diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,23833 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.21789284798769545, + "eval_steps": 500, + "global_step": 3400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 6.40861317610869e-05, + "grad_norm": 3.049792280054225, + "learning_rate": 1e-06, + "loss": 0.3756, + "step": 1 + }, + { + "epoch": 0.0001281722635221738, + "grad_norm": 2.736097146503981, + "learning_rate": 1e-06, + "loss": 0.4176, + "step": 2 + }, + { + "epoch": 0.00019225839528326071, + "grad_norm": 2.696671878484295, + "learning_rate": 1e-06, + "loss": 0.4815, + "step": 3 + }, + { + "epoch": 0.0002563445270443476, + "grad_norm": 3.337136551938015, + "learning_rate": 1e-06, + "loss": 0.4904, + "step": 4 + }, + { + "epoch": 0.0003204306588054345, + "grad_norm": 2.6647797201470778, + "learning_rate": 1e-06, + "loss": 0.4185, + "step": 5 + }, + { + "epoch": 0.00038451679056652143, + "grad_norm": 2.5193486744426967, + "learning_rate": 1e-06, + "loss": 0.4688, + "step": 6 + }, + { + "epoch": 0.0004486029223276083, + "grad_norm": 2.66396905228593, + "learning_rate": 1e-06, + "loss": 0.3592, + "step": 7 + }, + { + "epoch": 0.0005126890540886952, + "grad_norm": 3.1409024702898307, + "learning_rate": 1e-06, + "loss": 0.4413, + "step": 8 + }, + { + "epoch": 0.0005767751858497821, + "grad_norm": 2.6431941146744, + "learning_rate": 1e-06, + "loss": 0.449, + "step": 9 + }, + { + "epoch": 0.000640861317610869, + "grad_norm": 2.924241054090828, + "learning_rate": 1e-06, + "loss": 0.451, + "step": 10 + }, + { + "epoch": 0.0007049474493719559, + "grad_norm": 2.622132191231163, + "learning_rate": 1e-06, + "loss": 0.4541, + "step": 11 + }, + { + "epoch": 0.0007690335811330429, + "grad_norm": 2.6442231709945525, + "learning_rate": 1e-06, + "loss": 0.3841, + "step": 12 + }, + { + "epoch": 0.0008331197128941297, + "grad_norm": 2.8066741487653886, + "learning_rate": 1e-06, + "loss": 0.3987, + "step": 13 + }, + { + "epoch": 0.0008972058446552166, + "grad_norm": 2.4952793737222794, + "learning_rate": 1e-06, + "loss": 0.4367, + "step": 14 + }, + { + "epoch": 0.0009612919764163035, + "grad_norm": 2.736382692932069, + "learning_rate": 1e-06, + "loss": 0.3844, + "step": 15 + }, + { + "epoch": 0.0010253781081773904, + "grad_norm": 2.690189041127437, + "learning_rate": 1e-06, + "loss": 0.4179, + "step": 16 + }, + { + "epoch": 0.0010894642399384773, + "grad_norm": 2.4809643616320916, + "learning_rate": 1e-06, + "loss": 0.3971, + "step": 17 + }, + { + "epoch": 0.0011535503716995643, + "grad_norm": 2.470983130778616, + "learning_rate": 1e-06, + "loss": 0.4224, + "step": 18 + }, + { + "epoch": 0.001217636503460651, + "grad_norm": 2.9011921770535833, + "learning_rate": 1e-06, + "loss": 0.4941, + "step": 19 + }, + { + "epoch": 0.001281722635221738, + "grad_norm": 2.592260682165995, + "learning_rate": 1e-06, + "loss": 0.3991, + "step": 20 + }, + { + "epoch": 0.001345808766982825, + "grad_norm": 2.7190365461237502, + "learning_rate": 1e-06, + "loss": 0.4386, + "step": 21 + }, + { + "epoch": 0.0014098948987439118, + "grad_norm": 2.623825797975284, + "learning_rate": 1e-06, + "loss": 0.4459, + "step": 22 + }, + { + "epoch": 0.0014739810305049988, + "grad_norm": 4.130313126154051, + "learning_rate": 1e-06, + "loss": 0.4236, + "step": 23 + }, + { + "epoch": 0.0015380671622660857, + "grad_norm": 2.6968346941151387, + "learning_rate": 1e-06, + "loss": 0.4088, + "step": 24 + }, + { + "epoch": 0.0016021532940271724, + "grad_norm": 2.478154009816993, + "learning_rate": 1e-06, + "loss": 0.4064, + "step": 25 + }, + { + "epoch": 0.0016662394257882594, + "grad_norm": 2.878743074857694, + "learning_rate": 1e-06, + "loss": 0.4746, + "step": 26 + }, + { + "epoch": 0.0017303255575493463, + "grad_norm": 2.572012522185562, + "learning_rate": 1e-06, + "loss": 0.4063, + "step": 27 + }, + { + "epoch": 0.0017944116893104333, + "grad_norm": 2.6480699294244108, + "learning_rate": 1e-06, + "loss": 0.4893, + "step": 28 + }, + { + "epoch": 0.0018584978210715202, + "grad_norm": 2.5117617032878536, + "learning_rate": 1e-06, + "loss": 0.4271, + "step": 29 + }, + { + "epoch": 0.001922583952832607, + "grad_norm": 2.6045342374650704, + "learning_rate": 1e-06, + "loss": 0.4077, + "step": 30 + }, + { + "epoch": 0.001986670084593694, + "grad_norm": 2.598933479100832, + "learning_rate": 1e-06, + "loss": 0.447, + "step": 31 + }, + { + "epoch": 0.002050756216354781, + "grad_norm": 2.6774168288479685, + "learning_rate": 1e-06, + "loss": 0.4337, + "step": 32 + }, + { + "epoch": 0.0021148423481158678, + "grad_norm": 2.559910547832753, + "learning_rate": 1e-06, + "loss": 0.3952, + "step": 33 + }, + { + "epoch": 0.0021789284798769547, + "grad_norm": 2.8131005080313427, + "learning_rate": 1e-06, + "loss": 0.3904, + "step": 34 + }, + { + "epoch": 0.0022430146116380416, + "grad_norm": 2.6128742530557196, + "learning_rate": 1e-06, + "loss": 0.4635, + "step": 35 + }, + { + "epoch": 0.0023071007433991286, + "grad_norm": 2.790837783591932, + "learning_rate": 1e-06, + "loss": 0.4692, + "step": 36 + }, + { + "epoch": 0.0023711868751602155, + "grad_norm": 2.677228029063294, + "learning_rate": 1e-06, + "loss": 0.4263, + "step": 37 + }, + { + "epoch": 0.002435273006921302, + "grad_norm": 2.899817329376979, + "learning_rate": 1e-06, + "loss": 0.4265, + "step": 38 + }, + { + "epoch": 0.002499359138682389, + "grad_norm": 2.5848276627009055, + "learning_rate": 1e-06, + "loss": 0.4095, + "step": 39 + }, + { + "epoch": 0.002563445270443476, + "grad_norm": 2.5591688177632674, + "learning_rate": 1e-06, + "loss": 0.4879, + "step": 40 + }, + { + "epoch": 0.002627531402204563, + "grad_norm": 2.628718605922321, + "learning_rate": 1e-06, + "loss": 0.4264, + "step": 41 + }, + { + "epoch": 0.00269161753396565, + "grad_norm": 2.5750637451230385, + "learning_rate": 1e-06, + "loss": 0.3824, + "step": 42 + }, + { + "epoch": 0.0027557036657267367, + "grad_norm": 2.583335490883839, + "learning_rate": 1e-06, + "loss": 0.4289, + "step": 43 + }, + { + "epoch": 0.0028197897974878237, + "grad_norm": 2.5301462397652985, + "learning_rate": 1e-06, + "loss": 0.4284, + "step": 44 + }, + { + "epoch": 0.0028838759292489106, + "grad_norm": 2.629254554816327, + "learning_rate": 1e-06, + "loss": 0.4944, + "step": 45 + }, + { + "epoch": 0.0029479620610099976, + "grad_norm": 2.3796843494411215, + "learning_rate": 1e-06, + "loss": 0.4264, + "step": 46 + }, + { + "epoch": 0.0030120481927710845, + "grad_norm": 2.337285306067337, + "learning_rate": 1e-06, + "loss": 0.3691, + "step": 47 + }, + { + "epoch": 0.0030761343245321714, + "grad_norm": 2.4625613624689686, + "learning_rate": 1e-06, + "loss": 0.435, + "step": 48 + }, + { + "epoch": 0.003140220456293258, + "grad_norm": 2.6425603802991904, + "learning_rate": 1e-06, + "loss": 0.4156, + "step": 49 + }, + { + "epoch": 0.003204306588054345, + "grad_norm": 2.640849648601871, + "learning_rate": 1e-06, + "loss": 0.4261, + "step": 50 + }, + { + "epoch": 0.003268392719815432, + "grad_norm": 2.5632636645352895, + "learning_rate": 1e-06, + "loss": 0.4231, + "step": 51 + }, + { + "epoch": 0.0033324788515765188, + "grad_norm": 2.576641529620584, + "learning_rate": 1e-06, + "loss": 0.4591, + "step": 52 + }, + { + "epoch": 0.0033965649833376057, + "grad_norm": 2.485785384457921, + "learning_rate": 1e-06, + "loss": 0.436, + "step": 53 + }, + { + "epoch": 0.0034606511150986926, + "grad_norm": 2.546660745565414, + "learning_rate": 1e-06, + "loss": 0.4197, + "step": 54 + }, + { + "epoch": 0.0035247372468597796, + "grad_norm": 2.4421749353029747, + "learning_rate": 1e-06, + "loss": 0.3905, + "step": 55 + }, + { + "epoch": 0.0035888233786208665, + "grad_norm": 2.660810200932209, + "learning_rate": 1e-06, + "loss": 0.3768, + "step": 56 + }, + { + "epoch": 0.0036529095103819535, + "grad_norm": 2.4907286529936625, + "learning_rate": 1e-06, + "loss": 0.4203, + "step": 57 + }, + { + "epoch": 0.0037169956421430404, + "grad_norm": 2.5185462082716965, + "learning_rate": 1e-06, + "loss": 0.4264, + "step": 58 + }, + { + "epoch": 0.0037810817739041274, + "grad_norm": 2.5075831979076475, + "learning_rate": 1e-06, + "loss": 0.4246, + "step": 59 + }, + { + "epoch": 0.003845167905665214, + "grad_norm": 2.5893984879897434, + "learning_rate": 1e-06, + "loss": 0.4869, + "step": 60 + }, + { + "epoch": 0.003909254037426301, + "grad_norm": 2.5513574941347916, + "learning_rate": 1e-06, + "loss": 0.3985, + "step": 61 + }, + { + "epoch": 0.003973340169187388, + "grad_norm": 2.3634405046489824, + "learning_rate": 1e-06, + "loss": 0.4067, + "step": 62 + }, + { + "epoch": 0.004037426300948475, + "grad_norm": 2.794196161444718, + "learning_rate": 1e-06, + "loss": 0.474, + "step": 63 + }, + { + "epoch": 0.004101512432709562, + "grad_norm": 2.7786386772732636, + "learning_rate": 1e-06, + "loss": 0.4546, + "step": 64 + }, + { + "epoch": 0.0041655985644706486, + "grad_norm": 2.6427053217076577, + "learning_rate": 1e-06, + "loss": 0.4149, + "step": 65 + }, + { + "epoch": 0.0042296846962317355, + "grad_norm": 2.9375865167158963, + "learning_rate": 1e-06, + "loss": 0.4606, + "step": 66 + }, + { + "epoch": 0.0042937708279928224, + "grad_norm": 2.3163933176028584, + "learning_rate": 1e-06, + "loss": 0.4355, + "step": 67 + }, + { + "epoch": 0.004357856959753909, + "grad_norm": 2.4717258151766455, + "learning_rate": 1e-06, + "loss": 0.4136, + "step": 68 + }, + { + "epoch": 0.004421943091514996, + "grad_norm": 2.6985368472703435, + "learning_rate": 1e-06, + "loss": 0.4447, + "step": 69 + }, + { + "epoch": 0.004486029223276083, + "grad_norm": 2.6854921516916264, + "learning_rate": 1e-06, + "loss": 0.4055, + "step": 70 + }, + { + "epoch": 0.00455011535503717, + "grad_norm": 2.69736848706748, + "learning_rate": 1e-06, + "loss": 0.4256, + "step": 71 + }, + { + "epoch": 0.004614201486798257, + "grad_norm": 2.596490884499458, + "learning_rate": 1e-06, + "loss": 0.4069, + "step": 72 + }, + { + "epoch": 0.004678287618559344, + "grad_norm": 2.8156615368629216, + "learning_rate": 1e-06, + "loss": 0.3991, + "step": 73 + }, + { + "epoch": 0.004742373750320431, + "grad_norm": 2.4653779008574017, + "learning_rate": 1e-06, + "loss": 0.4834, + "step": 74 + }, + { + "epoch": 0.004806459882081518, + "grad_norm": 2.619661319531094, + "learning_rate": 1e-06, + "loss": 0.4507, + "step": 75 + }, + { + "epoch": 0.004870546013842604, + "grad_norm": 2.611916458285565, + "learning_rate": 1e-06, + "loss": 0.4103, + "step": 76 + }, + { + "epoch": 0.004934632145603691, + "grad_norm": 2.5636207784022225, + "learning_rate": 1e-06, + "loss": 0.4467, + "step": 77 + }, + { + "epoch": 0.004998718277364778, + "grad_norm": 2.4963493016425935, + "learning_rate": 1e-06, + "loss": 0.4775, + "step": 78 + }, + { + "epoch": 0.005062804409125865, + "grad_norm": 2.8481558198252133, + "learning_rate": 1e-06, + "loss": 0.4645, + "step": 79 + }, + { + "epoch": 0.005126890540886952, + "grad_norm": 2.724709371989913, + "learning_rate": 1e-06, + "loss": 0.4768, + "step": 80 + }, + { + "epoch": 0.005190976672648039, + "grad_norm": 2.474072117134112, + "learning_rate": 1e-06, + "loss": 0.415, + "step": 81 + }, + { + "epoch": 0.005255062804409126, + "grad_norm": 2.6671004940790204, + "learning_rate": 1e-06, + "loss": 0.4297, + "step": 82 + }, + { + "epoch": 0.005319148936170213, + "grad_norm": 2.636356788264366, + "learning_rate": 1e-06, + "loss": 0.4816, + "step": 83 + }, + { + "epoch": 0.0053832350679313, + "grad_norm": 2.5333332281115566, + "learning_rate": 1e-06, + "loss": 0.4175, + "step": 84 + }, + { + "epoch": 0.0054473211996923865, + "grad_norm": 2.5815065252078324, + "learning_rate": 1e-06, + "loss": 0.4338, + "step": 85 + }, + { + "epoch": 0.0055114073314534735, + "grad_norm": 2.532968445439128, + "learning_rate": 1e-06, + "loss": 0.4621, + "step": 86 + }, + { + "epoch": 0.00557549346321456, + "grad_norm": 2.5729492311933275, + "learning_rate": 1e-06, + "loss": 0.4469, + "step": 87 + }, + { + "epoch": 0.005639579594975647, + "grad_norm": 2.550980847594244, + "learning_rate": 1e-06, + "loss": 0.4541, + "step": 88 + }, + { + "epoch": 0.005703665726736734, + "grad_norm": 2.341442489669966, + "learning_rate": 1e-06, + "loss": 0.3631, + "step": 89 + }, + { + "epoch": 0.005767751858497821, + "grad_norm": 2.4357259388266046, + "learning_rate": 1e-06, + "loss": 0.3696, + "step": 90 + }, + { + "epoch": 0.005831837990258908, + "grad_norm": 2.8327584495472657, + "learning_rate": 1e-06, + "loss": 0.4202, + "step": 91 + }, + { + "epoch": 0.005895924122019995, + "grad_norm": 2.67552409067869, + "learning_rate": 1e-06, + "loss": 0.4439, + "step": 92 + }, + { + "epoch": 0.005960010253781082, + "grad_norm": 2.358660039118358, + "learning_rate": 1e-06, + "loss": 0.4177, + "step": 93 + }, + { + "epoch": 0.006024096385542169, + "grad_norm": 2.56361346512011, + "learning_rate": 1e-06, + "loss": 0.369, + "step": 94 + }, + { + "epoch": 0.006088182517303256, + "grad_norm": 2.5406342261425143, + "learning_rate": 1e-06, + "loss": 0.4096, + "step": 95 + }, + { + "epoch": 0.006152268649064343, + "grad_norm": 2.4792750353456876, + "learning_rate": 1e-06, + "loss": 0.4104, + "step": 96 + }, + { + "epoch": 0.006216354780825429, + "grad_norm": 2.632504157019107, + "learning_rate": 1e-06, + "loss": 0.4541, + "step": 97 + }, + { + "epoch": 0.006280440912586516, + "grad_norm": 2.6260305723231485, + "learning_rate": 1e-06, + "loss": 0.4319, + "step": 98 + }, + { + "epoch": 0.006344527044347603, + "grad_norm": 2.6485300234091356, + "learning_rate": 1e-06, + "loss": 0.3941, + "step": 99 + }, + { + "epoch": 0.00640861317610869, + "grad_norm": 2.611785234749641, + "learning_rate": 1e-06, + "loss": 0.4118, + "step": 100 + }, + { + "epoch": 0.006472699307869777, + "grad_norm": 2.485020096159456, + "learning_rate": 1e-06, + "loss": 0.4099, + "step": 101 + }, + { + "epoch": 0.006536785439630864, + "grad_norm": 2.515516066727853, + "learning_rate": 1e-06, + "loss": 0.4082, + "step": 102 + }, + { + "epoch": 0.006600871571391951, + "grad_norm": 2.6842572405670575, + "learning_rate": 1e-06, + "loss": 0.4795, + "step": 103 + }, + { + "epoch": 0.0066649577031530375, + "grad_norm": 2.7120822334389514, + "learning_rate": 1e-06, + "loss": 0.4101, + "step": 104 + }, + { + "epoch": 0.0067290438349141245, + "grad_norm": 2.69713806260704, + "learning_rate": 1e-06, + "loss": 0.4411, + "step": 105 + }, + { + "epoch": 0.006793129966675211, + "grad_norm": 2.823995680469878, + "learning_rate": 1e-06, + "loss": 0.4376, + "step": 106 + }, + { + "epoch": 0.006857216098436298, + "grad_norm": 2.971152302019369, + "learning_rate": 1e-06, + "loss": 0.4351, + "step": 107 + }, + { + "epoch": 0.006921302230197385, + "grad_norm": 2.6318324758558522, + "learning_rate": 1e-06, + "loss": 0.4086, + "step": 108 + }, + { + "epoch": 0.006985388361958472, + "grad_norm": 2.663343057511306, + "learning_rate": 1e-06, + "loss": 0.4143, + "step": 109 + }, + { + "epoch": 0.007049474493719559, + "grad_norm": 2.5787240892091043, + "learning_rate": 1e-06, + "loss": 0.4553, + "step": 110 + }, + { + "epoch": 0.007113560625480646, + "grad_norm": 2.904813690584719, + "learning_rate": 1e-06, + "loss": 0.4463, + "step": 111 + }, + { + "epoch": 0.007177646757241733, + "grad_norm": 2.6164388520336543, + "learning_rate": 1e-06, + "loss": 0.4237, + "step": 112 + }, + { + "epoch": 0.00724173288900282, + "grad_norm": 3.62468026756726, + "learning_rate": 1e-06, + "loss": 0.5197, + "step": 113 + }, + { + "epoch": 0.007305819020763907, + "grad_norm": 2.6283606516515183, + "learning_rate": 1e-06, + "loss": 0.4176, + "step": 114 + }, + { + "epoch": 0.007369905152524994, + "grad_norm": 2.733978140448335, + "learning_rate": 1e-06, + "loss": 0.4945, + "step": 115 + }, + { + "epoch": 0.007433991284286081, + "grad_norm": 2.6142513762887365, + "learning_rate": 1e-06, + "loss": 0.3966, + "step": 116 + }, + { + "epoch": 0.007498077416047168, + "grad_norm": 2.9462705802375964, + "learning_rate": 1e-06, + "loss": 0.3969, + "step": 117 + }, + { + "epoch": 0.007562163547808255, + "grad_norm": 2.5077415901431492, + "learning_rate": 1e-06, + "loss": 0.4053, + "step": 118 + }, + { + "epoch": 0.007626249679569341, + "grad_norm": 2.484246468433061, + "learning_rate": 1e-06, + "loss": 0.4683, + "step": 119 + }, + { + "epoch": 0.007690335811330428, + "grad_norm": 2.544320892555243, + "learning_rate": 1e-06, + "loss": 0.4388, + "step": 120 + }, + { + "epoch": 0.007754421943091515, + "grad_norm": 2.4444504338812396, + "learning_rate": 1e-06, + "loss": 0.4482, + "step": 121 + }, + { + "epoch": 0.007818508074852602, + "grad_norm": 2.6270003993720485, + "learning_rate": 1e-06, + "loss": 0.4309, + "step": 122 + }, + { + "epoch": 0.00788259420661369, + "grad_norm": 2.5560433655850656, + "learning_rate": 1e-06, + "loss": 0.4204, + "step": 123 + }, + { + "epoch": 0.007946680338374775, + "grad_norm": 2.583021760048913, + "learning_rate": 1e-06, + "loss": 0.4224, + "step": 124 + }, + { + "epoch": 0.008010766470135863, + "grad_norm": 2.516809486753932, + "learning_rate": 1e-06, + "loss": 0.4302, + "step": 125 + }, + { + "epoch": 0.00807485260189695, + "grad_norm": 2.7202512962911167, + "learning_rate": 1e-06, + "loss": 0.4496, + "step": 126 + }, + { + "epoch": 0.008138938733658037, + "grad_norm": 2.561504469620483, + "learning_rate": 1e-06, + "loss": 0.3797, + "step": 127 + }, + { + "epoch": 0.008203024865419123, + "grad_norm": 2.70197541717228, + "learning_rate": 1e-06, + "loss": 0.4708, + "step": 128 + }, + { + "epoch": 0.008267110997180211, + "grad_norm": 2.703655222038152, + "learning_rate": 1e-06, + "loss": 0.4013, + "step": 129 + }, + { + "epoch": 0.008331197128941297, + "grad_norm": 2.801075229740231, + "learning_rate": 1e-06, + "loss": 0.4957, + "step": 130 + }, + { + "epoch": 0.008395283260702383, + "grad_norm": 2.7000542176029283, + "learning_rate": 1e-06, + "loss": 0.4056, + "step": 131 + }, + { + "epoch": 0.008459369392463471, + "grad_norm": 2.728367510056538, + "learning_rate": 1e-06, + "loss": 0.4412, + "step": 132 + }, + { + "epoch": 0.008523455524224557, + "grad_norm": 2.4926115155921744, + "learning_rate": 1e-06, + "loss": 0.4143, + "step": 133 + }, + { + "epoch": 0.008587541655985645, + "grad_norm": 2.5170415635469663, + "learning_rate": 1e-06, + "loss": 0.4472, + "step": 134 + }, + { + "epoch": 0.008651627787746731, + "grad_norm": 2.766298946998558, + "learning_rate": 1e-06, + "loss": 0.3751, + "step": 135 + }, + { + "epoch": 0.008715713919507819, + "grad_norm": 2.725850116300731, + "learning_rate": 1e-06, + "loss": 0.4556, + "step": 136 + }, + { + "epoch": 0.008779800051268905, + "grad_norm": 2.53900556364546, + "learning_rate": 1e-06, + "loss": 0.4355, + "step": 137 + }, + { + "epoch": 0.008843886183029993, + "grad_norm": 2.477924972118881, + "learning_rate": 1e-06, + "loss": 0.4386, + "step": 138 + }, + { + "epoch": 0.008907972314791079, + "grad_norm": 2.826510511701279, + "learning_rate": 1e-06, + "loss": 0.4638, + "step": 139 + }, + { + "epoch": 0.008972058446552167, + "grad_norm": 2.6376643369143764, + "learning_rate": 1e-06, + "loss": 0.4355, + "step": 140 + }, + { + "epoch": 0.009036144578313253, + "grad_norm": 2.554145291137539, + "learning_rate": 1e-06, + "loss": 0.4451, + "step": 141 + }, + { + "epoch": 0.00910023071007434, + "grad_norm": 2.6743052984416154, + "learning_rate": 1e-06, + "loss": 0.4498, + "step": 142 + }, + { + "epoch": 0.009164316841835426, + "grad_norm": 2.56483434081286, + "learning_rate": 1e-06, + "loss": 0.4436, + "step": 143 + }, + { + "epoch": 0.009228402973596514, + "grad_norm": 2.4254491231280455, + "learning_rate": 1e-06, + "loss": 0.42, + "step": 144 + }, + { + "epoch": 0.0092924891053576, + "grad_norm": 2.558112650901851, + "learning_rate": 1e-06, + "loss": 0.4531, + "step": 145 + }, + { + "epoch": 0.009356575237118688, + "grad_norm": 2.405260114602089, + "learning_rate": 1e-06, + "loss": 0.4223, + "step": 146 + }, + { + "epoch": 0.009420661368879774, + "grad_norm": 2.60820574354277, + "learning_rate": 1e-06, + "loss": 0.3979, + "step": 147 + }, + { + "epoch": 0.009484747500640862, + "grad_norm": 2.6897694601294533, + "learning_rate": 1e-06, + "loss": 0.478, + "step": 148 + }, + { + "epoch": 0.009548833632401948, + "grad_norm": 2.579453394198776, + "learning_rate": 1e-06, + "loss": 0.4296, + "step": 149 + }, + { + "epoch": 0.009612919764163036, + "grad_norm": 2.7424643174105716, + "learning_rate": 1e-06, + "loss": 0.3937, + "step": 150 + }, + { + "epoch": 0.009677005895924122, + "grad_norm": 2.7023142583494875, + "learning_rate": 1e-06, + "loss": 0.4058, + "step": 151 + }, + { + "epoch": 0.009741092027685208, + "grad_norm": 2.506484730613173, + "learning_rate": 1e-06, + "loss": 0.3936, + "step": 152 + }, + { + "epoch": 0.009805178159446296, + "grad_norm": 2.4749366097633874, + "learning_rate": 1e-06, + "loss": 0.4255, + "step": 153 + }, + { + "epoch": 0.009869264291207382, + "grad_norm": 2.3269287496593414, + "learning_rate": 1e-06, + "loss": 0.3664, + "step": 154 + }, + { + "epoch": 0.00993335042296847, + "grad_norm": 2.4212911977056604, + "learning_rate": 1e-06, + "loss": 0.4353, + "step": 155 + }, + { + "epoch": 0.009997436554729556, + "grad_norm": 2.4468505337849593, + "learning_rate": 1e-06, + "loss": 0.4622, + "step": 156 + }, + { + "epoch": 0.010061522686490644, + "grad_norm": 2.5645734340411206, + "learning_rate": 1e-06, + "loss": 0.3703, + "step": 157 + }, + { + "epoch": 0.01012560881825173, + "grad_norm": 2.6534084292524764, + "learning_rate": 1e-06, + "loss": 0.4307, + "step": 158 + }, + { + "epoch": 0.010189694950012818, + "grad_norm": 2.510221138642233, + "learning_rate": 1e-06, + "loss": 0.4469, + "step": 159 + }, + { + "epoch": 0.010253781081773904, + "grad_norm": 2.6653843643185047, + "learning_rate": 1e-06, + "loss": 0.4187, + "step": 160 + }, + { + "epoch": 0.010317867213534991, + "grad_norm": 2.647156935213225, + "learning_rate": 1e-06, + "loss": 0.4281, + "step": 161 + }, + { + "epoch": 0.010381953345296078, + "grad_norm": 2.4645904023031364, + "learning_rate": 1e-06, + "loss": 0.4186, + "step": 162 + }, + { + "epoch": 0.010446039477057165, + "grad_norm": 2.5273636798670447, + "learning_rate": 1e-06, + "loss": 0.4196, + "step": 163 + }, + { + "epoch": 0.010510125608818251, + "grad_norm": 2.5759837139960355, + "learning_rate": 1e-06, + "loss": 0.4219, + "step": 164 + }, + { + "epoch": 0.01057421174057934, + "grad_norm": 2.7530163451644483, + "learning_rate": 1e-06, + "loss": 0.4473, + "step": 165 + }, + { + "epoch": 0.010638297872340425, + "grad_norm": 2.771144588040565, + "learning_rate": 1e-06, + "loss": 0.3832, + "step": 166 + }, + { + "epoch": 0.010702384004101513, + "grad_norm": 2.6609737067836194, + "learning_rate": 1e-06, + "loss": 0.4258, + "step": 167 + }, + { + "epoch": 0.0107664701358626, + "grad_norm": 2.76214436031771, + "learning_rate": 1e-06, + "loss": 0.3864, + "step": 168 + }, + { + "epoch": 0.010830556267623687, + "grad_norm": 2.722402456149691, + "learning_rate": 1e-06, + "loss": 0.404, + "step": 169 + }, + { + "epoch": 0.010894642399384773, + "grad_norm": 2.5998788986620696, + "learning_rate": 1e-06, + "loss": 0.4311, + "step": 170 + }, + { + "epoch": 0.01095872853114586, + "grad_norm": 2.7559088943343335, + "learning_rate": 1e-06, + "loss": 0.4491, + "step": 171 + }, + { + "epoch": 0.011022814662906947, + "grad_norm": 2.7409712519131686, + "learning_rate": 1e-06, + "loss": 0.4435, + "step": 172 + }, + { + "epoch": 0.011086900794668033, + "grad_norm": 2.664956072118899, + "learning_rate": 1e-06, + "loss": 0.4193, + "step": 173 + }, + { + "epoch": 0.01115098692642912, + "grad_norm": 3.1192465909037774, + "learning_rate": 1e-06, + "loss": 0.4669, + "step": 174 + }, + { + "epoch": 0.011215073058190207, + "grad_norm": 2.563621451938816, + "learning_rate": 1e-06, + "loss": 0.3767, + "step": 175 + }, + { + "epoch": 0.011279159189951295, + "grad_norm": 2.570045233440838, + "learning_rate": 1e-06, + "loss": 0.4765, + "step": 176 + }, + { + "epoch": 0.01134324532171238, + "grad_norm": 2.4916231423662043, + "learning_rate": 1e-06, + "loss": 0.4392, + "step": 177 + }, + { + "epoch": 0.011407331453473469, + "grad_norm": 2.7651096653059195, + "learning_rate": 1e-06, + "loss": 0.4099, + "step": 178 + }, + { + "epoch": 0.011471417585234555, + "grad_norm": 2.562958698429324, + "learning_rate": 1e-06, + "loss": 0.4092, + "step": 179 + }, + { + "epoch": 0.011535503716995642, + "grad_norm": 2.464035920659328, + "learning_rate": 1e-06, + "loss": 0.3709, + "step": 180 + }, + { + "epoch": 0.011599589848756729, + "grad_norm": 2.751881851503854, + "learning_rate": 1e-06, + "loss": 0.3687, + "step": 181 + }, + { + "epoch": 0.011663675980517816, + "grad_norm": 2.559719631734224, + "learning_rate": 1e-06, + "loss": 0.378, + "step": 182 + }, + { + "epoch": 0.011727762112278902, + "grad_norm": 2.4418892922224074, + "learning_rate": 1e-06, + "loss": 0.4095, + "step": 183 + }, + { + "epoch": 0.01179184824403999, + "grad_norm": 2.6724559730934545, + "learning_rate": 1e-06, + "loss": 0.4036, + "step": 184 + }, + { + "epoch": 0.011855934375801076, + "grad_norm": 2.6322280599536936, + "learning_rate": 1e-06, + "loss": 0.4517, + "step": 185 + }, + { + "epoch": 0.011920020507562164, + "grad_norm": 2.605353550494746, + "learning_rate": 1e-06, + "loss": 0.4107, + "step": 186 + }, + { + "epoch": 0.01198410663932325, + "grad_norm": 2.564073940099639, + "learning_rate": 1e-06, + "loss": 0.3855, + "step": 187 + }, + { + "epoch": 0.012048192771084338, + "grad_norm": 2.535479880548571, + "learning_rate": 1e-06, + "loss": 0.4454, + "step": 188 + }, + { + "epoch": 0.012112278902845424, + "grad_norm": 2.566704466588977, + "learning_rate": 1e-06, + "loss": 0.4025, + "step": 189 + }, + { + "epoch": 0.012176365034606512, + "grad_norm": 2.5806716178681293, + "learning_rate": 1e-06, + "loss": 0.3537, + "step": 190 + }, + { + "epoch": 0.012240451166367598, + "grad_norm": 2.825882756789249, + "learning_rate": 1e-06, + "loss": 0.4407, + "step": 191 + }, + { + "epoch": 0.012304537298128686, + "grad_norm": 2.578508321046613, + "learning_rate": 1e-06, + "loss": 0.4242, + "step": 192 + }, + { + "epoch": 0.012368623429889772, + "grad_norm": 2.460610078013274, + "learning_rate": 1e-06, + "loss": 0.3796, + "step": 193 + }, + { + "epoch": 0.012432709561650858, + "grad_norm": 2.52758953552192, + "learning_rate": 1e-06, + "loss": 0.3969, + "step": 194 + }, + { + "epoch": 0.012496795693411946, + "grad_norm": 2.680670213978359, + "learning_rate": 1e-06, + "loss": 0.4494, + "step": 195 + }, + { + "epoch": 0.012560881825173032, + "grad_norm": 2.5727240318324345, + "learning_rate": 1e-06, + "loss": 0.3651, + "step": 196 + }, + { + "epoch": 0.01262496795693412, + "grad_norm": 2.5479336539567035, + "learning_rate": 1e-06, + "loss": 0.4283, + "step": 197 + }, + { + "epoch": 0.012689054088695206, + "grad_norm": 2.5115294399067687, + "learning_rate": 1e-06, + "loss": 0.4641, + "step": 198 + }, + { + "epoch": 0.012753140220456293, + "grad_norm": 2.78330697315584, + "learning_rate": 1e-06, + "loss": 0.4351, + "step": 199 + }, + { + "epoch": 0.01281722635221738, + "grad_norm": 2.45488310212574, + "learning_rate": 1e-06, + "loss": 0.414, + "step": 200 + }, + { + "epoch": 0.012881312483978467, + "grad_norm": 2.3852827309025404, + "learning_rate": 1e-06, + "loss": 0.3923, + "step": 201 + }, + { + "epoch": 0.012945398615739553, + "grad_norm": 2.5864768221972354, + "learning_rate": 1e-06, + "loss": 0.4649, + "step": 202 + }, + { + "epoch": 0.013009484747500641, + "grad_norm": 2.7974137736304803, + "learning_rate": 1e-06, + "loss": 0.4582, + "step": 203 + }, + { + "epoch": 0.013073570879261727, + "grad_norm": 2.406651303043376, + "learning_rate": 1e-06, + "loss": 0.409, + "step": 204 + }, + { + "epoch": 0.013137657011022815, + "grad_norm": 2.5980042928944735, + "learning_rate": 1e-06, + "loss": 0.4234, + "step": 205 + }, + { + "epoch": 0.013201743142783901, + "grad_norm": 2.5707835336578237, + "learning_rate": 1e-06, + "loss": 0.4188, + "step": 206 + }, + { + "epoch": 0.013265829274544989, + "grad_norm": 2.5628325989007665, + "learning_rate": 1e-06, + "loss": 0.4019, + "step": 207 + }, + { + "epoch": 0.013329915406306075, + "grad_norm": 2.3178028538939057, + "learning_rate": 1e-06, + "loss": 0.3841, + "step": 208 + }, + { + "epoch": 0.013394001538067163, + "grad_norm": 2.4943832148029372, + "learning_rate": 1e-06, + "loss": 0.4318, + "step": 209 + }, + { + "epoch": 0.013458087669828249, + "grad_norm": 2.634069670687289, + "learning_rate": 1e-06, + "loss": 0.4756, + "step": 210 + }, + { + "epoch": 0.013522173801589337, + "grad_norm": 2.557527545449268, + "learning_rate": 1e-06, + "loss": 0.4653, + "step": 211 + }, + { + "epoch": 0.013586259933350423, + "grad_norm": 2.463861613281935, + "learning_rate": 1e-06, + "loss": 0.3948, + "step": 212 + }, + { + "epoch": 0.01365034606511151, + "grad_norm": 2.3854742264069397, + "learning_rate": 1e-06, + "loss": 0.3689, + "step": 213 + }, + { + "epoch": 0.013714432196872597, + "grad_norm": 2.4260505846618186, + "learning_rate": 1e-06, + "loss": 0.3644, + "step": 214 + }, + { + "epoch": 0.013778518328633685, + "grad_norm": 2.5804340331860343, + "learning_rate": 1e-06, + "loss": 0.4042, + "step": 215 + }, + { + "epoch": 0.01384260446039477, + "grad_norm": 2.6587083377401455, + "learning_rate": 1e-06, + "loss": 0.4072, + "step": 216 + }, + { + "epoch": 0.013906690592155857, + "grad_norm": 2.617929149718754, + "learning_rate": 1e-06, + "loss": 0.4605, + "step": 217 + }, + { + "epoch": 0.013970776723916944, + "grad_norm": 2.5803501828834143, + "learning_rate": 1e-06, + "loss": 0.454, + "step": 218 + }, + { + "epoch": 0.01403486285567803, + "grad_norm": 2.788641206889281, + "learning_rate": 1e-06, + "loss": 0.404, + "step": 219 + }, + { + "epoch": 0.014098948987439118, + "grad_norm": 2.506065493967879, + "learning_rate": 1e-06, + "loss": 0.3746, + "step": 220 + }, + { + "epoch": 0.014163035119200204, + "grad_norm": 2.62297260217538, + "learning_rate": 1e-06, + "loss": 0.4524, + "step": 221 + }, + { + "epoch": 0.014227121250961292, + "grad_norm": 2.396202673688117, + "learning_rate": 1e-06, + "loss": 0.423, + "step": 222 + }, + { + "epoch": 0.014291207382722378, + "grad_norm": 2.5608484595985788, + "learning_rate": 1e-06, + "loss": 0.394, + "step": 223 + }, + { + "epoch": 0.014355293514483466, + "grad_norm": 2.7259635787986722, + "learning_rate": 1e-06, + "loss": 0.4533, + "step": 224 + }, + { + "epoch": 0.014419379646244552, + "grad_norm": 2.7655406885266967, + "learning_rate": 1e-06, + "loss": 0.4553, + "step": 225 + }, + { + "epoch": 0.01448346577800564, + "grad_norm": 2.548446603379672, + "learning_rate": 1e-06, + "loss": 0.3889, + "step": 226 + }, + { + "epoch": 0.014547551909766726, + "grad_norm": 2.4670836949298565, + "learning_rate": 1e-06, + "loss": 0.4037, + "step": 227 + }, + { + "epoch": 0.014611638041527814, + "grad_norm": 2.4857360759200557, + "learning_rate": 1e-06, + "loss": 0.3866, + "step": 228 + }, + { + "epoch": 0.0146757241732889, + "grad_norm": 2.6690124449544754, + "learning_rate": 1e-06, + "loss": 0.4127, + "step": 229 + }, + { + "epoch": 0.014739810305049988, + "grad_norm": 2.6132105365625464, + "learning_rate": 1e-06, + "loss": 0.4843, + "step": 230 + }, + { + "epoch": 0.014803896436811074, + "grad_norm": 2.7432970666186347, + "learning_rate": 1e-06, + "loss": 0.4536, + "step": 231 + }, + { + "epoch": 0.014867982568572162, + "grad_norm": 2.7658731495653672, + "learning_rate": 1e-06, + "loss": 0.3939, + "step": 232 + }, + { + "epoch": 0.014932068700333248, + "grad_norm": 2.580362454305093, + "learning_rate": 1e-06, + "loss": 0.3823, + "step": 233 + }, + { + "epoch": 0.014996154832094336, + "grad_norm": 2.6052893301698745, + "learning_rate": 1e-06, + "loss": 0.4735, + "step": 234 + }, + { + "epoch": 0.015060240963855422, + "grad_norm": 2.48609104419947, + "learning_rate": 1e-06, + "loss": 0.4368, + "step": 235 + }, + { + "epoch": 0.01512432709561651, + "grad_norm": 2.4309934849231962, + "learning_rate": 1e-06, + "loss": 0.3756, + "step": 236 + }, + { + "epoch": 0.015188413227377595, + "grad_norm": 2.437877921605441, + "learning_rate": 1e-06, + "loss": 0.4504, + "step": 237 + }, + { + "epoch": 0.015252499359138682, + "grad_norm": 2.445601223786844, + "learning_rate": 1e-06, + "loss": 0.3719, + "step": 238 + }, + { + "epoch": 0.01531658549089977, + "grad_norm": 2.566610647012455, + "learning_rate": 1e-06, + "loss": 0.4522, + "step": 239 + }, + { + "epoch": 0.015380671622660855, + "grad_norm": 2.3629967174097883, + "learning_rate": 1e-06, + "loss": 0.3975, + "step": 240 + }, + { + "epoch": 0.015444757754421943, + "grad_norm": 2.7502843636448358, + "learning_rate": 1e-06, + "loss": 0.4292, + "step": 241 + }, + { + "epoch": 0.01550884388618303, + "grad_norm": 2.708324699769681, + "learning_rate": 1e-06, + "loss": 0.4187, + "step": 242 + }, + { + "epoch": 0.015572930017944117, + "grad_norm": 2.444268394765568, + "learning_rate": 1e-06, + "loss": 0.4329, + "step": 243 + }, + { + "epoch": 0.015637016149705203, + "grad_norm": 2.3920689319195025, + "learning_rate": 1e-06, + "loss": 0.3964, + "step": 244 + }, + { + "epoch": 0.01570110228146629, + "grad_norm": 2.4825639040106156, + "learning_rate": 1e-06, + "loss": 0.4123, + "step": 245 + }, + { + "epoch": 0.01576518841322738, + "grad_norm": 2.4814384615199123, + "learning_rate": 1e-06, + "loss": 0.4215, + "step": 246 + }, + { + "epoch": 0.015829274544988465, + "grad_norm": 2.724229129070956, + "learning_rate": 1e-06, + "loss": 0.4317, + "step": 247 + }, + { + "epoch": 0.01589336067674955, + "grad_norm": 2.7175053474836766, + "learning_rate": 1e-06, + "loss": 0.3619, + "step": 248 + }, + { + "epoch": 0.015957446808510637, + "grad_norm": 2.4697050879589275, + "learning_rate": 1e-06, + "loss": 0.3635, + "step": 249 + }, + { + "epoch": 0.016021532940271727, + "grad_norm": 2.4595339460868573, + "learning_rate": 1e-06, + "loss": 0.476, + "step": 250 + }, + { + "epoch": 0.016085619072032813, + "grad_norm": 2.6445467786638814, + "learning_rate": 1e-06, + "loss": 0.4999, + "step": 251 + }, + { + "epoch": 0.0161497052037939, + "grad_norm": 2.4693317351326414, + "learning_rate": 1e-06, + "loss": 0.4237, + "step": 252 + }, + { + "epoch": 0.016213791335554985, + "grad_norm": 2.4702693466347125, + "learning_rate": 1e-06, + "loss": 0.4443, + "step": 253 + }, + { + "epoch": 0.016277877467316074, + "grad_norm": 2.717654695399045, + "learning_rate": 1e-06, + "loss": 0.4019, + "step": 254 + }, + { + "epoch": 0.01634196359907716, + "grad_norm": 2.603236954441173, + "learning_rate": 1e-06, + "loss": 0.469, + "step": 255 + }, + { + "epoch": 0.016406049730838246, + "grad_norm": 2.7996949150270867, + "learning_rate": 1e-06, + "loss": 0.4296, + "step": 256 + }, + { + "epoch": 0.016470135862599333, + "grad_norm": 2.6193082828705125, + "learning_rate": 1e-06, + "loss": 0.4333, + "step": 257 + }, + { + "epoch": 0.016534221994360422, + "grad_norm": 2.473437533047886, + "learning_rate": 1e-06, + "loss": 0.4774, + "step": 258 + }, + { + "epoch": 0.016598308126121508, + "grad_norm": 2.564354049838647, + "learning_rate": 1e-06, + "loss": 0.3997, + "step": 259 + }, + { + "epoch": 0.016662394257882594, + "grad_norm": 2.547984763105764, + "learning_rate": 1e-06, + "loss": 0.4982, + "step": 260 + }, + { + "epoch": 0.01672648038964368, + "grad_norm": 2.693431115018885, + "learning_rate": 1e-06, + "loss": 0.4535, + "step": 261 + }, + { + "epoch": 0.016790566521404766, + "grad_norm": 2.5918028116724146, + "learning_rate": 1e-06, + "loss": 0.4384, + "step": 262 + }, + { + "epoch": 0.016854652653165856, + "grad_norm": 2.586212163796859, + "learning_rate": 1e-06, + "loss": 0.4386, + "step": 263 + }, + { + "epoch": 0.016918738784926942, + "grad_norm": 2.5085076025134962, + "learning_rate": 1e-06, + "loss": 0.4387, + "step": 264 + }, + { + "epoch": 0.016982824916688028, + "grad_norm": 2.7284858517103294, + "learning_rate": 1e-06, + "loss": 0.4369, + "step": 265 + }, + { + "epoch": 0.017046911048449114, + "grad_norm": 2.698912647175665, + "learning_rate": 1e-06, + "loss": 0.3981, + "step": 266 + }, + { + "epoch": 0.017110997180210204, + "grad_norm": 2.433393766997923, + "learning_rate": 1e-06, + "loss": 0.3888, + "step": 267 + }, + { + "epoch": 0.01717508331197129, + "grad_norm": 2.6338501838202606, + "learning_rate": 1e-06, + "loss": 0.4465, + "step": 268 + }, + { + "epoch": 0.017239169443732376, + "grad_norm": 2.728265238380636, + "learning_rate": 1e-06, + "loss": 0.471, + "step": 269 + }, + { + "epoch": 0.017303255575493462, + "grad_norm": 2.588489728565312, + "learning_rate": 1e-06, + "loss": 0.4759, + "step": 270 + }, + { + "epoch": 0.01736734170725455, + "grad_norm": 2.611963629217684, + "learning_rate": 1e-06, + "loss": 0.4743, + "step": 271 + }, + { + "epoch": 0.017431427839015638, + "grad_norm": 2.633648442017364, + "learning_rate": 1e-06, + "loss": 0.3899, + "step": 272 + }, + { + "epoch": 0.017495513970776724, + "grad_norm": 2.9530713121350414, + "learning_rate": 1e-06, + "loss": 0.3967, + "step": 273 + }, + { + "epoch": 0.01755960010253781, + "grad_norm": 2.499549993514414, + "learning_rate": 1e-06, + "loss": 0.4312, + "step": 274 + }, + { + "epoch": 0.0176236862342989, + "grad_norm": 2.6602932818625926, + "learning_rate": 1e-06, + "loss": 0.4594, + "step": 275 + }, + { + "epoch": 0.017687772366059985, + "grad_norm": 2.4729820946013943, + "learning_rate": 1e-06, + "loss": 0.3503, + "step": 276 + }, + { + "epoch": 0.01775185849782107, + "grad_norm": 2.568481943130322, + "learning_rate": 1e-06, + "loss": 0.4095, + "step": 277 + }, + { + "epoch": 0.017815944629582157, + "grad_norm": 2.5410630389732356, + "learning_rate": 1e-06, + "loss": 0.4558, + "step": 278 + }, + { + "epoch": 0.017880030761343247, + "grad_norm": 2.8834562082794633, + "learning_rate": 1e-06, + "loss": 0.4472, + "step": 279 + }, + { + "epoch": 0.017944116893104333, + "grad_norm": 2.9142632748077593, + "learning_rate": 1e-06, + "loss": 0.3867, + "step": 280 + }, + { + "epoch": 0.01800820302486542, + "grad_norm": 2.685459029147027, + "learning_rate": 1e-06, + "loss": 0.4265, + "step": 281 + }, + { + "epoch": 0.018072289156626505, + "grad_norm": 2.8592794071139687, + "learning_rate": 1e-06, + "loss": 0.4207, + "step": 282 + }, + { + "epoch": 0.01813637528838759, + "grad_norm": 2.577623379545379, + "learning_rate": 1e-06, + "loss": 0.3723, + "step": 283 + }, + { + "epoch": 0.01820046142014868, + "grad_norm": 2.639894455655153, + "learning_rate": 1e-06, + "loss": 0.4035, + "step": 284 + }, + { + "epoch": 0.018264547551909767, + "grad_norm": 2.744866333497609, + "learning_rate": 1e-06, + "loss": 0.4558, + "step": 285 + }, + { + "epoch": 0.018328633683670853, + "grad_norm": 2.7701088109503855, + "learning_rate": 1e-06, + "loss": 0.4056, + "step": 286 + }, + { + "epoch": 0.01839271981543194, + "grad_norm": 2.7164498040000233, + "learning_rate": 1e-06, + "loss": 0.4093, + "step": 287 + }, + { + "epoch": 0.01845680594719303, + "grad_norm": 2.8216916740476417, + "learning_rate": 1e-06, + "loss": 0.4087, + "step": 288 + }, + { + "epoch": 0.018520892078954115, + "grad_norm": 2.570695596833045, + "learning_rate": 1e-06, + "loss": 0.3929, + "step": 289 + }, + { + "epoch": 0.0185849782107152, + "grad_norm": 2.5436410160123093, + "learning_rate": 1e-06, + "loss": 0.4209, + "step": 290 + }, + { + "epoch": 0.018649064342476287, + "grad_norm": 2.464564508450358, + "learning_rate": 1e-06, + "loss": 0.4682, + "step": 291 + }, + { + "epoch": 0.018713150474237376, + "grad_norm": 2.6414925530399103, + "learning_rate": 1e-06, + "loss": 0.4286, + "step": 292 + }, + { + "epoch": 0.018777236605998462, + "grad_norm": 2.612675091607748, + "learning_rate": 1e-06, + "loss": 0.4331, + "step": 293 + }, + { + "epoch": 0.01884132273775955, + "grad_norm": 2.443575754509268, + "learning_rate": 1e-06, + "loss": 0.3982, + "step": 294 + }, + { + "epoch": 0.018905408869520635, + "grad_norm": 2.580805303286195, + "learning_rate": 1e-06, + "loss": 0.4145, + "step": 295 + }, + { + "epoch": 0.018969495001281724, + "grad_norm": 2.5273581888412453, + "learning_rate": 1e-06, + "loss": 0.3773, + "step": 296 + }, + { + "epoch": 0.01903358113304281, + "grad_norm": 2.489408203399578, + "learning_rate": 1e-06, + "loss": 0.4248, + "step": 297 + }, + { + "epoch": 0.019097667264803896, + "grad_norm": 2.649191810622033, + "learning_rate": 1e-06, + "loss": 0.4376, + "step": 298 + }, + { + "epoch": 0.019161753396564982, + "grad_norm": 2.531864638870876, + "learning_rate": 1e-06, + "loss": 0.4435, + "step": 299 + }, + { + "epoch": 0.019225839528326072, + "grad_norm": 2.405100794839331, + "learning_rate": 1e-06, + "loss": 0.4023, + "step": 300 + }, + { + "epoch": 0.019289925660087158, + "grad_norm": 2.8102035159421064, + "learning_rate": 1e-06, + "loss": 0.4716, + "step": 301 + }, + { + "epoch": 0.019354011791848244, + "grad_norm": 2.502247457851022, + "learning_rate": 1e-06, + "loss": 0.4135, + "step": 302 + }, + { + "epoch": 0.01941809792360933, + "grad_norm": 2.4034822507856375, + "learning_rate": 1e-06, + "loss": 0.375, + "step": 303 + }, + { + "epoch": 0.019482184055370416, + "grad_norm": 2.6665336665510355, + "learning_rate": 1e-06, + "loss": 0.4282, + "step": 304 + }, + { + "epoch": 0.019546270187131506, + "grad_norm": 2.6706007873681754, + "learning_rate": 1e-06, + "loss": 0.4281, + "step": 305 + }, + { + "epoch": 0.019610356318892592, + "grad_norm": 2.9943586682230596, + "learning_rate": 1e-06, + "loss": 0.4563, + "step": 306 + }, + { + "epoch": 0.019674442450653678, + "grad_norm": 2.529442058426431, + "learning_rate": 1e-06, + "loss": 0.4235, + "step": 307 + }, + { + "epoch": 0.019738528582414764, + "grad_norm": 2.6846719971532287, + "learning_rate": 1e-06, + "loss": 0.4301, + "step": 308 + }, + { + "epoch": 0.019802614714175854, + "grad_norm": 2.656046389414779, + "learning_rate": 1e-06, + "loss": 0.4214, + "step": 309 + }, + { + "epoch": 0.01986670084593694, + "grad_norm": 2.3228700793409316, + "learning_rate": 1e-06, + "loss": 0.4039, + "step": 310 + }, + { + "epoch": 0.019930786977698026, + "grad_norm": 2.3570092668653313, + "learning_rate": 1e-06, + "loss": 0.4244, + "step": 311 + }, + { + "epoch": 0.01999487310945911, + "grad_norm": 2.6047654223851087, + "learning_rate": 1e-06, + "loss": 0.408, + "step": 312 + }, + { + "epoch": 0.0200589592412202, + "grad_norm": 2.762796362913787, + "learning_rate": 1e-06, + "loss": 0.3973, + "step": 313 + }, + { + "epoch": 0.020123045372981287, + "grad_norm": 2.684177046272746, + "learning_rate": 1e-06, + "loss": 0.4441, + "step": 314 + }, + { + "epoch": 0.020187131504742373, + "grad_norm": 2.7783820778054875, + "learning_rate": 1e-06, + "loss": 0.4093, + "step": 315 + }, + { + "epoch": 0.02025121763650346, + "grad_norm": 2.6167852095843864, + "learning_rate": 1e-06, + "loss": 0.4247, + "step": 316 + }, + { + "epoch": 0.02031530376826455, + "grad_norm": 2.6845860637575756, + "learning_rate": 1e-06, + "loss": 0.4023, + "step": 317 + }, + { + "epoch": 0.020379389900025635, + "grad_norm": 2.602696481988515, + "learning_rate": 1e-06, + "loss": 0.4382, + "step": 318 + }, + { + "epoch": 0.02044347603178672, + "grad_norm": 2.5342579150615006, + "learning_rate": 1e-06, + "loss": 0.4055, + "step": 319 + }, + { + "epoch": 0.020507562163547807, + "grad_norm": 2.5854239688820377, + "learning_rate": 1e-06, + "loss": 0.465, + "step": 320 + }, + { + "epoch": 0.020571648295308897, + "grad_norm": 2.792877790090399, + "learning_rate": 1e-06, + "loss": 0.4172, + "step": 321 + }, + { + "epoch": 0.020635734427069983, + "grad_norm": 2.6879878145297913, + "learning_rate": 1e-06, + "loss": 0.3912, + "step": 322 + }, + { + "epoch": 0.02069982055883107, + "grad_norm": 2.5764792723219703, + "learning_rate": 1e-06, + "loss": 0.4293, + "step": 323 + }, + { + "epoch": 0.020763906690592155, + "grad_norm": 2.584799346003916, + "learning_rate": 1e-06, + "loss": 0.4235, + "step": 324 + }, + { + "epoch": 0.02082799282235324, + "grad_norm": 2.52310368174129, + "learning_rate": 1e-06, + "loss": 0.4061, + "step": 325 + }, + { + "epoch": 0.02089207895411433, + "grad_norm": 2.631813901191913, + "learning_rate": 1e-06, + "loss": 0.3927, + "step": 326 + }, + { + "epoch": 0.020956165085875417, + "grad_norm": 2.640467081871137, + "learning_rate": 1e-06, + "loss": 0.4428, + "step": 327 + }, + { + "epoch": 0.021020251217636503, + "grad_norm": 2.9492291344272874, + "learning_rate": 1e-06, + "loss": 0.4127, + "step": 328 + }, + { + "epoch": 0.02108433734939759, + "grad_norm": 2.5242390714272114, + "learning_rate": 1e-06, + "loss": 0.5015, + "step": 329 + }, + { + "epoch": 0.02114842348115868, + "grad_norm": 2.6479950311376954, + "learning_rate": 1e-06, + "loss": 0.4574, + "step": 330 + }, + { + "epoch": 0.021212509612919764, + "grad_norm": 2.5907035344735116, + "learning_rate": 1e-06, + "loss": 0.4418, + "step": 331 + }, + { + "epoch": 0.02127659574468085, + "grad_norm": 2.4904752366096203, + "learning_rate": 1e-06, + "loss": 0.4231, + "step": 332 + }, + { + "epoch": 0.021340681876441937, + "grad_norm": 2.762482929644451, + "learning_rate": 1e-06, + "loss": 0.4319, + "step": 333 + }, + { + "epoch": 0.021404768008203026, + "grad_norm": 2.257106119148726, + "learning_rate": 1e-06, + "loss": 0.4214, + "step": 334 + }, + { + "epoch": 0.021468854139964112, + "grad_norm": 2.687648586593707, + "learning_rate": 1e-06, + "loss": 0.4059, + "step": 335 + }, + { + "epoch": 0.0215329402717252, + "grad_norm": 2.6657368936212715, + "learning_rate": 1e-06, + "loss": 0.3806, + "step": 336 + }, + { + "epoch": 0.021597026403486284, + "grad_norm": 2.6482827833947997, + "learning_rate": 1e-06, + "loss": 0.4097, + "step": 337 + }, + { + "epoch": 0.021661112535247374, + "grad_norm": 2.7093340741541425, + "learning_rate": 1e-06, + "loss": 0.4485, + "step": 338 + }, + { + "epoch": 0.02172519866700846, + "grad_norm": 2.3762505723651057, + "learning_rate": 1e-06, + "loss": 0.428, + "step": 339 + }, + { + "epoch": 0.021789284798769546, + "grad_norm": 2.423159148830173, + "learning_rate": 1e-06, + "loss": 0.3828, + "step": 340 + }, + { + "epoch": 0.021853370930530632, + "grad_norm": 2.716362646966935, + "learning_rate": 1e-06, + "loss": 0.4466, + "step": 341 + }, + { + "epoch": 0.02191745706229172, + "grad_norm": 2.6724315541317494, + "learning_rate": 1e-06, + "loss": 0.4175, + "step": 342 + }, + { + "epoch": 0.021981543194052808, + "grad_norm": 2.622452376572196, + "learning_rate": 1e-06, + "loss": 0.4326, + "step": 343 + }, + { + "epoch": 0.022045629325813894, + "grad_norm": 2.6641408148600485, + "learning_rate": 1e-06, + "loss": 0.4213, + "step": 344 + }, + { + "epoch": 0.02210971545757498, + "grad_norm": 2.587246078788654, + "learning_rate": 1e-06, + "loss": 0.4081, + "step": 345 + }, + { + "epoch": 0.022173801589336066, + "grad_norm": 2.684412921798805, + "learning_rate": 1e-06, + "loss": 0.4255, + "step": 346 + }, + { + "epoch": 0.022237887721097156, + "grad_norm": 2.4607146739904007, + "learning_rate": 1e-06, + "loss": 0.4439, + "step": 347 + }, + { + "epoch": 0.02230197385285824, + "grad_norm": 2.7249177701726817, + "learning_rate": 1e-06, + "loss": 0.4347, + "step": 348 + }, + { + "epoch": 0.022366059984619328, + "grad_norm": 2.7097559627929053, + "learning_rate": 1e-06, + "loss": 0.4123, + "step": 349 + }, + { + "epoch": 0.022430146116380414, + "grad_norm": 2.6075455207333387, + "learning_rate": 1e-06, + "loss": 0.3605, + "step": 350 + }, + { + "epoch": 0.022494232248141503, + "grad_norm": 2.526788415971298, + "learning_rate": 1e-06, + "loss": 0.4505, + "step": 351 + }, + { + "epoch": 0.02255831837990259, + "grad_norm": 2.6078284647598204, + "learning_rate": 1e-06, + "loss": 0.4309, + "step": 352 + }, + { + "epoch": 0.022622404511663675, + "grad_norm": 2.443089947367771, + "learning_rate": 1e-06, + "loss": 0.394, + "step": 353 + }, + { + "epoch": 0.02268649064342476, + "grad_norm": 2.694646337485449, + "learning_rate": 1e-06, + "loss": 0.4094, + "step": 354 + }, + { + "epoch": 0.02275057677518585, + "grad_norm": 2.539210140474977, + "learning_rate": 1e-06, + "loss": 0.4183, + "step": 355 + }, + { + "epoch": 0.022814662906946937, + "grad_norm": 2.5494243341233602, + "learning_rate": 1e-06, + "loss": 0.3934, + "step": 356 + }, + { + "epoch": 0.022878749038708023, + "grad_norm": 2.518198059218748, + "learning_rate": 1e-06, + "loss": 0.3786, + "step": 357 + }, + { + "epoch": 0.02294283517046911, + "grad_norm": 2.61819759245241, + "learning_rate": 1e-06, + "loss": 0.4574, + "step": 358 + }, + { + "epoch": 0.0230069213022302, + "grad_norm": 2.5581859112297005, + "learning_rate": 1e-06, + "loss": 0.3627, + "step": 359 + }, + { + "epoch": 0.023071007433991285, + "grad_norm": 2.587465107279955, + "learning_rate": 1e-06, + "loss": 0.3798, + "step": 360 + }, + { + "epoch": 0.02313509356575237, + "grad_norm": 2.648677113740983, + "learning_rate": 1e-06, + "loss": 0.4366, + "step": 361 + }, + { + "epoch": 0.023199179697513457, + "grad_norm": 2.7540181767323664, + "learning_rate": 1e-06, + "loss": 0.3658, + "step": 362 + }, + { + "epoch": 0.023263265829274547, + "grad_norm": 2.755584999096141, + "learning_rate": 1e-06, + "loss": 0.4769, + "step": 363 + }, + { + "epoch": 0.023327351961035633, + "grad_norm": 2.6396695649199176, + "learning_rate": 1e-06, + "loss": 0.4281, + "step": 364 + }, + { + "epoch": 0.02339143809279672, + "grad_norm": 2.6761169006385845, + "learning_rate": 1e-06, + "loss": 0.4838, + "step": 365 + }, + { + "epoch": 0.023455524224557805, + "grad_norm": 2.6889950878228004, + "learning_rate": 1e-06, + "loss": 0.4501, + "step": 366 + }, + { + "epoch": 0.02351961035631889, + "grad_norm": 2.7548073384891003, + "learning_rate": 1e-06, + "loss": 0.4148, + "step": 367 + }, + { + "epoch": 0.02358369648807998, + "grad_norm": 2.542233508453795, + "learning_rate": 1e-06, + "loss": 0.4433, + "step": 368 + }, + { + "epoch": 0.023647782619841066, + "grad_norm": 2.5380547510477487, + "learning_rate": 1e-06, + "loss": 0.4246, + "step": 369 + }, + { + "epoch": 0.023711868751602153, + "grad_norm": 2.5495340569743252, + "learning_rate": 1e-06, + "loss": 0.3672, + "step": 370 + }, + { + "epoch": 0.02377595488336324, + "grad_norm": 2.775600728413592, + "learning_rate": 1e-06, + "loss": 0.3908, + "step": 371 + }, + { + "epoch": 0.023840041015124328, + "grad_norm": 2.60490757100387, + "learning_rate": 1e-06, + "loss": 0.3787, + "step": 372 + }, + { + "epoch": 0.023904127146885414, + "grad_norm": 2.6266435496694682, + "learning_rate": 1e-06, + "loss": 0.4148, + "step": 373 + }, + { + "epoch": 0.0239682132786465, + "grad_norm": 2.3238570132425744, + "learning_rate": 1e-06, + "loss": 0.4079, + "step": 374 + }, + { + "epoch": 0.024032299410407586, + "grad_norm": 2.4584692833391917, + "learning_rate": 1e-06, + "loss": 0.4871, + "step": 375 + }, + { + "epoch": 0.024096385542168676, + "grad_norm": 2.6874135978719065, + "learning_rate": 1e-06, + "loss": 0.3946, + "step": 376 + }, + { + "epoch": 0.024160471673929762, + "grad_norm": 2.5538834092647704, + "learning_rate": 1e-06, + "loss": 0.3631, + "step": 377 + }, + { + "epoch": 0.024224557805690848, + "grad_norm": 2.8327606606394724, + "learning_rate": 1e-06, + "loss": 0.3849, + "step": 378 + }, + { + "epoch": 0.024288643937451934, + "grad_norm": 2.514808684223404, + "learning_rate": 1e-06, + "loss": 0.4206, + "step": 379 + }, + { + "epoch": 0.024352730069213024, + "grad_norm": 2.6324540798174185, + "learning_rate": 1e-06, + "loss": 0.4487, + "step": 380 + }, + { + "epoch": 0.02441681620097411, + "grad_norm": 2.6056342440608526, + "learning_rate": 1e-06, + "loss": 0.3951, + "step": 381 + }, + { + "epoch": 0.024480902332735196, + "grad_norm": 2.675148296855455, + "learning_rate": 1e-06, + "loss": 0.4448, + "step": 382 + }, + { + "epoch": 0.024544988464496282, + "grad_norm": 2.5831013701900947, + "learning_rate": 1e-06, + "loss": 0.4318, + "step": 383 + }, + { + "epoch": 0.02460907459625737, + "grad_norm": 2.5554733350090575, + "learning_rate": 1e-06, + "loss": 0.4105, + "step": 384 + }, + { + "epoch": 0.024673160728018458, + "grad_norm": 2.825553455546965, + "learning_rate": 1e-06, + "loss": 0.4427, + "step": 385 + }, + { + "epoch": 0.024737246859779544, + "grad_norm": 2.598203447299543, + "learning_rate": 1e-06, + "loss": 0.3917, + "step": 386 + }, + { + "epoch": 0.02480133299154063, + "grad_norm": 2.7532512685061796, + "learning_rate": 1e-06, + "loss": 0.3665, + "step": 387 + }, + { + "epoch": 0.024865419123301716, + "grad_norm": 2.7759105997753264, + "learning_rate": 1e-06, + "loss": 0.4646, + "step": 388 + }, + { + "epoch": 0.024929505255062805, + "grad_norm": 2.486755526528995, + "learning_rate": 1e-06, + "loss": 0.4424, + "step": 389 + }, + { + "epoch": 0.02499359138682389, + "grad_norm": 2.5947708705458394, + "learning_rate": 1e-06, + "loss": 0.3967, + "step": 390 + }, + { + "epoch": 0.025057677518584977, + "grad_norm": 2.4297539299884683, + "learning_rate": 1e-06, + "loss": 0.4141, + "step": 391 + }, + { + "epoch": 0.025121763650346064, + "grad_norm": 2.434233908402395, + "learning_rate": 1e-06, + "loss": 0.4528, + "step": 392 + }, + { + "epoch": 0.025185849782107153, + "grad_norm": 2.757448942774155, + "learning_rate": 1e-06, + "loss": 0.4467, + "step": 393 + }, + { + "epoch": 0.02524993591386824, + "grad_norm": 2.4998807548143014, + "learning_rate": 1e-06, + "loss": 0.4334, + "step": 394 + }, + { + "epoch": 0.025314022045629325, + "grad_norm": 2.419465872584267, + "learning_rate": 1e-06, + "loss": 0.3803, + "step": 395 + }, + { + "epoch": 0.02537810817739041, + "grad_norm": 2.679576216317354, + "learning_rate": 1e-06, + "loss": 0.4507, + "step": 396 + }, + { + "epoch": 0.0254421943091515, + "grad_norm": 2.5869533374512335, + "learning_rate": 1e-06, + "loss": 0.464, + "step": 397 + }, + { + "epoch": 0.025506280440912587, + "grad_norm": 2.8162022718685678, + "learning_rate": 1e-06, + "loss": 0.4536, + "step": 398 + }, + { + "epoch": 0.025570366572673673, + "grad_norm": 2.49440453274208, + "learning_rate": 1e-06, + "loss": 0.4472, + "step": 399 + }, + { + "epoch": 0.02563445270443476, + "grad_norm": 2.7267037535153813, + "learning_rate": 1e-06, + "loss": 0.3788, + "step": 400 + }, + { + "epoch": 0.02569853883619585, + "grad_norm": 2.6275422608283123, + "learning_rate": 1e-06, + "loss": 0.455, + "step": 401 + }, + { + "epoch": 0.025762624967956935, + "grad_norm": 2.468485363505419, + "learning_rate": 1e-06, + "loss": 0.4348, + "step": 402 + }, + { + "epoch": 0.02582671109971802, + "grad_norm": 2.6322373705642015, + "learning_rate": 1e-06, + "loss": 0.4474, + "step": 403 + }, + { + "epoch": 0.025890797231479107, + "grad_norm": 2.535796436272985, + "learning_rate": 1e-06, + "loss": 0.4392, + "step": 404 + }, + { + "epoch": 0.025954883363240196, + "grad_norm": 2.5836972859728915, + "learning_rate": 1e-06, + "loss": 0.429, + "step": 405 + }, + { + "epoch": 0.026018969495001282, + "grad_norm": 2.7316052338513432, + "learning_rate": 1e-06, + "loss": 0.396, + "step": 406 + }, + { + "epoch": 0.02608305562676237, + "grad_norm": 2.736398307464729, + "learning_rate": 1e-06, + "loss": 0.4456, + "step": 407 + }, + { + "epoch": 0.026147141758523455, + "grad_norm": 2.7440451094469394, + "learning_rate": 1e-06, + "loss": 0.4556, + "step": 408 + }, + { + "epoch": 0.026211227890284544, + "grad_norm": 2.4414167181146635, + "learning_rate": 1e-06, + "loss": 0.4086, + "step": 409 + }, + { + "epoch": 0.02627531402204563, + "grad_norm": 2.7337025666681787, + "learning_rate": 1e-06, + "loss": 0.4728, + "step": 410 + }, + { + "epoch": 0.026339400153806716, + "grad_norm": 2.6555366248003076, + "learning_rate": 1e-06, + "loss": 0.4541, + "step": 411 + }, + { + "epoch": 0.026403486285567802, + "grad_norm": 2.5538593566825405, + "learning_rate": 1e-06, + "loss": 0.4807, + "step": 412 + }, + { + "epoch": 0.02646757241732889, + "grad_norm": 2.5362557233108833, + "learning_rate": 1e-06, + "loss": 0.3934, + "step": 413 + }, + { + "epoch": 0.026531658549089978, + "grad_norm": 2.8275479404432775, + "learning_rate": 1e-06, + "loss": 0.4276, + "step": 414 + }, + { + "epoch": 0.026595744680851064, + "grad_norm": 2.8543582707538655, + "learning_rate": 1e-06, + "loss": 0.4315, + "step": 415 + }, + { + "epoch": 0.02665983081261215, + "grad_norm": 2.950759907375976, + "learning_rate": 1e-06, + "loss": 0.5034, + "step": 416 + }, + { + "epoch": 0.026723916944373236, + "grad_norm": 2.6064488094160705, + "learning_rate": 1e-06, + "loss": 0.4089, + "step": 417 + }, + { + "epoch": 0.026788003076134326, + "grad_norm": 2.7096530877343534, + "learning_rate": 1e-06, + "loss": 0.4349, + "step": 418 + }, + { + "epoch": 0.026852089207895412, + "grad_norm": 2.563785025605755, + "learning_rate": 1e-06, + "loss": 0.4283, + "step": 419 + }, + { + "epoch": 0.026916175339656498, + "grad_norm": 2.5884296422116213, + "learning_rate": 1e-06, + "loss": 0.3884, + "step": 420 + }, + { + "epoch": 0.026980261471417584, + "grad_norm": 2.652177567014019, + "learning_rate": 1e-06, + "loss": 0.3815, + "step": 421 + }, + { + "epoch": 0.027044347603178674, + "grad_norm": 2.8145378730635477, + "learning_rate": 1e-06, + "loss": 0.469, + "step": 422 + }, + { + "epoch": 0.02710843373493976, + "grad_norm": 2.6413005099038918, + "learning_rate": 1e-06, + "loss": 0.4117, + "step": 423 + }, + { + "epoch": 0.027172519866700846, + "grad_norm": 2.6259835527792803, + "learning_rate": 1e-06, + "loss": 0.4584, + "step": 424 + }, + { + "epoch": 0.02723660599846193, + "grad_norm": 2.59772191403426, + "learning_rate": 1e-06, + "loss": 0.4332, + "step": 425 + }, + { + "epoch": 0.02730069213022302, + "grad_norm": 2.6447993885707106, + "learning_rate": 1e-06, + "loss": 0.4098, + "step": 426 + }, + { + "epoch": 0.027364778261984107, + "grad_norm": 2.8823068371516865, + "learning_rate": 1e-06, + "loss": 0.4728, + "step": 427 + }, + { + "epoch": 0.027428864393745193, + "grad_norm": 2.6895802156764996, + "learning_rate": 1e-06, + "loss": 0.4462, + "step": 428 + }, + { + "epoch": 0.02749295052550628, + "grad_norm": 2.775049320708902, + "learning_rate": 1e-06, + "loss": 0.4015, + "step": 429 + }, + { + "epoch": 0.02755703665726737, + "grad_norm": 2.4854036703312783, + "learning_rate": 1e-06, + "loss": 0.4104, + "step": 430 + }, + { + "epoch": 0.027621122789028455, + "grad_norm": 2.628854745968563, + "learning_rate": 1e-06, + "loss": 0.4176, + "step": 431 + }, + { + "epoch": 0.02768520892078954, + "grad_norm": 2.543581473195899, + "learning_rate": 1e-06, + "loss": 0.4403, + "step": 432 + }, + { + "epoch": 0.027749295052550627, + "grad_norm": 2.5870776263158115, + "learning_rate": 1e-06, + "loss": 0.4503, + "step": 433 + }, + { + "epoch": 0.027813381184311713, + "grad_norm": 2.5550230493211266, + "learning_rate": 1e-06, + "loss": 0.4454, + "step": 434 + }, + { + "epoch": 0.027877467316072803, + "grad_norm": 2.517836595844116, + "learning_rate": 1e-06, + "loss": 0.3876, + "step": 435 + }, + { + "epoch": 0.02794155344783389, + "grad_norm": 2.532564498900615, + "learning_rate": 1e-06, + "loss": 0.3899, + "step": 436 + }, + { + "epoch": 0.028005639579594975, + "grad_norm": 2.617217239557506, + "learning_rate": 1e-06, + "loss": 0.4032, + "step": 437 + }, + { + "epoch": 0.02806972571135606, + "grad_norm": 2.4932234326126985, + "learning_rate": 1e-06, + "loss": 0.4112, + "step": 438 + }, + { + "epoch": 0.02813381184311715, + "grad_norm": 2.632681267604381, + "learning_rate": 1e-06, + "loss": 0.4631, + "step": 439 + }, + { + "epoch": 0.028197897974878237, + "grad_norm": 2.700129131791681, + "learning_rate": 1e-06, + "loss": 0.3841, + "step": 440 + }, + { + "epoch": 0.028261984106639323, + "grad_norm": 2.3125223839321034, + "learning_rate": 1e-06, + "loss": 0.4322, + "step": 441 + }, + { + "epoch": 0.02832607023840041, + "grad_norm": 2.6480507139738187, + "learning_rate": 1e-06, + "loss": 0.4028, + "step": 442 + }, + { + "epoch": 0.0283901563701615, + "grad_norm": 2.5860040775843087, + "learning_rate": 1e-06, + "loss": 0.441, + "step": 443 + }, + { + "epoch": 0.028454242501922584, + "grad_norm": 2.623689520293853, + "learning_rate": 1e-06, + "loss": 0.4302, + "step": 444 + }, + { + "epoch": 0.02851832863368367, + "grad_norm": 2.558893067356012, + "learning_rate": 1e-06, + "loss": 0.4472, + "step": 445 + }, + { + "epoch": 0.028582414765444757, + "grad_norm": 2.827213781887415, + "learning_rate": 1e-06, + "loss": 0.4805, + "step": 446 + }, + { + "epoch": 0.028646500897205846, + "grad_norm": 2.626705801630092, + "learning_rate": 1e-06, + "loss": 0.4232, + "step": 447 + }, + { + "epoch": 0.028710587028966932, + "grad_norm": 2.3921237896610315, + "learning_rate": 1e-06, + "loss": 0.4341, + "step": 448 + }, + { + "epoch": 0.02877467316072802, + "grad_norm": 2.594901028723936, + "learning_rate": 1e-06, + "loss": 0.4329, + "step": 449 + }, + { + "epoch": 0.028838759292489104, + "grad_norm": 2.6928586153085465, + "learning_rate": 1e-06, + "loss": 0.4559, + "step": 450 + }, + { + "epoch": 0.028902845424250194, + "grad_norm": 2.5095686668184713, + "learning_rate": 1e-06, + "loss": 0.4052, + "step": 451 + }, + { + "epoch": 0.02896693155601128, + "grad_norm": 2.7755150350769773, + "learning_rate": 1e-06, + "loss": 0.443, + "step": 452 + }, + { + "epoch": 0.029031017687772366, + "grad_norm": 2.3618305061019202, + "learning_rate": 1e-06, + "loss": 0.4055, + "step": 453 + }, + { + "epoch": 0.029095103819533452, + "grad_norm": 2.5548014934465026, + "learning_rate": 1e-06, + "loss": 0.4431, + "step": 454 + }, + { + "epoch": 0.029159189951294538, + "grad_norm": 2.565871109083848, + "learning_rate": 1e-06, + "loss": 0.4101, + "step": 455 + }, + { + "epoch": 0.029223276083055628, + "grad_norm": 2.653111195337013, + "learning_rate": 1e-06, + "loss": 0.4435, + "step": 456 + }, + { + "epoch": 0.029287362214816714, + "grad_norm": 2.4368979780190245, + "learning_rate": 1e-06, + "loss": 0.367, + "step": 457 + }, + { + "epoch": 0.0293514483465778, + "grad_norm": 2.7911192897610047, + "learning_rate": 1e-06, + "loss": 0.3813, + "step": 458 + }, + { + "epoch": 0.029415534478338886, + "grad_norm": 2.4492006758826106, + "learning_rate": 1e-06, + "loss": 0.4112, + "step": 459 + }, + { + "epoch": 0.029479620610099976, + "grad_norm": 2.6617405331446555, + "learning_rate": 1e-06, + "loss": 0.4823, + "step": 460 + }, + { + "epoch": 0.02954370674186106, + "grad_norm": 2.43862787048992, + "learning_rate": 1e-06, + "loss": 0.4443, + "step": 461 + }, + { + "epoch": 0.029607792873622148, + "grad_norm": 2.4792268303893783, + "learning_rate": 1e-06, + "loss": 0.4563, + "step": 462 + }, + { + "epoch": 0.029671879005383234, + "grad_norm": 2.4850255441260267, + "learning_rate": 1e-06, + "loss": 0.4743, + "step": 463 + }, + { + "epoch": 0.029735965137144323, + "grad_norm": 2.5140153994819427, + "learning_rate": 1e-06, + "loss": 0.4696, + "step": 464 + }, + { + "epoch": 0.02980005126890541, + "grad_norm": 2.6261266820771585, + "learning_rate": 1e-06, + "loss": 0.3756, + "step": 465 + }, + { + "epoch": 0.029864137400666495, + "grad_norm": 2.6310050025757508, + "learning_rate": 1e-06, + "loss": 0.3847, + "step": 466 + }, + { + "epoch": 0.02992822353242758, + "grad_norm": 2.7082985977439176, + "learning_rate": 1e-06, + "loss": 0.3926, + "step": 467 + }, + { + "epoch": 0.02999230966418867, + "grad_norm": 2.553112208833126, + "learning_rate": 1e-06, + "loss": 0.396, + "step": 468 + }, + { + "epoch": 0.030056395795949757, + "grad_norm": 2.612291745367972, + "learning_rate": 1e-06, + "loss": 0.371, + "step": 469 + }, + { + "epoch": 0.030120481927710843, + "grad_norm": 2.621969421745666, + "learning_rate": 1e-06, + "loss": 0.4045, + "step": 470 + }, + { + "epoch": 0.03018456805947193, + "grad_norm": 2.5381238717470183, + "learning_rate": 1e-06, + "loss": 0.4543, + "step": 471 + }, + { + "epoch": 0.03024865419123302, + "grad_norm": 2.665916627779648, + "learning_rate": 1e-06, + "loss": 0.4564, + "step": 472 + }, + { + "epoch": 0.030312740322994105, + "grad_norm": 2.4444266781182664, + "learning_rate": 1e-06, + "loss": 0.4394, + "step": 473 + }, + { + "epoch": 0.03037682645475519, + "grad_norm": 2.669252093062087, + "learning_rate": 1e-06, + "loss": 0.4722, + "step": 474 + }, + { + "epoch": 0.030440912586516277, + "grad_norm": 2.5986644470297136, + "learning_rate": 1e-06, + "loss": 0.4343, + "step": 475 + }, + { + "epoch": 0.030504998718277363, + "grad_norm": 2.516452232958566, + "learning_rate": 1e-06, + "loss": 0.453, + "step": 476 + }, + { + "epoch": 0.030569084850038453, + "grad_norm": 2.4799852114064214, + "learning_rate": 1e-06, + "loss": 0.4186, + "step": 477 + }, + { + "epoch": 0.03063317098179954, + "grad_norm": 2.691271105524432, + "learning_rate": 1e-06, + "loss": 0.4461, + "step": 478 + }, + { + "epoch": 0.030697257113560625, + "grad_norm": 2.6177226276670886, + "learning_rate": 1e-06, + "loss": 0.3892, + "step": 479 + }, + { + "epoch": 0.03076134324532171, + "grad_norm": 2.6112474010731055, + "learning_rate": 1e-06, + "loss": 0.4304, + "step": 480 + }, + { + "epoch": 0.0308254293770828, + "grad_norm": 2.43910372990363, + "learning_rate": 1e-06, + "loss": 0.4058, + "step": 481 + }, + { + "epoch": 0.030889515508843886, + "grad_norm": 2.560767071357324, + "learning_rate": 1e-06, + "loss": 0.4067, + "step": 482 + }, + { + "epoch": 0.030953601640604973, + "grad_norm": 2.545682557199924, + "learning_rate": 1e-06, + "loss": 0.4691, + "step": 483 + }, + { + "epoch": 0.03101768777236606, + "grad_norm": 2.575507981220968, + "learning_rate": 1e-06, + "loss": 0.419, + "step": 484 + }, + { + "epoch": 0.031081773904127148, + "grad_norm": 2.398778610972209, + "learning_rate": 1e-06, + "loss": 0.4322, + "step": 485 + }, + { + "epoch": 0.031145860035888234, + "grad_norm": 2.468832999943894, + "learning_rate": 1e-06, + "loss": 0.4069, + "step": 486 + }, + { + "epoch": 0.03120994616764932, + "grad_norm": 2.6723453958820045, + "learning_rate": 1e-06, + "loss": 0.4476, + "step": 487 + }, + { + "epoch": 0.031274032299410406, + "grad_norm": 3.007937745124945, + "learning_rate": 1e-06, + "loss": 0.4408, + "step": 488 + }, + { + "epoch": 0.03133811843117149, + "grad_norm": 2.5372976061475625, + "learning_rate": 1e-06, + "loss": 0.4811, + "step": 489 + }, + { + "epoch": 0.03140220456293258, + "grad_norm": 2.4099062889186245, + "learning_rate": 1e-06, + "loss": 0.4239, + "step": 490 + }, + { + "epoch": 0.03146629069469367, + "grad_norm": 2.561389579372859, + "learning_rate": 1e-06, + "loss": 0.4182, + "step": 491 + }, + { + "epoch": 0.03153037682645476, + "grad_norm": 2.7289712094507492, + "learning_rate": 1e-06, + "loss": 0.3899, + "step": 492 + }, + { + "epoch": 0.031594462958215844, + "grad_norm": 2.8885100719999253, + "learning_rate": 1e-06, + "loss": 0.4456, + "step": 493 + }, + { + "epoch": 0.03165854908997693, + "grad_norm": 2.5064397467754573, + "learning_rate": 1e-06, + "loss": 0.407, + "step": 494 + }, + { + "epoch": 0.031722635221738016, + "grad_norm": 2.412203696247204, + "learning_rate": 1e-06, + "loss": 0.3731, + "step": 495 + }, + { + "epoch": 0.0317867213534991, + "grad_norm": 2.629095888173398, + "learning_rate": 1e-06, + "loss": 0.3977, + "step": 496 + }, + { + "epoch": 0.03185080748526019, + "grad_norm": 2.4280514452263406, + "learning_rate": 1e-06, + "loss": 0.4447, + "step": 497 + }, + { + "epoch": 0.031914893617021274, + "grad_norm": 2.5575542980194443, + "learning_rate": 1e-06, + "loss": 0.3789, + "step": 498 + }, + { + "epoch": 0.03197897974878236, + "grad_norm": 2.5872529006915412, + "learning_rate": 1e-06, + "loss": 0.4307, + "step": 499 + }, + { + "epoch": 0.03204306588054345, + "grad_norm": 2.5440251604452992, + "learning_rate": 1e-06, + "loss": 0.3903, + "step": 500 + }, + { + "epoch": 0.03210715201230454, + "grad_norm": 2.7137592808907884, + "learning_rate": 1e-06, + "loss": 0.4266, + "step": 501 + }, + { + "epoch": 0.032171238144065625, + "grad_norm": 2.6186486459780167, + "learning_rate": 1e-06, + "loss": 0.434, + "step": 502 + }, + { + "epoch": 0.03223532427582671, + "grad_norm": 2.570526490487541, + "learning_rate": 1e-06, + "loss": 0.3787, + "step": 503 + }, + { + "epoch": 0.0322994104075878, + "grad_norm": 2.60892271445792, + "learning_rate": 1e-06, + "loss": 0.3674, + "step": 504 + }, + { + "epoch": 0.032363496539348884, + "grad_norm": 2.8058791368375835, + "learning_rate": 1e-06, + "loss": 0.371, + "step": 505 + }, + { + "epoch": 0.03242758267110997, + "grad_norm": 2.3428317112470345, + "learning_rate": 1e-06, + "loss": 0.4225, + "step": 506 + }, + { + "epoch": 0.032491668802871056, + "grad_norm": 2.532222954783407, + "learning_rate": 1e-06, + "loss": 0.4619, + "step": 507 + }, + { + "epoch": 0.03255575493463215, + "grad_norm": 2.7351971936980672, + "learning_rate": 1e-06, + "loss": 0.3857, + "step": 508 + }, + { + "epoch": 0.032619841066393235, + "grad_norm": 2.4604615768666416, + "learning_rate": 1e-06, + "loss": 0.4439, + "step": 509 + }, + { + "epoch": 0.03268392719815432, + "grad_norm": 2.831748431791256, + "learning_rate": 1e-06, + "loss": 0.4475, + "step": 510 + }, + { + "epoch": 0.03274801332991541, + "grad_norm": 2.441688126278881, + "learning_rate": 1e-06, + "loss": 0.4578, + "step": 511 + }, + { + "epoch": 0.03281209946167649, + "grad_norm": 2.5457154513847606, + "learning_rate": 1e-06, + "loss": 0.3945, + "step": 512 + }, + { + "epoch": 0.03287618559343758, + "grad_norm": 2.6596542207786333, + "learning_rate": 1e-06, + "loss": 0.4234, + "step": 513 + }, + { + "epoch": 0.032940271725198665, + "grad_norm": 2.577103093592158, + "learning_rate": 1e-06, + "loss": 0.4362, + "step": 514 + }, + { + "epoch": 0.03300435785695975, + "grad_norm": 2.6865205065165476, + "learning_rate": 1e-06, + "loss": 0.457, + "step": 515 + }, + { + "epoch": 0.033068443988720844, + "grad_norm": 2.7935603445979593, + "learning_rate": 1e-06, + "loss": 0.3538, + "step": 516 + }, + { + "epoch": 0.03313253012048193, + "grad_norm": 2.5709612528638646, + "learning_rate": 1e-06, + "loss": 0.4058, + "step": 517 + }, + { + "epoch": 0.033196616252243016, + "grad_norm": 2.679906178087004, + "learning_rate": 1e-06, + "loss": 0.419, + "step": 518 + }, + { + "epoch": 0.0332607023840041, + "grad_norm": 2.631332537538208, + "learning_rate": 1e-06, + "loss": 0.3987, + "step": 519 + }, + { + "epoch": 0.03332478851576519, + "grad_norm": 2.742087588652849, + "learning_rate": 1e-06, + "loss": 0.3902, + "step": 520 + }, + { + "epoch": 0.033388874647526275, + "grad_norm": 2.607913480915616, + "learning_rate": 1e-06, + "loss": 0.3595, + "step": 521 + }, + { + "epoch": 0.03345296077928736, + "grad_norm": 2.6446962621851564, + "learning_rate": 1e-06, + "loss": 0.3932, + "step": 522 + }, + { + "epoch": 0.03351704691104845, + "grad_norm": 2.7401141685722337, + "learning_rate": 1e-06, + "loss": 0.4399, + "step": 523 + }, + { + "epoch": 0.03358113304280953, + "grad_norm": 2.505928983276003, + "learning_rate": 1e-06, + "loss": 0.501, + "step": 524 + }, + { + "epoch": 0.033645219174570626, + "grad_norm": 2.7600474954065444, + "learning_rate": 1e-06, + "loss": 0.4258, + "step": 525 + }, + { + "epoch": 0.03370930530633171, + "grad_norm": 2.7364210341460997, + "learning_rate": 1e-06, + "loss": 0.4107, + "step": 526 + }, + { + "epoch": 0.0337733914380928, + "grad_norm": 2.750607243162528, + "learning_rate": 1e-06, + "loss": 0.4495, + "step": 527 + }, + { + "epoch": 0.033837477569853884, + "grad_norm": 2.8190619687628273, + "learning_rate": 1e-06, + "loss": 0.3647, + "step": 528 + }, + { + "epoch": 0.03390156370161497, + "grad_norm": 2.7488605163437243, + "learning_rate": 1e-06, + "loss": 0.3953, + "step": 529 + }, + { + "epoch": 0.033965649833376056, + "grad_norm": 2.4591430760247475, + "learning_rate": 1e-06, + "loss": 0.4333, + "step": 530 + }, + { + "epoch": 0.03402973596513714, + "grad_norm": 2.456421101011619, + "learning_rate": 1e-06, + "loss": 0.3921, + "step": 531 + }, + { + "epoch": 0.03409382209689823, + "grad_norm": 2.6154427035143994, + "learning_rate": 1e-06, + "loss": 0.3879, + "step": 532 + }, + { + "epoch": 0.03415790822865932, + "grad_norm": 2.6735778265914716, + "learning_rate": 1e-06, + "loss": 0.3936, + "step": 533 + }, + { + "epoch": 0.03422199436042041, + "grad_norm": 2.890439024855884, + "learning_rate": 1e-06, + "loss": 0.4686, + "step": 534 + }, + { + "epoch": 0.034286080492181494, + "grad_norm": 2.4630446860653006, + "learning_rate": 1e-06, + "loss": 0.427, + "step": 535 + }, + { + "epoch": 0.03435016662394258, + "grad_norm": 2.6268371157846073, + "learning_rate": 1e-06, + "loss": 0.4102, + "step": 536 + }, + { + "epoch": 0.034414252755703666, + "grad_norm": 2.610285178020935, + "learning_rate": 1e-06, + "loss": 0.4184, + "step": 537 + }, + { + "epoch": 0.03447833888746475, + "grad_norm": 2.6839716680166577, + "learning_rate": 1e-06, + "loss": 0.463, + "step": 538 + }, + { + "epoch": 0.03454242501922584, + "grad_norm": 2.749663653273078, + "learning_rate": 1e-06, + "loss": 0.4618, + "step": 539 + }, + { + "epoch": 0.034606511150986924, + "grad_norm": 2.7732262749372962, + "learning_rate": 1e-06, + "loss": 0.4329, + "step": 540 + }, + { + "epoch": 0.03467059728274801, + "grad_norm": 2.4709968867303482, + "learning_rate": 1e-06, + "loss": 0.4149, + "step": 541 + }, + { + "epoch": 0.0347346834145091, + "grad_norm": 2.636652286257393, + "learning_rate": 1e-06, + "loss": 0.4409, + "step": 542 + }, + { + "epoch": 0.03479876954627019, + "grad_norm": 2.6364517480767207, + "learning_rate": 1e-06, + "loss": 0.3699, + "step": 543 + }, + { + "epoch": 0.034862855678031275, + "grad_norm": 2.602465187056353, + "learning_rate": 1e-06, + "loss": 0.4382, + "step": 544 + }, + { + "epoch": 0.03492694180979236, + "grad_norm": 2.5206229660480135, + "learning_rate": 1e-06, + "loss": 0.4329, + "step": 545 + }, + { + "epoch": 0.03499102794155345, + "grad_norm": 2.404100546779136, + "learning_rate": 1e-06, + "loss": 0.4556, + "step": 546 + }, + { + "epoch": 0.03505511407331453, + "grad_norm": 2.5357402525324373, + "learning_rate": 1e-06, + "loss": 0.4263, + "step": 547 + }, + { + "epoch": 0.03511920020507562, + "grad_norm": 2.759792501919909, + "learning_rate": 1e-06, + "loss": 0.4423, + "step": 548 + }, + { + "epoch": 0.035183286336836705, + "grad_norm": 2.6433173572474207, + "learning_rate": 1e-06, + "loss": 0.4271, + "step": 549 + }, + { + "epoch": 0.0352473724685978, + "grad_norm": 2.4955771953873986, + "learning_rate": 1e-06, + "loss": 0.4189, + "step": 550 + }, + { + "epoch": 0.035311458600358885, + "grad_norm": 2.6502029354718526, + "learning_rate": 1e-06, + "loss": 0.4253, + "step": 551 + }, + { + "epoch": 0.03537554473211997, + "grad_norm": 2.5023269519541556, + "learning_rate": 1e-06, + "loss": 0.4171, + "step": 552 + }, + { + "epoch": 0.03543963086388106, + "grad_norm": 2.427919137521944, + "learning_rate": 1e-06, + "loss": 0.3839, + "step": 553 + }, + { + "epoch": 0.03550371699564214, + "grad_norm": 2.547181435232563, + "learning_rate": 1e-06, + "loss": 0.4268, + "step": 554 + }, + { + "epoch": 0.03556780312740323, + "grad_norm": 2.8027164655802412, + "learning_rate": 1e-06, + "loss": 0.4133, + "step": 555 + }, + { + "epoch": 0.035631889259164315, + "grad_norm": 2.6901481129495024, + "learning_rate": 1e-06, + "loss": 0.4793, + "step": 556 + }, + { + "epoch": 0.0356959753909254, + "grad_norm": 2.5079034785333008, + "learning_rate": 1e-06, + "loss": 0.4461, + "step": 557 + }, + { + "epoch": 0.035760061522686494, + "grad_norm": 2.403696079239236, + "learning_rate": 1e-06, + "loss": 0.444, + "step": 558 + }, + { + "epoch": 0.03582414765444758, + "grad_norm": 2.569305452424107, + "learning_rate": 1e-06, + "loss": 0.3475, + "step": 559 + }, + { + "epoch": 0.035888233786208666, + "grad_norm": 2.6545444286825983, + "learning_rate": 1e-06, + "loss": 0.3869, + "step": 560 + }, + { + "epoch": 0.03595231991796975, + "grad_norm": 2.440955279891903, + "learning_rate": 1e-06, + "loss": 0.3581, + "step": 561 + }, + { + "epoch": 0.03601640604973084, + "grad_norm": 2.847512451653497, + "learning_rate": 1e-06, + "loss": 0.4296, + "step": 562 + }, + { + "epoch": 0.036080492181491924, + "grad_norm": 2.6466369305096666, + "learning_rate": 1e-06, + "loss": 0.4061, + "step": 563 + }, + { + "epoch": 0.03614457831325301, + "grad_norm": 2.6832876331361475, + "learning_rate": 1e-06, + "loss": 0.4232, + "step": 564 + }, + { + "epoch": 0.0362086644450141, + "grad_norm": 2.5616868912458157, + "learning_rate": 1e-06, + "loss": 0.4081, + "step": 565 + }, + { + "epoch": 0.03627275057677518, + "grad_norm": 2.4878609095378854, + "learning_rate": 1e-06, + "loss": 0.455, + "step": 566 + }, + { + "epoch": 0.036336836708536276, + "grad_norm": 2.46531966648379, + "learning_rate": 1e-06, + "loss": 0.4627, + "step": 567 + }, + { + "epoch": 0.03640092284029736, + "grad_norm": 2.516195232483696, + "learning_rate": 1e-06, + "loss": 0.4258, + "step": 568 + }, + { + "epoch": 0.03646500897205845, + "grad_norm": 2.6044742204218174, + "learning_rate": 1e-06, + "loss": 0.4174, + "step": 569 + }, + { + "epoch": 0.036529095103819534, + "grad_norm": 2.5714531773465312, + "learning_rate": 1e-06, + "loss": 0.4185, + "step": 570 + }, + { + "epoch": 0.03659318123558062, + "grad_norm": 2.5553692530565004, + "learning_rate": 1e-06, + "loss": 0.4518, + "step": 571 + }, + { + "epoch": 0.036657267367341706, + "grad_norm": 2.5353351960631363, + "learning_rate": 1e-06, + "loss": 0.4417, + "step": 572 + }, + { + "epoch": 0.03672135349910279, + "grad_norm": 2.690225025998638, + "learning_rate": 1e-06, + "loss": 0.4141, + "step": 573 + }, + { + "epoch": 0.03678543963086388, + "grad_norm": 2.709566881886002, + "learning_rate": 1e-06, + "loss": 0.4066, + "step": 574 + }, + { + "epoch": 0.03684952576262497, + "grad_norm": 2.755681344115124, + "learning_rate": 1e-06, + "loss": 0.4367, + "step": 575 + }, + { + "epoch": 0.03691361189438606, + "grad_norm": 2.5693249686778685, + "learning_rate": 1e-06, + "loss": 0.4407, + "step": 576 + }, + { + "epoch": 0.03697769802614714, + "grad_norm": 2.4624250441741364, + "learning_rate": 1e-06, + "loss": 0.3905, + "step": 577 + }, + { + "epoch": 0.03704178415790823, + "grad_norm": 2.73004795352225, + "learning_rate": 1e-06, + "loss": 0.456, + "step": 578 + }, + { + "epoch": 0.037105870289669315, + "grad_norm": 2.7527083634276592, + "learning_rate": 1e-06, + "loss": 0.3965, + "step": 579 + }, + { + "epoch": 0.0371699564214304, + "grad_norm": 2.8028124217258004, + "learning_rate": 1e-06, + "loss": 0.471, + "step": 580 + }, + { + "epoch": 0.03723404255319149, + "grad_norm": 2.5880344409030807, + "learning_rate": 1e-06, + "loss": 0.3902, + "step": 581 + }, + { + "epoch": 0.037298128684952574, + "grad_norm": 2.4467098820143263, + "learning_rate": 1e-06, + "loss": 0.4309, + "step": 582 + }, + { + "epoch": 0.03736221481671366, + "grad_norm": 2.6909263668214414, + "learning_rate": 1e-06, + "loss": 0.4424, + "step": 583 + }, + { + "epoch": 0.03742630094847475, + "grad_norm": 2.5028705246054064, + "learning_rate": 1e-06, + "loss": 0.4534, + "step": 584 + }, + { + "epoch": 0.03749038708023584, + "grad_norm": 2.632149240813846, + "learning_rate": 1e-06, + "loss": 0.4429, + "step": 585 + }, + { + "epoch": 0.037554473211996925, + "grad_norm": 2.676547497619598, + "learning_rate": 1e-06, + "loss": 0.4092, + "step": 586 + }, + { + "epoch": 0.03761855934375801, + "grad_norm": 2.593412911659521, + "learning_rate": 1e-06, + "loss": 0.3827, + "step": 587 + }, + { + "epoch": 0.0376826454755191, + "grad_norm": 2.5646155672189868, + "learning_rate": 1e-06, + "loss": 0.4445, + "step": 588 + }, + { + "epoch": 0.03774673160728018, + "grad_norm": 2.6958374982600857, + "learning_rate": 1e-06, + "loss": 0.4088, + "step": 589 + }, + { + "epoch": 0.03781081773904127, + "grad_norm": 2.944875349581748, + "learning_rate": 1e-06, + "loss": 0.4577, + "step": 590 + }, + { + "epoch": 0.037874903870802355, + "grad_norm": 2.4909466851587285, + "learning_rate": 1e-06, + "loss": 0.3851, + "step": 591 + }, + { + "epoch": 0.03793899000256345, + "grad_norm": 2.6354951431246474, + "learning_rate": 1e-06, + "loss": 0.4192, + "step": 592 + }, + { + "epoch": 0.038003076134324534, + "grad_norm": 2.5659179494331514, + "learning_rate": 1e-06, + "loss": 0.3952, + "step": 593 + }, + { + "epoch": 0.03806716226608562, + "grad_norm": 2.7030130238756387, + "learning_rate": 1e-06, + "loss": 0.4631, + "step": 594 + }, + { + "epoch": 0.038131248397846706, + "grad_norm": 2.5036363687595413, + "learning_rate": 1e-06, + "loss": 0.3573, + "step": 595 + }, + { + "epoch": 0.03819533452960779, + "grad_norm": 2.568713565345006, + "learning_rate": 1e-06, + "loss": 0.4051, + "step": 596 + }, + { + "epoch": 0.03825942066136888, + "grad_norm": 2.7356219550249543, + "learning_rate": 1e-06, + "loss": 0.3943, + "step": 597 + }, + { + "epoch": 0.038323506793129965, + "grad_norm": 2.589753760976193, + "learning_rate": 1e-06, + "loss": 0.4184, + "step": 598 + }, + { + "epoch": 0.03838759292489105, + "grad_norm": 2.5004031202976598, + "learning_rate": 1e-06, + "loss": 0.4061, + "step": 599 + }, + { + "epoch": 0.038451679056652144, + "grad_norm": 2.640214926702102, + "learning_rate": 1e-06, + "loss": 0.4696, + "step": 600 + }, + { + "epoch": 0.03851576518841323, + "grad_norm": 2.4476421037580227, + "learning_rate": 1e-06, + "loss": 0.3422, + "step": 601 + }, + { + "epoch": 0.038579851320174316, + "grad_norm": 2.6129179653473673, + "learning_rate": 1e-06, + "loss": 0.385, + "step": 602 + }, + { + "epoch": 0.0386439374519354, + "grad_norm": 2.8065814805198643, + "learning_rate": 1e-06, + "loss": 0.4985, + "step": 603 + }, + { + "epoch": 0.03870802358369649, + "grad_norm": 2.7506065468121665, + "learning_rate": 1e-06, + "loss": 0.38, + "step": 604 + }, + { + "epoch": 0.038772109715457574, + "grad_norm": 2.743913537942316, + "learning_rate": 1e-06, + "loss": 0.3987, + "step": 605 + }, + { + "epoch": 0.03883619584721866, + "grad_norm": 2.7130495638219982, + "learning_rate": 1e-06, + "loss": 0.4095, + "step": 606 + }, + { + "epoch": 0.038900281978979746, + "grad_norm": 2.5914175128432153, + "learning_rate": 1e-06, + "loss": 0.4005, + "step": 607 + }, + { + "epoch": 0.03896436811074083, + "grad_norm": 2.5482424585290366, + "learning_rate": 1e-06, + "loss": 0.4562, + "step": 608 + }, + { + "epoch": 0.039028454242501925, + "grad_norm": 2.7290427011123395, + "learning_rate": 1e-06, + "loss": 0.4013, + "step": 609 + }, + { + "epoch": 0.03909254037426301, + "grad_norm": 2.5427635742774117, + "learning_rate": 1e-06, + "loss": 0.3788, + "step": 610 + }, + { + "epoch": 0.0391566265060241, + "grad_norm": 2.561653737428255, + "learning_rate": 1e-06, + "loss": 0.4245, + "step": 611 + }, + { + "epoch": 0.039220712637785184, + "grad_norm": 2.5955070887416323, + "learning_rate": 1e-06, + "loss": 0.4266, + "step": 612 + }, + { + "epoch": 0.03928479876954627, + "grad_norm": 2.54265029522293, + "learning_rate": 1e-06, + "loss": 0.4145, + "step": 613 + }, + { + "epoch": 0.039348884901307356, + "grad_norm": 2.606212536478632, + "learning_rate": 1e-06, + "loss": 0.43, + "step": 614 + }, + { + "epoch": 0.03941297103306844, + "grad_norm": 2.6840178730682682, + "learning_rate": 1e-06, + "loss": 0.4871, + "step": 615 + }, + { + "epoch": 0.03947705716482953, + "grad_norm": 2.650414922571063, + "learning_rate": 1e-06, + "loss": 0.4061, + "step": 616 + }, + { + "epoch": 0.03954114329659062, + "grad_norm": 2.673526302945202, + "learning_rate": 1e-06, + "loss": 0.5218, + "step": 617 + }, + { + "epoch": 0.03960522942835171, + "grad_norm": 2.553206108448424, + "learning_rate": 1e-06, + "loss": 0.4068, + "step": 618 + }, + { + "epoch": 0.03966931556011279, + "grad_norm": 3.8623068130329914, + "learning_rate": 1e-06, + "loss": 0.4685, + "step": 619 + }, + { + "epoch": 0.03973340169187388, + "grad_norm": 2.611209530336014, + "learning_rate": 1e-06, + "loss": 0.4692, + "step": 620 + }, + { + "epoch": 0.039797487823634965, + "grad_norm": 2.612983189021595, + "learning_rate": 1e-06, + "loss": 0.3927, + "step": 621 + }, + { + "epoch": 0.03986157395539605, + "grad_norm": 2.642479051481998, + "learning_rate": 1e-06, + "loss": 0.4703, + "step": 622 + }, + { + "epoch": 0.03992566008715714, + "grad_norm": 2.7796089353924867, + "learning_rate": 1e-06, + "loss": 0.4151, + "step": 623 + }, + { + "epoch": 0.03998974621891822, + "grad_norm": 2.58513886175909, + "learning_rate": 1e-06, + "loss": 0.3998, + "step": 624 + }, + { + "epoch": 0.040053832350679316, + "grad_norm": 2.5219815352139916, + "learning_rate": 1e-06, + "loss": 0.4732, + "step": 625 + }, + { + "epoch": 0.0401179184824404, + "grad_norm": 2.427279598308452, + "learning_rate": 1e-06, + "loss": 0.4312, + "step": 626 + }, + { + "epoch": 0.04018200461420149, + "grad_norm": 2.7451148950328763, + "learning_rate": 1e-06, + "loss": 0.451, + "step": 627 + }, + { + "epoch": 0.040246090745962575, + "grad_norm": 2.935612939952673, + "learning_rate": 1e-06, + "loss": 0.408, + "step": 628 + }, + { + "epoch": 0.04031017687772366, + "grad_norm": 2.315351642452188, + "learning_rate": 1e-06, + "loss": 0.42, + "step": 629 + }, + { + "epoch": 0.04037426300948475, + "grad_norm": 2.616335080998465, + "learning_rate": 1e-06, + "loss": 0.459, + "step": 630 + }, + { + "epoch": 0.04043834914124583, + "grad_norm": 2.7994650340660527, + "learning_rate": 1e-06, + "loss": 0.4556, + "step": 631 + }, + { + "epoch": 0.04050243527300692, + "grad_norm": 2.7990589665063315, + "learning_rate": 1e-06, + "loss": 0.433, + "step": 632 + }, + { + "epoch": 0.040566521404768005, + "grad_norm": 2.726625812194557, + "learning_rate": 1e-06, + "loss": 0.4934, + "step": 633 + }, + { + "epoch": 0.0406306075365291, + "grad_norm": 2.6595974411965004, + "learning_rate": 1e-06, + "loss": 0.3912, + "step": 634 + }, + { + "epoch": 0.040694693668290184, + "grad_norm": 2.6317170132064054, + "learning_rate": 1e-06, + "loss": 0.4233, + "step": 635 + }, + { + "epoch": 0.04075877980005127, + "grad_norm": 2.7113262836328507, + "learning_rate": 1e-06, + "loss": 0.3809, + "step": 636 + }, + { + "epoch": 0.040822865931812356, + "grad_norm": 2.7276916836582963, + "learning_rate": 1e-06, + "loss": 0.4244, + "step": 637 + }, + { + "epoch": 0.04088695206357344, + "grad_norm": 2.511155076141722, + "learning_rate": 1e-06, + "loss": 0.3905, + "step": 638 + }, + { + "epoch": 0.04095103819533453, + "grad_norm": 2.6231514302817867, + "learning_rate": 1e-06, + "loss": 0.4993, + "step": 639 + }, + { + "epoch": 0.041015124327095615, + "grad_norm": 2.8738210559312836, + "learning_rate": 1e-06, + "loss": 0.4589, + "step": 640 + }, + { + "epoch": 0.0410792104588567, + "grad_norm": 2.630954950788824, + "learning_rate": 1e-06, + "loss": 0.4045, + "step": 641 + }, + { + "epoch": 0.041143296590617794, + "grad_norm": 2.3813698177760885, + "learning_rate": 1e-06, + "loss": 0.3638, + "step": 642 + }, + { + "epoch": 0.04120738272237888, + "grad_norm": 2.559089835922891, + "learning_rate": 1e-06, + "loss": 0.4153, + "step": 643 + }, + { + "epoch": 0.041271468854139966, + "grad_norm": 2.6301614669286764, + "learning_rate": 1e-06, + "loss": 0.3787, + "step": 644 + }, + { + "epoch": 0.04133555498590105, + "grad_norm": 2.826724753336134, + "learning_rate": 1e-06, + "loss": 0.4785, + "step": 645 + }, + { + "epoch": 0.04139964111766214, + "grad_norm": 2.5852268129152094, + "learning_rate": 1e-06, + "loss": 0.4127, + "step": 646 + }, + { + "epoch": 0.041463727249423224, + "grad_norm": 2.5517838435167337, + "learning_rate": 1e-06, + "loss": 0.4187, + "step": 647 + }, + { + "epoch": 0.04152781338118431, + "grad_norm": 2.5003343630514485, + "learning_rate": 1e-06, + "loss": 0.4664, + "step": 648 + }, + { + "epoch": 0.041591899512945396, + "grad_norm": 2.660162354251772, + "learning_rate": 1e-06, + "loss": 0.4587, + "step": 649 + }, + { + "epoch": 0.04165598564470648, + "grad_norm": 2.519788649872128, + "learning_rate": 1e-06, + "loss": 0.4124, + "step": 650 + }, + { + "epoch": 0.041720071776467575, + "grad_norm": 2.6811482996986684, + "learning_rate": 1e-06, + "loss": 0.4979, + "step": 651 + }, + { + "epoch": 0.04178415790822866, + "grad_norm": 2.617520724416409, + "learning_rate": 1e-06, + "loss": 0.4485, + "step": 652 + }, + { + "epoch": 0.04184824403998975, + "grad_norm": 2.488909344232931, + "learning_rate": 1e-06, + "loss": 0.4211, + "step": 653 + }, + { + "epoch": 0.04191233017175083, + "grad_norm": 2.6149410343930644, + "learning_rate": 1e-06, + "loss": 0.3825, + "step": 654 + }, + { + "epoch": 0.04197641630351192, + "grad_norm": 2.5128514869435525, + "learning_rate": 1e-06, + "loss": 0.4126, + "step": 655 + }, + { + "epoch": 0.042040502435273006, + "grad_norm": 2.677894898570438, + "learning_rate": 1e-06, + "loss": 0.3868, + "step": 656 + }, + { + "epoch": 0.04210458856703409, + "grad_norm": 2.5073846226183494, + "learning_rate": 1e-06, + "loss": 0.4001, + "step": 657 + }, + { + "epoch": 0.04216867469879518, + "grad_norm": 2.5959443672213887, + "learning_rate": 1e-06, + "loss": 0.3518, + "step": 658 + }, + { + "epoch": 0.04223276083055627, + "grad_norm": 2.6503459543932113, + "learning_rate": 1e-06, + "loss": 0.4283, + "step": 659 + }, + { + "epoch": 0.04229684696231736, + "grad_norm": 2.43063879482731, + "learning_rate": 1e-06, + "loss": 0.4251, + "step": 660 + }, + { + "epoch": 0.04236093309407844, + "grad_norm": 2.349576722965158, + "learning_rate": 1e-06, + "loss": 0.3834, + "step": 661 + }, + { + "epoch": 0.04242501922583953, + "grad_norm": 2.537089027277767, + "learning_rate": 1e-06, + "loss": 0.3797, + "step": 662 + }, + { + "epoch": 0.042489105357600615, + "grad_norm": 2.7224838638581756, + "learning_rate": 1e-06, + "loss": 0.4099, + "step": 663 + }, + { + "epoch": 0.0425531914893617, + "grad_norm": 2.7403703661711303, + "learning_rate": 1e-06, + "loss": 0.4155, + "step": 664 + }, + { + "epoch": 0.04261727762112279, + "grad_norm": 2.481829066148711, + "learning_rate": 1e-06, + "loss": 0.4395, + "step": 665 + }, + { + "epoch": 0.04268136375288387, + "grad_norm": 2.7058959104350286, + "learning_rate": 1e-06, + "loss": 0.3839, + "step": 666 + }, + { + "epoch": 0.042745449884644966, + "grad_norm": 2.664470985436932, + "learning_rate": 1e-06, + "loss": 0.4535, + "step": 667 + }, + { + "epoch": 0.04280953601640605, + "grad_norm": 2.989381219322504, + "learning_rate": 1e-06, + "loss": 0.4469, + "step": 668 + }, + { + "epoch": 0.04287362214816714, + "grad_norm": 2.456391983663052, + "learning_rate": 1e-06, + "loss": 0.4497, + "step": 669 + }, + { + "epoch": 0.042937708279928224, + "grad_norm": 2.5686791944245355, + "learning_rate": 1e-06, + "loss": 0.3879, + "step": 670 + }, + { + "epoch": 0.04300179441168931, + "grad_norm": 2.5171257630542496, + "learning_rate": 1e-06, + "loss": 0.3855, + "step": 671 + }, + { + "epoch": 0.0430658805434504, + "grad_norm": 2.7514044064489958, + "learning_rate": 1e-06, + "loss": 0.4274, + "step": 672 + }, + { + "epoch": 0.04312996667521148, + "grad_norm": 2.588593452159588, + "learning_rate": 1e-06, + "loss": 0.4361, + "step": 673 + }, + { + "epoch": 0.04319405280697257, + "grad_norm": 2.5337204798422133, + "learning_rate": 1e-06, + "loss": 0.4651, + "step": 674 + }, + { + "epoch": 0.043258138938733655, + "grad_norm": 2.80848924235811, + "learning_rate": 1e-06, + "loss": 0.3884, + "step": 675 + }, + { + "epoch": 0.04332222507049475, + "grad_norm": 2.4781318443632703, + "learning_rate": 1e-06, + "loss": 0.4071, + "step": 676 + }, + { + "epoch": 0.043386311202255834, + "grad_norm": 2.6636080398686017, + "learning_rate": 1e-06, + "loss": 0.4434, + "step": 677 + }, + { + "epoch": 0.04345039733401692, + "grad_norm": 2.6875653127583643, + "learning_rate": 1e-06, + "loss": 0.4095, + "step": 678 + }, + { + "epoch": 0.043514483465778006, + "grad_norm": 2.593545387753299, + "learning_rate": 1e-06, + "loss": 0.4077, + "step": 679 + }, + { + "epoch": 0.04357856959753909, + "grad_norm": 2.6927729543543393, + "learning_rate": 1e-06, + "loss": 0.4338, + "step": 680 + }, + { + "epoch": 0.04364265572930018, + "grad_norm": 2.683321727952175, + "learning_rate": 1e-06, + "loss": 0.4527, + "step": 681 + }, + { + "epoch": 0.043706741861061264, + "grad_norm": 2.4442159701665838, + "learning_rate": 1e-06, + "loss": 0.3911, + "step": 682 + }, + { + "epoch": 0.04377082799282235, + "grad_norm": 2.6443057440007722, + "learning_rate": 1e-06, + "loss": 0.4226, + "step": 683 + }, + { + "epoch": 0.04383491412458344, + "grad_norm": 2.771577021353034, + "learning_rate": 1e-06, + "loss": 0.4209, + "step": 684 + }, + { + "epoch": 0.04389900025634453, + "grad_norm": 2.5830604191286577, + "learning_rate": 1e-06, + "loss": 0.4699, + "step": 685 + }, + { + "epoch": 0.043963086388105616, + "grad_norm": 2.7145392457704367, + "learning_rate": 1e-06, + "loss": 0.3852, + "step": 686 + }, + { + "epoch": 0.0440271725198667, + "grad_norm": 2.6292966393223773, + "learning_rate": 1e-06, + "loss": 0.4188, + "step": 687 + }, + { + "epoch": 0.04409125865162779, + "grad_norm": 2.549423161227808, + "learning_rate": 1e-06, + "loss": 0.4607, + "step": 688 + }, + { + "epoch": 0.044155344783388874, + "grad_norm": 2.8191967007014416, + "learning_rate": 1e-06, + "loss": 0.4822, + "step": 689 + }, + { + "epoch": 0.04421943091514996, + "grad_norm": 2.7238873369280596, + "learning_rate": 1e-06, + "loss": 0.4639, + "step": 690 + }, + { + "epoch": 0.044283517046911046, + "grad_norm": 2.633151381982928, + "learning_rate": 1e-06, + "loss": 0.4557, + "step": 691 + }, + { + "epoch": 0.04434760317867213, + "grad_norm": 2.7228749845037745, + "learning_rate": 1e-06, + "loss": 0.4778, + "step": 692 + }, + { + "epoch": 0.044411689310433225, + "grad_norm": 2.7679898994678944, + "learning_rate": 1e-06, + "loss": 0.4181, + "step": 693 + }, + { + "epoch": 0.04447577544219431, + "grad_norm": 2.589789446094907, + "learning_rate": 1e-06, + "loss": 0.4236, + "step": 694 + }, + { + "epoch": 0.0445398615739554, + "grad_norm": 2.511018179407139, + "learning_rate": 1e-06, + "loss": 0.4469, + "step": 695 + }, + { + "epoch": 0.04460394770571648, + "grad_norm": 2.7885072030309206, + "learning_rate": 1e-06, + "loss": 0.4553, + "step": 696 + }, + { + "epoch": 0.04466803383747757, + "grad_norm": 2.7021041650758506, + "learning_rate": 1e-06, + "loss": 0.3864, + "step": 697 + }, + { + "epoch": 0.044732119969238655, + "grad_norm": 2.788293327898669, + "learning_rate": 1e-06, + "loss": 0.4676, + "step": 698 + }, + { + "epoch": 0.04479620610099974, + "grad_norm": 2.5530815918507215, + "learning_rate": 1e-06, + "loss": 0.4304, + "step": 699 + }, + { + "epoch": 0.04486029223276083, + "grad_norm": 2.3701294923623166, + "learning_rate": 1e-06, + "loss": 0.3835, + "step": 700 + }, + { + "epoch": 0.04492437836452192, + "grad_norm": 2.521705987774555, + "learning_rate": 1e-06, + "loss": 0.4147, + "step": 701 + }, + { + "epoch": 0.04498846449628301, + "grad_norm": 2.7337919356294273, + "learning_rate": 1e-06, + "loss": 0.4036, + "step": 702 + }, + { + "epoch": 0.04505255062804409, + "grad_norm": 2.7334466833635713, + "learning_rate": 1e-06, + "loss": 0.4737, + "step": 703 + }, + { + "epoch": 0.04511663675980518, + "grad_norm": 2.412574306067936, + "learning_rate": 1e-06, + "loss": 0.4092, + "step": 704 + }, + { + "epoch": 0.045180722891566265, + "grad_norm": 2.6647181788543675, + "learning_rate": 1e-06, + "loss": 0.4644, + "step": 705 + }, + { + "epoch": 0.04524480902332735, + "grad_norm": 2.8176686628649867, + "learning_rate": 1e-06, + "loss": 0.3859, + "step": 706 + }, + { + "epoch": 0.04530889515508844, + "grad_norm": 3.5376406758684555, + "learning_rate": 1e-06, + "loss": 0.4637, + "step": 707 + }, + { + "epoch": 0.04537298128684952, + "grad_norm": 2.648046979088884, + "learning_rate": 1e-06, + "loss": 0.436, + "step": 708 + }, + { + "epoch": 0.045437067418610616, + "grad_norm": 2.714391240179539, + "learning_rate": 1e-06, + "loss": 0.4007, + "step": 709 + }, + { + "epoch": 0.0455011535503717, + "grad_norm": 2.4256539477484993, + "learning_rate": 1e-06, + "loss": 0.4697, + "step": 710 + }, + { + "epoch": 0.04556523968213279, + "grad_norm": 2.53217824501919, + "learning_rate": 1e-06, + "loss": 0.4386, + "step": 711 + }, + { + "epoch": 0.045629325813893874, + "grad_norm": 2.627890030554265, + "learning_rate": 1e-06, + "loss": 0.3706, + "step": 712 + }, + { + "epoch": 0.04569341194565496, + "grad_norm": 2.518196492721495, + "learning_rate": 1e-06, + "loss": 0.4087, + "step": 713 + }, + { + "epoch": 0.045757498077416046, + "grad_norm": 2.496806126024874, + "learning_rate": 1e-06, + "loss": 0.4553, + "step": 714 + }, + { + "epoch": 0.04582158420917713, + "grad_norm": 2.5758802594573242, + "learning_rate": 1e-06, + "loss": 0.3946, + "step": 715 + }, + { + "epoch": 0.04588567034093822, + "grad_norm": 2.6327983887455875, + "learning_rate": 1e-06, + "loss": 0.4266, + "step": 716 + }, + { + "epoch": 0.045949756472699305, + "grad_norm": 2.5940640401988104, + "learning_rate": 1e-06, + "loss": 0.4717, + "step": 717 + }, + { + "epoch": 0.0460138426044604, + "grad_norm": 2.6292851190752056, + "learning_rate": 1e-06, + "loss": 0.4645, + "step": 718 + }, + { + "epoch": 0.046077928736221484, + "grad_norm": 2.5407643752820706, + "learning_rate": 1e-06, + "loss": 0.4496, + "step": 719 + }, + { + "epoch": 0.04614201486798257, + "grad_norm": 2.7968598288395667, + "learning_rate": 1e-06, + "loss": 0.4301, + "step": 720 + }, + { + "epoch": 0.046206100999743656, + "grad_norm": 2.5396197101295246, + "learning_rate": 1e-06, + "loss": 0.4702, + "step": 721 + }, + { + "epoch": 0.04627018713150474, + "grad_norm": 2.6880034453423662, + "learning_rate": 1e-06, + "loss": 0.4089, + "step": 722 + }, + { + "epoch": 0.04633427326326583, + "grad_norm": 2.534639229971012, + "learning_rate": 1e-06, + "loss": 0.4418, + "step": 723 + }, + { + "epoch": 0.046398359395026914, + "grad_norm": 2.7035380317990705, + "learning_rate": 1e-06, + "loss": 0.4194, + "step": 724 + }, + { + "epoch": 0.046462445526788, + "grad_norm": 2.5100605403625913, + "learning_rate": 1e-06, + "loss": 0.4387, + "step": 725 + }, + { + "epoch": 0.04652653165854909, + "grad_norm": 2.787377804865803, + "learning_rate": 1e-06, + "loss": 0.4391, + "step": 726 + }, + { + "epoch": 0.04659061779031018, + "grad_norm": 2.758137307249647, + "learning_rate": 1e-06, + "loss": 0.4297, + "step": 727 + }, + { + "epoch": 0.046654703922071265, + "grad_norm": 2.6893335848609987, + "learning_rate": 1e-06, + "loss": 0.4442, + "step": 728 + }, + { + "epoch": 0.04671879005383235, + "grad_norm": 2.596806970800409, + "learning_rate": 1e-06, + "loss": 0.4434, + "step": 729 + }, + { + "epoch": 0.04678287618559344, + "grad_norm": 2.607569512934613, + "learning_rate": 1e-06, + "loss": 0.3982, + "step": 730 + }, + { + "epoch": 0.046846962317354524, + "grad_norm": 2.5297392747277994, + "learning_rate": 1e-06, + "loss": 0.4196, + "step": 731 + }, + { + "epoch": 0.04691104844911561, + "grad_norm": 2.6521936744466195, + "learning_rate": 1e-06, + "loss": 0.4147, + "step": 732 + }, + { + "epoch": 0.046975134580876696, + "grad_norm": 2.876213805294052, + "learning_rate": 1e-06, + "loss": 0.395, + "step": 733 + }, + { + "epoch": 0.04703922071263778, + "grad_norm": 2.4043885041918243, + "learning_rate": 1e-06, + "loss": 0.4338, + "step": 734 + }, + { + "epoch": 0.047103306844398875, + "grad_norm": 2.5867612136864055, + "learning_rate": 1e-06, + "loss": 0.4444, + "step": 735 + }, + { + "epoch": 0.04716739297615996, + "grad_norm": 2.5203294696988134, + "learning_rate": 1e-06, + "loss": 0.437, + "step": 736 + }, + { + "epoch": 0.04723147910792105, + "grad_norm": 2.6451605123098587, + "learning_rate": 1e-06, + "loss": 0.4449, + "step": 737 + }, + { + "epoch": 0.04729556523968213, + "grad_norm": 2.6941293120299137, + "learning_rate": 1e-06, + "loss": 0.4292, + "step": 738 + }, + { + "epoch": 0.04735965137144322, + "grad_norm": 2.6302676678079546, + "learning_rate": 1e-06, + "loss": 0.4505, + "step": 739 + }, + { + "epoch": 0.047423737503204305, + "grad_norm": 2.592261969149947, + "learning_rate": 1e-06, + "loss": 0.4215, + "step": 740 + }, + { + "epoch": 0.04748782363496539, + "grad_norm": 2.682832075298025, + "learning_rate": 1e-06, + "loss": 0.4149, + "step": 741 + }, + { + "epoch": 0.04755190976672648, + "grad_norm": 2.5907747282023887, + "learning_rate": 1e-06, + "loss": 0.4235, + "step": 742 + }, + { + "epoch": 0.04761599589848757, + "grad_norm": 2.6833032775341943, + "learning_rate": 1e-06, + "loss": 0.399, + "step": 743 + }, + { + "epoch": 0.047680082030248656, + "grad_norm": 2.518606400690256, + "learning_rate": 1e-06, + "loss": 0.3945, + "step": 744 + }, + { + "epoch": 0.04774416816200974, + "grad_norm": 2.479270755919287, + "learning_rate": 1e-06, + "loss": 0.4386, + "step": 745 + }, + { + "epoch": 0.04780825429377083, + "grad_norm": 2.72515135116294, + "learning_rate": 1e-06, + "loss": 0.4247, + "step": 746 + }, + { + "epoch": 0.047872340425531915, + "grad_norm": 2.4749722434919716, + "learning_rate": 1e-06, + "loss": 0.4729, + "step": 747 + }, + { + "epoch": 0.047936426557293, + "grad_norm": 2.5106869846945323, + "learning_rate": 1e-06, + "loss": 0.4118, + "step": 748 + }, + { + "epoch": 0.04800051268905409, + "grad_norm": 2.665593207338953, + "learning_rate": 1e-06, + "loss": 0.4037, + "step": 749 + }, + { + "epoch": 0.04806459882081517, + "grad_norm": 2.7037283823905405, + "learning_rate": 1e-06, + "loss": 0.451, + "step": 750 + }, + { + "epoch": 0.048128684952576266, + "grad_norm": 2.433991765145583, + "learning_rate": 1e-06, + "loss": 0.4518, + "step": 751 + }, + { + "epoch": 0.04819277108433735, + "grad_norm": 2.6227479993797833, + "learning_rate": 1e-06, + "loss": 0.327, + "step": 752 + }, + { + "epoch": 0.04825685721609844, + "grad_norm": 2.5651412483654323, + "learning_rate": 1e-06, + "loss": 0.4565, + "step": 753 + }, + { + "epoch": 0.048320943347859524, + "grad_norm": 2.4244933108312683, + "learning_rate": 1e-06, + "loss": 0.4195, + "step": 754 + }, + { + "epoch": 0.04838502947962061, + "grad_norm": 2.41146156297117, + "learning_rate": 1e-06, + "loss": 0.4178, + "step": 755 + }, + { + "epoch": 0.048449115611381696, + "grad_norm": 2.8872723239228097, + "learning_rate": 1e-06, + "loss": 0.4369, + "step": 756 + }, + { + "epoch": 0.04851320174314278, + "grad_norm": 2.762725037885391, + "learning_rate": 1e-06, + "loss": 0.3781, + "step": 757 + }, + { + "epoch": 0.04857728787490387, + "grad_norm": 3.836723679916666, + "learning_rate": 1e-06, + "loss": 0.4037, + "step": 758 + }, + { + "epoch": 0.048641374006664954, + "grad_norm": 2.8109148560486727, + "learning_rate": 1e-06, + "loss": 0.4655, + "step": 759 + }, + { + "epoch": 0.04870546013842605, + "grad_norm": 2.577080298605142, + "learning_rate": 1e-06, + "loss": 0.4606, + "step": 760 + }, + { + "epoch": 0.048769546270187134, + "grad_norm": 2.65526854422781, + "learning_rate": 1e-06, + "loss": 0.5098, + "step": 761 + }, + { + "epoch": 0.04883363240194822, + "grad_norm": 2.525793932433678, + "learning_rate": 1e-06, + "loss": 0.3975, + "step": 762 + }, + { + "epoch": 0.048897718533709306, + "grad_norm": 2.7031533973927915, + "learning_rate": 1e-06, + "loss": 0.4672, + "step": 763 + }, + { + "epoch": 0.04896180466547039, + "grad_norm": 2.772546979327886, + "learning_rate": 1e-06, + "loss": 0.4592, + "step": 764 + }, + { + "epoch": 0.04902589079723148, + "grad_norm": 2.7047272161441316, + "learning_rate": 1e-06, + "loss": 0.4611, + "step": 765 + }, + { + "epoch": 0.049089976928992564, + "grad_norm": 2.6979587125081963, + "learning_rate": 1e-06, + "loss": 0.4094, + "step": 766 + }, + { + "epoch": 0.04915406306075365, + "grad_norm": 2.525668570583122, + "learning_rate": 1e-06, + "loss": 0.38, + "step": 767 + }, + { + "epoch": 0.04921814919251474, + "grad_norm": 2.538200424294564, + "learning_rate": 1e-06, + "loss": 0.3992, + "step": 768 + }, + { + "epoch": 0.04928223532427583, + "grad_norm": 2.4589512816952372, + "learning_rate": 1e-06, + "loss": 0.3886, + "step": 769 + }, + { + "epoch": 0.049346321456036915, + "grad_norm": 2.74365547311966, + "learning_rate": 1e-06, + "loss": 0.4395, + "step": 770 + }, + { + "epoch": 0.049410407587798, + "grad_norm": 2.8613608191226456, + "learning_rate": 1e-06, + "loss": 0.417, + "step": 771 + }, + { + "epoch": 0.04947449371955909, + "grad_norm": 2.4818805818190293, + "learning_rate": 1e-06, + "loss": 0.4414, + "step": 772 + }, + { + "epoch": 0.04953857985132017, + "grad_norm": 3.073554752078375, + "learning_rate": 1e-06, + "loss": 0.3623, + "step": 773 + }, + { + "epoch": 0.04960266598308126, + "grad_norm": 2.6000204539317986, + "learning_rate": 1e-06, + "loss": 0.4448, + "step": 774 + }, + { + "epoch": 0.049666752114842345, + "grad_norm": 2.619867444073265, + "learning_rate": 1e-06, + "loss": 0.4846, + "step": 775 + }, + { + "epoch": 0.04973083824660343, + "grad_norm": 2.670953914332981, + "learning_rate": 1e-06, + "loss": 0.4124, + "step": 776 + }, + { + "epoch": 0.049794924378364525, + "grad_norm": 2.6431772566079537, + "learning_rate": 1e-06, + "loss": 0.4111, + "step": 777 + }, + { + "epoch": 0.04985901051012561, + "grad_norm": 2.379782074934585, + "learning_rate": 1e-06, + "loss": 0.4399, + "step": 778 + }, + { + "epoch": 0.0499230966418867, + "grad_norm": 2.5242383740225645, + "learning_rate": 1e-06, + "loss": 0.3977, + "step": 779 + }, + { + "epoch": 0.04998718277364778, + "grad_norm": 2.4585612977491214, + "learning_rate": 1e-06, + "loss": 0.4342, + "step": 780 + }, + { + "epoch": 0.05005126890540887, + "grad_norm": 2.5353463721233833, + "learning_rate": 1e-06, + "loss": 0.409, + "step": 781 + }, + { + "epoch": 0.050115355037169955, + "grad_norm": 2.771798641700129, + "learning_rate": 1e-06, + "loss": 0.4163, + "step": 782 + }, + { + "epoch": 0.05017944116893104, + "grad_norm": 2.4567227275972785, + "learning_rate": 1e-06, + "loss": 0.4343, + "step": 783 + }, + { + "epoch": 0.05024352730069213, + "grad_norm": 2.776241471884275, + "learning_rate": 1e-06, + "loss": 0.3772, + "step": 784 + }, + { + "epoch": 0.05030761343245322, + "grad_norm": 2.73917716665947, + "learning_rate": 1e-06, + "loss": 0.421, + "step": 785 + }, + { + "epoch": 0.050371699564214306, + "grad_norm": 2.4608060384184696, + "learning_rate": 1e-06, + "loss": 0.4078, + "step": 786 + }, + { + "epoch": 0.05043578569597539, + "grad_norm": 2.7782795289847333, + "learning_rate": 1e-06, + "loss": 0.4286, + "step": 787 + }, + { + "epoch": 0.05049987182773648, + "grad_norm": 2.4978989115524106, + "learning_rate": 1e-06, + "loss": 0.3732, + "step": 788 + }, + { + "epoch": 0.050563957959497564, + "grad_norm": 2.5533215903850977, + "learning_rate": 1e-06, + "loss": 0.3687, + "step": 789 + }, + { + "epoch": 0.05062804409125865, + "grad_norm": 2.7587787615557113, + "learning_rate": 1e-06, + "loss": 0.427, + "step": 790 + }, + { + "epoch": 0.05069213022301974, + "grad_norm": 2.6589908486758125, + "learning_rate": 1e-06, + "loss": 0.4661, + "step": 791 + }, + { + "epoch": 0.05075621635478082, + "grad_norm": 2.574238405368089, + "learning_rate": 1e-06, + "loss": 0.4779, + "step": 792 + }, + { + "epoch": 0.050820302486541916, + "grad_norm": 2.6274966108889255, + "learning_rate": 1e-06, + "loss": 0.4217, + "step": 793 + }, + { + "epoch": 0.050884388618303, + "grad_norm": 3.2801557437132556, + "learning_rate": 1e-06, + "loss": 0.3728, + "step": 794 + }, + { + "epoch": 0.05094847475006409, + "grad_norm": 2.6941030070682874, + "learning_rate": 1e-06, + "loss": 0.466, + "step": 795 + }, + { + "epoch": 0.051012560881825174, + "grad_norm": 2.5176002839833918, + "learning_rate": 1e-06, + "loss": 0.4492, + "step": 796 + }, + { + "epoch": 0.05107664701358626, + "grad_norm": 2.7530193209358567, + "learning_rate": 1e-06, + "loss": 0.4408, + "step": 797 + }, + { + "epoch": 0.051140733145347346, + "grad_norm": 2.565968695255342, + "learning_rate": 1e-06, + "loss": 0.3498, + "step": 798 + }, + { + "epoch": 0.05120481927710843, + "grad_norm": 2.6259720811960445, + "learning_rate": 1e-06, + "loss": 0.3989, + "step": 799 + }, + { + "epoch": 0.05126890540886952, + "grad_norm": 2.6154860713109773, + "learning_rate": 1e-06, + "loss": 0.4164, + "step": 800 + }, + { + "epoch": 0.051332991540630604, + "grad_norm": 2.7563490613331054, + "learning_rate": 1e-06, + "loss": 0.4331, + "step": 801 + }, + { + "epoch": 0.0513970776723917, + "grad_norm": 2.588060868381382, + "learning_rate": 1e-06, + "loss": 0.4109, + "step": 802 + }, + { + "epoch": 0.05146116380415278, + "grad_norm": 2.4579680369870234, + "learning_rate": 1e-06, + "loss": 0.4286, + "step": 803 + }, + { + "epoch": 0.05152524993591387, + "grad_norm": 2.4116736631822464, + "learning_rate": 1e-06, + "loss": 0.4438, + "step": 804 + }, + { + "epoch": 0.051589336067674955, + "grad_norm": 2.695563644208034, + "learning_rate": 1e-06, + "loss": 0.4688, + "step": 805 + }, + { + "epoch": 0.05165342219943604, + "grad_norm": 2.675047438176051, + "learning_rate": 1e-06, + "loss": 0.4056, + "step": 806 + }, + { + "epoch": 0.05171750833119713, + "grad_norm": 2.748181109672208, + "learning_rate": 1e-06, + "loss": 0.4171, + "step": 807 + }, + { + "epoch": 0.051781594462958214, + "grad_norm": 2.3059289383795263, + "learning_rate": 1e-06, + "loss": 0.3945, + "step": 808 + }, + { + "epoch": 0.0518456805947193, + "grad_norm": 2.5384132760352434, + "learning_rate": 1e-06, + "loss": 0.4316, + "step": 809 + }, + { + "epoch": 0.05190976672648039, + "grad_norm": 2.647356877858427, + "learning_rate": 1e-06, + "loss": 0.4583, + "step": 810 + }, + { + "epoch": 0.05197385285824148, + "grad_norm": 2.6468297347585636, + "learning_rate": 1e-06, + "loss": 0.4099, + "step": 811 + }, + { + "epoch": 0.052037938990002565, + "grad_norm": 2.694772877787142, + "learning_rate": 1e-06, + "loss": 0.4127, + "step": 812 + }, + { + "epoch": 0.05210202512176365, + "grad_norm": 2.7739794514088914, + "learning_rate": 1e-06, + "loss": 0.4089, + "step": 813 + }, + { + "epoch": 0.05216611125352474, + "grad_norm": 2.7432106258958444, + "learning_rate": 1e-06, + "loss": 0.4478, + "step": 814 + }, + { + "epoch": 0.05223019738528582, + "grad_norm": 2.59447934250964, + "learning_rate": 1e-06, + "loss": 0.4175, + "step": 815 + }, + { + "epoch": 0.05229428351704691, + "grad_norm": 2.482449882094235, + "learning_rate": 1e-06, + "loss": 0.3899, + "step": 816 + }, + { + "epoch": 0.052358369648807995, + "grad_norm": 2.6200358287917687, + "learning_rate": 1e-06, + "loss": 0.4187, + "step": 817 + }, + { + "epoch": 0.05242245578056909, + "grad_norm": 2.6787235723782645, + "learning_rate": 1e-06, + "loss": 0.4276, + "step": 818 + }, + { + "epoch": 0.052486541912330174, + "grad_norm": 2.857976316647533, + "learning_rate": 1e-06, + "loss": 0.3971, + "step": 819 + }, + { + "epoch": 0.05255062804409126, + "grad_norm": 2.4674037209798287, + "learning_rate": 1e-06, + "loss": 0.4056, + "step": 820 + }, + { + "epoch": 0.052614714175852347, + "grad_norm": 2.581328829253118, + "learning_rate": 1e-06, + "loss": 0.4306, + "step": 821 + }, + { + "epoch": 0.05267880030761343, + "grad_norm": 2.5713295173776984, + "learning_rate": 1e-06, + "loss": 0.4197, + "step": 822 + }, + { + "epoch": 0.05274288643937452, + "grad_norm": 2.6617855766435903, + "learning_rate": 1e-06, + "loss": 0.453, + "step": 823 + }, + { + "epoch": 0.052806972571135605, + "grad_norm": 2.5223890207481925, + "learning_rate": 1e-06, + "loss": 0.3927, + "step": 824 + }, + { + "epoch": 0.05287105870289669, + "grad_norm": 2.532919291158444, + "learning_rate": 1e-06, + "loss": 0.4401, + "step": 825 + }, + { + "epoch": 0.05293514483465778, + "grad_norm": 2.6565628872172256, + "learning_rate": 1e-06, + "loss": 0.4112, + "step": 826 + }, + { + "epoch": 0.05299923096641887, + "grad_norm": 2.593028943009522, + "learning_rate": 1e-06, + "loss": 0.429, + "step": 827 + }, + { + "epoch": 0.053063317098179956, + "grad_norm": 2.713800694208457, + "learning_rate": 1e-06, + "loss": 0.4659, + "step": 828 + }, + { + "epoch": 0.05312740322994104, + "grad_norm": 2.4576869289326706, + "learning_rate": 1e-06, + "loss": 0.4199, + "step": 829 + }, + { + "epoch": 0.05319148936170213, + "grad_norm": 2.4801827666076552, + "learning_rate": 1e-06, + "loss": 0.4272, + "step": 830 + }, + { + "epoch": 0.053255575493463214, + "grad_norm": 2.582789118414582, + "learning_rate": 1e-06, + "loss": 0.4289, + "step": 831 + }, + { + "epoch": 0.0533196616252243, + "grad_norm": 2.48821093605307, + "learning_rate": 1e-06, + "loss": 0.4121, + "step": 832 + }, + { + "epoch": 0.053383747756985386, + "grad_norm": 2.6186323943996435, + "learning_rate": 1e-06, + "loss": 0.441, + "step": 833 + }, + { + "epoch": 0.05344783388874647, + "grad_norm": 2.5517032828506334, + "learning_rate": 1e-06, + "loss": 0.421, + "step": 834 + }, + { + "epoch": 0.053511920020507565, + "grad_norm": 2.5356349325994647, + "learning_rate": 1e-06, + "loss": 0.4274, + "step": 835 + }, + { + "epoch": 0.05357600615226865, + "grad_norm": 2.428198485601343, + "learning_rate": 1e-06, + "loss": 0.4706, + "step": 836 + }, + { + "epoch": 0.05364009228402974, + "grad_norm": 2.587909226071667, + "learning_rate": 1e-06, + "loss": 0.4062, + "step": 837 + }, + { + "epoch": 0.053704178415790824, + "grad_norm": 2.309073113032567, + "learning_rate": 1e-06, + "loss": 0.4032, + "step": 838 + }, + { + "epoch": 0.05376826454755191, + "grad_norm": 2.6703159072154077, + "learning_rate": 1e-06, + "loss": 0.419, + "step": 839 + }, + { + "epoch": 0.053832350679312996, + "grad_norm": 2.71350739697573, + "learning_rate": 1e-06, + "loss": 0.4495, + "step": 840 + }, + { + "epoch": 0.05389643681107408, + "grad_norm": 2.434892539079085, + "learning_rate": 1e-06, + "loss": 0.4535, + "step": 841 + }, + { + "epoch": 0.05396052294283517, + "grad_norm": 2.5963354526268425, + "learning_rate": 1e-06, + "loss": 0.425, + "step": 842 + }, + { + "epoch": 0.054024609074596254, + "grad_norm": 2.483702809890466, + "learning_rate": 1e-06, + "loss": 0.4452, + "step": 843 + }, + { + "epoch": 0.05408869520635735, + "grad_norm": 2.5440750960584957, + "learning_rate": 1e-06, + "loss": 0.3545, + "step": 844 + }, + { + "epoch": 0.05415278133811843, + "grad_norm": 2.483060350154876, + "learning_rate": 1e-06, + "loss": 0.4014, + "step": 845 + }, + { + "epoch": 0.05421686746987952, + "grad_norm": 2.65872885985031, + "learning_rate": 1e-06, + "loss": 0.3978, + "step": 846 + }, + { + "epoch": 0.054280953601640605, + "grad_norm": 2.5818989112102173, + "learning_rate": 1e-06, + "loss": 0.3821, + "step": 847 + }, + { + "epoch": 0.05434503973340169, + "grad_norm": 2.4076980150335734, + "learning_rate": 1e-06, + "loss": 0.3354, + "step": 848 + }, + { + "epoch": 0.05440912586516278, + "grad_norm": 2.698007185792458, + "learning_rate": 1e-06, + "loss": 0.4506, + "step": 849 + }, + { + "epoch": 0.05447321199692386, + "grad_norm": 2.519075483872261, + "learning_rate": 1e-06, + "loss": 0.4133, + "step": 850 + }, + { + "epoch": 0.05453729812868495, + "grad_norm": 2.717885074101232, + "learning_rate": 1e-06, + "loss": 0.4175, + "step": 851 + }, + { + "epoch": 0.05460138426044604, + "grad_norm": 2.5856521915485646, + "learning_rate": 1e-06, + "loss": 0.4253, + "step": 852 + }, + { + "epoch": 0.05466547039220713, + "grad_norm": 2.4556651119087127, + "learning_rate": 1e-06, + "loss": 0.3957, + "step": 853 + }, + { + "epoch": 0.054729556523968215, + "grad_norm": 2.7721277540478146, + "learning_rate": 1e-06, + "loss": 0.3671, + "step": 854 + }, + { + "epoch": 0.0547936426557293, + "grad_norm": 2.657352492338648, + "learning_rate": 1e-06, + "loss": 0.4178, + "step": 855 + }, + { + "epoch": 0.05485772878749039, + "grad_norm": 2.7006089581535075, + "learning_rate": 1e-06, + "loss": 0.4183, + "step": 856 + }, + { + "epoch": 0.05492181491925147, + "grad_norm": 2.5670709358295647, + "learning_rate": 1e-06, + "loss": 0.4558, + "step": 857 + }, + { + "epoch": 0.05498590105101256, + "grad_norm": 2.628637826655462, + "learning_rate": 1e-06, + "loss": 0.4069, + "step": 858 + }, + { + "epoch": 0.055049987182773645, + "grad_norm": 2.6148005379775783, + "learning_rate": 1e-06, + "loss": 0.4136, + "step": 859 + }, + { + "epoch": 0.05511407331453474, + "grad_norm": 2.591000403813115, + "learning_rate": 1e-06, + "loss": 0.3866, + "step": 860 + }, + { + "epoch": 0.055178159446295824, + "grad_norm": 2.507601296484238, + "learning_rate": 1e-06, + "loss": 0.3998, + "step": 861 + }, + { + "epoch": 0.05524224557805691, + "grad_norm": 2.787771022744685, + "learning_rate": 1e-06, + "loss": 0.4026, + "step": 862 + }, + { + "epoch": 0.055306331709817996, + "grad_norm": 2.518082388993556, + "learning_rate": 1e-06, + "loss": 0.4215, + "step": 863 + }, + { + "epoch": 0.05537041784157908, + "grad_norm": 2.4587915178077777, + "learning_rate": 1e-06, + "loss": 0.4266, + "step": 864 + }, + { + "epoch": 0.05543450397334017, + "grad_norm": 2.615686357034752, + "learning_rate": 1e-06, + "loss": 0.368, + "step": 865 + }, + { + "epoch": 0.055498590105101255, + "grad_norm": 2.761059961241006, + "learning_rate": 1e-06, + "loss": 0.5015, + "step": 866 + }, + { + "epoch": 0.05556267623686234, + "grad_norm": 2.5272432083229535, + "learning_rate": 1e-06, + "loss": 0.4705, + "step": 867 + }, + { + "epoch": 0.05562676236862343, + "grad_norm": 2.59284819194525, + "learning_rate": 1e-06, + "loss": 0.403, + "step": 868 + }, + { + "epoch": 0.05569084850038452, + "grad_norm": 2.533049808944894, + "learning_rate": 1e-06, + "loss": 0.394, + "step": 869 + }, + { + "epoch": 0.055754934632145606, + "grad_norm": 2.504421255278961, + "learning_rate": 1e-06, + "loss": 0.3689, + "step": 870 + }, + { + "epoch": 0.05581902076390669, + "grad_norm": 2.771424162674612, + "learning_rate": 1e-06, + "loss": 0.4824, + "step": 871 + }, + { + "epoch": 0.05588310689566778, + "grad_norm": 2.5494936421830285, + "learning_rate": 1e-06, + "loss": 0.4088, + "step": 872 + }, + { + "epoch": 0.055947193027428864, + "grad_norm": 2.626180064034078, + "learning_rate": 1e-06, + "loss": 0.4615, + "step": 873 + }, + { + "epoch": 0.05601127915918995, + "grad_norm": 2.845992108008082, + "learning_rate": 1e-06, + "loss": 0.4442, + "step": 874 + }, + { + "epoch": 0.056075365290951036, + "grad_norm": 2.741059913668609, + "learning_rate": 1e-06, + "loss": 0.4047, + "step": 875 + }, + { + "epoch": 0.05613945142271212, + "grad_norm": 2.870070922473511, + "learning_rate": 1e-06, + "loss": 0.4648, + "step": 876 + }, + { + "epoch": 0.056203537554473215, + "grad_norm": 2.5058746300769434, + "learning_rate": 1e-06, + "loss": 0.3809, + "step": 877 + }, + { + "epoch": 0.0562676236862343, + "grad_norm": 2.560616702011279, + "learning_rate": 1e-06, + "loss": 0.4123, + "step": 878 + }, + { + "epoch": 0.05633170981799539, + "grad_norm": 2.6594905071835955, + "learning_rate": 1e-06, + "loss": 0.4145, + "step": 879 + }, + { + "epoch": 0.05639579594975647, + "grad_norm": 2.5214292041437827, + "learning_rate": 1e-06, + "loss": 0.4427, + "step": 880 + }, + { + "epoch": 0.05645988208151756, + "grad_norm": 2.5077576684919407, + "learning_rate": 1e-06, + "loss": 0.4656, + "step": 881 + }, + { + "epoch": 0.056523968213278646, + "grad_norm": 2.5092725959002546, + "learning_rate": 1e-06, + "loss": 0.39, + "step": 882 + }, + { + "epoch": 0.05658805434503973, + "grad_norm": 2.580445517000057, + "learning_rate": 1e-06, + "loss": 0.4093, + "step": 883 + }, + { + "epoch": 0.05665214047680082, + "grad_norm": 2.5735197772804193, + "learning_rate": 1e-06, + "loss": 0.4221, + "step": 884 + }, + { + "epoch": 0.056716226608561904, + "grad_norm": 2.574408375918761, + "learning_rate": 1e-06, + "loss": 0.4423, + "step": 885 + }, + { + "epoch": 0.056780312740323, + "grad_norm": 2.780282548148286, + "learning_rate": 1e-06, + "loss": 0.4947, + "step": 886 + }, + { + "epoch": 0.05684439887208408, + "grad_norm": 2.4928626518173274, + "learning_rate": 1e-06, + "loss": 0.3504, + "step": 887 + }, + { + "epoch": 0.05690848500384517, + "grad_norm": 2.553051950988801, + "learning_rate": 1e-06, + "loss": 0.3975, + "step": 888 + }, + { + "epoch": 0.056972571135606255, + "grad_norm": 2.658416930515607, + "learning_rate": 1e-06, + "loss": 0.4394, + "step": 889 + }, + { + "epoch": 0.05703665726736734, + "grad_norm": 2.485961423782349, + "learning_rate": 1e-06, + "loss": 0.4279, + "step": 890 + }, + { + "epoch": 0.05710074339912843, + "grad_norm": 2.667713489712823, + "learning_rate": 1e-06, + "loss": 0.4705, + "step": 891 + }, + { + "epoch": 0.05716482953088951, + "grad_norm": 2.534564282196064, + "learning_rate": 1e-06, + "loss": 0.3889, + "step": 892 + }, + { + "epoch": 0.0572289156626506, + "grad_norm": 2.4356599669017496, + "learning_rate": 1e-06, + "loss": 0.3985, + "step": 893 + }, + { + "epoch": 0.05729300179441169, + "grad_norm": 2.7300363917836066, + "learning_rate": 1e-06, + "loss": 0.4208, + "step": 894 + }, + { + "epoch": 0.05735708792617278, + "grad_norm": 2.4426263028281503, + "learning_rate": 1e-06, + "loss": 0.4138, + "step": 895 + }, + { + "epoch": 0.057421174057933864, + "grad_norm": 2.4273359063929116, + "learning_rate": 1e-06, + "loss": 0.3869, + "step": 896 + }, + { + "epoch": 0.05748526018969495, + "grad_norm": 2.683961557261951, + "learning_rate": 1e-06, + "loss": 0.4253, + "step": 897 + }, + { + "epoch": 0.05754934632145604, + "grad_norm": 2.716449807419154, + "learning_rate": 1e-06, + "loss": 0.4144, + "step": 898 + }, + { + "epoch": 0.05761343245321712, + "grad_norm": 2.489474578355706, + "learning_rate": 1e-06, + "loss": 0.4242, + "step": 899 + }, + { + "epoch": 0.05767751858497821, + "grad_norm": 2.621299699094432, + "learning_rate": 1e-06, + "loss": 0.3961, + "step": 900 + }, + { + "epoch": 0.057741604716739295, + "grad_norm": 2.7042519805441545, + "learning_rate": 1e-06, + "loss": 0.4085, + "step": 901 + }, + { + "epoch": 0.05780569084850039, + "grad_norm": 2.8709190595594793, + "learning_rate": 1e-06, + "loss": 0.4351, + "step": 902 + }, + { + "epoch": 0.057869776980261474, + "grad_norm": 2.7239595338843543, + "learning_rate": 1e-06, + "loss": 0.4457, + "step": 903 + }, + { + "epoch": 0.05793386311202256, + "grad_norm": 2.6942314635143743, + "learning_rate": 1e-06, + "loss": 0.437, + "step": 904 + }, + { + "epoch": 0.057997949243783646, + "grad_norm": 2.561892748218017, + "learning_rate": 1e-06, + "loss": 0.4445, + "step": 905 + }, + { + "epoch": 0.05806203537554473, + "grad_norm": 2.6940166517257977, + "learning_rate": 1e-06, + "loss": 0.4407, + "step": 906 + }, + { + "epoch": 0.05812612150730582, + "grad_norm": 2.8794761385227083, + "learning_rate": 1e-06, + "loss": 0.4117, + "step": 907 + }, + { + "epoch": 0.058190207639066904, + "grad_norm": 2.6035218967013662, + "learning_rate": 1e-06, + "loss": 0.4566, + "step": 908 + }, + { + "epoch": 0.05825429377082799, + "grad_norm": 2.658548866501046, + "learning_rate": 1e-06, + "loss": 0.4506, + "step": 909 + }, + { + "epoch": 0.058318379902589076, + "grad_norm": 2.7266152577743723, + "learning_rate": 1e-06, + "loss": 0.502, + "step": 910 + }, + { + "epoch": 0.05838246603435017, + "grad_norm": 2.480477229574691, + "learning_rate": 1e-06, + "loss": 0.4381, + "step": 911 + }, + { + "epoch": 0.058446552166111256, + "grad_norm": 2.571579618617921, + "learning_rate": 1e-06, + "loss": 0.4126, + "step": 912 + }, + { + "epoch": 0.05851063829787234, + "grad_norm": 2.673325790509034, + "learning_rate": 1e-06, + "loss": 0.4085, + "step": 913 + }, + { + "epoch": 0.05857472442963343, + "grad_norm": 2.585655375519541, + "learning_rate": 1e-06, + "loss": 0.4323, + "step": 914 + }, + { + "epoch": 0.058638810561394514, + "grad_norm": 2.5704842362508846, + "learning_rate": 1e-06, + "loss": 0.4449, + "step": 915 + }, + { + "epoch": 0.0587028966931556, + "grad_norm": 2.6248859605653276, + "learning_rate": 1e-06, + "loss": 0.4606, + "step": 916 + }, + { + "epoch": 0.058766982824916686, + "grad_norm": 2.5772275270301703, + "learning_rate": 1e-06, + "loss": 0.4135, + "step": 917 + }, + { + "epoch": 0.05883106895667777, + "grad_norm": 2.524083865479341, + "learning_rate": 1e-06, + "loss": 0.4281, + "step": 918 + }, + { + "epoch": 0.058895155088438865, + "grad_norm": 2.5691457666125697, + "learning_rate": 1e-06, + "loss": 0.4609, + "step": 919 + }, + { + "epoch": 0.05895924122019995, + "grad_norm": 2.613129783593128, + "learning_rate": 1e-06, + "loss": 0.4304, + "step": 920 + }, + { + "epoch": 0.05902332735196104, + "grad_norm": 2.645453363847247, + "learning_rate": 1e-06, + "loss": 0.4506, + "step": 921 + }, + { + "epoch": 0.05908741348372212, + "grad_norm": 2.5056096531333516, + "learning_rate": 1e-06, + "loss": 0.3821, + "step": 922 + }, + { + "epoch": 0.05915149961548321, + "grad_norm": 2.8282030249135, + "learning_rate": 1e-06, + "loss": 0.4979, + "step": 923 + }, + { + "epoch": 0.059215585747244295, + "grad_norm": 2.545655906976193, + "learning_rate": 1e-06, + "loss": 0.4205, + "step": 924 + }, + { + "epoch": 0.05927967187900538, + "grad_norm": 2.4742283942662486, + "learning_rate": 1e-06, + "loss": 0.3709, + "step": 925 + }, + { + "epoch": 0.05934375801076647, + "grad_norm": 2.609486423567062, + "learning_rate": 1e-06, + "loss": 0.3883, + "step": 926 + }, + { + "epoch": 0.059407844142527554, + "grad_norm": 2.433383851079032, + "learning_rate": 1e-06, + "loss": 0.465, + "step": 927 + }, + { + "epoch": 0.05947193027428865, + "grad_norm": 2.4780835152841405, + "learning_rate": 1e-06, + "loss": 0.4182, + "step": 928 + }, + { + "epoch": 0.05953601640604973, + "grad_norm": 2.5995419876722883, + "learning_rate": 1e-06, + "loss": 0.4444, + "step": 929 + }, + { + "epoch": 0.05960010253781082, + "grad_norm": 2.802476257110543, + "learning_rate": 1e-06, + "loss": 0.4203, + "step": 930 + }, + { + "epoch": 0.059664188669571905, + "grad_norm": 2.696579952644756, + "learning_rate": 1e-06, + "loss": 0.369, + "step": 931 + }, + { + "epoch": 0.05972827480133299, + "grad_norm": 2.764140417821217, + "learning_rate": 1e-06, + "loss": 0.4162, + "step": 932 + }, + { + "epoch": 0.05979236093309408, + "grad_norm": 2.7153933671973727, + "learning_rate": 1e-06, + "loss": 0.4039, + "step": 933 + }, + { + "epoch": 0.05985644706485516, + "grad_norm": 2.5802340494659806, + "learning_rate": 1e-06, + "loss": 0.423, + "step": 934 + }, + { + "epoch": 0.05992053319661625, + "grad_norm": 2.8596854328037984, + "learning_rate": 1e-06, + "loss": 0.4285, + "step": 935 + }, + { + "epoch": 0.05998461932837734, + "grad_norm": 2.4558109539962976, + "learning_rate": 1e-06, + "loss": 0.3674, + "step": 936 + }, + { + "epoch": 0.06004870546013843, + "grad_norm": 2.565179500088762, + "learning_rate": 1e-06, + "loss": 0.4408, + "step": 937 + }, + { + "epoch": 0.060112791591899514, + "grad_norm": 2.8418067743587625, + "learning_rate": 1e-06, + "loss": 0.3897, + "step": 938 + }, + { + "epoch": 0.0601768777236606, + "grad_norm": 2.664804134655138, + "learning_rate": 1e-06, + "loss": 0.4296, + "step": 939 + }, + { + "epoch": 0.060240963855421686, + "grad_norm": 2.4571750352285804, + "learning_rate": 1e-06, + "loss": 0.3878, + "step": 940 + }, + { + "epoch": 0.06030504998718277, + "grad_norm": 2.61630303904412, + "learning_rate": 1e-06, + "loss": 0.4347, + "step": 941 + }, + { + "epoch": 0.06036913611894386, + "grad_norm": 2.7027575415190896, + "learning_rate": 1e-06, + "loss": 0.3921, + "step": 942 + }, + { + "epoch": 0.060433222250704945, + "grad_norm": 3.010427817021502, + "learning_rate": 1e-06, + "loss": 0.3863, + "step": 943 + }, + { + "epoch": 0.06049730838246604, + "grad_norm": 2.7087312174774065, + "learning_rate": 1e-06, + "loss": 0.3995, + "step": 944 + }, + { + "epoch": 0.060561394514227124, + "grad_norm": 2.4384517149103226, + "learning_rate": 1e-06, + "loss": 0.3957, + "step": 945 + }, + { + "epoch": 0.06062548064598821, + "grad_norm": 2.3482061954090616, + "learning_rate": 1e-06, + "loss": 0.3846, + "step": 946 + }, + { + "epoch": 0.060689566777749296, + "grad_norm": 2.5136646304971033, + "learning_rate": 1e-06, + "loss": 0.4334, + "step": 947 + }, + { + "epoch": 0.06075365290951038, + "grad_norm": 2.5457038683619015, + "learning_rate": 1e-06, + "loss": 0.3798, + "step": 948 + }, + { + "epoch": 0.06081773904127147, + "grad_norm": 2.420555660582623, + "learning_rate": 1e-06, + "loss": 0.3932, + "step": 949 + }, + { + "epoch": 0.060881825173032554, + "grad_norm": 2.6412791172215524, + "learning_rate": 1e-06, + "loss": 0.4662, + "step": 950 + }, + { + "epoch": 0.06094591130479364, + "grad_norm": 2.9291322019172688, + "learning_rate": 1e-06, + "loss": 0.4281, + "step": 951 + }, + { + "epoch": 0.061009997436554726, + "grad_norm": 2.7145651389710177, + "learning_rate": 1e-06, + "loss": 0.4171, + "step": 952 + }, + { + "epoch": 0.06107408356831582, + "grad_norm": 2.6986578195358666, + "learning_rate": 1e-06, + "loss": 0.3915, + "step": 953 + }, + { + "epoch": 0.061138169700076905, + "grad_norm": 2.554566833482712, + "learning_rate": 1e-06, + "loss": 0.4024, + "step": 954 + }, + { + "epoch": 0.06120225583183799, + "grad_norm": 2.476875377384491, + "learning_rate": 1e-06, + "loss": 0.398, + "step": 955 + }, + { + "epoch": 0.06126634196359908, + "grad_norm": 2.587063107108279, + "learning_rate": 1e-06, + "loss": 0.3987, + "step": 956 + }, + { + "epoch": 0.061330428095360164, + "grad_norm": 2.5604522601800404, + "learning_rate": 1e-06, + "loss": 0.4748, + "step": 957 + }, + { + "epoch": 0.06139451422712125, + "grad_norm": 2.5258207322773614, + "learning_rate": 1e-06, + "loss": 0.44, + "step": 958 + }, + { + "epoch": 0.061458600358882336, + "grad_norm": 2.6373864349591107, + "learning_rate": 1e-06, + "loss": 0.4521, + "step": 959 + }, + { + "epoch": 0.06152268649064342, + "grad_norm": 2.702180991893202, + "learning_rate": 1e-06, + "loss": 0.424, + "step": 960 + }, + { + "epoch": 0.061586772622404515, + "grad_norm": 2.4921759193243074, + "learning_rate": 1e-06, + "loss": 0.3869, + "step": 961 + }, + { + "epoch": 0.0616508587541656, + "grad_norm": 2.450635064392266, + "learning_rate": 1e-06, + "loss": 0.407, + "step": 962 + }, + { + "epoch": 0.06171494488592669, + "grad_norm": 2.489839839860473, + "learning_rate": 1e-06, + "loss": 0.4472, + "step": 963 + }, + { + "epoch": 0.06177903101768777, + "grad_norm": 2.563181821436447, + "learning_rate": 1e-06, + "loss": 0.4138, + "step": 964 + }, + { + "epoch": 0.06184311714944886, + "grad_norm": 2.7120831137952637, + "learning_rate": 1e-06, + "loss": 0.3688, + "step": 965 + }, + { + "epoch": 0.061907203281209945, + "grad_norm": 2.623528912470007, + "learning_rate": 1e-06, + "loss": 0.4902, + "step": 966 + }, + { + "epoch": 0.06197128941297103, + "grad_norm": 2.70653288950328, + "learning_rate": 1e-06, + "loss": 0.3872, + "step": 967 + }, + { + "epoch": 0.06203537554473212, + "grad_norm": 2.589210582203363, + "learning_rate": 1e-06, + "loss": 0.4239, + "step": 968 + }, + { + "epoch": 0.06209946167649321, + "grad_norm": 2.5398221503803544, + "learning_rate": 1e-06, + "loss": 0.4281, + "step": 969 + }, + { + "epoch": 0.062163547808254296, + "grad_norm": 2.454628695755026, + "learning_rate": 1e-06, + "loss": 0.4235, + "step": 970 + }, + { + "epoch": 0.06222763394001538, + "grad_norm": 2.848700306726633, + "learning_rate": 1e-06, + "loss": 0.4546, + "step": 971 + }, + { + "epoch": 0.06229172007177647, + "grad_norm": 2.6158789298430816, + "learning_rate": 1e-06, + "loss": 0.4367, + "step": 972 + }, + { + "epoch": 0.062355806203537555, + "grad_norm": 2.4948748651670978, + "learning_rate": 1e-06, + "loss": 0.4011, + "step": 973 + }, + { + "epoch": 0.06241989233529864, + "grad_norm": 2.559300985421126, + "learning_rate": 1e-06, + "loss": 0.3945, + "step": 974 + }, + { + "epoch": 0.06248397846705973, + "grad_norm": 2.506236058687631, + "learning_rate": 1e-06, + "loss": 0.3754, + "step": 975 + }, + { + "epoch": 0.06254806459882081, + "grad_norm": 2.802408713917196, + "learning_rate": 1e-06, + "loss": 0.4177, + "step": 976 + }, + { + "epoch": 0.0626121507305819, + "grad_norm": 2.5662771461384297, + "learning_rate": 1e-06, + "loss": 0.4466, + "step": 977 + }, + { + "epoch": 0.06267623686234298, + "grad_norm": 2.4273852123129736, + "learning_rate": 1e-06, + "loss": 0.431, + "step": 978 + }, + { + "epoch": 0.06274032299410408, + "grad_norm": 2.695506665685708, + "learning_rate": 1e-06, + "loss": 0.4042, + "step": 979 + }, + { + "epoch": 0.06280440912586516, + "grad_norm": 2.5089248203469117, + "learning_rate": 1e-06, + "loss": 0.4118, + "step": 980 + }, + { + "epoch": 0.06286849525762625, + "grad_norm": 2.610225075497294, + "learning_rate": 1e-06, + "loss": 0.425, + "step": 981 + }, + { + "epoch": 0.06293258138938734, + "grad_norm": 2.6515675578565365, + "learning_rate": 1e-06, + "loss": 0.4858, + "step": 982 + }, + { + "epoch": 0.06299666752114842, + "grad_norm": 2.662942034843004, + "learning_rate": 1e-06, + "loss": 0.4146, + "step": 983 + }, + { + "epoch": 0.06306075365290952, + "grad_norm": 2.483294863504189, + "learning_rate": 1e-06, + "loss": 0.4229, + "step": 984 + }, + { + "epoch": 0.0631248397846706, + "grad_norm": 2.774408912609533, + "learning_rate": 1e-06, + "loss": 0.4477, + "step": 985 + }, + { + "epoch": 0.06318892591643169, + "grad_norm": 2.6992735362714746, + "learning_rate": 1e-06, + "loss": 0.3877, + "step": 986 + }, + { + "epoch": 0.06325301204819277, + "grad_norm": 2.574806326857814, + "learning_rate": 1e-06, + "loss": 0.4137, + "step": 987 + }, + { + "epoch": 0.06331709817995386, + "grad_norm": 2.6815055430989316, + "learning_rate": 1e-06, + "loss": 0.4214, + "step": 988 + }, + { + "epoch": 0.06338118431171494, + "grad_norm": 2.519521321534601, + "learning_rate": 1e-06, + "loss": 0.389, + "step": 989 + }, + { + "epoch": 0.06344527044347603, + "grad_norm": 2.33602785029866, + "learning_rate": 1e-06, + "loss": 0.4372, + "step": 990 + }, + { + "epoch": 0.06350935657523712, + "grad_norm": 2.71707391218128, + "learning_rate": 1e-06, + "loss": 0.4465, + "step": 991 + }, + { + "epoch": 0.0635734427069982, + "grad_norm": 2.636719039925872, + "learning_rate": 1e-06, + "loss": 0.3749, + "step": 992 + }, + { + "epoch": 0.0636375288387593, + "grad_norm": 2.4058618312611983, + "learning_rate": 1e-06, + "loss": 0.3711, + "step": 993 + }, + { + "epoch": 0.06370161497052038, + "grad_norm": 2.6341360576740214, + "learning_rate": 1e-06, + "loss": 0.4046, + "step": 994 + }, + { + "epoch": 0.06376570110228147, + "grad_norm": 2.6734271373551226, + "learning_rate": 1e-06, + "loss": 0.4286, + "step": 995 + }, + { + "epoch": 0.06382978723404255, + "grad_norm": 2.448615128413396, + "learning_rate": 1e-06, + "loss": 0.3743, + "step": 996 + }, + { + "epoch": 0.06389387336580364, + "grad_norm": 2.465073956868142, + "learning_rate": 1e-06, + "loss": 0.4479, + "step": 997 + }, + { + "epoch": 0.06395795949756472, + "grad_norm": 2.740786732279467, + "learning_rate": 1e-06, + "loss": 0.4407, + "step": 998 + }, + { + "epoch": 0.06402204562932581, + "grad_norm": 2.9571363244427626, + "learning_rate": 1e-06, + "loss": 0.4127, + "step": 999 + }, + { + "epoch": 0.0640861317610869, + "grad_norm": 2.6266858312799912, + "learning_rate": 1e-06, + "loss": 0.4022, + "step": 1000 + }, + { + "epoch": 0.06415021789284799, + "grad_norm": 2.566332751578703, + "learning_rate": 1e-06, + "loss": 0.4415, + "step": 1001 + }, + { + "epoch": 0.06421430402460908, + "grad_norm": 2.8348967920960946, + "learning_rate": 1e-06, + "loss": 0.4631, + "step": 1002 + }, + { + "epoch": 0.06427839015637016, + "grad_norm": 2.783573396035872, + "learning_rate": 1e-06, + "loss": 0.4117, + "step": 1003 + }, + { + "epoch": 0.06434247628813125, + "grad_norm": 2.573686094710799, + "learning_rate": 1e-06, + "loss": 0.4196, + "step": 1004 + }, + { + "epoch": 0.06440656241989233, + "grad_norm": 2.4367691385400554, + "learning_rate": 1e-06, + "loss": 0.4291, + "step": 1005 + }, + { + "epoch": 0.06447064855165342, + "grad_norm": 2.5403039658130417, + "learning_rate": 1e-06, + "loss": 0.4099, + "step": 1006 + }, + { + "epoch": 0.06453473468341452, + "grad_norm": 2.6028748942628352, + "learning_rate": 1e-06, + "loss": 0.4034, + "step": 1007 + }, + { + "epoch": 0.0645988208151756, + "grad_norm": 2.4662519444773148, + "learning_rate": 1e-06, + "loss": 0.399, + "step": 1008 + }, + { + "epoch": 0.06466290694693669, + "grad_norm": 2.51916140654751, + "learning_rate": 1e-06, + "loss": 0.4602, + "step": 1009 + }, + { + "epoch": 0.06472699307869777, + "grad_norm": 2.5823634639004185, + "learning_rate": 1e-06, + "loss": 0.4225, + "step": 1010 + }, + { + "epoch": 0.06479107921045886, + "grad_norm": 2.778381438547188, + "learning_rate": 1e-06, + "loss": 0.4606, + "step": 1011 + }, + { + "epoch": 0.06485516534221994, + "grad_norm": 2.556163096331897, + "learning_rate": 1e-06, + "loss": 0.3915, + "step": 1012 + }, + { + "epoch": 0.06491925147398103, + "grad_norm": 2.759435920952366, + "learning_rate": 1e-06, + "loss": 0.4893, + "step": 1013 + }, + { + "epoch": 0.06498333760574211, + "grad_norm": 2.6059925655979033, + "learning_rate": 1e-06, + "loss": 0.4138, + "step": 1014 + }, + { + "epoch": 0.0650474237375032, + "grad_norm": 2.6958934341111997, + "learning_rate": 1e-06, + "loss": 0.4431, + "step": 1015 + }, + { + "epoch": 0.0651115098692643, + "grad_norm": 2.5129883965336983, + "learning_rate": 1e-06, + "loss": 0.3692, + "step": 1016 + }, + { + "epoch": 0.06517559600102538, + "grad_norm": 2.617160055089755, + "learning_rate": 1e-06, + "loss": 0.3773, + "step": 1017 + }, + { + "epoch": 0.06523968213278647, + "grad_norm": 2.596654082237727, + "learning_rate": 1e-06, + "loss": 0.3705, + "step": 1018 + }, + { + "epoch": 0.06530376826454755, + "grad_norm": 2.6853485295779964, + "learning_rate": 1e-06, + "loss": 0.4032, + "step": 1019 + }, + { + "epoch": 0.06536785439630864, + "grad_norm": 2.731951426496971, + "learning_rate": 1e-06, + "loss": 0.4325, + "step": 1020 + }, + { + "epoch": 0.06543194052806972, + "grad_norm": 2.615306745896242, + "learning_rate": 1e-06, + "loss": 0.4863, + "step": 1021 + }, + { + "epoch": 0.06549602665983081, + "grad_norm": 2.7173205806268603, + "learning_rate": 1e-06, + "loss": 0.4243, + "step": 1022 + }, + { + "epoch": 0.06556011279159189, + "grad_norm": 2.6805449715216567, + "learning_rate": 1e-06, + "loss": 0.394, + "step": 1023 + }, + { + "epoch": 0.06562419892335299, + "grad_norm": 2.580930131918419, + "learning_rate": 1e-06, + "loss": 0.4065, + "step": 1024 + }, + { + "epoch": 0.06568828505511408, + "grad_norm": 2.597849794416895, + "learning_rate": 1e-06, + "loss": 0.4519, + "step": 1025 + }, + { + "epoch": 0.06575237118687516, + "grad_norm": 2.599092164234108, + "learning_rate": 1e-06, + "loss": 0.4076, + "step": 1026 + }, + { + "epoch": 0.06581645731863625, + "grad_norm": 2.496092273953171, + "learning_rate": 1e-06, + "loss": 0.3984, + "step": 1027 + }, + { + "epoch": 0.06588054345039733, + "grad_norm": 2.5031105951259547, + "learning_rate": 1e-06, + "loss": 0.3922, + "step": 1028 + }, + { + "epoch": 0.06594462958215842, + "grad_norm": 2.580869432971234, + "learning_rate": 1e-06, + "loss": 0.4078, + "step": 1029 + }, + { + "epoch": 0.0660087157139195, + "grad_norm": 2.3408972954509957, + "learning_rate": 1e-06, + "loss": 0.3927, + "step": 1030 + }, + { + "epoch": 0.0660728018456806, + "grad_norm": 2.404906585724594, + "learning_rate": 1e-06, + "loss": 0.4575, + "step": 1031 + }, + { + "epoch": 0.06613688797744169, + "grad_norm": 2.635559029994182, + "learning_rate": 1e-06, + "loss": 0.431, + "step": 1032 + }, + { + "epoch": 0.06620097410920277, + "grad_norm": 2.7759398385370195, + "learning_rate": 1e-06, + "loss": 0.4244, + "step": 1033 + }, + { + "epoch": 0.06626506024096386, + "grad_norm": 2.7080679996277444, + "learning_rate": 1e-06, + "loss": 0.4226, + "step": 1034 + }, + { + "epoch": 0.06632914637272494, + "grad_norm": 2.5479868488751345, + "learning_rate": 1e-06, + "loss": 0.4295, + "step": 1035 + }, + { + "epoch": 0.06639323250448603, + "grad_norm": 2.5285529800347284, + "learning_rate": 1e-06, + "loss": 0.4695, + "step": 1036 + }, + { + "epoch": 0.06645731863624711, + "grad_norm": 2.6519770537147407, + "learning_rate": 1e-06, + "loss": 0.4483, + "step": 1037 + }, + { + "epoch": 0.0665214047680082, + "grad_norm": 2.533724568813007, + "learning_rate": 1e-06, + "loss": 0.4122, + "step": 1038 + }, + { + "epoch": 0.06658549089976928, + "grad_norm": 2.521873472809296, + "learning_rate": 1e-06, + "loss": 0.3602, + "step": 1039 + }, + { + "epoch": 0.06664957703153038, + "grad_norm": 2.8267483275910443, + "learning_rate": 1e-06, + "loss": 0.4167, + "step": 1040 + }, + { + "epoch": 0.06671366316329147, + "grad_norm": 2.467528084282326, + "learning_rate": 1e-06, + "loss": 0.4311, + "step": 1041 + }, + { + "epoch": 0.06677774929505255, + "grad_norm": 2.561755755376321, + "learning_rate": 1e-06, + "loss": 0.4501, + "step": 1042 + }, + { + "epoch": 0.06684183542681364, + "grad_norm": 2.6711046370183373, + "learning_rate": 1e-06, + "loss": 0.3674, + "step": 1043 + }, + { + "epoch": 0.06690592155857472, + "grad_norm": 2.7762462243085735, + "learning_rate": 1e-06, + "loss": 0.434, + "step": 1044 + }, + { + "epoch": 0.06697000769033581, + "grad_norm": 2.5653129293074213, + "learning_rate": 1e-06, + "loss": 0.372, + "step": 1045 + }, + { + "epoch": 0.0670340938220969, + "grad_norm": 2.706282450560898, + "learning_rate": 1e-06, + "loss": 0.444, + "step": 1046 + }, + { + "epoch": 0.06709817995385799, + "grad_norm": 2.622565934269703, + "learning_rate": 1e-06, + "loss": 0.4176, + "step": 1047 + }, + { + "epoch": 0.06716226608561907, + "grad_norm": 2.781376720798491, + "learning_rate": 1e-06, + "loss": 0.4173, + "step": 1048 + }, + { + "epoch": 0.06722635221738016, + "grad_norm": 2.8092295996044228, + "learning_rate": 1e-06, + "loss": 0.4043, + "step": 1049 + }, + { + "epoch": 0.06729043834914125, + "grad_norm": 2.4394636221770556, + "learning_rate": 1e-06, + "loss": 0.5048, + "step": 1050 + }, + { + "epoch": 0.06735452448090233, + "grad_norm": 2.452372374028152, + "learning_rate": 1e-06, + "loss": 0.4492, + "step": 1051 + }, + { + "epoch": 0.06741861061266342, + "grad_norm": 2.3619178075691307, + "learning_rate": 1e-06, + "loss": 0.3886, + "step": 1052 + }, + { + "epoch": 0.0674826967444245, + "grad_norm": 2.800387735902425, + "learning_rate": 1e-06, + "loss": 0.4693, + "step": 1053 + }, + { + "epoch": 0.0675467828761856, + "grad_norm": 2.6335374299550316, + "learning_rate": 1e-06, + "loss": 0.386, + "step": 1054 + }, + { + "epoch": 0.06761086900794668, + "grad_norm": 2.6774069184367177, + "learning_rate": 1e-06, + "loss": 0.3891, + "step": 1055 + }, + { + "epoch": 0.06767495513970777, + "grad_norm": 2.7446624322855886, + "learning_rate": 1e-06, + "loss": 0.4485, + "step": 1056 + }, + { + "epoch": 0.06773904127146885, + "grad_norm": 2.5979310178492585, + "learning_rate": 1e-06, + "loss": 0.3936, + "step": 1057 + }, + { + "epoch": 0.06780312740322994, + "grad_norm": 2.565876902003627, + "learning_rate": 1e-06, + "loss": 0.4408, + "step": 1058 + }, + { + "epoch": 0.06786721353499103, + "grad_norm": 2.670634426279264, + "learning_rate": 1e-06, + "loss": 0.4304, + "step": 1059 + }, + { + "epoch": 0.06793129966675211, + "grad_norm": 2.6113141898048293, + "learning_rate": 1e-06, + "loss": 0.3744, + "step": 1060 + }, + { + "epoch": 0.0679953857985132, + "grad_norm": 2.7067225739696212, + "learning_rate": 1e-06, + "loss": 0.4545, + "step": 1061 + }, + { + "epoch": 0.06805947193027428, + "grad_norm": 2.793453856124571, + "learning_rate": 1e-06, + "loss": 0.4007, + "step": 1062 + }, + { + "epoch": 0.06812355806203538, + "grad_norm": 2.756931561264504, + "learning_rate": 1e-06, + "loss": 0.4316, + "step": 1063 + }, + { + "epoch": 0.06818764419379646, + "grad_norm": 2.597642965053177, + "learning_rate": 1e-06, + "loss": 0.3865, + "step": 1064 + }, + { + "epoch": 0.06825173032555755, + "grad_norm": 2.6663999203185385, + "learning_rate": 1e-06, + "loss": 0.4348, + "step": 1065 + }, + { + "epoch": 0.06831581645731864, + "grad_norm": 2.8609809542202487, + "learning_rate": 1e-06, + "loss": 0.3609, + "step": 1066 + }, + { + "epoch": 0.06837990258907972, + "grad_norm": 2.652883468791596, + "learning_rate": 1e-06, + "loss": 0.4343, + "step": 1067 + }, + { + "epoch": 0.06844398872084081, + "grad_norm": 2.9494281680723646, + "learning_rate": 1e-06, + "loss": 0.3445, + "step": 1068 + }, + { + "epoch": 0.0685080748526019, + "grad_norm": 2.6494659758233983, + "learning_rate": 1e-06, + "loss": 0.4532, + "step": 1069 + }, + { + "epoch": 0.06857216098436299, + "grad_norm": 2.687343190573424, + "learning_rate": 1e-06, + "loss": 0.4343, + "step": 1070 + }, + { + "epoch": 0.06863624711612407, + "grad_norm": 2.522789534611682, + "learning_rate": 1e-06, + "loss": 0.4329, + "step": 1071 + }, + { + "epoch": 0.06870033324788516, + "grad_norm": 2.668827458426054, + "learning_rate": 1e-06, + "loss": 0.3978, + "step": 1072 + }, + { + "epoch": 0.06876441937964624, + "grad_norm": 2.5380058649439676, + "learning_rate": 1e-06, + "loss": 0.3881, + "step": 1073 + }, + { + "epoch": 0.06882850551140733, + "grad_norm": 2.5193129973843287, + "learning_rate": 1e-06, + "loss": 0.3855, + "step": 1074 + }, + { + "epoch": 0.06889259164316842, + "grad_norm": 2.6544689506543473, + "learning_rate": 1e-06, + "loss": 0.4046, + "step": 1075 + }, + { + "epoch": 0.0689566777749295, + "grad_norm": 2.752467894455635, + "learning_rate": 1e-06, + "loss": 0.4065, + "step": 1076 + }, + { + "epoch": 0.0690207639066906, + "grad_norm": 2.6254575219907803, + "learning_rate": 1e-06, + "loss": 0.3959, + "step": 1077 + }, + { + "epoch": 0.06908485003845168, + "grad_norm": 2.5639116654438254, + "learning_rate": 1e-06, + "loss": 0.4176, + "step": 1078 + }, + { + "epoch": 0.06914893617021277, + "grad_norm": 2.816930112267722, + "learning_rate": 1e-06, + "loss": 0.4839, + "step": 1079 + }, + { + "epoch": 0.06921302230197385, + "grad_norm": 2.6424585821266313, + "learning_rate": 1e-06, + "loss": 0.4152, + "step": 1080 + }, + { + "epoch": 0.06927710843373494, + "grad_norm": 2.598199833909251, + "learning_rate": 1e-06, + "loss": 0.4057, + "step": 1081 + }, + { + "epoch": 0.06934119456549602, + "grad_norm": 2.6818415968968385, + "learning_rate": 1e-06, + "loss": 0.3833, + "step": 1082 + }, + { + "epoch": 0.06940528069725711, + "grad_norm": 2.7587247856867463, + "learning_rate": 1e-06, + "loss": 0.4621, + "step": 1083 + }, + { + "epoch": 0.0694693668290182, + "grad_norm": 2.669134371598925, + "learning_rate": 1e-06, + "loss": 0.4269, + "step": 1084 + }, + { + "epoch": 0.06953345296077929, + "grad_norm": 2.506076752346152, + "learning_rate": 1e-06, + "loss": 0.4204, + "step": 1085 + }, + { + "epoch": 0.06959753909254038, + "grad_norm": 2.5858512706812475, + "learning_rate": 1e-06, + "loss": 0.4303, + "step": 1086 + }, + { + "epoch": 0.06966162522430146, + "grad_norm": 2.661423397910268, + "learning_rate": 1e-06, + "loss": 0.4027, + "step": 1087 + }, + { + "epoch": 0.06972571135606255, + "grad_norm": 2.5027730869065947, + "learning_rate": 1e-06, + "loss": 0.369, + "step": 1088 + }, + { + "epoch": 0.06978979748782363, + "grad_norm": 2.5945275163137262, + "learning_rate": 1e-06, + "loss": 0.4553, + "step": 1089 + }, + { + "epoch": 0.06985388361958472, + "grad_norm": 2.847369043442018, + "learning_rate": 1e-06, + "loss": 0.4604, + "step": 1090 + }, + { + "epoch": 0.06991796975134582, + "grad_norm": 2.513424892957305, + "learning_rate": 1e-06, + "loss": 0.3673, + "step": 1091 + }, + { + "epoch": 0.0699820558831069, + "grad_norm": 2.7317286397718914, + "learning_rate": 1e-06, + "loss": 0.4119, + "step": 1092 + }, + { + "epoch": 0.07004614201486799, + "grad_norm": 2.4113150816480986, + "learning_rate": 1e-06, + "loss": 0.415, + "step": 1093 + }, + { + "epoch": 0.07011022814662907, + "grad_norm": 2.6688521108098375, + "learning_rate": 1e-06, + "loss": 0.4192, + "step": 1094 + }, + { + "epoch": 0.07017431427839016, + "grad_norm": 2.5120122115920633, + "learning_rate": 1e-06, + "loss": 0.4205, + "step": 1095 + }, + { + "epoch": 0.07023840041015124, + "grad_norm": 2.65132626428691, + "learning_rate": 1e-06, + "loss": 0.4336, + "step": 1096 + }, + { + "epoch": 0.07030248654191233, + "grad_norm": 2.6400689871000056, + "learning_rate": 1e-06, + "loss": 0.4401, + "step": 1097 + }, + { + "epoch": 0.07036657267367341, + "grad_norm": 2.467297818144308, + "learning_rate": 1e-06, + "loss": 0.4335, + "step": 1098 + }, + { + "epoch": 0.0704306588054345, + "grad_norm": 2.478710251122422, + "learning_rate": 1e-06, + "loss": 0.4625, + "step": 1099 + }, + { + "epoch": 0.0704947449371956, + "grad_norm": 2.506389963438959, + "learning_rate": 1e-06, + "loss": 0.4214, + "step": 1100 + }, + { + "epoch": 0.07055883106895668, + "grad_norm": 2.571801825624722, + "learning_rate": 1e-06, + "loss": 0.3919, + "step": 1101 + }, + { + "epoch": 0.07062291720071777, + "grad_norm": 2.419710867732795, + "learning_rate": 1e-06, + "loss": 0.3828, + "step": 1102 + }, + { + "epoch": 0.07068700333247885, + "grad_norm": 2.599190425523394, + "learning_rate": 1e-06, + "loss": 0.3828, + "step": 1103 + }, + { + "epoch": 0.07075108946423994, + "grad_norm": 2.607254163899496, + "learning_rate": 1e-06, + "loss": 0.4243, + "step": 1104 + }, + { + "epoch": 0.07081517559600102, + "grad_norm": 2.358621192505561, + "learning_rate": 1e-06, + "loss": 0.3716, + "step": 1105 + }, + { + "epoch": 0.07087926172776211, + "grad_norm": 2.539685863939085, + "learning_rate": 1e-06, + "loss": 0.4441, + "step": 1106 + }, + { + "epoch": 0.07094334785952319, + "grad_norm": 2.4938855723828772, + "learning_rate": 1e-06, + "loss": 0.3537, + "step": 1107 + }, + { + "epoch": 0.07100743399128429, + "grad_norm": 2.5238087008493304, + "learning_rate": 1e-06, + "loss": 0.45, + "step": 1108 + }, + { + "epoch": 0.07107152012304538, + "grad_norm": 2.7442192669650094, + "learning_rate": 1e-06, + "loss": 0.3776, + "step": 1109 + }, + { + "epoch": 0.07113560625480646, + "grad_norm": 2.5839741345460627, + "learning_rate": 1e-06, + "loss": 0.5042, + "step": 1110 + }, + { + "epoch": 0.07119969238656755, + "grad_norm": 2.6663297246069355, + "learning_rate": 1e-06, + "loss": 0.3484, + "step": 1111 + }, + { + "epoch": 0.07126377851832863, + "grad_norm": 2.598906923733249, + "learning_rate": 1e-06, + "loss": 0.431, + "step": 1112 + }, + { + "epoch": 0.07132786465008972, + "grad_norm": 2.6202715077828085, + "learning_rate": 1e-06, + "loss": 0.388, + "step": 1113 + }, + { + "epoch": 0.0713919507818508, + "grad_norm": 2.4825489453871974, + "learning_rate": 1e-06, + "loss": 0.4029, + "step": 1114 + }, + { + "epoch": 0.0714560369136119, + "grad_norm": 3.6753116517960454, + "learning_rate": 1e-06, + "loss": 0.4002, + "step": 1115 + }, + { + "epoch": 0.07152012304537299, + "grad_norm": 2.6693428456376393, + "learning_rate": 1e-06, + "loss": 0.4982, + "step": 1116 + }, + { + "epoch": 0.07158420917713407, + "grad_norm": 2.3868609496718136, + "learning_rate": 1e-06, + "loss": 0.462, + "step": 1117 + }, + { + "epoch": 0.07164829530889516, + "grad_norm": 2.7746202720611635, + "learning_rate": 1e-06, + "loss": 0.3875, + "step": 1118 + }, + { + "epoch": 0.07171238144065624, + "grad_norm": 2.4389606051674395, + "learning_rate": 1e-06, + "loss": 0.427, + "step": 1119 + }, + { + "epoch": 0.07177646757241733, + "grad_norm": 2.743220624439705, + "learning_rate": 1e-06, + "loss": 0.4451, + "step": 1120 + }, + { + "epoch": 0.07184055370417841, + "grad_norm": 2.430870254174703, + "learning_rate": 1e-06, + "loss": 0.3746, + "step": 1121 + }, + { + "epoch": 0.0719046398359395, + "grad_norm": 2.534393202382834, + "learning_rate": 1e-06, + "loss": 0.371, + "step": 1122 + }, + { + "epoch": 0.07196872596770058, + "grad_norm": 2.7817519211332393, + "learning_rate": 1e-06, + "loss": 0.4502, + "step": 1123 + }, + { + "epoch": 0.07203281209946168, + "grad_norm": 2.9100386240706517, + "learning_rate": 1e-06, + "loss": 0.3884, + "step": 1124 + }, + { + "epoch": 0.07209689823122277, + "grad_norm": 2.826915075639032, + "learning_rate": 1e-06, + "loss": 0.4718, + "step": 1125 + }, + { + "epoch": 0.07216098436298385, + "grad_norm": 2.6944257999670826, + "learning_rate": 1e-06, + "loss": 0.4484, + "step": 1126 + }, + { + "epoch": 0.07222507049474494, + "grad_norm": 2.4687223760052435, + "learning_rate": 1e-06, + "loss": 0.4122, + "step": 1127 + }, + { + "epoch": 0.07228915662650602, + "grad_norm": 2.4178966947676646, + "learning_rate": 1e-06, + "loss": 0.3825, + "step": 1128 + }, + { + "epoch": 0.07235324275826711, + "grad_norm": 2.752834047078145, + "learning_rate": 1e-06, + "loss": 0.4372, + "step": 1129 + }, + { + "epoch": 0.0724173288900282, + "grad_norm": 2.6488729911222206, + "learning_rate": 1e-06, + "loss": 0.42, + "step": 1130 + }, + { + "epoch": 0.07248141502178929, + "grad_norm": 2.501959418170512, + "learning_rate": 1e-06, + "loss": 0.3996, + "step": 1131 + }, + { + "epoch": 0.07254550115355037, + "grad_norm": 2.596725048264129, + "learning_rate": 1e-06, + "loss": 0.401, + "step": 1132 + }, + { + "epoch": 0.07260958728531146, + "grad_norm": 2.787403116588525, + "learning_rate": 1e-06, + "loss": 0.4004, + "step": 1133 + }, + { + "epoch": 0.07267367341707255, + "grad_norm": 2.658465773541618, + "learning_rate": 1e-06, + "loss": 0.4132, + "step": 1134 + }, + { + "epoch": 0.07273775954883363, + "grad_norm": 2.482592801042877, + "learning_rate": 1e-06, + "loss": 0.4216, + "step": 1135 + }, + { + "epoch": 0.07280184568059472, + "grad_norm": 2.6974112554570926, + "learning_rate": 1e-06, + "loss": 0.4661, + "step": 1136 + }, + { + "epoch": 0.0728659318123558, + "grad_norm": 2.7703393520834614, + "learning_rate": 1e-06, + "loss": 0.4758, + "step": 1137 + }, + { + "epoch": 0.0729300179441169, + "grad_norm": 2.52696180165321, + "learning_rate": 1e-06, + "loss": 0.4251, + "step": 1138 + }, + { + "epoch": 0.07299410407587797, + "grad_norm": 2.543291457166555, + "learning_rate": 1e-06, + "loss": 0.412, + "step": 1139 + }, + { + "epoch": 0.07305819020763907, + "grad_norm": 2.518067877477988, + "learning_rate": 1e-06, + "loss": 0.4186, + "step": 1140 + }, + { + "epoch": 0.07312227633940016, + "grad_norm": 2.649865343001732, + "learning_rate": 1e-06, + "loss": 0.4216, + "step": 1141 + }, + { + "epoch": 0.07318636247116124, + "grad_norm": 2.6317013476687454, + "learning_rate": 1e-06, + "loss": 0.4166, + "step": 1142 + }, + { + "epoch": 0.07325044860292233, + "grad_norm": 2.5713731021703294, + "learning_rate": 1e-06, + "loss": 0.3974, + "step": 1143 + }, + { + "epoch": 0.07331453473468341, + "grad_norm": 2.7931203073361788, + "learning_rate": 1e-06, + "loss": 0.4184, + "step": 1144 + }, + { + "epoch": 0.0733786208664445, + "grad_norm": 2.599003814107677, + "learning_rate": 1e-06, + "loss": 0.4507, + "step": 1145 + }, + { + "epoch": 0.07344270699820558, + "grad_norm": 2.7684915380392354, + "learning_rate": 1e-06, + "loss": 0.4471, + "step": 1146 + }, + { + "epoch": 0.07350679312996668, + "grad_norm": 2.4796130862109735, + "learning_rate": 1e-06, + "loss": 0.4646, + "step": 1147 + }, + { + "epoch": 0.07357087926172776, + "grad_norm": 2.691420190189633, + "learning_rate": 1e-06, + "loss": 0.4762, + "step": 1148 + }, + { + "epoch": 0.07363496539348885, + "grad_norm": 2.7664381558741065, + "learning_rate": 1e-06, + "loss": 0.448, + "step": 1149 + }, + { + "epoch": 0.07369905152524994, + "grad_norm": 2.5162422680138814, + "learning_rate": 1e-06, + "loss": 0.4012, + "step": 1150 + }, + { + "epoch": 0.07376313765701102, + "grad_norm": 2.5781952395234633, + "learning_rate": 1e-06, + "loss": 0.3675, + "step": 1151 + }, + { + "epoch": 0.07382722378877211, + "grad_norm": 2.6625898353464157, + "learning_rate": 1e-06, + "loss": 0.379, + "step": 1152 + }, + { + "epoch": 0.0738913099205332, + "grad_norm": 2.5901429074988553, + "learning_rate": 1e-06, + "loss": 0.4186, + "step": 1153 + }, + { + "epoch": 0.07395539605229429, + "grad_norm": 2.7942392683449846, + "learning_rate": 1e-06, + "loss": 0.3987, + "step": 1154 + }, + { + "epoch": 0.07401948218405537, + "grad_norm": 2.6654460748878286, + "learning_rate": 1e-06, + "loss": 0.409, + "step": 1155 + }, + { + "epoch": 0.07408356831581646, + "grad_norm": 2.5476306648846294, + "learning_rate": 1e-06, + "loss": 0.4411, + "step": 1156 + }, + { + "epoch": 0.07414765444757754, + "grad_norm": 2.591611279936759, + "learning_rate": 1e-06, + "loss": 0.4648, + "step": 1157 + }, + { + "epoch": 0.07421174057933863, + "grad_norm": 2.6704534006186087, + "learning_rate": 1e-06, + "loss": 0.4079, + "step": 1158 + }, + { + "epoch": 0.07427582671109972, + "grad_norm": 2.671768868673718, + "learning_rate": 1e-06, + "loss": 0.4416, + "step": 1159 + }, + { + "epoch": 0.0743399128428608, + "grad_norm": 2.4873744200996404, + "learning_rate": 1e-06, + "loss": 0.4139, + "step": 1160 + }, + { + "epoch": 0.0744039989746219, + "grad_norm": 2.4953535418400232, + "learning_rate": 1e-06, + "loss": 0.4331, + "step": 1161 + }, + { + "epoch": 0.07446808510638298, + "grad_norm": 2.526056293976957, + "learning_rate": 1e-06, + "loss": 0.4408, + "step": 1162 + }, + { + "epoch": 0.07453217123814407, + "grad_norm": 2.588818835559356, + "learning_rate": 1e-06, + "loss": 0.4776, + "step": 1163 + }, + { + "epoch": 0.07459625736990515, + "grad_norm": 2.666576479172023, + "learning_rate": 1e-06, + "loss": 0.4057, + "step": 1164 + }, + { + "epoch": 0.07466034350166624, + "grad_norm": 2.961651092072556, + "learning_rate": 1e-06, + "loss": 0.5216, + "step": 1165 + }, + { + "epoch": 0.07472442963342732, + "grad_norm": 2.520589368360706, + "learning_rate": 1e-06, + "loss": 0.4646, + "step": 1166 + }, + { + "epoch": 0.07478851576518841, + "grad_norm": 2.5775375985686435, + "learning_rate": 1e-06, + "loss": 0.4166, + "step": 1167 + }, + { + "epoch": 0.0748526018969495, + "grad_norm": 2.618379504403705, + "learning_rate": 1e-06, + "loss": 0.3859, + "step": 1168 + }, + { + "epoch": 0.07491668802871058, + "grad_norm": 2.701174665426732, + "learning_rate": 1e-06, + "loss": 0.4412, + "step": 1169 + }, + { + "epoch": 0.07498077416047168, + "grad_norm": 2.879711459534598, + "learning_rate": 1e-06, + "loss": 0.4384, + "step": 1170 + }, + { + "epoch": 0.07504486029223276, + "grad_norm": 2.437675394493719, + "learning_rate": 1e-06, + "loss": 0.4061, + "step": 1171 + }, + { + "epoch": 0.07510894642399385, + "grad_norm": 2.378217037869276, + "learning_rate": 1e-06, + "loss": 0.4029, + "step": 1172 + }, + { + "epoch": 0.07517303255575493, + "grad_norm": 2.6864533867239104, + "learning_rate": 1e-06, + "loss": 0.4255, + "step": 1173 + }, + { + "epoch": 0.07523711868751602, + "grad_norm": 2.9070178703797755, + "learning_rate": 1e-06, + "loss": 0.4432, + "step": 1174 + }, + { + "epoch": 0.07530120481927711, + "grad_norm": 2.394244267169891, + "learning_rate": 1e-06, + "loss": 0.3855, + "step": 1175 + }, + { + "epoch": 0.0753652909510382, + "grad_norm": 2.613237904551317, + "learning_rate": 1e-06, + "loss": 0.3646, + "step": 1176 + }, + { + "epoch": 0.07542937708279929, + "grad_norm": 2.5704274998773995, + "learning_rate": 1e-06, + "loss": 0.4264, + "step": 1177 + }, + { + "epoch": 0.07549346321456037, + "grad_norm": 2.647086455310922, + "learning_rate": 1e-06, + "loss": 0.4598, + "step": 1178 + }, + { + "epoch": 0.07555754934632146, + "grad_norm": 2.4851338144046866, + "learning_rate": 1e-06, + "loss": 0.3922, + "step": 1179 + }, + { + "epoch": 0.07562163547808254, + "grad_norm": 2.6992112402066373, + "learning_rate": 1e-06, + "loss": 0.4165, + "step": 1180 + }, + { + "epoch": 0.07568572160984363, + "grad_norm": 2.443187282946552, + "learning_rate": 1e-06, + "loss": 0.421, + "step": 1181 + }, + { + "epoch": 0.07574980774160471, + "grad_norm": 2.5382215760682336, + "learning_rate": 1e-06, + "loss": 0.3895, + "step": 1182 + }, + { + "epoch": 0.0758138938733658, + "grad_norm": 2.6296100830741005, + "learning_rate": 1e-06, + "loss": 0.427, + "step": 1183 + }, + { + "epoch": 0.0758779800051269, + "grad_norm": 2.437492691585318, + "learning_rate": 1e-06, + "loss": 0.4015, + "step": 1184 + }, + { + "epoch": 0.07594206613688798, + "grad_norm": 2.5072485842248082, + "learning_rate": 1e-06, + "loss": 0.3903, + "step": 1185 + }, + { + "epoch": 0.07600615226864907, + "grad_norm": 2.6751785189294455, + "learning_rate": 1e-06, + "loss": 0.4258, + "step": 1186 + }, + { + "epoch": 0.07607023840041015, + "grad_norm": 2.603241674792852, + "learning_rate": 1e-06, + "loss": 0.3732, + "step": 1187 + }, + { + "epoch": 0.07613432453217124, + "grad_norm": 2.4459489689703724, + "learning_rate": 1e-06, + "loss": 0.4164, + "step": 1188 + }, + { + "epoch": 0.07619841066393232, + "grad_norm": 2.936377164469067, + "learning_rate": 1e-06, + "loss": 0.4382, + "step": 1189 + }, + { + "epoch": 0.07626249679569341, + "grad_norm": 2.581801369589373, + "learning_rate": 1e-06, + "loss": 0.4646, + "step": 1190 + }, + { + "epoch": 0.07632658292745449, + "grad_norm": 2.6802303045344513, + "learning_rate": 1e-06, + "loss": 0.3503, + "step": 1191 + }, + { + "epoch": 0.07639066905921559, + "grad_norm": 2.6291026196838994, + "learning_rate": 1e-06, + "loss": 0.3919, + "step": 1192 + }, + { + "epoch": 0.07645475519097668, + "grad_norm": 2.5858158792263684, + "learning_rate": 1e-06, + "loss": 0.4262, + "step": 1193 + }, + { + "epoch": 0.07651884132273776, + "grad_norm": 2.658961870365221, + "learning_rate": 1e-06, + "loss": 0.4472, + "step": 1194 + }, + { + "epoch": 0.07658292745449885, + "grad_norm": 2.6170446639780445, + "learning_rate": 1e-06, + "loss": 0.3768, + "step": 1195 + }, + { + "epoch": 0.07664701358625993, + "grad_norm": 2.449050801281128, + "learning_rate": 1e-06, + "loss": 0.4166, + "step": 1196 + }, + { + "epoch": 0.07671109971802102, + "grad_norm": 2.510377162453613, + "learning_rate": 1e-06, + "loss": 0.4748, + "step": 1197 + }, + { + "epoch": 0.0767751858497821, + "grad_norm": 2.408176741926398, + "learning_rate": 1e-06, + "loss": 0.3704, + "step": 1198 + }, + { + "epoch": 0.0768392719815432, + "grad_norm": 2.510120365800847, + "learning_rate": 1e-06, + "loss": 0.4427, + "step": 1199 + }, + { + "epoch": 0.07690335811330429, + "grad_norm": 2.570673514283029, + "learning_rate": 1e-06, + "loss": 0.3971, + "step": 1200 + }, + { + "epoch": 0.07696744424506537, + "grad_norm": 2.560138302433304, + "learning_rate": 1e-06, + "loss": 0.4463, + "step": 1201 + }, + { + "epoch": 0.07703153037682646, + "grad_norm": 2.6974205413109953, + "learning_rate": 1e-06, + "loss": 0.4467, + "step": 1202 + }, + { + "epoch": 0.07709561650858754, + "grad_norm": 2.6092456235211183, + "learning_rate": 1e-06, + "loss": 0.3956, + "step": 1203 + }, + { + "epoch": 0.07715970264034863, + "grad_norm": 2.64955554722162, + "learning_rate": 1e-06, + "loss": 0.4856, + "step": 1204 + }, + { + "epoch": 0.07722378877210971, + "grad_norm": 2.8128688811623337, + "learning_rate": 1e-06, + "loss": 0.4009, + "step": 1205 + }, + { + "epoch": 0.0772878749038708, + "grad_norm": 2.5192936294423074, + "learning_rate": 1e-06, + "loss": 0.4242, + "step": 1206 + }, + { + "epoch": 0.07735196103563188, + "grad_norm": 2.68278659957716, + "learning_rate": 1e-06, + "loss": 0.4704, + "step": 1207 + }, + { + "epoch": 0.07741604716739298, + "grad_norm": 2.6168633523606064, + "learning_rate": 1e-06, + "loss": 0.3387, + "step": 1208 + }, + { + "epoch": 0.07748013329915407, + "grad_norm": 2.6114164138499887, + "learning_rate": 1e-06, + "loss": 0.4408, + "step": 1209 + }, + { + "epoch": 0.07754421943091515, + "grad_norm": 2.6161792038015053, + "learning_rate": 1e-06, + "loss": 0.406, + "step": 1210 + }, + { + "epoch": 0.07760830556267624, + "grad_norm": 2.7520314159563033, + "learning_rate": 1e-06, + "loss": 0.3833, + "step": 1211 + }, + { + "epoch": 0.07767239169443732, + "grad_norm": 2.774588610799305, + "learning_rate": 1e-06, + "loss": 0.5036, + "step": 1212 + }, + { + "epoch": 0.07773647782619841, + "grad_norm": 2.5033422537991994, + "learning_rate": 1e-06, + "loss": 0.4058, + "step": 1213 + }, + { + "epoch": 0.07780056395795949, + "grad_norm": 2.73481240056627, + "learning_rate": 1e-06, + "loss": 0.4345, + "step": 1214 + }, + { + "epoch": 0.07786465008972059, + "grad_norm": 2.6851998419349026, + "learning_rate": 1e-06, + "loss": 0.4647, + "step": 1215 + }, + { + "epoch": 0.07792873622148166, + "grad_norm": 2.7237868971195596, + "learning_rate": 1e-06, + "loss": 0.4302, + "step": 1216 + }, + { + "epoch": 0.07799282235324276, + "grad_norm": 2.541489190154758, + "learning_rate": 1e-06, + "loss": 0.3638, + "step": 1217 + }, + { + "epoch": 0.07805690848500385, + "grad_norm": 2.464482316323183, + "learning_rate": 1e-06, + "loss": 0.4373, + "step": 1218 + }, + { + "epoch": 0.07812099461676493, + "grad_norm": 2.572853991725539, + "learning_rate": 1e-06, + "loss": 0.4129, + "step": 1219 + }, + { + "epoch": 0.07818508074852602, + "grad_norm": 2.6668459095154216, + "learning_rate": 1e-06, + "loss": 0.4249, + "step": 1220 + }, + { + "epoch": 0.0782491668802871, + "grad_norm": 2.782296312797821, + "learning_rate": 1e-06, + "loss": 0.4296, + "step": 1221 + }, + { + "epoch": 0.0783132530120482, + "grad_norm": 2.6235991335311937, + "learning_rate": 1e-06, + "loss": 0.4165, + "step": 1222 + }, + { + "epoch": 0.07837733914380927, + "grad_norm": 2.7714978455242028, + "learning_rate": 1e-06, + "loss": 0.364, + "step": 1223 + }, + { + "epoch": 0.07844142527557037, + "grad_norm": 2.3752856980488266, + "learning_rate": 1e-06, + "loss": 0.3848, + "step": 1224 + }, + { + "epoch": 0.07850551140733146, + "grad_norm": 2.7992009765527515, + "learning_rate": 1e-06, + "loss": 0.3977, + "step": 1225 + }, + { + "epoch": 0.07856959753909254, + "grad_norm": 2.633436222450441, + "learning_rate": 1e-06, + "loss": 0.4539, + "step": 1226 + }, + { + "epoch": 0.07863368367085363, + "grad_norm": 2.577156182418178, + "learning_rate": 1e-06, + "loss": 0.4118, + "step": 1227 + }, + { + "epoch": 0.07869776980261471, + "grad_norm": 2.720243320628882, + "learning_rate": 1e-06, + "loss": 0.4416, + "step": 1228 + }, + { + "epoch": 0.0787618559343758, + "grad_norm": 2.7155698373395354, + "learning_rate": 1e-06, + "loss": 0.4508, + "step": 1229 + }, + { + "epoch": 0.07882594206613688, + "grad_norm": 2.343026422126699, + "learning_rate": 1e-06, + "loss": 0.4059, + "step": 1230 + }, + { + "epoch": 0.07889002819789798, + "grad_norm": 2.5559831262150574, + "learning_rate": 1e-06, + "loss": 0.4686, + "step": 1231 + }, + { + "epoch": 0.07895411432965906, + "grad_norm": 2.493013880324132, + "learning_rate": 1e-06, + "loss": 0.4157, + "step": 1232 + }, + { + "epoch": 0.07901820046142015, + "grad_norm": 2.5095068573703463, + "learning_rate": 1e-06, + "loss": 0.4078, + "step": 1233 + }, + { + "epoch": 0.07908228659318124, + "grad_norm": 2.6581528818814975, + "learning_rate": 1e-06, + "loss": 0.4285, + "step": 1234 + }, + { + "epoch": 0.07914637272494232, + "grad_norm": 2.7654271167249793, + "learning_rate": 1e-06, + "loss": 0.47, + "step": 1235 + }, + { + "epoch": 0.07921045885670341, + "grad_norm": 2.6893412954625853, + "learning_rate": 1e-06, + "loss": 0.4038, + "step": 1236 + }, + { + "epoch": 0.0792745449884645, + "grad_norm": 2.456340889110768, + "learning_rate": 1e-06, + "loss": 0.4103, + "step": 1237 + }, + { + "epoch": 0.07933863112022559, + "grad_norm": 2.556312174382247, + "learning_rate": 1e-06, + "loss": 0.4516, + "step": 1238 + }, + { + "epoch": 0.07940271725198667, + "grad_norm": 2.6753669316919417, + "learning_rate": 1e-06, + "loss": 0.4232, + "step": 1239 + }, + { + "epoch": 0.07946680338374776, + "grad_norm": 2.6035452011928903, + "learning_rate": 1e-06, + "loss": 0.3405, + "step": 1240 + }, + { + "epoch": 0.07953088951550884, + "grad_norm": 2.4434288383339946, + "learning_rate": 1e-06, + "loss": 0.4881, + "step": 1241 + }, + { + "epoch": 0.07959497564726993, + "grad_norm": 2.662120824414917, + "learning_rate": 1e-06, + "loss": 0.382, + "step": 1242 + }, + { + "epoch": 0.07965906177903102, + "grad_norm": 2.6177518200932, + "learning_rate": 1e-06, + "loss": 0.4306, + "step": 1243 + }, + { + "epoch": 0.0797231479107921, + "grad_norm": 2.4114846966775767, + "learning_rate": 1e-06, + "loss": 0.3994, + "step": 1244 + }, + { + "epoch": 0.0797872340425532, + "grad_norm": 2.6835316926141863, + "learning_rate": 1e-06, + "loss": 0.4706, + "step": 1245 + }, + { + "epoch": 0.07985132017431427, + "grad_norm": 2.5651819030486496, + "learning_rate": 1e-06, + "loss": 0.4262, + "step": 1246 + }, + { + "epoch": 0.07991540630607537, + "grad_norm": 2.3934181889049126, + "learning_rate": 1e-06, + "loss": 0.4224, + "step": 1247 + }, + { + "epoch": 0.07997949243783645, + "grad_norm": 2.643579877871764, + "learning_rate": 1e-06, + "loss": 0.4524, + "step": 1248 + }, + { + "epoch": 0.08004357856959754, + "grad_norm": 2.5313599900999337, + "learning_rate": 1e-06, + "loss": 0.364, + "step": 1249 + }, + { + "epoch": 0.08010766470135863, + "grad_norm": 2.619558211523681, + "learning_rate": 1e-06, + "loss": 0.3862, + "step": 1250 + }, + { + "epoch": 0.08017175083311971, + "grad_norm": 2.5146257629070097, + "learning_rate": 1e-06, + "loss": 0.4005, + "step": 1251 + }, + { + "epoch": 0.0802358369648808, + "grad_norm": 2.591349223638374, + "learning_rate": 1e-06, + "loss": 0.4145, + "step": 1252 + }, + { + "epoch": 0.08029992309664188, + "grad_norm": 2.59587465957244, + "learning_rate": 1e-06, + "loss": 0.437, + "step": 1253 + }, + { + "epoch": 0.08036400922840298, + "grad_norm": 2.9832966117856667, + "learning_rate": 1e-06, + "loss": 0.4246, + "step": 1254 + }, + { + "epoch": 0.08042809536016406, + "grad_norm": 2.6040546777081657, + "learning_rate": 1e-06, + "loss": 0.3766, + "step": 1255 + }, + { + "epoch": 0.08049218149192515, + "grad_norm": 2.496979332905621, + "learning_rate": 1e-06, + "loss": 0.4219, + "step": 1256 + }, + { + "epoch": 0.08055626762368623, + "grad_norm": 2.629851175282855, + "learning_rate": 1e-06, + "loss": 0.3876, + "step": 1257 + }, + { + "epoch": 0.08062035375544732, + "grad_norm": 2.5674825630257896, + "learning_rate": 1e-06, + "loss": 0.4325, + "step": 1258 + }, + { + "epoch": 0.08068443988720841, + "grad_norm": 2.940799171634024, + "learning_rate": 1e-06, + "loss": 0.468, + "step": 1259 + }, + { + "epoch": 0.0807485260189695, + "grad_norm": 2.650948845848417, + "learning_rate": 1e-06, + "loss": 0.4211, + "step": 1260 + }, + { + "epoch": 0.08081261215073059, + "grad_norm": 2.679500087359039, + "learning_rate": 1e-06, + "loss": 0.4538, + "step": 1261 + }, + { + "epoch": 0.08087669828249167, + "grad_norm": 2.390614137526815, + "learning_rate": 1e-06, + "loss": 0.3643, + "step": 1262 + }, + { + "epoch": 0.08094078441425276, + "grad_norm": 2.6276093482133867, + "learning_rate": 1e-06, + "loss": 0.4292, + "step": 1263 + }, + { + "epoch": 0.08100487054601384, + "grad_norm": 2.61781590271726, + "learning_rate": 1e-06, + "loss": 0.4178, + "step": 1264 + }, + { + "epoch": 0.08106895667777493, + "grad_norm": 2.779503761768041, + "learning_rate": 1e-06, + "loss": 0.4117, + "step": 1265 + }, + { + "epoch": 0.08113304280953601, + "grad_norm": 2.561366232789244, + "learning_rate": 1e-06, + "loss": 0.4184, + "step": 1266 + }, + { + "epoch": 0.0811971289412971, + "grad_norm": 2.8717807574636174, + "learning_rate": 1e-06, + "loss": 0.4089, + "step": 1267 + }, + { + "epoch": 0.0812612150730582, + "grad_norm": 2.7921707046886977, + "learning_rate": 1e-06, + "loss": 0.4527, + "step": 1268 + }, + { + "epoch": 0.08132530120481928, + "grad_norm": 2.5928313310418565, + "learning_rate": 1e-06, + "loss": 0.4103, + "step": 1269 + }, + { + "epoch": 0.08138938733658037, + "grad_norm": 2.9418199515370085, + "learning_rate": 1e-06, + "loss": 0.4114, + "step": 1270 + }, + { + "epoch": 0.08145347346834145, + "grad_norm": 2.474771537185245, + "learning_rate": 1e-06, + "loss": 0.4398, + "step": 1271 + }, + { + "epoch": 0.08151755960010254, + "grad_norm": 2.4288233134888104, + "learning_rate": 1e-06, + "loss": 0.4071, + "step": 1272 + }, + { + "epoch": 0.08158164573186362, + "grad_norm": 2.728783148369517, + "learning_rate": 1e-06, + "loss": 0.3944, + "step": 1273 + }, + { + "epoch": 0.08164573186362471, + "grad_norm": 2.595463299855329, + "learning_rate": 1e-06, + "loss": 0.3961, + "step": 1274 + }, + { + "epoch": 0.08170981799538579, + "grad_norm": 2.4186223539565472, + "learning_rate": 1e-06, + "loss": 0.3815, + "step": 1275 + }, + { + "epoch": 0.08177390412714688, + "grad_norm": 2.5691674213334412, + "learning_rate": 1e-06, + "loss": 0.4347, + "step": 1276 + }, + { + "epoch": 0.08183799025890798, + "grad_norm": 2.772988911794295, + "learning_rate": 1e-06, + "loss": 0.4498, + "step": 1277 + }, + { + "epoch": 0.08190207639066906, + "grad_norm": 2.5770033358021176, + "learning_rate": 1e-06, + "loss": 0.4306, + "step": 1278 + }, + { + "epoch": 0.08196616252243015, + "grad_norm": 2.7479896736468534, + "learning_rate": 1e-06, + "loss": 0.4102, + "step": 1279 + }, + { + "epoch": 0.08203024865419123, + "grad_norm": 2.463361879732375, + "learning_rate": 1e-06, + "loss": 0.4234, + "step": 1280 + }, + { + "epoch": 0.08209433478595232, + "grad_norm": 2.5773689957088974, + "learning_rate": 1e-06, + "loss": 0.4061, + "step": 1281 + }, + { + "epoch": 0.0821584209177134, + "grad_norm": 2.7031134347973205, + "learning_rate": 1e-06, + "loss": 0.3718, + "step": 1282 + }, + { + "epoch": 0.0822225070494745, + "grad_norm": 2.3685374130275436, + "learning_rate": 1e-06, + "loss": 0.4034, + "step": 1283 + }, + { + "epoch": 0.08228659318123559, + "grad_norm": 2.5898467699040904, + "learning_rate": 1e-06, + "loss": 0.5035, + "step": 1284 + }, + { + "epoch": 0.08235067931299667, + "grad_norm": 2.566905337207484, + "learning_rate": 1e-06, + "loss": 0.4253, + "step": 1285 + }, + { + "epoch": 0.08241476544475776, + "grad_norm": 2.7563322922109172, + "learning_rate": 1e-06, + "loss": 0.3726, + "step": 1286 + }, + { + "epoch": 0.08247885157651884, + "grad_norm": 2.491496676280438, + "learning_rate": 1e-06, + "loss": 0.37, + "step": 1287 + }, + { + "epoch": 0.08254293770827993, + "grad_norm": 2.6730510178537608, + "learning_rate": 1e-06, + "loss": 0.384, + "step": 1288 + }, + { + "epoch": 0.08260702384004101, + "grad_norm": 2.607473514155551, + "learning_rate": 1e-06, + "loss": 0.4547, + "step": 1289 + }, + { + "epoch": 0.0826711099718021, + "grad_norm": 2.664953352790864, + "learning_rate": 1e-06, + "loss": 0.4457, + "step": 1290 + }, + { + "epoch": 0.08273519610356318, + "grad_norm": 2.9065089288452675, + "learning_rate": 1e-06, + "loss": 0.4133, + "step": 1291 + }, + { + "epoch": 0.08279928223532428, + "grad_norm": 2.761268347894189, + "learning_rate": 1e-06, + "loss": 0.4013, + "step": 1292 + }, + { + "epoch": 0.08286336836708537, + "grad_norm": 2.53874487571015, + "learning_rate": 1e-06, + "loss": 0.4745, + "step": 1293 + }, + { + "epoch": 0.08292745449884645, + "grad_norm": 2.5396881198425603, + "learning_rate": 1e-06, + "loss": 0.4437, + "step": 1294 + }, + { + "epoch": 0.08299154063060754, + "grad_norm": 2.547768008887249, + "learning_rate": 1e-06, + "loss": 0.3651, + "step": 1295 + }, + { + "epoch": 0.08305562676236862, + "grad_norm": 2.499185905675141, + "learning_rate": 1e-06, + "loss": 0.4153, + "step": 1296 + }, + { + "epoch": 0.08311971289412971, + "grad_norm": 2.4836012038697457, + "learning_rate": 1e-06, + "loss": 0.4259, + "step": 1297 + }, + { + "epoch": 0.08318379902589079, + "grad_norm": 2.621155242220239, + "learning_rate": 1e-06, + "loss": 0.4255, + "step": 1298 + }, + { + "epoch": 0.08324788515765189, + "grad_norm": 2.592565931013178, + "learning_rate": 1e-06, + "loss": 0.3688, + "step": 1299 + }, + { + "epoch": 0.08331197128941296, + "grad_norm": 2.5498345680795924, + "learning_rate": 1e-06, + "loss": 0.3821, + "step": 1300 + }, + { + "epoch": 0.08337605742117406, + "grad_norm": 2.4929209233901295, + "learning_rate": 1e-06, + "loss": 0.4552, + "step": 1301 + }, + { + "epoch": 0.08344014355293515, + "grad_norm": 2.6882977211963905, + "learning_rate": 1e-06, + "loss": 0.4062, + "step": 1302 + }, + { + "epoch": 0.08350422968469623, + "grad_norm": 2.473705037392068, + "learning_rate": 1e-06, + "loss": 0.3817, + "step": 1303 + }, + { + "epoch": 0.08356831581645732, + "grad_norm": 2.5317476990261465, + "learning_rate": 1e-06, + "loss": 0.3704, + "step": 1304 + }, + { + "epoch": 0.0836324019482184, + "grad_norm": 2.5664114423982074, + "learning_rate": 1e-06, + "loss": 0.4016, + "step": 1305 + }, + { + "epoch": 0.0836964880799795, + "grad_norm": 2.5912791482520943, + "learning_rate": 1e-06, + "loss": 0.392, + "step": 1306 + }, + { + "epoch": 0.08376057421174057, + "grad_norm": 2.9901280232724976, + "learning_rate": 1e-06, + "loss": 0.4434, + "step": 1307 + }, + { + "epoch": 0.08382466034350167, + "grad_norm": 2.5967756023713995, + "learning_rate": 1e-06, + "loss": 0.3927, + "step": 1308 + }, + { + "epoch": 0.08388874647526276, + "grad_norm": 2.73521407336545, + "learning_rate": 1e-06, + "loss": 0.4241, + "step": 1309 + }, + { + "epoch": 0.08395283260702384, + "grad_norm": 2.919428684494416, + "learning_rate": 1e-06, + "loss": 0.4205, + "step": 1310 + }, + { + "epoch": 0.08401691873878493, + "grad_norm": 2.7758438993320484, + "learning_rate": 1e-06, + "loss": 0.4519, + "step": 1311 + }, + { + "epoch": 0.08408100487054601, + "grad_norm": 2.715976314244008, + "learning_rate": 1e-06, + "loss": 0.3993, + "step": 1312 + }, + { + "epoch": 0.0841450910023071, + "grad_norm": 2.5924326736887213, + "learning_rate": 1e-06, + "loss": 0.4025, + "step": 1313 + }, + { + "epoch": 0.08420917713406818, + "grad_norm": 2.7752283664217017, + "learning_rate": 1e-06, + "loss": 0.3956, + "step": 1314 + }, + { + "epoch": 0.08427326326582928, + "grad_norm": 2.4160129671872337, + "learning_rate": 1e-06, + "loss": 0.4106, + "step": 1315 + }, + { + "epoch": 0.08433734939759036, + "grad_norm": 2.5407463488739053, + "learning_rate": 1e-06, + "loss": 0.4163, + "step": 1316 + }, + { + "epoch": 0.08440143552935145, + "grad_norm": 2.801380710310921, + "learning_rate": 1e-06, + "loss": 0.3871, + "step": 1317 + }, + { + "epoch": 0.08446552166111254, + "grad_norm": 2.6269565817549925, + "learning_rate": 1e-06, + "loss": 0.4203, + "step": 1318 + }, + { + "epoch": 0.08452960779287362, + "grad_norm": 2.5243065786237944, + "learning_rate": 1e-06, + "loss": 0.373, + "step": 1319 + }, + { + "epoch": 0.08459369392463471, + "grad_norm": 2.8116644364078986, + "learning_rate": 1e-06, + "loss": 0.4163, + "step": 1320 + }, + { + "epoch": 0.08465778005639579, + "grad_norm": 2.575799701620765, + "learning_rate": 1e-06, + "loss": 0.4277, + "step": 1321 + }, + { + "epoch": 0.08472186618815689, + "grad_norm": 2.75865288811887, + "learning_rate": 1e-06, + "loss": 0.4687, + "step": 1322 + }, + { + "epoch": 0.08478595231991796, + "grad_norm": 2.7042258614953467, + "learning_rate": 1e-06, + "loss": 0.4051, + "step": 1323 + }, + { + "epoch": 0.08485003845167906, + "grad_norm": 2.6044252860407213, + "learning_rate": 1e-06, + "loss": 0.4243, + "step": 1324 + }, + { + "epoch": 0.08491412458344014, + "grad_norm": 2.655140452465128, + "learning_rate": 1e-06, + "loss": 0.4242, + "step": 1325 + }, + { + "epoch": 0.08497821071520123, + "grad_norm": 2.6694681152353072, + "learning_rate": 1e-06, + "loss": 0.4693, + "step": 1326 + }, + { + "epoch": 0.08504229684696232, + "grad_norm": 2.6321438736595364, + "learning_rate": 1e-06, + "loss": 0.447, + "step": 1327 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 2.4939365988778572, + "learning_rate": 1e-06, + "loss": 0.384, + "step": 1328 + }, + { + "epoch": 0.0851704691104845, + "grad_norm": 2.7686831102984186, + "learning_rate": 1e-06, + "loss": 0.4096, + "step": 1329 + }, + { + "epoch": 0.08523455524224557, + "grad_norm": 2.4734804109635533, + "learning_rate": 1e-06, + "loss": 0.4172, + "step": 1330 + }, + { + "epoch": 0.08529864137400667, + "grad_norm": 2.3310389802867224, + "learning_rate": 1e-06, + "loss": 0.4499, + "step": 1331 + }, + { + "epoch": 0.08536272750576775, + "grad_norm": 2.5427992914352733, + "learning_rate": 1e-06, + "loss": 0.5163, + "step": 1332 + }, + { + "epoch": 0.08542681363752884, + "grad_norm": 2.550742214022169, + "learning_rate": 1e-06, + "loss": 0.3351, + "step": 1333 + }, + { + "epoch": 0.08549089976928993, + "grad_norm": 2.3491321424112823, + "learning_rate": 1e-06, + "loss": 0.3927, + "step": 1334 + }, + { + "epoch": 0.08555498590105101, + "grad_norm": 7.183934088342243, + "learning_rate": 1e-06, + "loss": 0.4585, + "step": 1335 + }, + { + "epoch": 0.0856190720328121, + "grad_norm": 2.727510164110277, + "learning_rate": 1e-06, + "loss": 0.403, + "step": 1336 + }, + { + "epoch": 0.08568315816457318, + "grad_norm": 2.463122951016014, + "learning_rate": 1e-06, + "loss": 0.3738, + "step": 1337 + }, + { + "epoch": 0.08574724429633428, + "grad_norm": 2.4246794004524856, + "learning_rate": 1e-06, + "loss": 0.4131, + "step": 1338 + }, + { + "epoch": 0.08581133042809536, + "grad_norm": 2.505910282661219, + "learning_rate": 1e-06, + "loss": 0.3954, + "step": 1339 + }, + { + "epoch": 0.08587541655985645, + "grad_norm": 2.7449797274573906, + "learning_rate": 1e-06, + "loss": 0.4332, + "step": 1340 + }, + { + "epoch": 0.08593950269161753, + "grad_norm": 2.602286871420609, + "learning_rate": 1e-06, + "loss": 0.4568, + "step": 1341 + }, + { + "epoch": 0.08600358882337862, + "grad_norm": 2.4878615903294303, + "learning_rate": 1e-06, + "loss": 0.3549, + "step": 1342 + }, + { + "epoch": 0.08606767495513971, + "grad_norm": 2.377510719482693, + "learning_rate": 1e-06, + "loss": 0.4109, + "step": 1343 + }, + { + "epoch": 0.0861317610869008, + "grad_norm": 2.5420518872600377, + "learning_rate": 1e-06, + "loss": 0.402, + "step": 1344 + }, + { + "epoch": 0.08619584721866189, + "grad_norm": 2.619739587147752, + "learning_rate": 1e-06, + "loss": 0.431, + "step": 1345 + }, + { + "epoch": 0.08625993335042297, + "grad_norm": 2.5009317514537237, + "learning_rate": 1e-06, + "loss": 0.433, + "step": 1346 + }, + { + "epoch": 0.08632401948218406, + "grad_norm": 3.129107498488318, + "learning_rate": 1e-06, + "loss": 0.3725, + "step": 1347 + }, + { + "epoch": 0.08638810561394514, + "grad_norm": 2.475758470143281, + "learning_rate": 1e-06, + "loss": 0.45, + "step": 1348 + }, + { + "epoch": 0.08645219174570623, + "grad_norm": 2.181341026400154, + "learning_rate": 1e-06, + "loss": 0.3971, + "step": 1349 + }, + { + "epoch": 0.08651627787746731, + "grad_norm": 2.4954104610238024, + "learning_rate": 1e-06, + "loss": 0.4054, + "step": 1350 + }, + { + "epoch": 0.0865803640092284, + "grad_norm": 2.6628673858751255, + "learning_rate": 1e-06, + "loss": 0.4717, + "step": 1351 + }, + { + "epoch": 0.0866444501409895, + "grad_norm": 2.6090665119674967, + "learning_rate": 1e-06, + "loss": 0.4544, + "step": 1352 + }, + { + "epoch": 0.08670853627275057, + "grad_norm": 2.6018719333061986, + "learning_rate": 1e-06, + "loss": 0.4399, + "step": 1353 + }, + { + "epoch": 0.08677262240451167, + "grad_norm": 2.4434199739785827, + "learning_rate": 1e-06, + "loss": 0.415, + "step": 1354 + }, + { + "epoch": 0.08683670853627275, + "grad_norm": 2.769828363674665, + "learning_rate": 1e-06, + "loss": 0.4179, + "step": 1355 + }, + { + "epoch": 0.08690079466803384, + "grad_norm": 2.7397769472949247, + "learning_rate": 1e-06, + "loss": 0.4056, + "step": 1356 + }, + { + "epoch": 0.08696488079979492, + "grad_norm": 2.6172378352859726, + "learning_rate": 1e-06, + "loss": 0.4165, + "step": 1357 + }, + { + "epoch": 0.08702896693155601, + "grad_norm": 2.539813707503197, + "learning_rate": 1e-06, + "loss": 0.4665, + "step": 1358 + }, + { + "epoch": 0.08709305306331709, + "grad_norm": 2.6327431750979695, + "learning_rate": 1e-06, + "loss": 0.4685, + "step": 1359 + }, + { + "epoch": 0.08715713919507818, + "grad_norm": 2.7709842385532455, + "learning_rate": 1e-06, + "loss": 0.495, + "step": 1360 + }, + { + "epoch": 0.08722122532683928, + "grad_norm": 2.5015074677105753, + "learning_rate": 1e-06, + "loss": 0.4131, + "step": 1361 + }, + { + "epoch": 0.08728531145860036, + "grad_norm": 2.5041650372229305, + "learning_rate": 1e-06, + "loss": 0.4544, + "step": 1362 + }, + { + "epoch": 0.08734939759036145, + "grad_norm": 2.71401356392254, + "learning_rate": 1e-06, + "loss": 0.4458, + "step": 1363 + }, + { + "epoch": 0.08741348372212253, + "grad_norm": 2.531157702107666, + "learning_rate": 1e-06, + "loss": 0.3734, + "step": 1364 + }, + { + "epoch": 0.08747756985388362, + "grad_norm": 2.626679235090447, + "learning_rate": 1e-06, + "loss": 0.4067, + "step": 1365 + }, + { + "epoch": 0.0875416559856447, + "grad_norm": 2.5951080452068704, + "learning_rate": 1e-06, + "loss": 0.3876, + "step": 1366 + }, + { + "epoch": 0.0876057421174058, + "grad_norm": 2.646236280137177, + "learning_rate": 1e-06, + "loss": 0.4023, + "step": 1367 + }, + { + "epoch": 0.08766982824916689, + "grad_norm": 2.565685965978931, + "learning_rate": 1e-06, + "loss": 0.3883, + "step": 1368 + }, + { + "epoch": 0.08773391438092797, + "grad_norm": 2.566570629622735, + "learning_rate": 1e-06, + "loss": 0.406, + "step": 1369 + }, + { + "epoch": 0.08779800051268906, + "grad_norm": 2.558331006355462, + "learning_rate": 1e-06, + "loss": 0.4179, + "step": 1370 + }, + { + "epoch": 0.08786208664445014, + "grad_norm": 2.812026841029743, + "learning_rate": 1e-06, + "loss": 0.4102, + "step": 1371 + }, + { + "epoch": 0.08792617277621123, + "grad_norm": 2.593914945874754, + "learning_rate": 1e-06, + "loss": 0.4115, + "step": 1372 + }, + { + "epoch": 0.08799025890797231, + "grad_norm": 2.673191899027595, + "learning_rate": 1e-06, + "loss": 0.4581, + "step": 1373 + }, + { + "epoch": 0.0880543450397334, + "grad_norm": 2.6943890741457452, + "learning_rate": 1e-06, + "loss": 0.3679, + "step": 1374 + }, + { + "epoch": 0.08811843117149448, + "grad_norm": 2.7105087759870248, + "learning_rate": 1e-06, + "loss": 0.3861, + "step": 1375 + }, + { + "epoch": 0.08818251730325558, + "grad_norm": 2.6588773058390567, + "learning_rate": 1e-06, + "loss": 0.4262, + "step": 1376 + }, + { + "epoch": 0.08824660343501667, + "grad_norm": 2.624668121030694, + "learning_rate": 1e-06, + "loss": 0.4408, + "step": 1377 + }, + { + "epoch": 0.08831068956677775, + "grad_norm": 2.567696520761241, + "learning_rate": 1e-06, + "loss": 0.3944, + "step": 1378 + }, + { + "epoch": 0.08837477569853884, + "grad_norm": 2.525791516217746, + "learning_rate": 1e-06, + "loss": 0.3737, + "step": 1379 + }, + { + "epoch": 0.08843886183029992, + "grad_norm": 2.5589736672483916, + "learning_rate": 1e-06, + "loss": 0.376, + "step": 1380 + }, + { + "epoch": 0.08850294796206101, + "grad_norm": 2.8070743006208843, + "learning_rate": 1e-06, + "loss": 0.456, + "step": 1381 + }, + { + "epoch": 0.08856703409382209, + "grad_norm": 2.5993874207613006, + "learning_rate": 1e-06, + "loss": 0.4197, + "step": 1382 + }, + { + "epoch": 0.08863112022558318, + "grad_norm": 2.5051066249655825, + "learning_rate": 1e-06, + "loss": 0.385, + "step": 1383 + }, + { + "epoch": 0.08869520635734426, + "grad_norm": 2.405748485125088, + "learning_rate": 1e-06, + "loss": 0.4049, + "step": 1384 + }, + { + "epoch": 0.08875929248910536, + "grad_norm": 2.6145829424746965, + "learning_rate": 1e-06, + "loss": 0.4247, + "step": 1385 + }, + { + "epoch": 0.08882337862086645, + "grad_norm": 2.6173900351223964, + "learning_rate": 1e-06, + "loss": 0.4221, + "step": 1386 + }, + { + "epoch": 0.08888746475262753, + "grad_norm": 2.6806113776153246, + "learning_rate": 1e-06, + "loss": 0.4292, + "step": 1387 + }, + { + "epoch": 0.08895155088438862, + "grad_norm": 2.796773269226567, + "learning_rate": 1e-06, + "loss": 0.4554, + "step": 1388 + }, + { + "epoch": 0.0890156370161497, + "grad_norm": 2.570155349023824, + "learning_rate": 1e-06, + "loss": 0.4609, + "step": 1389 + }, + { + "epoch": 0.0890797231479108, + "grad_norm": 2.651717663982742, + "learning_rate": 1e-06, + "loss": 0.3871, + "step": 1390 + }, + { + "epoch": 0.08914380927967187, + "grad_norm": 2.9890601911555574, + "learning_rate": 1e-06, + "loss": 0.4246, + "step": 1391 + }, + { + "epoch": 0.08920789541143297, + "grad_norm": 2.816018482470761, + "learning_rate": 1e-06, + "loss": 0.4105, + "step": 1392 + }, + { + "epoch": 0.08927198154319406, + "grad_norm": 2.806504598338548, + "learning_rate": 1e-06, + "loss": 0.4205, + "step": 1393 + }, + { + "epoch": 0.08933606767495514, + "grad_norm": 2.773138983571822, + "learning_rate": 1e-06, + "loss": 0.439, + "step": 1394 + }, + { + "epoch": 0.08940015380671623, + "grad_norm": 2.685056882343485, + "learning_rate": 1e-06, + "loss": 0.4444, + "step": 1395 + }, + { + "epoch": 0.08946423993847731, + "grad_norm": 2.9257692535431485, + "learning_rate": 1e-06, + "loss": 0.4641, + "step": 1396 + }, + { + "epoch": 0.0895283260702384, + "grad_norm": 2.4710812395258905, + "learning_rate": 1e-06, + "loss": 0.4463, + "step": 1397 + }, + { + "epoch": 0.08959241220199948, + "grad_norm": 2.70288826605684, + "learning_rate": 1e-06, + "loss": 0.4057, + "step": 1398 + }, + { + "epoch": 0.08965649833376058, + "grad_norm": 2.4331622087280045, + "learning_rate": 1e-06, + "loss": 0.452, + "step": 1399 + }, + { + "epoch": 0.08972058446552166, + "grad_norm": 2.324146163478173, + "learning_rate": 1e-06, + "loss": 0.4225, + "step": 1400 + }, + { + "epoch": 0.08978467059728275, + "grad_norm": 2.6508507688510754, + "learning_rate": 1e-06, + "loss": 0.4437, + "step": 1401 + }, + { + "epoch": 0.08984875672904384, + "grad_norm": 2.6863035227529344, + "learning_rate": 1e-06, + "loss": 0.4067, + "step": 1402 + }, + { + "epoch": 0.08991284286080492, + "grad_norm": 2.5644169132895493, + "learning_rate": 1e-06, + "loss": 0.431, + "step": 1403 + }, + { + "epoch": 0.08997692899256601, + "grad_norm": 2.4924959261581043, + "learning_rate": 1e-06, + "loss": 0.4128, + "step": 1404 + }, + { + "epoch": 0.09004101512432709, + "grad_norm": 2.680924691244528, + "learning_rate": 1e-06, + "loss": 0.4216, + "step": 1405 + }, + { + "epoch": 0.09010510125608819, + "grad_norm": 2.5648825557723898, + "learning_rate": 1e-06, + "loss": 0.3999, + "step": 1406 + }, + { + "epoch": 0.09016918738784926, + "grad_norm": 2.6750967093694578, + "learning_rate": 1e-06, + "loss": 0.3778, + "step": 1407 + }, + { + "epoch": 0.09023327351961036, + "grad_norm": 2.6244024961560157, + "learning_rate": 1e-06, + "loss": 0.4582, + "step": 1408 + }, + { + "epoch": 0.09029735965137144, + "grad_norm": 2.632057969960539, + "learning_rate": 1e-06, + "loss": 0.4357, + "step": 1409 + }, + { + "epoch": 0.09036144578313253, + "grad_norm": 2.6751906186648946, + "learning_rate": 1e-06, + "loss": 0.413, + "step": 1410 + }, + { + "epoch": 0.09042553191489362, + "grad_norm": 2.561674070651734, + "learning_rate": 1e-06, + "loss": 0.4063, + "step": 1411 + }, + { + "epoch": 0.0904896180466547, + "grad_norm": 2.6877044774483827, + "learning_rate": 1e-06, + "loss": 0.4689, + "step": 1412 + }, + { + "epoch": 0.0905537041784158, + "grad_norm": 2.5896953889358607, + "learning_rate": 1e-06, + "loss": 0.4043, + "step": 1413 + }, + { + "epoch": 0.09061779031017687, + "grad_norm": 2.421302707798909, + "learning_rate": 1e-06, + "loss": 0.3714, + "step": 1414 + }, + { + "epoch": 0.09068187644193797, + "grad_norm": 2.7539329450540277, + "learning_rate": 1e-06, + "loss": 0.3832, + "step": 1415 + }, + { + "epoch": 0.09074596257369905, + "grad_norm": 2.6356933313255277, + "learning_rate": 1e-06, + "loss": 0.3381, + "step": 1416 + }, + { + "epoch": 0.09081004870546014, + "grad_norm": 2.36289210209112, + "learning_rate": 1e-06, + "loss": 0.4371, + "step": 1417 + }, + { + "epoch": 0.09087413483722123, + "grad_norm": 2.6024801679680905, + "learning_rate": 1e-06, + "loss": 0.4301, + "step": 1418 + }, + { + "epoch": 0.09093822096898231, + "grad_norm": 2.5452100945357117, + "learning_rate": 1e-06, + "loss": 0.4905, + "step": 1419 + }, + { + "epoch": 0.0910023071007434, + "grad_norm": 2.471533347199074, + "learning_rate": 1e-06, + "loss": 0.3574, + "step": 1420 + }, + { + "epoch": 0.09106639323250448, + "grad_norm": 2.684599051779771, + "learning_rate": 1e-06, + "loss": 0.4455, + "step": 1421 + }, + { + "epoch": 0.09113047936426558, + "grad_norm": 2.6706289104425345, + "learning_rate": 1e-06, + "loss": 0.4298, + "step": 1422 + }, + { + "epoch": 0.09119456549602666, + "grad_norm": 2.7372861347057706, + "learning_rate": 1e-06, + "loss": 0.4044, + "step": 1423 + }, + { + "epoch": 0.09125865162778775, + "grad_norm": 2.6311831051474974, + "learning_rate": 1e-06, + "loss": 0.4472, + "step": 1424 + }, + { + "epoch": 0.09132273775954883, + "grad_norm": 2.505239288524563, + "learning_rate": 1e-06, + "loss": 0.4608, + "step": 1425 + }, + { + "epoch": 0.09138682389130992, + "grad_norm": 2.455646364875542, + "learning_rate": 1e-06, + "loss": 0.4488, + "step": 1426 + }, + { + "epoch": 0.09145091002307101, + "grad_norm": 2.732934220116128, + "learning_rate": 1e-06, + "loss": 0.4206, + "step": 1427 + }, + { + "epoch": 0.09151499615483209, + "grad_norm": 2.74323981935251, + "learning_rate": 1e-06, + "loss": 0.4234, + "step": 1428 + }, + { + "epoch": 0.09157908228659319, + "grad_norm": 2.7158136731219646, + "learning_rate": 1e-06, + "loss": 0.4209, + "step": 1429 + }, + { + "epoch": 0.09164316841835426, + "grad_norm": 2.6247631977279595, + "learning_rate": 1e-06, + "loss": 0.4421, + "step": 1430 + }, + { + "epoch": 0.09170725455011536, + "grad_norm": 2.637304136296822, + "learning_rate": 1e-06, + "loss": 0.484, + "step": 1431 + }, + { + "epoch": 0.09177134068187644, + "grad_norm": 2.852998156294522, + "learning_rate": 1e-06, + "loss": 0.4155, + "step": 1432 + }, + { + "epoch": 0.09183542681363753, + "grad_norm": 2.781417431625871, + "learning_rate": 1e-06, + "loss": 0.4231, + "step": 1433 + }, + { + "epoch": 0.09189951294539861, + "grad_norm": 2.628138673747872, + "learning_rate": 1e-06, + "loss": 0.3949, + "step": 1434 + }, + { + "epoch": 0.0919635990771597, + "grad_norm": 2.667218051474725, + "learning_rate": 1e-06, + "loss": 0.3949, + "step": 1435 + }, + { + "epoch": 0.0920276852089208, + "grad_norm": 2.786734610663218, + "learning_rate": 1e-06, + "loss": 0.4462, + "step": 1436 + }, + { + "epoch": 0.09209177134068187, + "grad_norm": 2.5652996360198013, + "learning_rate": 1e-06, + "loss": 0.4285, + "step": 1437 + }, + { + "epoch": 0.09215585747244297, + "grad_norm": 2.614618685141593, + "learning_rate": 1e-06, + "loss": 0.4204, + "step": 1438 + }, + { + "epoch": 0.09221994360420405, + "grad_norm": 2.60209536356399, + "learning_rate": 1e-06, + "loss": 0.4186, + "step": 1439 + }, + { + "epoch": 0.09228402973596514, + "grad_norm": 2.669408041556916, + "learning_rate": 1e-06, + "loss": 0.4278, + "step": 1440 + }, + { + "epoch": 0.09234811586772622, + "grad_norm": 2.5842411481731635, + "learning_rate": 1e-06, + "loss": 0.3888, + "step": 1441 + }, + { + "epoch": 0.09241220199948731, + "grad_norm": 2.4305355503889077, + "learning_rate": 1e-06, + "loss": 0.4215, + "step": 1442 + }, + { + "epoch": 0.0924762881312484, + "grad_norm": 2.6165578444721995, + "learning_rate": 1e-06, + "loss": 0.4132, + "step": 1443 + }, + { + "epoch": 0.09254037426300948, + "grad_norm": 2.559076650522681, + "learning_rate": 1e-06, + "loss": 0.468, + "step": 1444 + }, + { + "epoch": 0.09260446039477058, + "grad_norm": 2.789845249548872, + "learning_rate": 1e-06, + "loss": 0.4552, + "step": 1445 + }, + { + "epoch": 0.09266854652653166, + "grad_norm": 2.602140727223901, + "learning_rate": 1e-06, + "loss": 0.3931, + "step": 1446 + }, + { + "epoch": 0.09273263265829275, + "grad_norm": 2.7240550984204632, + "learning_rate": 1e-06, + "loss": 0.431, + "step": 1447 + }, + { + "epoch": 0.09279671879005383, + "grad_norm": 2.5188910583037294, + "learning_rate": 1e-06, + "loss": 0.4161, + "step": 1448 + }, + { + "epoch": 0.09286080492181492, + "grad_norm": 2.5994000439724054, + "learning_rate": 1e-06, + "loss": 0.4029, + "step": 1449 + }, + { + "epoch": 0.092924891053576, + "grad_norm": 2.6047446760108954, + "learning_rate": 1e-06, + "loss": 0.4495, + "step": 1450 + }, + { + "epoch": 0.0929889771853371, + "grad_norm": 2.537159921941756, + "learning_rate": 1e-06, + "loss": 0.4281, + "step": 1451 + }, + { + "epoch": 0.09305306331709819, + "grad_norm": 2.7029522825334475, + "learning_rate": 1e-06, + "loss": 0.4348, + "step": 1452 + }, + { + "epoch": 0.09311714944885927, + "grad_norm": 2.5101563201833286, + "learning_rate": 1e-06, + "loss": 0.3314, + "step": 1453 + }, + { + "epoch": 0.09318123558062036, + "grad_norm": 2.5375880368645523, + "learning_rate": 1e-06, + "loss": 0.3821, + "step": 1454 + }, + { + "epoch": 0.09324532171238144, + "grad_norm": 2.630162183349999, + "learning_rate": 1e-06, + "loss": 0.4482, + "step": 1455 + }, + { + "epoch": 0.09330940784414253, + "grad_norm": 2.770034028918006, + "learning_rate": 1e-06, + "loss": 0.4052, + "step": 1456 + }, + { + "epoch": 0.09337349397590361, + "grad_norm": 2.5993245566678507, + "learning_rate": 1e-06, + "loss": 0.3816, + "step": 1457 + }, + { + "epoch": 0.0934375801076647, + "grad_norm": 2.64778391952047, + "learning_rate": 1e-06, + "loss": 0.4181, + "step": 1458 + }, + { + "epoch": 0.09350166623942578, + "grad_norm": 2.7001931669814008, + "learning_rate": 1e-06, + "loss": 0.3884, + "step": 1459 + }, + { + "epoch": 0.09356575237118687, + "grad_norm": 2.7310231499002313, + "learning_rate": 1e-06, + "loss": 0.4263, + "step": 1460 + }, + { + "epoch": 0.09362983850294797, + "grad_norm": 2.5333368747726444, + "learning_rate": 1e-06, + "loss": 0.4217, + "step": 1461 + }, + { + "epoch": 0.09369392463470905, + "grad_norm": 2.573504068195423, + "learning_rate": 1e-06, + "loss": 0.4123, + "step": 1462 + }, + { + "epoch": 0.09375801076647014, + "grad_norm": 2.586567658193003, + "learning_rate": 1e-06, + "loss": 0.4025, + "step": 1463 + }, + { + "epoch": 0.09382209689823122, + "grad_norm": 2.561777711659588, + "learning_rate": 1e-06, + "loss": 0.4438, + "step": 1464 + }, + { + "epoch": 0.09388618302999231, + "grad_norm": 2.8983923631902617, + "learning_rate": 1e-06, + "loss": 0.4852, + "step": 1465 + }, + { + "epoch": 0.09395026916175339, + "grad_norm": 2.6375367410550017, + "learning_rate": 1e-06, + "loss": 0.39, + "step": 1466 + }, + { + "epoch": 0.09401435529351448, + "grad_norm": 2.8361863431121375, + "learning_rate": 1e-06, + "loss": 0.3983, + "step": 1467 + }, + { + "epoch": 0.09407844142527556, + "grad_norm": 2.6342268308154058, + "learning_rate": 1e-06, + "loss": 0.4052, + "step": 1468 + }, + { + "epoch": 0.09414252755703666, + "grad_norm": 2.608017548825659, + "learning_rate": 1e-06, + "loss": 0.4211, + "step": 1469 + }, + { + "epoch": 0.09420661368879775, + "grad_norm": 2.449627254791894, + "learning_rate": 1e-06, + "loss": 0.4113, + "step": 1470 + }, + { + "epoch": 0.09427069982055883, + "grad_norm": 2.4811525375719556, + "learning_rate": 1e-06, + "loss": 0.377, + "step": 1471 + }, + { + "epoch": 0.09433478595231992, + "grad_norm": 2.6371030366074617, + "learning_rate": 1e-06, + "loss": 0.389, + "step": 1472 + }, + { + "epoch": 0.094398872084081, + "grad_norm": 2.499025005200448, + "learning_rate": 1e-06, + "loss": 0.4178, + "step": 1473 + }, + { + "epoch": 0.0944629582158421, + "grad_norm": 2.449095256001988, + "learning_rate": 1e-06, + "loss": 0.4191, + "step": 1474 + }, + { + "epoch": 0.09452704434760317, + "grad_norm": 2.5906968749902073, + "learning_rate": 1e-06, + "loss": 0.4099, + "step": 1475 + }, + { + "epoch": 0.09459113047936427, + "grad_norm": 2.7404480397879336, + "learning_rate": 1e-06, + "loss": 0.456, + "step": 1476 + }, + { + "epoch": 0.09465521661112536, + "grad_norm": 2.72078552266344, + "learning_rate": 1e-06, + "loss": 0.4425, + "step": 1477 + }, + { + "epoch": 0.09471930274288644, + "grad_norm": 2.6857231786650315, + "learning_rate": 1e-06, + "loss": 0.4032, + "step": 1478 + }, + { + "epoch": 0.09478338887464753, + "grad_norm": 2.788718665825192, + "learning_rate": 1e-06, + "loss": 0.4309, + "step": 1479 + }, + { + "epoch": 0.09484747500640861, + "grad_norm": 2.5435811015181735, + "learning_rate": 1e-06, + "loss": 0.4508, + "step": 1480 + }, + { + "epoch": 0.0949115611381697, + "grad_norm": 2.6178528404278567, + "learning_rate": 1e-06, + "loss": 0.375, + "step": 1481 + }, + { + "epoch": 0.09497564726993078, + "grad_norm": 2.690631217688418, + "learning_rate": 1e-06, + "loss": 0.4573, + "step": 1482 + }, + { + "epoch": 0.09503973340169188, + "grad_norm": 2.8404450272707704, + "learning_rate": 1e-06, + "loss": 0.4252, + "step": 1483 + }, + { + "epoch": 0.09510381953345295, + "grad_norm": 2.546535336347818, + "learning_rate": 1e-06, + "loss": 0.3952, + "step": 1484 + }, + { + "epoch": 0.09516790566521405, + "grad_norm": 2.825421925183025, + "learning_rate": 1e-06, + "loss": 0.4806, + "step": 1485 + }, + { + "epoch": 0.09523199179697514, + "grad_norm": 2.6961291272224654, + "learning_rate": 1e-06, + "loss": 0.3879, + "step": 1486 + }, + { + "epoch": 0.09529607792873622, + "grad_norm": 2.7748192803256577, + "learning_rate": 1e-06, + "loss": 0.4424, + "step": 1487 + }, + { + "epoch": 0.09536016406049731, + "grad_norm": 2.598558063958087, + "learning_rate": 1e-06, + "loss": 0.483, + "step": 1488 + }, + { + "epoch": 0.09542425019225839, + "grad_norm": 2.5230579044355554, + "learning_rate": 1e-06, + "loss": 0.4463, + "step": 1489 + }, + { + "epoch": 0.09548833632401948, + "grad_norm": 2.7678832307087347, + "learning_rate": 1e-06, + "loss": 0.3969, + "step": 1490 + }, + { + "epoch": 0.09555242245578056, + "grad_norm": 2.6987595821256365, + "learning_rate": 1e-06, + "loss": 0.4439, + "step": 1491 + }, + { + "epoch": 0.09561650858754166, + "grad_norm": 2.5431991351880656, + "learning_rate": 1e-06, + "loss": 0.4091, + "step": 1492 + }, + { + "epoch": 0.09568059471930274, + "grad_norm": 2.684245256638835, + "learning_rate": 1e-06, + "loss": 0.4585, + "step": 1493 + }, + { + "epoch": 0.09574468085106383, + "grad_norm": 2.552815133009136, + "learning_rate": 1e-06, + "loss": 0.4242, + "step": 1494 + }, + { + "epoch": 0.09580876698282492, + "grad_norm": 2.547549931917095, + "learning_rate": 1e-06, + "loss": 0.3841, + "step": 1495 + }, + { + "epoch": 0.095872853114586, + "grad_norm": 2.583837338026024, + "learning_rate": 1e-06, + "loss": 0.4405, + "step": 1496 + }, + { + "epoch": 0.0959369392463471, + "grad_norm": 2.6677862575379168, + "learning_rate": 1e-06, + "loss": 0.473, + "step": 1497 + }, + { + "epoch": 0.09600102537810817, + "grad_norm": 2.9204657147492794, + "learning_rate": 1e-06, + "loss": 0.5309, + "step": 1498 + }, + { + "epoch": 0.09606511150986927, + "grad_norm": 2.677637058756192, + "learning_rate": 1e-06, + "loss": 0.3905, + "step": 1499 + }, + { + "epoch": 0.09612919764163035, + "grad_norm": 2.7688834587766435, + "learning_rate": 1e-06, + "loss": 0.4399, + "step": 1500 + }, + { + "epoch": 0.09619328377339144, + "grad_norm": 2.740388435781337, + "learning_rate": 1e-06, + "loss": 0.4204, + "step": 1501 + }, + { + "epoch": 0.09625736990515253, + "grad_norm": 2.3972218665471514, + "learning_rate": 1e-06, + "loss": 0.4034, + "step": 1502 + }, + { + "epoch": 0.09632145603691361, + "grad_norm": 2.6633730067165526, + "learning_rate": 1e-06, + "loss": 0.4054, + "step": 1503 + }, + { + "epoch": 0.0963855421686747, + "grad_norm": 2.456909671813905, + "learning_rate": 1e-06, + "loss": 0.4143, + "step": 1504 + }, + { + "epoch": 0.09644962830043578, + "grad_norm": 2.6087008055470235, + "learning_rate": 1e-06, + "loss": 0.4382, + "step": 1505 + }, + { + "epoch": 0.09651371443219688, + "grad_norm": 2.934760294530396, + "learning_rate": 1e-06, + "loss": 0.4752, + "step": 1506 + }, + { + "epoch": 0.09657780056395796, + "grad_norm": 2.7777807926127185, + "learning_rate": 1e-06, + "loss": 0.3972, + "step": 1507 + }, + { + "epoch": 0.09664188669571905, + "grad_norm": 2.5245228852446124, + "learning_rate": 1e-06, + "loss": 0.4838, + "step": 1508 + }, + { + "epoch": 0.09670597282748013, + "grad_norm": 2.556699635104518, + "learning_rate": 1e-06, + "loss": 0.3796, + "step": 1509 + }, + { + "epoch": 0.09677005895924122, + "grad_norm": 2.6734250594176294, + "learning_rate": 1e-06, + "loss": 0.3988, + "step": 1510 + }, + { + "epoch": 0.09683414509100231, + "grad_norm": 2.779651783350114, + "learning_rate": 1e-06, + "loss": 0.4375, + "step": 1511 + }, + { + "epoch": 0.09689823122276339, + "grad_norm": 2.7072349412684886, + "learning_rate": 1e-06, + "loss": 0.4093, + "step": 1512 + }, + { + "epoch": 0.09696231735452449, + "grad_norm": 2.9895637028649746, + "learning_rate": 1e-06, + "loss": 0.4021, + "step": 1513 + }, + { + "epoch": 0.09702640348628556, + "grad_norm": 2.6235342457993416, + "learning_rate": 1e-06, + "loss": 0.4582, + "step": 1514 + }, + { + "epoch": 0.09709048961804666, + "grad_norm": 2.6685946420553273, + "learning_rate": 1e-06, + "loss": 0.491, + "step": 1515 + }, + { + "epoch": 0.09715457574980774, + "grad_norm": 2.6912226523276086, + "learning_rate": 1e-06, + "loss": 0.3626, + "step": 1516 + }, + { + "epoch": 0.09721866188156883, + "grad_norm": 2.5055739358842346, + "learning_rate": 1e-06, + "loss": 0.4461, + "step": 1517 + }, + { + "epoch": 0.09728274801332991, + "grad_norm": 3.0575599131857327, + "learning_rate": 1e-06, + "loss": 0.3804, + "step": 1518 + }, + { + "epoch": 0.097346834145091, + "grad_norm": 2.4144353166108634, + "learning_rate": 1e-06, + "loss": 0.383, + "step": 1519 + }, + { + "epoch": 0.0974109202768521, + "grad_norm": 2.485527122413705, + "learning_rate": 1e-06, + "loss": 0.4088, + "step": 1520 + }, + { + "epoch": 0.09747500640861317, + "grad_norm": 2.727634679983602, + "learning_rate": 1e-06, + "loss": 0.391, + "step": 1521 + }, + { + "epoch": 0.09753909254037427, + "grad_norm": 2.688199231816162, + "learning_rate": 1e-06, + "loss": 0.4088, + "step": 1522 + }, + { + "epoch": 0.09760317867213535, + "grad_norm": 2.597138289171914, + "learning_rate": 1e-06, + "loss": 0.436, + "step": 1523 + }, + { + "epoch": 0.09766726480389644, + "grad_norm": 2.4989362972544966, + "learning_rate": 1e-06, + "loss": 0.3662, + "step": 1524 + }, + { + "epoch": 0.09773135093565752, + "grad_norm": 2.6467220685385926, + "learning_rate": 1e-06, + "loss": 0.4828, + "step": 1525 + }, + { + "epoch": 0.09779543706741861, + "grad_norm": 2.570899529787848, + "learning_rate": 1e-06, + "loss": 0.4307, + "step": 1526 + }, + { + "epoch": 0.0978595231991797, + "grad_norm": 2.5504651035630803, + "learning_rate": 1e-06, + "loss": 0.4346, + "step": 1527 + }, + { + "epoch": 0.09792360933094078, + "grad_norm": 2.658388218479122, + "learning_rate": 1e-06, + "loss": 0.3835, + "step": 1528 + }, + { + "epoch": 0.09798769546270188, + "grad_norm": 2.5854205257184955, + "learning_rate": 1e-06, + "loss": 0.4067, + "step": 1529 + }, + { + "epoch": 0.09805178159446296, + "grad_norm": 2.549513145186835, + "learning_rate": 1e-06, + "loss": 0.4431, + "step": 1530 + }, + { + "epoch": 0.09811586772622405, + "grad_norm": 2.7201381698108715, + "learning_rate": 1e-06, + "loss": 0.4101, + "step": 1531 + }, + { + "epoch": 0.09817995385798513, + "grad_norm": 2.5040689871867934, + "learning_rate": 1e-06, + "loss": 0.3734, + "step": 1532 + }, + { + "epoch": 0.09824403998974622, + "grad_norm": 2.714762788781154, + "learning_rate": 1e-06, + "loss": 0.4831, + "step": 1533 + }, + { + "epoch": 0.0983081261215073, + "grad_norm": 2.552541268474821, + "learning_rate": 1e-06, + "loss": 0.3674, + "step": 1534 + }, + { + "epoch": 0.09837221225326839, + "grad_norm": 2.57284676055446, + "learning_rate": 1e-06, + "loss": 0.3585, + "step": 1535 + }, + { + "epoch": 0.09843629838502949, + "grad_norm": 2.721429265477203, + "learning_rate": 1e-06, + "loss": 0.4301, + "step": 1536 + }, + { + "epoch": 0.09850038451679057, + "grad_norm": 2.774050751464011, + "learning_rate": 1e-06, + "loss": 0.4887, + "step": 1537 + }, + { + "epoch": 0.09856447064855166, + "grad_norm": 2.7744727611174964, + "learning_rate": 1e-06, + "loss": 0.3775, + "step": 1538 + }, + { + "epoch": 0.09862855678031274, + "grad_norm": 2.510003290063987, + "learning_rate": 1e-06, + "loss": 0.3722, + "step": 1539 + }, + { + "epoch": 0.09869264291207383, + "grad_norm": 2.535552620114136, + "learning_rate": 1e-06, + "loss": 0.4426, + "step": 1540 + }, + { + "epoch": 0.09875672904383491, + "grad_norm": 2.564367168182366, + "learning_rate": 1e-06, + "loss": 0.4083, + "step": 1541 + }, + { + "epoch": 0.098820815175596, + "grad_norm": 2.7394137242469947, + "learning_rate": 1e-06, + "loss": 0.4465, + "step": 1542 + }, + { + "epoch": 0.09888490130735708, + "grad_norm": 2.545975890654039, + "learning_rate": 1e-06, + "loss": 0.3485, + "step": 1543 + }, + { + "epoch": 0.09894898743911817, + "grad_norm": 2.774458084017365, + "learning_rate": 1e-06, + "loss": 0.4045, + "step": 1544 + }, + { + "epoch": 0.09901307357087927, + "grad_norm": 2.5574985197779805, + "learning_rate": 1e-06, + "loss": 0.4363, + "step": 1545 + }, + { + "epoch": 0.09907715970264035, + "grad_norm": 2.6854709411400104, + "learning_rate": 1e-06, + "loss": 0.4384, + "step": 1546 + }, + { + "epoch": 0.09914124583440144, + "grad_norm": 2.596155628657547, + "learning_rate": 1e-06, + "loss": 0.3991, + "step": 1547 + }, + { + "epoch": 0.09920533196616252, + "grad_norm": 2.725319331359241, + "learning_rate": 1e-06, + "loss": 0.4757, + "step": 1548 + }, + { + "epoch": 0.09926941809792361, + "grad_norm": 2.4312408844513835, + "learning_rate": 1e-06, + "loss": 0.3759, + "step": 1549 + }, + { + "epoch": 0.09933350422968469, + "grad_norm": 3.0975479055506665, + "learning_rate": 1e-06, + "loss": 0.405, + "step": 1550 + }, + { + "epoch": 0.09939759036144578, + "grad_norm": 2.6897850500620626, + "learning_rate": 1e-06, + "loss": 0.478, + "step": 1551 + }, + { + "epoch": 0.09946167649320686, + "grad_norm": 2.4014922518328414, + "learning_rate": 1e-06, + "loss": 0.3817, + "step": 1552 + }, + { + "epoch": 0.09952576262496796, + "grad_norm": 2.5781390912257507, + "learning_rate": 1e-06, + "loss": 0.3929, + "step": 1553 + }, + { + "epoch": 0.09958984875672905, + "grad_norm": 2.6814168690887734, + "learning_rate": 1e-06, + "loss": 0.4167, + "step": 1554 + }, + { + "epoch": 0.09965393488849013, + "grad_norm": 2.8229320999460366, + "learning_rate": 1e-06, + "loss": 0.4424, + "step": 1555 + }, + { + "epoch": 0.09971802102025122, + "grad_norm": 2.671523799350974, + "learning_rate": 1e-06, + "loss": 0.417, + "step": 1556 + }, + { + "epoch": 0.0997821071520123, + "grad_norm": 2.816317550333446, + "learning_rate": 1e-06, + "loss": 0.4308, + "step": 1557 + }, + { + "epoch": 0.0998461932837734, + "grad_norm": 2.8263116326882476, + "learning_rate": 1e-06, + "loss": 0.461, + "step": 1558 + }, + { + "epoch": 0.09991027941553447, + "grad_norm": 2.773664845095712, + "learning_rate": 1e-06, + "loss": 0.3831, + "step": 1559 + }, + { + "epoch": 0.09997436554729557, + "grad_norm": 2.465271500495268, + "learning_rate": 1e-06, + "loss": 0.4111, + "step": 1560 + }, + { + "epoch": 0.10003845167905666, + "grad_norm": 2.5630490824452696, + "learning_rate": 1e-06, + "loss": 0.3999, + "step": 1561 + }, + { + "epoch": 0.10010253781081774, + "grad_norm": 2.443896329725649, + "learning_rate": 1e-06, + "loss": 0.3384, + "step": 1562 + }, + { + "epoch": 0.10016662394257883, + "grad_norm": 2.679277621913989, + "learning_rate": 1e-06, + "loss": 0.4006, + "step": 1563 + }, + { + "epoch": 0.10023071007433991, + "grad_norm": 2.687935697149264, + "learning_rate": 1e-06, + "loss": 0.3993, + "step": 1564 + }, + { + "epoch": 0.100294796206101, + "grad_norm": 2.672827135397584, + "learning_rate": 1e-06, + "loss": 0.4441, + "step": 1565 + }, + { + "epoch": 0.10035888233786208, + "grad_norm": 2.5859584068225305, + "learning_rate": 1e-06, + "loss": 0.3911, + "step": 1566 + }, + { + "epoch": 0.10042296846962318, + "grad_norm": 2.6623095578704845, + "learning_rate": 1e-06, + "loss": 0.4869, + "step": 1567 + }, + { + "epoch": 0.10048705460138425, + "grad_norm": 2.751166514445602, + "learning_rate": 1e-06, + "loss": 0.3948, + "step": 1568 + }, + { + "epoch": 0.10055114073314535, + "grad_norm": 2.5789596779897996, + "learning_rate": 1e-06, + "loss": 0.4196, + "step": 1569 + }, + { + "epoch": 0.10061522686490644, + "grad_norm": 2.733584791811764, + "learning_rate": 1e-06, + "loss": 0.4257, + "step": 1570 + }, + { + "epoch": 0.10067931299666752, + "grad_norm": 2.771740197510259, + "learning_rate": 1e-06, + "loss": 0.4188, + "step": 1571 + }, + { + "epoch": 0.10074339912842861, + "grad_norm": 2.8752303657524387, + "learning_rate": 1e-06, + "loss": 0.4678, + "step": 1572 + }, + { + "epoch": 0.10080748526018969, + "grad_norm": 2.558260771757404, + "learning_rate": 1e-06, + "loss": 0.3632, + "step": 1573 + }, + { + "epoch": 0.10087157139195078, + "grad_norm": 2.5885122104751193, + "learning_rate": 1e-06, + "loss": 0.3969, + "step": 1574 + }, + { + "epoch": 0.10093565752371186, + "grad_norm": 2.535083774597389, + "learning_rate": 1e-06, + "loss": 0.413, + "step": 1575 + }, + { + "epoch": 0.10099974365547296, + "grad_norm": 2.4010853501994966, + "learning_rate": 1e-06, + "loss": 0.451, + "step": 1576 + }, + { + "epoch": 0.10106382978723404, + "grad_norm": 2.649504940578061, + "learning_rate": 1e-06, + "loss": 0.4474, + "step": 1577 + }, + { + "epoch": 0.10112791591899513, + "grad_norm": 2.410664702100842, + "learning_rate": 1e-06, + "loss": 0.3643, + "step": 1578 + }, + { + "epoch": 0.10119200205075622, + "grad_norm": 2.614304760449613, + "learning_rate": 1e-06, + "loss": 0.4721, + "step": 1579 + }, + { + "epoch": 0.1012560881825173, + "grad_norm": 2.5811032100103417, + "learning_rate": 1e-06, + "loss": 0.4441, + "step": 1580 + }, + { + "epoch": 0.1013201743142784, + "grad_norm": 2.479567803200828, + "learning_rate": 1e-06, + "loss": 0.4105, + "step": 1581 + }, + { + "epoch": 0.10138426044603947, + "grad_norm": 2.822694534916483, + "learning_rate": 1e-06, + "loss": 0.4662, + "step": 1582 + }, + { + "epoch": 0.10144834657780057, + "grad_norm": 2.7523444736975406, + "learning_rate": 1e-06, + "loss": 0.4016, + "step": 1583 + }, + { + "epoch": 0.10151243270956165, + "grad_norm": 2.7855545296692408, + "learning_rate": 1e-06, + "loss": 0.4851, + "step": 1584 + }, + { + "epoch": 0.10157651884132274, + "grad_norm": 2.69264141081618, + "learning_rate": 1e-06, + "loss": 0.4044, + "step": 1585 + }, + { + "epoch": 0.10164060497308383, + "grad_norm": 2.5649058822118764, + "learning_rate": 1e-06, + "loss": 0.3722, + "step": 1586 + }, + { + "epoch": 0.10170469110484491, + "grad_norm": 2.5866879553520246, + "learning_rate": 1e-06, + "loss": 0.4265, + "step": 1587 + }, + { + "epoch": 0.101768777236606, + "grad_norm": 2.4330295355820577, + "learning_rate": 1e-06, + "loss": 0.4152, + "step": 1588 + }, + { + "epoch": 0.10183286336836708, + "grad_norm": 2.570725714010572, + "learning_rate": 1e-06, + "loss": 0.3802, + "step": 1589 + }, + { + "epoch": 0.10189694950012818, + "grad_norm": 2.6275641529435845, + "learning_rate": 1e-06, + "loss": 0.3901, + "step": 1590 + }, + { + "epoch": 0.10196103563188925, + "grad_norm": 2.7067442146798943, + "learning_rate": 1e-06, + "loss": 0.4823, + "step": 1591 + }, + { + "epoch": 0.10202512176365035, + "grad_norm": 2.4594883517967885, + "learning_rate": 1e-06, + "loss": 0.3733, + "step": 1592 + }, + { + "epoch": 0.10208920789541143, + "grad_norm": 2.588977297029969, + "learning_rate": 1e-06, + "loss": 0.4596, + "step": 1593 + }, + { + "epoch": 0.10215329402717252, + "grad_norm": 2.6727644456390935, + "learning_rate": 1e-06, + "loss": 0.4501, + "step": 1594 + }, + { + "epoch": 0.10221738015893361, + "grad_norm": 2.5011109727208813, + "learning_rate": 1e-06, + "loss": 0.3772, + "step": 1595 + }, + { + "epoch": 0.10228146629069469, + "grad_norm": 2.729101743448196, + "learning_rate": 1e-06, + "loss": 0.3365, + "step": 1596 + }, + { + "epoch": 0.10234555242245578, + "grad_norm": 2.7459192458327633, + "learning_rate": 1e-06, + "loss": 0.4438, + "step": 1597 + }, + { + "epoch": 0.10240963855421686, + "grad_norm": 2.9193827751910133, + "learning_rate": 1e-06, + "loss": 0.4325, + "step": 1598 + }, + { + "epoch": 0.10247372468597796, + "grad_norm": 2.5884629470491247, + "learning_rate": 1e-06, + "loss": 0.4163, + "step": 1599 + }, + { + "epoch": 0.10253781081773904, + "grad_norm": 2.6180046140478512, + "learning_rate": 1e-06, + "loss": 0.4598, + "step": 1600 + }, + { + "epoch": 0.10260189694950013, + "grad_norm": 2.6215411725840774, + "learning_rate": 1e-06, + "loss": 0.4109, + "step": 1601 + }, + { + "epoch": 0.10266598308126121, + "grad_norm": 2.7502317600306654, + "learning_rate": 1e-06, + "loss": 0.4366, + "step": 1602 + }, + { + "epoch": 0.1027300692130223, + "grad_norm": 2.473642725106791, + "learning_rate": 1e-06, + "loss": 0.3513, + "step": 1603 + }, + { + "epoch": 0.1027941553447834, + "grad_norm": 3.027137495649638, + "learning_rate": 1e-06, + "loss": 0.4232, + "step": 1604 + }, + { + "epoch": 0.10285824147654447, + "grad_norm": 2.589351540135811, + "learning_rate": 1e-06, + "loss": 0.3928, + "step": 1605 + }, + { + "epoch": 0.10292232760830557, + "grad_norm": 2.605232656127217, + "learning_rate": 1e-06, + "loss": 0.3833, + "step": 1606 + }, + { + "epoch": 0.10298641374006665, + "grad_norm": 2.5160966475689373, + "learning_rate": 1e-06, + "loss": 0.4443, + "step": 1607 + }, + { + "epoch": 0.10305049987182774, + "grad_norm": 2.9291249601398945, + "learning_rate": 1e-06, + "loss": 0.3807, + "step": 1608 + }, + { + "epoch": 0.10311458600358882, + "grad_norm": 2.5567951767508417, + "learning_rate": 1e-06, + "loss": 0.4061, + "step": 1609 + }, + { + "epoch": 0.10317867213534991, + "grad_norm": 2.73170975100072, + "learning_rate": 1e-06, + "loss": 0.4131, + "step": 1610 + }, + { + "epoch": 0.103242758267111, + "grad_norm": 2.635090816171602, + "learning_rate": 1e-06, + "loss": 0.3814, + "step": 1611 + }, + { + "epoch": 0.10330684439887208, + "grad_norm": 2.461341775873606, + "learning_rate": 1e-06, + "loss": 0.4498, + "step": 1612 + }, + { + "epoch": 0.10337093053063318, + "grad_norm": 2.62340434297987, + "learning_rate": 1e-06, + "loss": 0.4303, + "step": 1613 + }, + { + "epoch": 0.10343501666239426, + "grad_norm": 2.418377756799505, + "learning_rate": 1e-06, + "loss": 0.423, + "step": 1614 + }, + { + "epoch": 0.10349910279415535, + "grad_norm": 2.480451698537808, + "learning_rate": 1e-06, + "loss": 0.4039, + "step": 1615 + }, + { + "epoch": 0.10356318892591643, + "grad_norm": 2.5930294009627506, + "learning_rate": 1e-06, + "loss": 0.4249, + "step": 1616 + }, + { + "epoch": 0.10362727505767752, + "grad_norm": 2.5585904418749568, + "learning_rate": 1e-06, + "loss": 0.4271, + "step": 1617 + }, + { + "epoch": 0.1036913611894386, + "grad_norm": 2.8192930639241442, + "learning_rate": 1e-06, + "loss": 0.4281, + "step": 1618 + }, + { + "epoch": 0.10375544732119969, + "grad_norm": 2.6233847466921216, + "learning_rate": 1e-06, + "loss": 0.4337, + "step": 1619 + }, + { + "epoch": 0.10381953345296079, + "grad_norm": 2.476019776497116, + "learning_rate": 1e-06, + "loss": 0.3443, + "step": 1620 + }, + { + "epoch": 0.10388361958472186, + "grad_norm": 2.65079762175131, + "learning_rate": 1e-06, + "loss": 0.4388, + "step": 1621 + }, + { + "epoch": 0.10394770571648296, + "grad_norm": 2.617383658663692, + "learning_rate": 1e-06, + "loss": 0.4207, + "step": 1622 + }, + { + "epoch": 0.10401179184824404, + "grad_norm": 2.6226899184245305, + "learning_rate": 1e-06, + "loss": 0.4373, + "step": 1623 + }, + { + "epoch": 0.10407587798000513, + "grad_norm": 2.6357591101547806, + "learning_rate": 1e-06, + "loss": 0.4235, + "step": 1624 + }, + { + "epoch": 0.10413996411176621, + "grad_norm": 2.6520449138547604, + "learning_rate": 1e-06, + "loss": 0.4487, + "step": 1625 + }, + { + "epoch": 0.1042040502435273, + "grad_norm": 2.6677348499168803, + "learning_rate": 1e-06, + "loss": 0.3955, + "step": 1626 + }, + { + "epoch": 0.10426813637528838, + "grad_norm": 2.485785549204256, + "learning_rate": 1e-06, + "loss": 0.4121, + "step": 1627 + }, + { + "epoch": 0.10433222250704947, + "grad_norm": 2.6469670405473655, + "learning_rate": 1e-06, + "loss": 0.4526, + "step": 1628 + }, + { + "epoch": 0.10439630863881057, + "grad_norm": 2.6217926498103394, + "learning_rate": 1e-06, + "loss": 0.4237, + "step": 1629 + }, + { + "epoch": 0.10446039477057165, + "grad_norm": 2.5329404663917767, + "learning_rate": 1e-06, + "loss": 0.4267, + "step": 1630 + }, + { + "epoch": 0.10452448090233274, + "grad_norm": 2.799258811339821, + "learning_rate": 1e-06, + "loss": 0.3584, + "step": 1631 + }, + { + "epoch": 0.10458856703409382, + "grad_norm": 2.6785670772725885, + "learning_rate": 1e-06, + "loss": 0.4077, + "step": 1632 + }, + { + "epoch": 0.10465265316585491, + "grad_norm": 2.8053785359525754, + "learning_rate": 1e-06, + "loss": 0.4357, + "step": 1633 + }, + { + "epoch": 0.10471673929761599, + "grad_norm": 2.617661148969151, + "learning_rate": 1e-06, + "loss": 0.4612, + "step": 1634 + }, + { + "epoch": 0.10478082542937708, + "grad_norm": 2.7631606730879077, + "learning_rate": 1e-06, + "loss": 0.4398, + "step": 1635 + }, + { + "epoch": 0.10484491156113818, + "grad_norm": 2.6745121058036068, + "learning_rate": 1e-06, + "loss": 0.3933, + "step": 1636 + }, + { + "epoch": 0.10490899769289926, + "grad_norm": 2.6261615730588677, + "learning_rate": 1e-06, + "loss": 0.4276, + "step": 1637 + }, + { + "epoch": 0.10497308382466035, + "grad_norm": 2.4662550009165747, + "learning_rate": 1e-06, + "loss": 0.3995, + "step": 1638 + }, + { + "epoch": 0.10503716995642143, + "grad_norm": 2.586298212854455, + "learning_rate": 1e-06, + "loss": 0.3858, + "step": 1639 + }, + { + "epoch": 0.10510125608818252, + "grad_norm": 2.6534127259916374, + "learning_rate": 1e-06, + "loss": 0.4182, + "step": 1640 + }, + { + "epoch": 0.1051653422199436, + "grad_norm": 2.795573062823909, + "learning_rate": 1e-06, + "loss": 0.4032, + "step": 1641 + }, + { + "epoch": 0.10522942835170469, + "grad_norm": 2.7539546932567767, + "learning_rate": 1e-06, + "loss": 0.4529, + "step": 1642 + }, + { + "epoch": 0.10529351448346577, + "grad_norm": 2.6383536806239123, + "learning_rate": 1e-06, + "loss": 0.4243, + "step": 1643 + }, + { + "epoch": 0.10535760061522687, + "grad_norm": 2.5524878339737436, + "learning_rate": 1e-06, + "loss": 0.4276, + "step": 1644 + }, + { + "epoch": 0.10542168674698796, + "grad_norm": 2.365115184404631, + "learning_rate": 1e-06, + "loss": 0.4012, + "step": 1645 + }, + { + "epoch": 0.10548577287874904, + "grad_norm": 2.562456499015141, + "learning_rate": 1e-06, + "loss": 0.4182, + "step": 1646 + }, + { + "epoch": 0.10554985901051013, + "grad_norm": 2.540218757528349, + "learning_rate": 1e-06, + "loss": 0.3861, + "step": 1647 + }, + { + "epoch": 0.10561394514227121, + "grad_norm": 2.7507708144988645, + "learning_rate": 1e-06, + "loss": 0.4476, + "step": 1648 + }, + { + "epoch": 0.1056780312740323, + "grad_norm": 2.569575329697874, + "learning_rate": 1e-06, + "loss": 0.4077, + "step": 1649 + }, + { + "epoch": 0.10574211740579338, + "grad_norm": 2.820273901570955, + "learning_rate": 1e-06, + "loss": 0.3955, + "step": 1650 + }, + { + "epoch": 0.10580620353755447, + "grad_norm": 2.5636949127298423, + "learning_rate": 1e-06, + "loss": 0.4195, + "step": 1651 + }, + { + "epoch": 0.10587028966931555, + "grad_norm": 2.4593346132223326, + "learning_rate": 1e-06, + "loss": 0.4317, + "step": 1652 + }, + { + "epoch": 0.10593437580107665, + "grad_norm": 2.4987644523028387, + "learning_rate": 1e-06, + "loss": 0.3765, + "step": 1653 + }, + { + "epoch": 0.10599846193283774, + "grad_norm": 2.658818194022743, + "learning_rate": 1e-06, + "loss": 0.4174, + "step": 1654 + }, + { + "epoch": 0.10606254806459882, + "grad_norm": 2.5958535933853213, + "learning_rate": 1e-06, + "loss": 0.3846, + "step": 1655 + }, + { + "epoch": 0.10612663419635991, + "grad_norm": 2.664277085039511, + "learning_rate": 1e-06, + "loss": 0.4129, + "step": 1656 + }, + { + "epoch": 0.10619072032812099, + "grad_norm": 2.6513772408963665, + "learning_rate": 1e-06, + "loss": 0.4107, + "step": 1657 + }, + { + "epoch": 0.10625480645988208, + "grad_norm": 2.7469582827871566, + "learning_rate": 1e-06, + "loss": 0.4148, + "step": 1658 + }, + { + "epoch": 0.10631889259164316, + "grad_norm": 2.7045725526549576, + "learning_rate": 1e-06, + "loss": 0.4139, + "step": 1659 + }, + { + "epoch": 0.10638297872340426, + "grad_norm": 2.623229446283425, + "learning_rate": 1e-06, + "loss": 0.4439, + "step": 1660 + }, + { + "epoch": 0.10644706485516534, + "grad_norm": 2.586774207872217, + "learning_rate": 1e-06, + "loss": 0.4244, + "step": 1661 + }, + { + "epoch": 0.10651115098692643, + "grad_norm": 2.953381892885236, + "learning_rate": 1e-06, + "loss": 0.4256, + "step": 1662 + }, + { + "epoch": 0.10657523711868752, + "grad_norm": 2.769340797223471, + "learning_rate": 1e-06, + "loss": 0.4559, + "step": 1663 + }, + { + "epoch": 0.1066393232504486, + "grad_norm": 2.638792643534101, + "learning_rate": 1e-06, + "loss": 0.4563, + "step": 1664 + }, + { + "epoch": 0.1067034093822097, + "grad_norm": 2.780562883204113, + "learning_rate": 1e-06, + "loss": 0.4674, + "step": 1665 + }, + { + "epoch": 0.10676749551397077, + "grad_norm": 2.6954405632484923, + "learning_rate": 1e-06, + "loss": 0.4098, + "step": 1666 + }, + { + "epoch": 0.10683158164573187, + "grad_norm": 2.7618401225967273, + "learning_rate": 1e-06, + "loss": 0.4703, + "step": 1667 + }, + { + "epoch": 0.10689566777749294, + "grad_norm": 2.5574016056264837, + "learning_rate": 1e-06, + "loss": 0.4101, + "step": 1668 + }, + { + "epoch": 0.10695975390925404, + "grad_norm": 2.8001123266252077, + "learning_rate": 1e-06, + "loss": 0.3912, + "step": 1669 + }, + { + "epoch": 0.10702384004101513, + "grad_norm": 2.378615768916347, + "learning_rate": 1e-06, + "loss": 0.3575, + "step": 1670 + }, + { + "epoch": 0.10708792617277621, + "grad_norm": 2.8999182844615885, + "learning_rate": 1e-06, + "loss": 0.4575, + "step": 1671 + }, + { + "epoch": 0.1071520123045373, + "grad_norm": 2.698293857397677, + "learning_rate": 1e-06, + "loss": 0.4647, + "step": 1672 + }, + { + "epoch": 0.10721609843629838, + "grad_norm": 2.7334643214109278, + "learning_rate": 1e-06, + "loss": 0.4029, + "step": 1673 + }, + { + "epoch": 0.10728018456805948, + "grad_norm": 2.711296438189429, + "learning_rate": 1e-06, + "loss": 0.3844, + "step": 1674 + }, + { + "epoch": 0.10734427069982055, + "grad_norm": 2.3686553393233916, + "learning_rate": 1e-06, + "loss": 0.404, + "step": 1675 + }, + { + "epoch": 0.10740835683158165, + "grad_norm": 2.4640210766920694, + "learning_rate": 1e-06, + "loss": 0.4225, + "step": 1676 + }, + { + "epoch": 0.10747244296334273, + "grad_norm": 2.694372214956423, + "learning_rate": 1e-06, + "loss": 0.3985, + "step": 1677 + }, + { + "epoch": 0.10753652909510382, + "grad_norm": 2.629781628214181, + "learning_rate": 1e-06, + "loss": 0.464, + "step": 1678 + }, + { + "epoch": 0.10760061522686491, + "grad_norm": 2.491325092259068, + "learning_rate": 1e-06, + "loss": 0.4328, + "step": 1679 + }, + { + "epoch": 0.10766470135862599, + "grad_norm": 2.788493606737418, + "learning_rate": 1e-06, + "loss": 0.3886, + "step": 1680 + }, + { + "epoch": 0.10772878749038708, + "grad_norm": 2.916286988190873, + "learning_rate": 1e-06, + "loss": 0.4411, + "step": 1681 + }, + { + "epoch": 0.10779287362214816, + "grad_norm": 2.583616470716141, + "learning_rate": 1e-06, + "loss": 0.4642, + "step": 1682 + }, + { + "epoch": 0.10785695975390926, + "grad_norm": 2.4721597951784906, + "learning_rate": 1e-06, + "loss": 0.3899, + "step": 1683 + }, + { + "epoch": 0.10792104588567034, + "grad_norm": 2.67527569790069, + "learning_rate": 1e-06, + "loss": 0.4089, + "step": 1684 + }, + { + "epoch": 0.10798513201743143, + "grad_norm": 2.5895651711542, + "learning_rate": 1e-06, + "loss": 0.3923, + "step": 1685 + }, + { + "epoch": 0.10804921814919251, + "grad_norm": 2.5846016622522403, + "learning_rate": 1e-06, + "loss": 0.4352, + "step": 1686 + }, + { + "epoch": 0.1081133042809536, + "grad_norm": 2.777517387535592, + "learning_rate": 1e-06, + "loss": 0.3758, + "step": 1687 + }, + { + "epoch": 0.1081773904127147, + "grad_norm": 2.6384771823414557, + "learning_rate": 1e-06, + "loss": 0.395, + "step": 1688 + }, + { + "epoch": 0.10824147654447577, + "grad_norm": 2.3574131829286817, + "learning_rate": 1e-06, + "loss": 0.455, + "step": 1689 + }, + { + "epoch": 0.10830556267623687, + "grad_norm": 2.6567719560903567, + "learning_rate": 1e-06, + "loss": 0.3988, + "step": 1690 + }, + { + "epoch": 0.10836964880799795, + "grad_norm": 2.5692755223481507, + "learning_rate": 1e-06, + "loss": 0.4512, + "step": 1691 + }, + { + "epoch": 0.10843373493975904, + "grad_norm": 2.499408417636477, + "learning_rate": 1e-06, + "loss": 0.4029, + "step": 1692 + }, + { + "epoch": 0.10849782107152012, + "grad_norm": 2.7581677889012046, + "learning_rate": 1e-06, + "loss": 0.4781, + "step": 1693 + }, + { + "epoch": 0.10856190720328121, + "grad_norm": 2.7449113590760703, + "learning_rate": 1e-06, + "loss": 0.3819, + "step": 1694 + }, + { + "epoch": 0.1086259933350423, + "grad_norm": 2.4758292398159507, + "learning_rate": 1e-06, + "loss": 0.4012, + "step": 1695 + }, + { + "epoch": 0.10869007946680338, + "grad_norm": 2.776264244021826, + "learning_rate": 1e-06, + "loss": 0.4143, + "step": 1696 + }, + { + "epoch": 0.10875416559856448, + "grad_norm": 2.638478281868802, + "learning_rate": 1e-06, + "loss": 0.4193, + "step": 1697 + }, + { + "epoch": 0.10881825173032555, + "grad_norm": 2.481263674878327, + "learning_rate": 1e-06, + "loss": 0.4411, + "step": 1698 + }, + { + "epoch": 0.10888233786208665, + "grad_norm": 2.552295863623659, + "learning_rate": 1e-06, + "loss": 0.433, + "step": 1699 + }, + { + "epoch": 0.10894642399384773, + "grad_norm": 2.52105190848856, + "learning_rate": 1e-06, + "loss": 0.3908, + "step": 1700 + }, + { + "epoch": 0.10901051012560882, + "grad_norm": 2.518675874240785, + "learning_rate": 1e-06, + "loss": 0.4043, + "step": 1701 + }, + { + "epoch": 0.1090745962573699, + "grad_norm": 2.614317095507154, + "learning_rate": 1e-06, + "loss": 0.387, + "step": 1702 + }, + { + "epoch": 0.10913868238913099, + "grad_norm": 2.670338340620114, + "learning_rate": 1e-06, + "loss": 0.4327, + "step": 1703 + }, + { + "epoch": 0.10920276852089209, + "grad_norm": 2.679878055563821, + "learning_rate": 1e-06, + "loss": 0.4729, + "step": 1704 + }, + { + "epoch": 0.10926685465265316, + "grad_norm": 2.493372948578064, + "learning_rate": 1e-06, + "loss": 0.4325, + "step": 1705 + }, + { + "epoch": 0.10933094078441426, + "grad_norm": 2.5642512118942187, + "learning_rate": 1e-06, + "loss": 0.4222, + "step": 1706 + }, + { + "epoch": 0.10939502691617534, + "grad_norm": 2.5169199510593927, + "learning_rate": 1e-06, + "loss": 0.426, + "step": 1707 + }, + { + "epoch": 0.10945911304793643, + "grad_norm": 2.4887182192261665, + "learning_rate": 1e-06, + "loss": 0.409, + "step": 1708 + }, + { + "epoch": 0.10952319917969751, + "grad_norm": 2.7289202425480417, + "learning_rate": 1e-06, + "loss": 0.383, + "step": 1709 + }, + { + "epoch": 0.1095872853114586, + "grad_norm": 2.61721341106042, + "learning_rate": 1e-06, + "loss": 0.4173, + "step": 1710 + }, + { + "epoch": 0.10965137144321968, + "grad_norm": 2.72774723157587, + "learning_rate": 1e-06, + "loss": 0.4001, + "step": 1711 + }, + { + "epoch": 0.10971545757498077, + "grad_norm": 2.413345581192089, + "learning_rate": 1e-06, + "loss": 0.3807, + "step": 1712 + }, + { + "epoch": 0.10977954370674187, + "grad_norm": 2.7211350379570955, + "learning_rate": 1e-06, + "loss": 0.3684, + "step": 1713 + }, + { + "epoch": 0.10984362983850295, + "grad_norm": 2.737851682076553, + "learning_rate": 1e-06, + "loss": 0.4509, + "step": 1714 + }, + { + "epoch": 0.10990771597026404, + "grad_norm": 2.5769530061758275, + "learning_rate": 1e-06, + "loss": 0.3692, + "step": 1715 + }, + { + "epoch": 0.10997180210202512, + "grad_norm": 2.570743845028058, + "learning_rate": 1e-06, + "loss": 0.4236, + "step": 1716 + }, + { + "epoch": 0.11003588823378621, + "grad_norm": 2.756598918829375, + "learning_rate": 1e-06, + "loss": 0.4437, + "step": 1717 + }, + { + "epoch": 0.11009997436554729, + "grad_norm": 2.57919783563559, + "learning_rate": 1e-06, + "loss": 0.3837, + "step": 1718 + }, + { + "epoch": 0.11016406049730838, + "grad_norm": 2.5580986528574887, + "learning_rate": 1e-06, + "loss": 0.4406, + "step": 1719 + }, + { + "epoch": 0.11022814662906948, + "grad_norm": 2.668776644544978, + "learning_rate": 1e-06, + "loss": 0.451, + "step": 1720 + }, + { + "epoch": 0.11029223276083056, + "grad_norm": 2.523514496339625, + "learning_rate": 1e-06, + "loss": 0.41, + "step": 1721 + }, + { + "epoch": 0.11035631889259165, + "grad_norm": 2.603462247414295, + "learning_rate": 1e-06, + "loss": 0.4276, + "step": 1722 + }, + { + "epoch": 0.11042040502435273, + "grad_norm": 2.855953751352866, + "learning_rate": 1e-06, + "loss": 0.4488, + "step": 1723 + }, + { + "epoch": 0.11048449115611382, + "grad_norm": 2.459019889378647, + "learning_rate": 1e-06, + "loss": 0.3524, + "step": 1724 + }, + { + "epoch": 0.1105485772878749, + "grad_norm": 2.6543139169949668, + "learning_rate": 1e-06, + "loss": 0.3758, + "step": 1725 + }, + { + "epoch": 0.11061266341963599, + "grad_norm": 2.732939934915263, + "learning_rate": 1e-06, + "loss": 0.4017, + "step": 1726 + }, + { + "epoch": 0.11067674955139707, + "grad_norm": 2.5573541578391663, + "learning_rate": 1e-06, + "loss": 0.4032, + "step": 1727 + }, + { + "epoch": 0.11074083568315816, + "grad_norm": 2.48054846948909, + "learning_rate": 1e-06, + "loss": 0.4306, + "step": 1728 + }, + { + "epoch": 0.11080492181491926, + "grad_norm": 2.521020872733791, + "learning_rate": 1e-06, + "loss": 0.435, + "step": 1729 + }, + { + "epoch": 0.11086900794668034, + "grad_norm": 2.401450393189567, + "learning_rate": 1e-06, + "loss": 0.3783, + "step": 1730 + }, + { + "epoch": 0.11093309407844143, + "grad_norm": 2.617773165324972, + "learning_rate": 1e-06, + "loss": 0.4264, + "step": 1731 + }, + { + "epoch": 0.11099718021020251, + "grad_norm": 2.6692631322553035, + "learning_rate": 1e-06, + "loss": 0.442, + "step": 1732 + }, + { + "epoch": 0.1110612663419636, + "grad_norm": 2.6413046769270627, + "learning_rate": 1e-06, + "loss": 0.4591, + "step": 1733 + }, + { + "epoch": 0.11112535247372468, + "grad_norm": 2.5232036530580353, + "learning_rate": 1e-06, + "loss": 0.408, + "step": 1734 + }, + { + "epoch": 0.11118943860548577, + "grad_norm": 2.598170322205431, + "learning_rate": 1e-06, + "loss": 0.3985, + "step": 1735 + }, + { + "epoch": 0.11125352473724685, + "grad_norm": 2.5204863843861713, + "learning_rate": 1e-06, + "loss": 0.3859, + "step": 1736 + }, + { + "epoch": 0.11131761086900795, + "grad_norm": 2.531925371710578, + "learning_rate": 1e-06, + "loss": 0.4006, + "step": 1737 + }, + { + "epoch": 0.11138169700076904, + "grad_norm": 2.5551159173093456, + "learning_rate": 1e-06, + "loss": 0.382, + "step": 1738 + }, + { + "epoch": 0.11144578313253012, + "grad_norm": 2.76432440791053, + "learning_rate": 1e-06, + "loss": 0.4738, + "step": 1739 + }, + { + "epoch": 0.11150986926429121, + "grad_norm": 2.7599097775430863, + "learning_rate": 1e-06, + "loss": 0.4049, + "step": 1740 + }, + { + "epoch": 0.11157395539605229, + "grad_norm": 2.727649566997873, + "learning_rate": 1e-06, + "loss": 0.4498, + "step": 1741 + }, + { + "epoch": 0.11163804152781338, + "grad_norm": 2.548484134262224, + "learning_rate": 1e-06, + "loss": 0.3875, + "step": 1742 + }, + { + "epoch": 0.11170212765957446, + "grad_norm": 2.4432877629217225, + "learning_rate": 1e-06, + "loss": 0.3908, + "step": 1743 + }, + { + "epoch": 0.11176621379133556, + "grad_norm": 3.1019322307948913, + "learning_rate": 1e-06, + "loss": 0.4498, + "step": 1744 + }, + { + "epoch": 0.11183029992309665, + "grad_norm": 3.0096044794329613, + "learning_rate": 1e-06, + "loss": 0.4709, + "step": 1745 + }, + { + "epoch": 0.11189438605485773, + "grad_norm": 2.469796239735023, + "learning_rate": 1e-06, + "loss": 0.4759, + "step": 1746 + }, + { + "epoch": 0.11195847218661882, + "grad_norm": 2.5208667157610196, + "learning_rate": 1e-06, + "loss": 0.4266, + "step": 1747 + }, + { + "epoch": 0.1120225583183799, + "grad_norm": 2.5523207697871175, + "learning_rate": 1e-06, + "loss": 0.4218, + "step": 1748 + }, + { + "epoch": 0.112086644450141, + "grad_norm": 2.517061409866147, + "learning_rate": 1e-06, + "loss": 0.3933, + "step": 1749 + }, + { + "epoch": 0.11215073058190207, + "grad_norm": 2.5726688003570772, + "learning_rate": 1e-06, + "loss": 0.4698, + "step": 1750 + }, + { + "epoch": 0.11221481671366317, + "grad_norm": 2.660408633103373, + "learning_rate": 1e-06, + "loss": 0.4223, + "step": 1751 + }, + { + "epoch": 0.11227890284542424, + "grad_norm": 2.4592087263046505, + "learning_rate": 1e-06, + "loss": 0.3798, + "step": 1752 + }, + { + "epoch": 0.11234298897718534, + "grad_norm": 2.583126620448884, + "learning_rate": 1e-06, + "loss": 0.444, + "step": 1753 + }, + { + "epoch": 0.11240707510894643, + "grad_norm": 2.385137220467907, + "learning_rate": 1e-06, + "loss": 0.3733, + "step": 1754 + }, + { + "epoch": 0.11247116124070751, + "grad_norm": 2.583408504897648, + "learning_rate": 1e-06, + "loss": 0.3652, + "step": 1755 + }, + { + "epoch": 0.1125352473724686, + "grad_norm": 2.528928465202496, + "learning_rate": 1e-06, + "loss": 0.4839, + "step": 1756 + }, + { + "epoch": 0.11259933350422968, + "grad_norm": 2.4844004028727578, + "learning_rate": 1e-06, + "loss": 0.3589, + "step": 1757 + }, + { + "epoch": 0.11266341963599077, + "grad_norm": 2.6836038257816552, + "learning_rate": 1e-06, + "loss": 0.3876, + "step": 1758 + }, + { + "epoch": 0.11272750576775185, + "grad_norm": 2.5384485116106017, + "learning_rate": 1e-06, + "loss": 0.4044, + "step": 1759 + }, + { + "epoch": 0.11279159189951295, + "grad_norm": 2.753583038091332, + "learning_rate": 1e-06, + "loss": 0.4455, + "step": 1760 + }, + { + "epoch": 0.11285567803127403, + "grad_norm": 2.9345543598136827, + "learning_rate": 1e-06, + "loss": 0.4116, + "step": 1761 + }, + { + "epoch": 0.11291976416303512, + "grad_norm": 2.7326619237198067, + "learning_rate": 1e-06, + "loss": 0.3968, + "step": 1762 + }, + { + "epoch": 0.11298385029479621, + "grad_norm": 2.64669411763611, + "learning_rate": 1e-06, + "loss": 0.3625, + "step": 1763 + }, + { + "epoch": 0.11304793642655729, + "grad_norm": 2.552350284148369, + "learning_rate": 1e-06, + "loss": 0.3777, + "step": 1764 + }, + { + "epoch": 0.11311202255831838, + "grad_norm": 2.8295097832904132, + "learning_rate": 1e-06, + "loss": 0.4645, + "step": 1765 + }, + { + "epoch": 0.11317610869007946, + "grad_norm": 2.77247592560161, + "learning_rate": 1e-06, + "loss": 0.4338, + "step": 1766 + }, + { + "epoch": 0.11324019482184056, + "grad_norm": 2.5665589634227968, + "learning_rate": 1e-06, + "loss": 0.4078, + "step": 1767 + }, + { + "epoch": 0.11330428095360164, + "grad_norm": 2.5795506039181593, + "learning_rate": 1e-06, + "loss": 0.4332, + "step": 1768 + }, + { + "epoch": 0.11336836708536273, + "grad_norm": 2.509627569113794, + "learning_rate": 1e-06, + "loss": 0.3402, + "step": 1769 + }, + { + "epoch": 0.11343245321712381, + "grad_norm": 2.8115985620573087, + "learning_rate": 1e-06, + "loss": 0.4901, + "step": 1770 + }, + { + "epoch": 0.1134965393488849, + "grad_norm": 2.839001751369005, + "learning_rate": 1e-06, + "loss": 0.4355, + "step": 1771 + }, + { + "epoch": 0.113560625480646, + "grad_norm": 2.565272919502216, + "learning_rate": 1e-06, + "loss": 0.378, + "step": 1772 + }, + { + "epoch": 0.11362471161240707, + "grad_norm": 2.5032995137142797, + "learning_rate": 1e-06, + "loss": 0.371, + "step": 1773 + }, + { + "epoch": 0.11368879774416817, + "grad_norm": 2.5599129867744015, + "learning_rate": 1e-06, + "loss": 0.4237, + "step": 1774 + }, + { + "epoch": 0.11375288387592924, + "grad_norm": 2.8282808159804227, + "learning_rate": 1e-06, + "loss": 0.397, + "step": 1775 + }, + { + "epoch": 0.11381697000769034, + "grad_norm": 2.435898564425463, + "learning_rate": 1e-06, + "loss": 0.4122, + "step": 1776 + }, + { + "epoch": 0.11388105613945142, + "grad_norm": 2.590842457237392, + "learning_rate": 1e-06, + "loss": 0.4441, + "step": 1777 + }, + { + "epoch": 0.11394514227121251, + "grad_norm": 2.78652717011761, + "learning_rate": 1e-06, + "loss": 0.4343, + "step": 1778 + }, + { + "epoch": 0.1140092284029736, + "grad_norm": 2.8160506518694404, + "learning_rate": 1e-06, + "loss": 0.4206, + "step": 1779 + }, + { + "epoch": 0.11407331453473468, + "grad_norm": 2.7610789996086043, + "learning_rate": 1e-06, + "loss": 0.4261, + "step": 1780 + }, + { + "epoch": 0.11413740066649578, + "grad_norm": 2.6617844651355846, + "learning_rate": 1e-06, + "loss": 0.3811, + "step": 1781 + }, + { + "epoch": 0.11420148679825685, + "grad_norm": 2.496233276933666, + "learning_rate": 1e-06, + "loss": 0.4673, + "step": 1782 + }, + { + "epoch": 0.11426557293001795, + "grad_norm": 2.455381531806013, + "learning_rate": 1e-06, + "loss": 0.3537, + "step": 1783 + }, + { + "epoch": 0.11432965906177903, + "grad_norm": 2.7512371176528867, + "learning_rate": 1e-06, + "loss": 0.3862, + "step": 1784 + }, + { + "epoch": 0.11439374519354012, + "grad_norm": 2.455432753073087, + "learning_rate": 1e-06, + "loss": 0.3869, + "step": 1785 + }, + { + "epoch": 0.1144578313253012, + "grad_norm": 2.5204235042326544, + "learning_rate": 1e-06, + "loss": 0.3941, + "step": 1786 + }, + { + "epoch": 0.11452191745706229, + "grad_norm": 2.8503009414302665, + "learning_rate": 1e-06, + "loss": 0.3999, + "step": 1787 + }, + { + "epoch": 0.11458600358882338, + "grad_norm": 2.6189598564105228, + "learning_rate": 1e-06, + "loss": 0.4742, + "step": 1788 + }, + { + "epoch": 0.11465008972058446, + "grad_norm": 2.605962177702438, + "learning_rate": 1e-06, + "loss": 0.4198, + "step": 1789 + }, + { + "epoch": 0.11471417585234556, + "grad_norm": 2.6260862237551996, + "learning_rate": 1e-06, + "loss": 0.4471, + "step": 1790 + }, + { + "epoch": 0.11477826198410664, + "grad_norm": 2.9144544195926447, + "learning_rate": 1e-06, + "loss": 0.4141, + "step": 1791 + }, + { + "epoch": 0.11484234811586773, + "grad_norm": 2.7848484757662604, + "learning_rate": 1e-06, + "loss": 0.4851, + "step": 1792 + }, + { + "epoch": 0.11490643424762881, + "grad_norm": 2.6432933600668926, + "learning_rate": 1e-06, + "loss": 0.4358, + "step": 1793 + }, + { + "epoch": 0.1149705203793899, + "grad_norm": 2.5969710533282697, + "learning_rate": 1e-06, + "loss": 0.3901, + "step": 1794 + }, + { + "epoch": 0.11503460651115098, + "grad_norm": 2.6038648664893076, + "learning_rate": 1e-06, + "loss": 0.3698, + "step": 1795 + }, + { + "epoch": 0.11509869264291207, + "grad_norm": 2.664492510496455, + "learning_rate": 1e-06, + "loss": 0.4344, + "step": 1796 + }, + { + "epoch": 0.11516277877467317, + "grad_norm": 2.4865877780180248, + "learning_rate": 1e-06, + "loss": 0.4152, + "step": 1797 + }, + { + "epoch": 0.11522686490643425, + "grad_norm": 2.6980174755024864, + "learning_rate": 1e-06, + "loss": 0.4049, + "step": 1798 + }, + { + "epoch": 0.11529095103819534, + "grad_norm": 2.7132888219218447, + "learning_rate": 1e-06, + "loss": 0.4101, + "step": 1799 + }, + { + "epoch": 0.11535503716995642, + "grad_norm": 2.6163019786785755, + "learning_rate": 1e-06, + "loss": 0.3957, + "step": 1800 + }, + { + "epoch": 0.11541912330171751, + "grad_norm": 2.438070830380359, + "learning_rate": 1e-06, + "loss": 0.3896, + "step": 1801 + }, + { + "epoch": 0.11548320943347859, + "grad_norm": 2.777361514104172, + "learning_rate": 1e-06, + "loss": 0.4361, + "step": 1802 + }, + { + "epoch": 0.11554729556523968, + "grad_norm": 2.4615952090613153, + "learning_rate": 1e-06, + "loss": 0.4725, + "step": 1803 + }, + { + "epoch": 0.11561138169700078, + "grad_norm": 2.786019550812383, + "learning_rate": 1e-06, + "loss": 0.4232, + "step": 1804 + }, + { + "epoch": 0.11567546782876185, + "grad_norm": 2.582580806012161, + "learning_rate": 1e-06, + "loss": 0.4174, + "step": 1805 + }, + { + "epoch": 0.11573955396052295, + "grad_norm": 2.6551798788814307, + "learning_rate": 1e-06, + "loss": 0.4031, + "step": 1806 + }, + { + "epoch": 0.11580364009228403, + "grad_norm": 2.628264393086419, + "learning_rate": 1e-06, + "loss": 0.4183, + "step": 1807 + }, + { + "epoch": 0.11586772622404512, + "grad_norm": 2.554298651583823, + "learning_rate": 1e-06, + "loss": 0.4543, + "step": 1808 + }, + { + "epoch": 0.1159318123558062, + "grad_norm": 2.5870692588572477, + "learning_rate": 1e-06, + "loss": 0.4007, + "step": 1809 + }, + { + "epoch": 0.11599589848756729, + "grad_norm": 2.536498615958139, + "learning_rate": 1e-06, + "loss": 0.4527, + "step": 1810 + }, + { + "epoch": 0.11605998461932837, + "grad_norm": 2.7350417193936605, + "learning_rate": 1e-06, + "loss": 0.4948, + "step": 1811 + }, + { + "epoch": 0.11612407075108946, + "grad_norm": 2.8962172722341446, + "learning_rate": 1e-06, + "loss": 0.4139, + "step": 1812 + }, + { + "epoch": 0.11618815688285056, + "grad_norm": 2.447551642523489, + "learning_rate": 1e-06, + "loss": 0.4313, + "step": 1813 + }, + { + "epoch": 0.11625224301461164, + "grad_norm": 2.7581062937740324, + "learning_rate": 1e-06, + "loss": 0.4046, + "step": 1814 + }, + { + "epoch": 0.11631632914637273, + "grad_norm": 2.6613780988530387, + "learning_rate": 1e-06, + "loss": 0.4378, + "step": 1815 + }, + { + "epoch": 0.11638041527813381, + "grad_norm": 2.7024783765058067, + "learning_rate": 1e-06, + "loss": 0.3638, + "step": 1816 + }, + { + "epoch": 0.1164445014098949, + "grad_norm": 2.610733885000389, + "learning_rate": 1e-06, + "loss": 0.4581, + "step": 1817 + }, + { + "epoch": 0.11650858754165598, + "grad_norm": 2.7818604058811367, + "learning_rate": 1e-06, + "loss": 0.3369, + "step": 1818 + }, + { + "epoch": 0.11657267367341707, + "grad_norm": 2.852704189136639, + "learning_rate": 1e-06, + "loss": 0.4626, + "step": 1819 + }, + { + "epoch": 0.11663675980517815, + "grad_norm": 2.6925423286197496, + "learning_rate": 1e-06, + "loss": 0.4053, + "step": 1820 + }, + { + "epoch": 0.11670084593693925, + "grad_norm": 2.8192484239080056, + "learning_rate": 1e-06, + "loss": 0.4099, + "step": 1821 + }, + { + "epoch": 0.11676493206870034, + "grad_norm": 2.8485436665584536, + "learning_rate": 1e-06, + "loss": 0.4731, + "step": 1822 + }, + { + "epoch": 0.11682901820046142, + "grad_norm": 2.6475121457375645, + "learning_rate": 1e-06, + "loss": 0.4149, + "step": 1823 + }, + { + "epoch": 0.11689310433222251, + "grad_norm": 2.8733685444969903, + "learning_rate": 1e-06, + "loss": 0.3954, + "step": 1824 + }, + { + "epoch": 0.11695719046398359, + "grad_norm": 2.6062579224564595, + "learning_rate": 1e-06, + "loss": 0.386, + "step": 1825 + }, + { + "epoch": 0.11702127659574468, + "grad_norm": 2.6613936407149485, + "learning_rate": 1e-06, + "loss": 0.3948, + "step": 1826 + }, + { + "epoch": 0.11708536272750576, + "grad_norm": 2.607041472301362, + "learning_rate": 1e-06, + "loss": 0.4213, + "step": 1827 + }, + { + "epoch": 0.11714944885926686, + "grad_norm": 2.595697100885704, + "learning_rate": 1e-06, + "loss": 0.3763, + "step": 1828 + }, + { + "epoch": 0.11721353499102795, + "grad_norm": 2.627079136159903, + "learning_rate": 1e-06, + "loss": 0.4152, + "step": 1829 + }, + { + "epoch": 0.11727762112278903, + "grad_norm": 2.8799355704445695, + "learning_rate": 1e-06, + "loss": 0.4191, + "step": 1830 + }, + { + "epoch": 0.11734170725455012, + "grad_norm": 2.66793046478117, + "learning_rate": 1e-06, + "loss": 0.4221, + "step": 1831 + }, + { + "epoch": 0.1174057933863112, + "grad_norm": 2.5435157561676003, + "learning_rate": 1e-06, + "loss": 0.4753, + "step": 1832 + }, + { + "epoch": 0.11746987951807229, + "grad_norm": 2.6227753499816684, + "learning_rate": 1e-06, + "loss": 0.4436, + "step": 1833 + }, + { + "epoch": 0.11753396564983337, + "grad_norm": 2.6787886244585413, + "learning_rate": 1e-06, + "loss": 0.3946, + "step": 1834 + }, + { + "epoch": 0.11759805178159446, + "grad_norm": 2.6989424714818435, + "learning_rate": 1e-06, + "loss": 0.3964, + "step": 1835 + }, + { + "epoch": 0.11766213791335554, + "grad_norm": 2.693869459395887, + "learning_rate": 1e-06, + "loss": 0.3903, + "step": 1836 + }, + { + "epoch": 0.11772622404511664, + "grad_norm": 2.5534714081744165, + "learning_rate": 1e-06, + "loss": 0.4123, + "step": 1837 + }, + { + "epoch": 0.11779031017687773, + "grad_norm": 2.5396904316255298, + "learning_rate": 1e-06, + "loss": 0.4625, + "step": 1838 + }, + { + "epoch": 0.11785439630863881, + "grad_norm": 2.6864914728899834, + "learning_rate": 1e-06, + "loss": 0.4628, + "step": 1839 + }, + { + "epoch": 0.1179184824403999, + "grad_norm": 2.4031481570736, + "learning_rate": 1e-06, + "loss": 0.4412, + "step": 1840 + }, + { + "epoch": 0.11798256857216098, + "grad_norm": 2.8510808627208566, + "learning_rate": 1e-06, + "loss": 0.4295, + "step": 1841 + }, + { + "epoch": 0.11804665470392207, + "grad_norm": 2.625121495397035, + "learning_rate": 1e-06, + "loss": 0.4293, + "step": 1842 + }, + { + "epoch": 0.11811074083568315, + "grad_norm": 2.379041716484206, + "learning_rate": 1e-06, + "loss": 0.3987, + "step": 1843 + }, + { + "epoch": 0.11817482696744425, + "grad_norm": 2.941854719930437, + "learning_rate": 1e-06, + "loss": 0.4159, + "step": 1844 + }, + { + "epoch": 0.11823891309920533, + "grad_norm": 2.659123817605973, + "learning_rate": 1e-06, + "loss": 0.3979, + "step": 1845 + }, + { + "epoch": 0.11830299923096642, + "grad_norm": 2.664461683615228, + "learning_rate": 1e-06, + "loss": 0.372, + "step": 1846 + }, + { + "epoch": 0.11836708536272751, + "grad_norm": 2.4454388383128145, + "learning_rate": 1e-06, + "loss": 0.3694, + "step": 1847 + }, + { + "epoch": 0.11843117149448859, + "grad_norm": 2.9853732892969007, + "learning_rate": 1e-06, + "loss": 0.4965, + "step": 1848 + }, + { + "epoch": 0.11849525762624968, + "grad_norm": 2.5862583091122637, + "learning_rate": 1e-06, + "loss": 0.429, + "step": 1849 + }, + { + "epoch": 0.11855934375801076, + "grad_norm": 2.650195995580348, + "learning_rate": 1e-06, + "loss": 0.4274, + "step": 1850 + }, + { + "epoch": 0.11862342988977186, + "grad_norm": 2.4840409762343776, + "learning_rate": 1e-06, + "loss": 0.4511, + "step": 1851 + }, + { + "epoch": 0.11868751602153294, + "grad_norm": 2.8313952437027763, + "learning_rate": 1e-06, + "loss": 0.3909, + "step": 1852 + }, + { + "epoch": 0.11875160215329403, + "grad_norm": 2.598131250280352, + "learning_rate": 1e-06, + "loss": 0.4383, + "step": 1853 + }, + { + "epoch": 0.11881568828505511, + "grad_norm": 2.5956430213471644, + "learning_rate": 1e-06, + "loss": 0.3624, + "step": 1854 + }, + { + "epoch": 0.1188797744168162, + "grad_norm": 2.819100294041442, + "learning_rate": 1e-06, + "loss": 0.4846, + "step": 1855 + }, + { + "epoch": 0.1189438605485773, + "grad_norm": 2.659323824569532, + "learning_rate": 1e-06, + "loss": 0.4342, + "step": 1856 + }, + { + "epoch": 0.11900794668033837, + "grad_norm": 2.5236617424640784, + "learning_rate": 1e-06, + "loss": 0.4015, + "step": 1857 + }, + { + "epoch": 0.11907203281209947, + "grad_norm": 2.4090615191739526, + "learning_rate": 1e-06, + "loss": 0.4327, + "step": 1858 + }, + { + "epoch": 0.11913611894386054, + "grad_norm": 2.769880388165618, + "learning_rate": 1e-06, + "loss": 0.381, + "step": 1859 + }, + { + "epoch": 0.11920020507562164, + "grad_norm": 2.819628662213885, + "learning_rate": 1e-06, + "loss": 0.4547, + "step": 1860 + }, + { + "epoch": 0.11926429120738272, + "grad_norm": 2.5754514969457873, + "learning_rate": 1e-06, + "loss": 0.4594, + "step": 1861 + }, + { + "epoch": 0.11932837733914381, + "grad_norm": 2.5022880249203325, + "learning_rate": 1e-06, + "loss": 0.4144, + "step": 1862 + }, + { + "epoch": 0.1193924634709049, + "grad_norm": 2.4699745018799404, + "learning_rate": 1e-06, + "loss": 0.3806, + "step": 1863 + }, + { + "epoch": 0.11945654960266598, + "grad_norm": 2.6456565415557374, + "learning_rate": 1e-06, + "loss": 0.4065, + "step": 1864 + }, + { + "epoch": 0.11952063573442707, + "grad_norm": 2.4005550753618015, + "learning_rate": 1e-06, + "loss": 0.4158, + "step": 1865 + }, + { + "epoch": 0.11958472186618815, + "grad_norm": 2.5060801713618983, + "learning_rate": 1e-06, + "loss": 0.4542, + "step": 1866 + }, + { + "epoch": 0.11964880799794925, + "grad_norm": 2.4513719291485585, + "learning_rate": 1e-06, + "loss": 0.4339, + "step": 1867 + }, + { + "epoch": 0.11971289412971033, + "grad_norm": 2.6769921439010753, + "learning_rate": 1e-06, + "loss": 0.4052, + "step": 1868 + }, + { + "epoch": 0.11977698026147142, + "grad_norm": 2.6840492380158834, + "learning_rate": 1e-06, + "loss": 0.4007, + "step": 1869 + }, + { + "epoch": 0.1198410663932325, + "grad_norm": 2.803156210251872, + "learning_rate": 1e-06, + "loss": 0.39, + "step": 1870 + }, + { + "epoch": 0.11990515252499359, + "grad_norm": 2.6987854999821597, + "learning_rate": 1e-06, + "loss": 0.4034, + "step": 1871 + }, + { + "epoch": 0.11996923865675468, + "grad_norm": 2.665721870443163, + "learning_rate": 1e-06, + "loss": 0.4221, + "step": 1872 + }, + { + "epoch": 0.12003332478851576, + "grad_norm": 2.6560564490065093, + "learning_rate": 1e-06, + "loss": 0.3591, + "step": 1873 + }, + { + "epoch": 0.12009741092027686, + "grad_norm": 2.7021179621334004, + "learning_rate": 1e-06, + "loss": 0.4218, + "step": 1874 + }, + { + "epoch": 0.12016149705203794, + "grad_norm": 2.6055942460970827, + "learning_rate": 1e-06, + "loss": 0.4245, + "step": 1875 + }, + { + "epoch": 0.12022558318379903, + "grad_norm": 2.416483571969232, + "learning_rate": 1e-06, + "loss": 0.4176, + "step": 1876 + }, + { + "epoch": 0.12028966931556011, + "grad_norm": 2.8828531541836506, + "learning_rate": 1e-06, + "loss": 0.4091, + "step": 1877 + }, + { + "epoch": 0.1203537554473212, + "grad_norm": 2.6443922479248787, + "learning_rate": 1e-06, + "loss": 0.4548, + "step": 1878 + }, + { + "epoch": 0.12041784157908228, + "grad_norm": 2.688894934206754, + "learning_rate": 1e-06, + "loss": 0.4015, + "step": 1879 + }, + { + "epoch": 0.12048192771084337, + "grad_norm": 2.633924101866232, + "learning_rate": 1e-06, + "loss": 0.3938, + "step": 1880 + }, + { + "epoch": 0.12054601384260447, + "grad_norm": 2.807137697859837, + "learning_rate": 1e-06, + "loss": 0.413, + "step": 1881 + }, + { + "epoch": 0.12061009997436554, + "grad_norm": 2.552820173004839, + "learning_rate": 1e-06, + "loss": 0.4118, + "step": 1882 + }, + { + "epoch": 0.12067418610612664, + "grad_norm": 2.6265690322793978, + "learning_rate": 1e-06, + "loss": 0.3926, + "step": 1883 + }, + { + "epoch": 0.12073827223788772, + "grad_norm": 2.678479232293037, + "learning_rate": 1e-06, + "loss": 0.4639, + "step": 1884 + }, + { + "epoch": 0.12080235836964881, + "grad_norm": 2.6050744958084264, + "learning_rate": 1e-06, + "loss": 0.4349, + "step": 1885 + }, + { + "epoch": 0.12086644450140989, + "grad_norm": 2.7082918851876427, + "learning_rate": 1e-06, + "loss": 0.4765, + "step": 1886 + }, + { + "epoch": 0.12093053063317098, + "grad_norm": 2.526845919082137, + "learning_rate": 1e-06, + "loss": 0.3662, + "step": 1887 + }, + { + "epoch": 0.12099461676493208, + "grad_norm": 2.6302833083000703, + "learning_rate": 1e-06, + "loss": 0.366, + "step": 1888 + }, + { + "epoch": 0.12105870289669315, + "grad_norm": 2.553849692507201, + "learning_rate": 1e-06, + "loss": 0.423, + "step": 1889 + }, + { + "epoch": 0.12112278902845425, + "grad_norm": 2.683491080670172, + "learning_rate": 1e-06, + "loss": 0.4514, + "step": 1890 + }, + { + "epoch": 0.12118687516021533, + "grad_norm": 2.6364753658326125, + "learning_rate": 1e-06, + "loss": 0.4189, + "step": 1891 + }, + { + "epoch": 0.12125096129197642, + "grad_norm": 2.7165916312599423, + "learning_rate": 1e-06, + "loss": 0.3922, + "step": 1892 + }, + { + "epoch": 0.1213150474237375, + "grad_norm": 2.549170313734264, + "learning_rate": 1e-06, + "loss": 0.4106, + "step": 1893 + }, + { + "epoch": 0.12137913355549859, + "grad_norm": 2.6838182220230906, + "learning_rate": 1e-06, + "loss": 0.3677, + "step": 1894 + }, + { + "epoch": 0.12144321968725967, + "grad_norm": 2.6414043011658865, + "learning_rate": 1e-06, + "loss": 0.3912, + "step": 1895 + }, + { + "epoch": 0.12150730581902076, + "grad_norm": 2.5632629923340775, + "learning_rate": 1e-06, + "loss": 0.4224, + "step": 1896 + }, + { + "epoch": 0.12157139195078186, + "grad_norm": 2.544984397970424, + "learning_rate": 1e-06, + "loss": 0.4227, + "step": 1897 + }, + { + "epoch": 0.12163547808254294, + "grad_norm": 2.5847715613939455, + "learning_rate": 1e-06, + "loss": 0.4267, + "step": 1898 + }, + { + "epoch": 0.12169956421430403, + "grad_norm": 2.65803391386428, + "learning_rate": 1e-06, + "loss": 0.3746, + "step": 1899 + }, + { + "epoch": 0.12176365034606511, + "grad_norm": 2.6897903682954847, + "learning_rate": 1e-06, + "loss": 0.4652, + "step": 1900 + }, + { + "epoch": 0.1218277364778262, + "grad_norm": 2.56277118816797, + "learning_rate": 1e-06, + "loss": 0.4529, + "step": 1901 + }, + { + "epoch": 0.12189182260958728, + "grad_norm": 2.5517501695285016, + "learning_rate": 1e-06, + "loss": 0.465, + "step": 1902 + }, + { + "epoch": 0.12195590874134837, + "grad_norm": 2.648252581171726, + "learning_rate": 1e-06, + "loss": 0.455, + "step": 1903 + }, + { + "epoch": 0.12201999487310945, + "grad_norm": 2.6367007538455938, + "learning_rate": 1e-06, + "loss": 0.4412, + "step": 1904 + }, + { + "epoch": 0.12208408100487055, + "grad_norm": 2.6880014376963044, + "learning_rate": 1e-06, + "loss": 0.4533, + "step": 1905 + }, + { + "epoch": 0.12214816713663164, + "grad_norm": 2.6551564910990058, + "learning_rate": 1e-06, + "loss": 0.3949, + "step": 1906 + }, + { + "epoch": 0.12221225326839272, + "grad_norm": 2.814098153913346, + "learning_rate": 1e-06, + "loss": 0.37, + "step": 1907 + }, + { + "epoch": 0.12227633940015381, + "grad_norm": 2.5032472998077466, + "learning_rate": 1e-06, + "loss": 0.3843, + "step": 1908 + }, + { + "epoch": 0.12234042553191489, + "grad_norm": 2.8306387124319192, + "learning_rate": 1e-06, + "loss": 0.4515, + "step": 1909 + }, + { + "epoch": 0.12240451166367598, + "grad_norm": 2.42122372122656, + "learning_rate": 1e-06, + "loss": 0.4164, + "step": 1910 + }, + { + "epoch": 0.12246859779543706, + "grad_norm": 2.537249205277831, + "learning_rate": 1e-06, + "loss": 0.4379, + "step": 1911 + }, + { + "epoch": 0.12253268392719815, + "grad_norm": 2.6442746159357924, + "learning_rate": 1e-06, + "loss": 0.4344, + "step": 1912 + }, + { + "epoch": 0.12259677005895925, + "grad_norm": 2.656606308028338, + "learning_rate": 1e-06, + "loss": 0.379, + "step": 1913 + }, + { + "epoch": 0.12266085619072033, + "grad_norm": 2.8088794752870014, + "learning_rate": 1e-06, + "loss": 0.4047, + "step": 1914 + }, + { + "epoch": 0.12272494232248142, + "grad_norm": 2.582575451213974, + "learning_rate": 1e-06, + "loss": 0.4024, + "step": 1915 + }, + { + "epoch": 0.1227890284542425, + "grad_norm": 2.710660924430504, + "learning_rate": 1e-06, + "loss": 0.4155, + "step": 1916 + }, + { + "epoch": 0.12285311458600359, + "grad_norm": 2.42215293270404, + "learning_rate": 1e-06, + "loss": 0.4298, + "step": 1917 + }, + { + "epoch": 0.12291720071776467, + "grad_norm": 2.5921801959218316, + "learning_rate": 1e-06, + "loss": 0.3942, + "step": 1918 + }, + { + "epoch": 0.12298128684952576, + "grad_norm": 2.5278890165586057, + "learning_rate": 1e-06, + "loss": 0.3819, + "step": 1919 + }, + { + "epoch": 0.12304537298128684, + "grad_norm": 2.524488936361231, + "learning_rate": 1e-06, + "loss": 0.4046, + "step": 1920 + }, + { + "epoch": 0.12310945911304794, + "grad_norm": 2.4328966559195675, + "learning_rate": 1e-06, + "loss": 0.4195, + "step": 1921 + }, + { + "epoch": 0.12317354524480903, + "grad_norm": 2.774360894158552, + "learning_rate": 1e-06, + "loss": 0.4204, + "step": 1922 + }, + { + "epoch": 0.12323763137657011, + "grad_norm": 2.518717450808227, + "learning_rate": 1e-06, + "loss": 0.4391, + "step": 1923 + }, + { + "epoch": 0.1233017175083312, + "grad_norm": 2.832159155238884, + "learning_rate": 1e-06, + "loss": 0.4229, + "step": 1924 + }, + { + "epoch": 0.12336580364009228, + "grad_norm": 2.511577878519737, + "learning_rate": 1e-06, + "loss": 0.4165, + "step": 1925 + }, + { + "epoch": 0.12342988977185337, + "grad_norm": 3.5208364019882885, + "learning_rate": 1e-06, + "loss": 0.3873, + "step": 1926 + }, + { + "epoch": 0.12349397590361445, + "grad_norm": 2.427688682125614, + "learning_rate": 1e-06, + "loss": 0.377, + "step": 1927 + }, + { + "epoch": 0.12355806203537555, + "grad_norm": 2.6404501242098837, + "learning_rate": 1e-06, + "loss": 0.454, + "step": 1928 + }, + { + "epoch": 0.12362214816713663, + "grad_norm": 2.441849984145885, + "learning_rate": 1e-06, + "loss": 0.3963, + "step": 1929 + }, + { + "epoch": 0.12368623429889772, + "grad_norm": 2.443122503103709, + "learning_rate": 1e-06, + "loss": 0.4631, + "step": 1930 + }, + { + "epoch": 0.12375032043065881, + "grad_norm": 2.8171252209762474, + "learning_rate": 1e-06, + "loss": 0.4887, + "step": 1931 + }, + { + "epoch": 0.12381440656241989, + "grad_norm": 2.4480320743078887, + "learning_rate": 1e-06, + "loss": 0.3477, + "step": 1932 + }, + { + "epoch": 0.12387849269418098, + "grad_norm": 2.6932808980425107, + "learning_rate": 1e-06, + "loss": 0.4569, + "step": 1933 + }, + { + "epoch": 0.12394257882594206, + "grad_norm": 2.820935235598579, + "learning_rate": 1e-06, + "loss": 0.4038, + "step": 1934 + }, + { + "epoch": 0.12400666495770316, + "grad_norm": 2.6341412685655983, + "learning_rate": 1e-06, + "loss": 0.4254, + "step": 1935 + }, + { + "epoch": 0.12407075108946423, + "grad_norm": 2.6019885732357326, + "learning_rate": 1e-06, + "loss": 0.4036, + "step": 1936 + }, + { + "epoch": 0.12413483722122533, + "grad_norm": 2.6751182734908965, + "learning_rate": 1e-06, + "loss": 0.4484, + "step": 1937 + }, + { + "epoch": 0.12419892335298642, + "grad_norm": 2.5870567275864325, + "learning_rate": 1e-06, + "loss": 0.4145, + "step": 1938 + }, + { + "epoch": 0.1242630094847475, + "grad_norm": 2.5353320352240525, + "learning_rate": 1e-06, + "loss": 0.4034, + "step": 1939 + }, + { + "epoch": 0.12432709561650859, + "grad_norm": 2.598518940629256, + "learning_rate": 1e-06, + "loss": 0.3844, + "step": 1940 + }, + { + "epoch": 0.12439118174826967, + "grad_norm": 2.562879451548365, + "learning_rate": 1e-06, + "loss": 0.3962, + "step": 1941 + }, + { + "epoch": 0.12445526788003076, + "grad_norm": 2.645786720650379, + "learning_rate": 1e-06, + "loss": 0.4838, + "step": 1942 + }, + { + "epoch": 0.12451935401179184, + "grad_norm": 2.6403502102882808, + "learning_rate": 1e-06, + "loss": 0.4711, + "step": 1943 + }, + { + "epoch": 0.12458344014355294, + "grad_norm": 2.876774406457867, + "learning_rate": 1e-06, + "loss": 0.4092, + "step": 1944 + }, + { + "epoch": 0.12464752627531402, + "grad_norm": 2.751100020052121, + "learning_rate": 1e-06, + "loss": 0.4066, + "step": 1945 + }, + { + "epoch": 0.12471161240707511, + "grad_norm": 2.721494548343405, + "learning_rate": 1e-06, + "loss": 0.4333, + "step": 1946 + }, + { + "epoch": 0.1247756985388362, + "grad_norm": 3.0707931228709318, + "learning_rate": 1e-06, + "loss": 0.4444, + "step": 1947 + }, + { + "epoch": 0.12483978467059728, + "grad_norm": 2.5536278780711874, + "learning_rate": 1e-06, + "loss": 0.4432, + "step": 1948 + }, + { + "epoch": 0.12490387080235837, + "grad_norm": 2.5952295096194327, + "learning_rate": 1e-06, + "loss": 0.4064, + "step": 1949 + }, + { + "epoch": 0.12496795693411945, + "grad_norm": 2.804966831875095, + "learning_rate": 1e-06, + "loss": 0.4377, + "step": 1950 + }, + { + "epoch": 0.12503204306588053, + "grad_norm": 2.6254645986466016, + "learning_rate": 1e-06, + "loss": 0.3889, + "step": 1951 + }, + { + "epoch": 0.12509612919764163, + "grad_norm": 2.8575547875752734, + "learning_rate": 1e-06, + "loss": 0.4404, + "step": 1952 + }, + { + "epoch": 0.12516021532940272, + "grad_norm": 2.890946909114489, + "learning_rate": 1e-06, + "loss": 0.3916, + "step": 1953 + }, + { + "epoch": 0.1252243014611638, + "grad_norm": 2.4936005818287588, + "learning_rate": 1e-06, + "loss": 0.4075, + "step": 1954 + }, + { + "epoch": 0.1252883875929249, + "grad_norm": 2.6488281056040917, + "learning_rate": 1e-06, + "loss": 0.4452, + "step": 1955 + }, + { + "epoch": 0.12535247372468597, + "grad_norm": 2.5861471919909875, + "learning_rate": 1e-06, + "loss": 0.3942, + "step": 1956 + }, + { + "epoch": 0.12541655985644706, + "grad_norm": 3.5318637594090085, + "learning_rate": 1e-06, + "loss": 0.3976, + "step": 1957 + }, + { + "epoch": 0.12548064598820816, + "grad_norm": 2.399050718582829, + "learning_rate": 1e-06, + "loss": 0.3801, + "step": 1958 + }, + { + "epoch": 0.12554473211996925, + "grad_norm": 2.597714681002882, + "learning_rate": 1e-06, + "loss": 0.3961, + "step": 1959 + }, + { + "epoch": 0.12560881825173031, + "grad_norm": 2.4663048518140362, + "learning_rate": 1e-06, + "loss": 0.4151, + "step": 1960 + }, + { + "epoch": 0.1256729043834914, + "grad_norm": 2.5624663402691126, + "learning_rate": 1e-06, + "loss": 0.4148, + "step": 1961 + }, + { + "epoch": 0.1257369905152525, + "grad_norm": 3.140524750218864, + "learning_rate": 1e-06, + "loss": 0.4058, + "step": 1962 + }, + { + "epoch": 0.1258010766470136, + "grad_norm": 2.810127821809344, + "learning_rate": 1e-06, + "loss": 0.4366, + "step": 1963 + }, + { + "epoch": 0.1258651627787747, + "grad_norm": 2.5209202909604, + "learning_rate": 1e-06, + "loss": 0.4337, + "step": 1964 + }, + { + "epoch": 0.12592924891053575, + "grad_norm": 2.651740149209284, + "learning_rate": 1e-06, + "loss": 0.4091, + "step": 1965 + }, + { + "epoch": 0.12599333504229684, + "grad_norm": 2.6638550091846, + "learning_rate": 1e-06, + "loss": 0.3951, + "step": 1966 + }, + { + "epoch": 0.12605742117405794, + "grad_norm": 2.7390919328674106, + "learning_rate": 1e-06, + "loss": 0.4224, + "step": 1967 + }, + { + "epoch": 0.12612150730581903, + "grad_norm": 2.568954836564736, + "learning_rate": 1e-06, + "loss": 0.4179, + "step": 1968 + }, + { + "epoch": 0.1261855934375801, + "grad_norm": 2.623192224449949, + "learning_rate": 1e-06, + "loss": 0.4082, + "step": 1969 + }, + { + "epoch": 0.1262496795693412, + "grad_norm": 2.772705656163098, + "learning_rate": 1e-06, + "loss": 0.4291, + "step": 1970 + }, + { + "epoch": 0.12631376570110228, + "grad_norm": 3.0653029335133293, + "learning_rate": 1e-06, + "loss": 0.4196, + "step": 1971 + }, + { + "epoch": 0.12637785183286337, + "grad_norm": 2.71401827712792, + "learning_rate": 1e-06, + "loss": 0.4095, + "step": 1972 + }, + { + "epoch": 0.12644193796462447, + "grad_norm": 2.3069147395316767, + "learning_rate": 1e-06, + "loss": 0.4296, + "step": 1973 + }, + { + "epoch": 0.12650602409638553, + "grad_norm": 2.944709378967964, + "learning_rate": 1e-06, + "loss": 0.4009, + "step": 1974 + }, + { + "epoch": 0.12657011022814663, + "grad_norm": 2.6377150914839675, + "learning_rate": 1e-06, + "loss": 0.4023, + "step": 1975 + }, + { + "epoch": 0.12663419635990772, + "grad_norm": 2.8380873293773226, + "learning_rate": 1e-06, + "loss": 0.4077, + "step": 1976 + }, + { + "epoch": 0.1266982824916688, + "grad_norm": 2.677246943205837, + "learning_rate": 1e-06, + "loss": 0.4423, + "step": 1977 + }, + { + "epoch": 0.12676236862342988, + "grad_norm": 2.5127787900075402, + "learning_rate": 1e-06, + "loss": 0.3851, + "step": 1978 + }, + { + "epoch": 0.12682645475519097, + "grad_norm": 2.4144112563671793, + "learning_rate": 1e-06, + "loss": 0.4408, + "step": 1979 + }, + { + "epoch": 0.12689054088695206, + "grad_norm": 2.592467710784995, + "learning_rate": 1e-06, + "loss": 0.4076, + "step": 1980 + }, + { + "epoch": 0.12695462701871316, + "grad_norm": 2.702943687034177, + "learning_rate": 1e-06, + "loss": 0.4539, + "step": 1981 + }, + { + "epoch": 0.12701871315047425, + "grad_norm": 2.555904697910308, + "learning_rate": 1e-06, + "loss": 0.3962, + "step": 1982 + }, + { + "epoch": 0.12708279928223531, + "grad_norm": 2.6617695924556775, + "learning_rate": 1e-06, + "loss": 0.4315, + "step": 1983 + }, + { + "epoch": 0.1271468854139964, + "grad_norm": 2.5865121829306155, + "learning_rate": 1e-06, + "loss": 0.378, + "step": 1984 + }, + { + "epoch": 0.1272109715457575, + "grad_norm": 2.4733075250800067, + "learning_rate": 1e-06, + "loss": 0.4381, + "step": 1985 + }, + { + "epoch": 0.1272750576775186, + "grad_norm": 2.7177984913035558, + "learning_rate": 1e-06, + "loss": 0.4293, + "step": 1986 + }, + { + "epoch": 0.12733914380927966, + "grad_norm": 2.68776734487849, + "learning_rate": 1e-06, + "loss": 0.3889, + "step": 1987 + }, + { + "epoch": 0.12740322994104075, + "grad_norm": 2.619007004179668, + "learning_rate": 1e-06, + "loss": 0.4024, + "step": 1988 + }, + { + "epoch": 0.12746731607280185, + "grad_norm": 2.65050880017839, + "learning_rate": 1e-06, + "loss": 0.4148, + "step": 1989 + }, + { + "epoch": 0.12753140220456294, + "grad_norm": 2.6588622202293077, + "learning_rate": 1e-06, + "loss": 0.4207, + "step": 1990 + }, + { + "epoch": 0.12759548833632403, + "grad_norm": 2.708910525725563, + "learning_rate": 1e-06, + "loss": 0.4158, + "step": 1991 + }, + { + "epoch": 0.1276595744680851, + "grad_norm": 2.5687777946786285, + "learning_rate": 1e-06, + "loss": 0.3429, + "step": 1992 + }, + { + "epoch": 0.1277236605998462, + "grad_norm": 2.8445974840088857, + "learning_rate": 1e-06, + "loss": 0.3966, + "step": 1993 + }, + { + "epoch": 0.12778774673160728, + "grad_norm": 2.4812860715534737, + "learning_rate": 1e-06, + "loss": 0.4086, + "step": 1994 + }, + { + "epoch": 0.12785183286336838, + "grad_norm": 2.5067894831823274, + "learning_rate": 1e-06, + "loss": 0.3674, + "step": 1995 + }, + { + "epoch": 0.12791591899512944, + "grad_norm": 3.120477088832285, + "learning_rate": 1e-06, + "loss": 0.4126, + "step": 1996 + }, + { + "epoch": 0.12798000512689053, + "grad_norm": 2.68728983577007, + "learning_rate": 1e-06, + "loss": 0.3998, + "step": 1997 + }, + { + "epoch": 0.12804409125865163, + "grad_norm": 2.5708000190381006, + "learning_rate": 1e-06, + "loss": 0.3876, + "step": 1998 + }, + { + "epoch": 0.12810817739041272, + "grad_norm": 2.816934859878245, + "learning_rate": 1e-06, + "loss": 0.4359, + "step": 1999 + }, + { + "epoch": 0.1281722635221738, + "grad_norm": 2.7079638576190903, + "learning_rate": 1e-06, + "loss": 0.4471, + "step": 2000 + }, + { + "epoch": 0.12823634965393488, + "grad_norm": 2.5800674482671635, + "learning_rate": 1e-06, + "loss": 0.4317, + "step": 2001 + }, + { + "epoch": 0.12830043578569597, + "grad_norm": 2.4591428885818507, + "learning_rate": 1e-06, + "loss": 0.421, + "step": 2002 + }, + { + "epoch": 0.12836452191745706, + "grad_norm": 2.6827533850410337, + "learning_rate": 1e-06, + "loss": 0.3498, + "step": 2003 + }, + { + "epoch": 0.12842860804921816, + "grad_norm": 2.709188331291668, + "learning_rate": 1e-06, + "loss": 0.4272, + "step": 2004 + }, + { + "epoch": 0.12849269418097922, + "grad_norm": 2.499208486274247, + "learning_rate": 1e-06, + "loss": 0.4053, + "step": 2005 + }, + { + "epoch": 0.12855678031274032, + "grad_norm": 2.6909496268940214, + "learning_rate": 1e-06, + "loss": 0.4524, + "step": 2006 + }, + { + "epoch": 0.1286208664445014, + "grad_norm": 2.802587101271318, + "learning_rate": 1e-06, + "loss": 0.4081, + "step": 2007 + }, + { + "epoch": 0.1286849525762625, + "grad_norm": 2.4601661186060673, + "learning_rate": 1e-06, + "loss": 0.3981, + "step": 2008 + }, + { + "epoch": 0.1287490387080236, + "grad_norm": 2.654138037986819, + "learning_rate": 1e-06, + "loss": 0.3885, + "step": 2009 + }, + { + "epoch": 0.12881312483978466, + "grad_norm": 2.8370521642356876, + "learning_rate": 1e-06, + "loss": 0.434, + "step": 2010 + }, + { + "epoch": 0.12887721097154575, + "grad_norm": 2.5252795408324964, + "learning_rate": 1e-06, + "loss": 0.4169, + "step": 2011 + }, + { + "epoch": 0.12894129710330685, + "grad_norm": 2.4799437101808866, + "learning_rate": 1e-06, + "loss": 0.3828, + "step": 2012 + }, + { + "epoch": 0.12900538323506794, + "grad_norm": 2.7494549099164614, + "learning_rate": 1e-06, + "loss": 0.4594, + "step": 2013 + }, + { + "epoch": 0.12906946936682903, + "grad_norm": 2.4795137457784926, + "learning_rate": 1e-06, + "loss": 0.4005, + "step": 2014 + }, + { + "epoch": 0.1291335554985901, + "grad_norm": 2.7520680465218175, + "learning_rate": 1e-06, + "loss": 0.4466, + "step": 2015 + }, + { + "epoch": 0.1291976416303512, + "grad_norm": 2.722284636249325, + "learning_rate": 1e-06, + "loss": 0.3849, + "step": 2016 + }, + { + "epoch": 0.12926172776211228, + "grad_norm": 2.6524413526819237, + "learning_rate": 1e-06, + "loss": 0.4324, + "step": 2017 + }, + { + "epoch": 0.12932581389387338, + "grad_norm": 2.66292985459627, + "learning_rate": 1e-06, + "loss": 0.4615, + "step": 2018 + }, + { + "epoch": 0.12938990002563444, + "grad_norm": 2.6159139687609576, + "learning_rate": 1e-06, + "loss": 0.4115, + "step": 2019 + }, + { + "epoch": 0.12945398615739553, + "grad_norm": 2.7173243701716823, + "learning_rate": 1e-06, + "loss": 0.4108, + "step": 2020 + }, + { + "epoch": 0.12951807228915663, + "grad_norm": 2.7996012785431357, + "learning_rate": 1e-06, + "loss": 0.418, + "step": 2021 + }, + { + "epoch": 0.12958215842091772, + "grad_norm": 2.6421635527383063, + "learning_rate": 1e-06, + "loss": 0.5005, + "step": 2022 + }, + { + "epoch": 0.1296462445526788, + "grad_norm": 2.701234561198063, + "learning_rate": 1e-06, + "loss": 0.4144, + "step": 2023 + }, + { + "epoch": 0.12971033068443988, + "grad_norm": 2.5982870193962793, + "learning_rate": 1e-06, + "loss": 0.4336, + "step": 2024 + }, + { + "epoch": 0.12977441681620097, + "grad_norm": 3.0523581053313165, + "learning_rate": 1e-06, + "loss": 0.4677, + "step": 2025 + }, + { + "epoch": 0.12983850294796206, + "grad_norm": 2.498398555961267, + "learning_rate": 1e-06, + "loss": 0.4339, + "step": 2026 + }, + { + "epoch": 0.12990258907972316, + "grad_norm": 2.6602278770071623, + "learning_rate": 1e-06, + "loss": 0.4301, + "step": 2027 + }, + { + "epoch": 0.12996667521148422, + "grad_norm": 2.4848270044860645, + "learning_rate": 1e-06, + "loss": 0.4618, + "step": 2028 + }, + { + "epoch": 0.13003076134324532, + "grad_norm": 2.6426560908155805, + "learning_rate": 1e-06, + "loss": 0.4431, + "step": 2029 + }, + { + "epoch": 0.1300948474750064, + "grad_norm": 2.685119281293971, + "learning_rate": 1e-06, + "loss": 0.4239, + "step": 2030 + }, + { + "epoch": 0.1301589336067675, + "grad_norm": 2.653893626434556, + "learning_rate": 1e-06, + "loss": 0.4626, + "step": 2031 + }, + { + "epoch": 0.1302230197385286, + "grad_norm": 2.693815977647124, + "learning_rate": 1e-06, + "loss": 0.4073, + "step": 2032 + }, + { + "epoch": 0.13028710587028966, + "grad_norm": 2.7742930167368316, + "learning_rate": 1e-06, + "loss": 0.4358, + "step": 2033 + }, + { + "epoch": 0.13035119200205075, + "grad_norm": 2.4521871241228532, + "learning_rate": 1e-06, + "loss": 0.3665, + "step": 2034 + }, + { + "epoch": 0.13041527813381185, + "grad_norm": 2.5778926021918656, + "learning_rate": 1e-06, + "loss": 0.4419, + "step": 2035 + }, + { + "epoch": 0.13047936426557294, + "grad_norm": 2.5233030916242307, + "learning_rate": 1e-06, + "loss": 0.4635, + "step": 2036 + }, + { + "epoch": 0.130543450397334, + "grad_norm": 2.7769656735415227, + "learning_rate": 1e-06, + "loss": 0.4503, + "step": 2037 + }, + { + "epoch": 0.1306075365290951, + "grad_norm": 2.6061311223083528, + "learning_rate": 1e-06, + "loss": 0.4037, + "step": 2038 + }, + { + "epoch": 0.1306716226608562, + "grad_norm": 2.6441536424858416, + "learning_rate": 1e-06, + "loss": 0.4084, + "step": 2039 + }, + { + "epoch": 0.13073570879261728, + "grad_norm": 2.646813268165125, + "learning_rate": 1e-06, + "loss": 0.404, + "step": 2040 + }, + { + "epoch": 0.13079979492437838, + "grad_norm": 2.718568711377987, + "learning_rate": 1e-06, + "loss": 0.3933, + "step": 2041 + }, + { + "epoch": 0.13086388105613944, + "grad_norm": 2.4331819490330076, + "learning_rate": 1e-06, + "loss": 0.4277, + "step": 2042 + }, + { + "epoch": 0.13092796718790053, + "grad_norm": 2.5525577204591787, + "learning_rate": 1e-06, + "loss": 0.4228, + "step": 2043 + }, + { + "epoch": 0.13099205331966163, + "grad_norm": 2.591680461417647, + "learning_rate": 1e-06, + "loss": 0.3663, + "step": 2044 + }, + { + "epoch": 0.13105613945142272, + "grad_norm": 2.7473732680920464, + "learning_rate": 1e-06, + "loss": 0.4217, + "step": 2045 + }, + { + "epoch": 0.13112022558318379, + "grad_norm": 2.512421236575192, + "learning_rate": 1e-06, + "loss": 0.4137, + "step": 2046 + }, + { + "epoch": 0.13118431171494488, + "grad_norm": 2.745025052977934, + "learning_rate": 1e-06, + "loss": 0.4289, + "step": 2047 + }, + { + "epoch": 0.13124839784670597, + "grad_norm": 2.6990060791654327, + "learning_rate": 1e-06, + "loss": 0.4014, + "step": 2048 + }, + { + "epoch": 0.13131248397846706, + "grad_norm": 2.603256256896644, + "learning_rate": 1e-06, + "loss": 0.4655, + "step": 2049 + }, + { + "epoch": 0.13137657011022816, + "grad_norm": 2.6494921431521687, + "learning_rate": 1e-06, + "loss": 0.3774, + "step": 2050 + }, + { + "epoch": 0.13144065624198922, + "grad_norm": 2.8543194864950965, + "learning_rate": 1e-06, + "loss": 0.4587, + "step": 2051 + }, + { + "epoch": 0.13150474237375032, + "grad_norm": 2.7739591966852166, + "learning_rate": 1e-06, + "loss": 0.3972, + "step": 2052 + }, + { + "epoch": 0.1315688285055114, + "grad_norm": 2.4982817995739444, + "learning_rate": 1e-06, + "loss": 0.3863, + "step": 2053 + }, + { + "epoch": 0.1316329146372725, + "grad_norm": 2.6312308133622286, + "learning_rate": 1e-06, + "loss": 0.4148, + "step": 2054 + }, + { + "epoch": 0.13169700076903357, + "grad_norm": 3.0636204926026425, + "learning_rate": 1e-06, + "loss": 0.4028, + "step": 2055 + }, + { + "epoch": 0.13176108690079466, + "grad_norm": 2.7556518659223626, + "learning_rate": 1e-06, + "loss": 0.4212, + "step": 2056 + }, + { + "epoch": 0.13182517303255575, + "grad_norm": 2.7385712365433936, + "learning_rate": 1e-06, + "loss": 0.4222, + "step": 2057 + }, + { + "epoch": 0.13188925916431685, + "grad_norm": 2.533880989808742, + "learning_rate": 1e-06, + "loss": 0.4315, + "step": 2058 + }, + { + "epoch": 0.13195334529607794, + "grad_norm": 2.426764055890877, + "learning_rate": 1e-06, + "loss": 0.4003, + "step": 2059 + }, + { + "epoch": 0.132017431427839, + "grad_norm": 2.5769365252329246, + "learning_rate": 1e-06, + "loss": 0.4141, + "step": 2060 + }, + { + "epoch": 0.1320815175596001, + "grad_norm": 2.5672902578483447, + "learning_rate": 1e-06, + "loss": 0.3979, + "step": 2061 + }, + { + "epoch": 0.1321456036913612, + "grad_norm": 2.727113752369044, + "learning_rate": 1e-06, + "loss": 0.407, + "step": 2062 + }, + { + "epoch": 0.13220968982312228, + "grad_norm": 2.598448088421263, + "learning_rate": 1e-06, + "loss": 0.3973, + "step": 2063 + }, + { + "epoch": 0.13227377595488338, + "grad_norm": 2.808346442547483, + "learning_rate": 1e-06, + "loss": 0.3907, + "step": 2064 + }, + { + "epoch": 0.13233786208664444, + "grad_norm": 2.5537562254410817, + "learning_rate": 1e-06, + "loss": 0.3879, + "step": 2065 + }, + { + "epoch": 0.13240194821840554, + "grad_norm": 2.507844085434063, + "learning_rate": 1e-06, + "loss": 0.3912, + "step": 2066 + }, + { + "epoch": 0.13246603435016663, + "grad_norm": 2.6236845142825556, + "learning_rate": 1e-06, + "loss": 0.3805, + "step": 2067 + }, + { + "epoch": 0.13253012048192772, + "grad_norm": 2.5809839489336666, + "learning_rate": 1e-06, + "loss": 0.391, + "step": 2068 + }, + { + "epoch": 0.1325942066136888, + "grad_norm": 2.5456272047937096, + "learning_rate": 1e-06, + "loss": 0.3996, + "step": 2069 + }, + { + "epoch": 0.13265829274544988, + "grad_norm": 2.7029859623452235, + "learning_rate": 1e-06, + "loss": 0.3932, + "step": 2070 + }, + { + "epoch": 0.13272237887721097, + "grad_norm": 3.113893501887302, + "learning_rate": 1e-06, + "loss": 0.4189, + "step": 2071 + }, + { + "epoch": 0.13278646500897207, + "grad_norm": 2.895251806197545, + "learning_rate": 1e-06, + "loss": 0.4129, + "step": 2072 + }, + { + "epoch": 0.13285055114073316, + "grad_norm": 2.692808938525751, + "learning_rate": 1e-06, + "loss": 0.3822, + "step": 2073 + }, + { + "epoch": 0.13291463727249422, + "grad_norm": 2.666108543162362, + "learning_rate": 1e-06, + "loss": 0.4093, + "step": 2074 + }, + { + "epoch": 0.13297872340425532, + "grad_norm": 2.628539864622391, + "learning_rate": 1e-06, + "loss": 0.4195, + "step": 2075 + }, + { + "epoch": 0.1330428095360164, + "grad_norm": 2.6457888313754854, + "learning_rate": 1e-06, + "loss": 0.3665, + "step": 2076 + }, + { + "epoch": 0.1331068956677775, + "grad_norm": 2.73962603052359, + "learning_rate": 1e-06, + "loss": 0.4013, + "step": 2077 + }, + { + "epoch": 0.13317098179953857, + "grad_norm": 2.73176395407253, + "learning_rate": 1e-06, + "loss": 0.4025, + "step": 2078 + }, + { + "epoch": 0.13323506793129966, + "grad_norm": 2.698908843828074, + "learning_rate": 1e-06, + "loss": 0.4199, + "step": 2079 + }, + { + "epoch": 0.13329915406306075, + "grad_norm": 2.717511171326012, + "learning_rate": 1e-06, + "loss": 0.4055, + "step": 2080 + }, + { + "epoch": 0.13336324019482185, + "grad_norm": 2.663032680662114, + "learning_rate": 1e-06, + "loss": 0.4314, + "step": 2081 + }, + { + "epoch": 0.13342732632658294, + "grad_norm": 2.67227144538236, + "learning_rate": 1e-06, + "loss": 0.3949, + "step": 2082 + }, + { + "epoch": 0.133491412458344, + "grad_norm": 2.7192184119233347, + "learning_rate": 1e-06, + "loss": 0.3971, + "step": 2083 + }, + { + "epoch": 0.1335554985901051, + "grad_norm": 2.80607421933027, + "learning_rate": 1e-06, + "loss": 0.442, + "step": 2084 + }, + { + "epoch": 0.1336195847218662, + "grad_norm": 2.4282833747159365, + "learning_rate": 1e-06, + "loss": 0.3403, + "step": 2085 + }, + { + "epoch": 0.13368367085362728, + "grad_norm": 2.779501854115342, + "learning_rate": 1e-06, + "loss": 0.3793, + "step": 2086 + }, + { + "epoch": 0.13374775698538835, + "grad_norm": 2.605891767733823, + "learning_rate": 1e-06, + "loss": 0.4258, + "step": 2087 + }, + { + "epoch": 0.13381184311714944, + "grad_norm": 2.4367002776811315, + "learning_rate": 1e-06, + "loss": 0.4342, + "step": 2088 + }, + { + "epoch": 0.13387592924891054, + "grad_norm": 2.462742753214077, + "learning_rate": 1e-06, + "loss": 0.408, + "step": 2089 + }, + { + "epoch": 0.13394001538067163, + "grad_norm": 2.717385353303187, + "learning_rate": 1e-06, + "loss": 0.3728, + "step": 2090 + }, + { + "epoch": 0.13400410151243272, + "grad_norm": 2.5506542547042583, + "learning_rate": 1e-06, + "loss": 0.3751, + "step": 2091 + }, + { + "epoch": 0.1340681876441938, + "grad_norm": 2.48170579062675, + "learning_rate": 1e-06, + "loss": 0.3592, + "step": 2092 + }, + { + "epoch": 0.13413227377595488, + "grad_norm": 2.5466247727125646, + "learning_rate": 1e-06, + "loss": 0.3816, + "step": 2093 + }, + { + "epoch": 0.13419635990771597, + "grad_norm": 2.792497217280625, + "learning_rate": 1e-06, + "loss": 0.4217, + "step": 2094 + }, + { + "epoch": 0.13426044603947707, + "grad_norm": 2.8605300019220246, + "learning_rate": 1e-06, + "loss": 0.4964, + "step": 2095 + }, + { + "epoch": 0.13432453217123813, + "grad_norm": 2.5038991233158363, + "learning_rate": 1e-06, + "loss": 0.4279, + "step": 2096 + }, + { + "epoch": 0.13438861830299922, + "grad_norm": 2.868839229735299, + "learning_rate": 1e-06, + "loss": 0.4488, + "step": 2097 + }, + { + "epoch": 0.13445270443476032, + "grad_norm": 2.545144883407565, + "learning_rate": 1e-06, + "loss": 0.4652, + "step": 2098 + }, + { + "epoch": 0.1345167905665214, + "grad_norm": 2.5425577896010565, + "learning_rate": 1e-06, + "loss": 0.3585, + "step": 2099 + }, + { + "epoch": 0.1345808766982825, + "grad_norm": 2.6384130176202962, + "learning_rate": 1e-06, + "loss": 0.4988, + "step": 2100 + }, + { + "epoch": 0.13464496283004357, + "grad_norm": 2.6027499158121215, + "learning_rate": 1e-06, + "loss": 0.4604, + "step": 2101 + }, + { + "epoch": 0.13470904896180466, + "grad_norm": 2.6048158509628614, + "learning_rate": 1e-06, + "loss": 0.3841, + "step": 2102 + }, + { + "epoch": 0.13477313509356575, + "grad_norm": 2.8293034564567248, + "learning_rate": 1e-06, + "loss": 0.408, + "step": 2103 + }, + { + "epoch": 0.13483722122532685, + "grad_norm": 2.592521713422449, + "learning_rate": 1e-06, + "loss": 0.4517, + "step": 2104 + }, + { + "epoch": 0.1349013073570879, + "grad_norm": 2.6693886764853407, + "learning_rate": 1e-06, + "loss": 0.3943, + "step": 2105 + }, + { + "epoch": 0.134965393488849, + "grad_norm": 2.504808703738117, + "learning_rate": 1e-06, + "loss": 0.3935, + "step": 2106 + }, + { + "epoch": 0.1350294796206101, + "grad_norm": 2.907032906917195, + "learning_rate": 1e-06, + "loss": 0.4508, + "step": 2107 + }, + { + "epoch": 0.1350935657523712, + "grad_norm": 2.679610851400339, + "learning_rate": 1e-06, + "loss": 0.3949, + "step": 2108 + }, + { + "epoch": 0.13515765188413228, + "grad_norm": 2.748289961872713, + "learning_rate": 1e-06, + "loss": 0.4391, + "step": 2109 + }, + { + "epoch": 0.13522173801589335, + "grad_norm": 2.7234870239625493, + "learning_rate": 1e-06, + "loss": 0.4582, + "step": 2110 + }, + { + "epoch": 0.13528582414765444, + "grad_norm": 2.7140754867383023, + "learning_rate": 1e-06, + "loss": 0.4414, + "step": 2111 + }, + { + "epoch": 0.13534991027941554, + "grad_norm": 2.6496771127358527, + "learning_rate": 1e-06, + "loss": 0.4467, + "step": 2112 + }, + { + "epoch": 0.13541399641117663, + "grad_norm": 2.730252973494939, + "learning_rate": 1e-06, + "loss": 0.4407, + "step": 2113 + }, + { + "epoch": 0.1354780825429377, + "grad_norm": 2.655324313984564, + "learning_rate": 1e-06, + "loss": 0.4708, + "step": 2114 + }, + { + "epoch": 0.1355421686746988, + "grad_norm": 2.8728419806844454, + "learning_rate": 1e-06, + "loss": 0.473, + "step": 2115 + }, + { + "epoch": 0.13560625480645988, + "grad_norm": 2.5073844860924184, + "learning_rate": 1e-06, + "loss": 0.3807, + "step": 2116 + }, + { + "epoch": 0.13567034093822097, + "grad_norm": 2.60753256323996, + "learning_rate": 1e-06, + "loss": 0.4838, + "step": 2117 + }, + { + "epoch": 0.13573442706998207, + "grad_norm": 2.7082299427616907, + "learning_rate": 1e-06, + "loss": 0.3978, + "step": 2118 + }, + { + "epoch": 0.13579851320174313, + "grad_norm": 2.4118401233619253, + "learning_rate": 1e-06, + "loss": 0.4567, + "step": 2119 + }, + { + "epoch": 0.13586259933350422, + "grad_norm": 2.4643325105841503, + "learning_rate": 1e-06, + "loss": 0.4006, + "step": 2120 + }, + { + "epoch": 0.13592668546526532, + "grad_norm": 2.5275100662919554, + "learning_rate": 1e-06, + "loss": 0.4766, + "step": 2121 + }, + { + "epoch": 0.1359907715970264, + "grad_norm": 2.655859438369582, + "learning_rate": 1e-06, + "loss": 0.4157, + "step": 2122 + }, + { + "epoch": 0.1360548577287875, + "grad_norm": 2.6427058505139187, + "learning_rate": 1e-06, + "loss": 0.3924, + "step": 2123 + }, + { + "epoch": 0.13611894386054857, + "grad_norm": 2.6632577787547, + "learning_rate": 1e-06, + "loss": 0.4052, + "step": 2124 + }, + { + "epoch": 0.13618302999230966, + "grad_norm": 2.6287929102425003, + "learning_rate": 1e-06, + "loss": 0.3918, + "step": 2125 + }, + { + "epoch": 0.13624711612407076, + "grad_norm": 2.8016371088295475, + "learning_rate": 1e-06, + "loss": 0.4356, + "step": 2126 + }, + { + "epoch": 0.13631120225583185, + "grad_norm": 3.3386006555368057, + "learning_rate": 1e-06, + "loss": 0.3933, + "step": 2127 + }, + { + "epoch": 0.1363752883875929, + "grad_norm": 2.7097965814412226, + "learning_rate": 1e-06, + "loss": 0.4021, + "step": 2128 + }, + { + "epoch": 0.136439374519354, + "grad_norm": 2.7644699066410983, + "learning_rate": 1e-06, + "loss": 0.4464, + "step": 2129 + }, + { + "epoch": 0.1365034606511151, + "grad_norm": 2.4771892454096793, + "learning_rate": 1e-06, + "loss": 0.3746, + "step": 2130 + }, + { + "epoch": 0.1365675467828762, + "grad_norm": 2.760564997369234, + "learning_rate": 1e-06, + "loss": 0.3566, + "step": 2131 + }, + { + "epoch": 0.13663163291463729, + "grad_norm": 2.661760302416408, + "learning_rate": 1e-06, + "loss": 0.5215, + "step": 2132 + }, + { + "epoch": 0.13669571904639835, + "grad_norm": 2.623671240870864, + "learning_rate": 1e-06, + "loss": 0.4042, + "step": 2133 + }, + { + "epoch": 0.13675980517815944, + "grad_norm": 2.9925947621163673, + "learning_rate": 1e-06, + "loss": 0.4295, + "step": 2134 + }, + { + "epoch": 0.13682389130992054, + "grad_norm": 2.7417367216554207, + "learning_rate": 1e-06, + "loss": 0.4468, + "step": 2135 + }, + { + "epoch": 0.13688797744168163, + "grad_norm": 2.669990654570271, + "learning_rate": 1e-06, + "loss": 0.4087, + "step": 2136 + }, + { + "epoch": 0.1369520635734427, + "grad_norm": 2.8533638654559805, + "learning_rate": 1e-06, + "loss": 0.4808, + "step": 2137 + }, + { + "epoch": 0.1370161497052038, + "grad_norm": 2.7643977807163793, + "learning_rate": 1e-06, + "loss": 0.3754, + "step": 2138 + }, + { + "epoch": 0.13708023583696488, + "grad_norm": 2.536365532104418, + "learning_rate": 1e-06, + "loss": 0.3998, + "step": 2139 + }, + { + "epoch": 0.13714432196872597, + "grad_norm": 2.637414739617339, + "learning_rate": 1e-06, + "loss": 0.4129, + "step": 2140 + }, + { + "epoch": 0.13720840810048707, + "grad_norm": 2.5290361793526963, + "learning_rate": 1e-06, + "loss": 0.4058, + "step": 2141 + }, + { + "epoch": 0.13727249423224813, + "grad_norm": 2.600362258487816, + "learning_rate": 1e-06, + "loss": 0.3776, + "step": 2142 + }, + { + "epoch": 0.13733658036400923, + "grad_norm": 2.5573978268464623, + "learning_rate": 1e-06, + "loss": 0.3914, + "step": 2143 + }, + { + "epoch": 0.13740066649577032, + "grad_norm": 2.680943994912968, + "learning_rate": 1e-06, + "loss": 0.4214, + "step": 2144 + }, + { + "epoch": 0.1374647526275314, + "grad_norm": 2.6925962306306146, + "learning_rate": 1e-06, + "loss": 0.4573, + "step": 2145 + }, + { + "epoch": 0.13752883875929248, + "grad_norm": 2.689427306845986, + "learning_rate": 1e-06, + "loss": 0.4184, + "step": 2146 + }, + { + "epoch": 0.13759292489105357, + "grad_norm": 2.7293189543709766, + "learning_rate": 1e-06, + "loss": 0.3834, + "step": 2147 + }, + { + "epoch": 0.13765701102281466, + "grad_norm": 2.8777615193581676, + "learning_rate": 1e-06, + "loss": 0.4442, + "step": 2148 + }, + { + "epoch": 0.13772109715457576, + "grad_norm": 2.6444638826899634, + "learning_rate": 1e-06, + "loss": 0.4287, + "step": 2149 + }, + { + "epoch": 0.13778518328633685, + "grad_norm": 2.7759250640439217, + "learning_rate": 1e-06, + "loss": 0.3773, + "step": 2150 + }, + { + "epoch": 0.13784926941809791, + "grad_norm": 2.616339376376333, + "learning_rate": 1e-06, + "loss": 0.3957, + "step": 2151 + }, + { + "epoch": 0.137913355549859, + "grad_norm": 2.538884599658828, + "learning_rate": 1e-06, + "loss": 0.4364, + "step": 2152 + }, + { + "epoch": 0.1379774416816201, + "grad_norm": 2.75943965508161, + "learning_rate": 1e-06, + "loss": 0.404, + "step": 2153 + }, + { + "epoch": 0.1380415278133812, + "grad_norm": 2.5818273891275867, + "learning_rate": 1e-06, + "loss": 0.455, + "step": 2154 + }, + { + "epoch": 0.13810561394514226, + "grad_norm": 2.9759449959751176, + "learning_rate": 1e-06, + "loss": 0.4126, + "step": 2155 + }, + { + "epoch": 0.13816970007690335, + "grad_norm": 2.595854159746892, + "learning_rate": 1e-06, + "loss": 0.4279, + "step": 2156 + }, + { + "epoch": 0.13823378620866444, + "grad_norm": 2.970107649172304, + "learning_rate": 1e-06, + "loss": 0.3991, + "step": 2157 + }, + { + "epoch": 0.13829787234042554, + "grad_norm": 2.5620174383001, + "learning_rate": 1e-06, + "loss": 0.3853, + "step": 2158 + }, + { + "epoch": 0.13836195847218663, + "grad_norm": 2.682996794741283, + "learning_rate": 1e-06, + "loss": 0.3926, + "step": 2159 + }, + { + "epoch": 0.1384260446039477, + "grad_norm": 2.640623080908026, + "learning_rate": 1e-06, + "loss": 0.4437, + "step": 2160 + }, + { + "epoch": 0.1384901307357088, + "grad_norm": 2.562450172958601, + "learning_rate": 1e-06, + "loss": 0.4001, + "step": 2161 + }, + { + "epoch": 0.13855421686746988, + "grad_norm": 2.625741101049455, + "learning_rate": 1e-06, + "loss": 0.4063, + "step": 2162 + }, + { + "epoch": 0.13861830299923097, + "grad_norm": 2.926628566586209, + "learning_rate": 1e-06, + "loss": 0.4099, + "step": 2163 + }, + { + "epoch": 0.13868238913099204, + "grad_norm": 2.532631132337974, + "learning_rate": 1e-06, + "loss": 0.4167, + "step": 2164 + }, + { + "epoch": 0.13874647526275313, + "grad_norm": 2.41138765437668, + "learning_rate": 1e-06, + "loss": 0.3966, + "step": 2165 + }, + { + "epoch": 0.13881056139451423, + "grad_norm": 2.7163462717552247, + "learning_rate": 1e-06, + "loss": 0.4605, + "step": 2166 + }, + { + "epoch": 0.13887464752627532, + "grad_norm": 2.7349179779173682, + "learning_rate": 1e-06, + "loss": 0.4129, + "step": 2167 + }, + { + "epoch": 0.1389387336580364, + "grad_norm": 2.705844709704702, + "learning_rate": 1e-06, + "loss": 0.3875, + "step": 2168 + }, + { + "epoch": 0.13900281978979748, + "grad_norm": 3.077466061475745, + "learning_rate": 1e-06, + "loss": 0.461, + "step": 2169 + }, + { + "epoch": 0.13906690592155857, + "grad_norm": 2.7230463846895163, + "learning_rate": 1e-06, + "loss": 0.4436, + "step": 2170 + }, + { + "epoch": 0.13913099205331966, + "grad_norm": 2.8623601827737355, + "learning_rate": 1e-06, + "loss": 0.4651, + "step": 2171 + }, + { + "epoch": 0.13919507818508076, + "grad_norm": 2.681961203424956, + "learning_rate": 1e-06, + "loss": 0.3852, + "step": 2172 + }, + { + "epoch": 0.13925916431684185, + "grad_norm": 2.646391464509139, + "learning_rate": 1e-06, + "loss": 0.3898, + "step": 2173 + }, + { + "epoch": 0.13932325044860291, + "grad_norm": 2.7593356672279805, + "learning_rate": 1e-06, + "loss": 0.3854, + "step": 2174 + }, + { + "epoch": 0.139387336580364, + "grad_norm": 2.742273416019991, + "learning_rate": 1e-06, + "loss": 0.4334, + "step": 2175 + }, + { + "epoch": 0.1394514227121251, + "grad_norm": 2.609949808118975, + "learning_rate": 1e-06, + "loss": 0.3883, + "step": 2176 + }, + { + "epoch": 0.1395155088438862, + "grad_norm": 2.688282306305331, + "learning_rate": 1e-06, + "loss": 0.3954, + "step": 2177 + }, + { + "epoch": 0.13957959497564726, + "grad_norm": 2.664368149793294, + "learning_rate": 1e-06, + "loss": 0.3988, + "step": 2178 + }, + { + "epoch": 0.13964368110740835, + "grad_norm": 2.5754163962200782, + "learning_rate": 1e-06, + "loss": 0.3572, + "step": 2179 + }, + { + "epoch": 0.13970776723916944, + "grad_norm": 2.7535770236048647, + "learning_rate": 1e-06, + "loss": 0.4486, + "step": 2180 + }, + { + "epoch": 0.13977185337093054, + "grad_norm": 2.5586750005179812, + "learning_rate": 1e-06, + "loss": 0.4398, + "step": 2181 + }, + { + "epoch": 0.13983593950269163, + "grad_norm": 2.5801312411637634, + "learning_rate": 1e-06, + "loss": 0.4103, + "step": 2182 + }, + { + "epoch": 0.1399000256344527, + "grad_norm": 2.6418380770398233, + "learning_rate": 1e-06, + "loss": 0.384, + "step": 2183 + }, + { + "epoch": 0.1399641117662138, + "grad_norm": 2.8406851039525205, + "learning_rate": 1e-06, + "loss": 0.4637, + "step": 2184 + }, + { + "epoch": 0.14002819789797488, + "grad_norm": 2.6629292430069524, + "learning_rate": 1e-06, + "loss": 0.4501, + "step": 2185 + }, + { + "epoch": 0.14009228402973598, + "grad_norm": 2.654798177824672, + "learning_rate": 1e-06, + "loss": 0.4054, + "step": 2186 + }, + { + "epoch": 0.14015637016149704, + "grad_norm": 2.9168291610332195, + "learning_rate": 1e-06, + "loss": 0.4182, + "step": 2187 + }, + { + "epoch": 0.14022045629325813, + "grad_norm": 2.600267663410967, + "learning_rate": 1e-06, + "loss": 0.4414, + "step": 2188 + }, + { + "epoch": 0.14028454242501923, + "grad_norm": 2.5417581238364906, + "learning_rate": 1e-06, + "loss": 0.4318, + "step": 2189 + }, + { + "epoch": 0.14034862855678032, + "grad_norm": 2.454596888462331, + "learning_rate": 1e-06, + "loss": 0.3628, + "step": 2190 + }, + { + "epoch": 0.1404127146885414, + "grad_norm": 2.8569961486821867, + "learning_rate": 1e-06, + "loss": 0.4434, + "step": 2191 + }, + { + "epoch": 0.14047680082030248, + "grad_norm": 2.8200076888696706, + "learning_rate": 1e-06, + "loss": 0.3914, + "step": 2192 + }, + { + "epoch": 0.14054088695206357, + "grad_norm": 2.7538541707394315, + "learning_rate": 1e-06, + "loss": 0.3932, + "step": 2193 + }, + { + "epoch": 0.14060497308382466, + "grad_norm": 2.756896816994633, + "learning_rate": 1e-06, + "loss": 0.3879, + "step": 2194 + }, + { + "epoch": 0.14066905921558576, + "grad_norm": 2.387845561333487, + "learning_rate": 1e-06, + "loss": 0.418, + "step": 2195 + }, + { + "epoch": 0.14073314534734682, + "grad_norm": 2.7006071057783876, + "learning_rate": 1e-06, + "loss": 0.4469, + "step": 2196 + }, + { + "epoch": 0.14079723147910791, + "grad_norm": 2.6467935141543086, + "learning_rate": 1e-06, + "loss": 0.4038, + "step": 2197 + }, + { + "epoch": 0.140861317610869, + "grad_norm": 2.6198363896498282, + "learning_rate": 1e-06, + "loss": 0.425, + "step": 2198 + }, + { + "epoch": 0.1409254037426301, + "grad_norm": 2.5636411622366264, + "learning_rate": 1e-06, + "loss": 0.4789, + "step": 2199 + }, + { + "epoch": 0.1409894898743912, + "grad_norm": 2.4631479300062114, + "learning_rate": 1e-06, + "loss": 0.4166, + "step": 2200 + }, + { + "epoch": 0.14105357600615226, + "grad_norm": 2.691147262921431, + "learning_rate": 1e-06, + "loss": 0.4125, + "step": 2201 + }, + { + "epoch": 0.14111766213791335, + "grad_norm": 2.653751571043374, + "learning_rate": 1e-06, + "loss": 0.4041, + "step": 2202 + }, + { + "epoch": 0.14118174826967445, + "grad_norm": 2.457050148938937, + "learning_rate": 1e-06, + "loss": 0.3652, + "step": 2203 + }, + { + "epoch": 0.14124583440143554, + "grad_norm": 2.3973561946298103, + "learning_rate": 1e-06, + "loss": 0.3735, + "step": 2204 + }, + { + "epoch": 0.1413099205331966, + "grad_norm": 2.7407883027389155, + "learning_rate": 1e-06, + "loss": 0.4125, + "step": 2205 + }, + { + "epoch": 0.1413740066649577, + "grad_norm": 2.6688132463232317, + "learning_rate": 1e-06, + "loss": 0.4421, + "step": 2206 + }, + { + "epoch": 0.1414380927967188, + "grad_norm": 2.643644919200164, + "learning_rate": 1e-06, + "loss": 0.3683, + "step": 2207 + }, + { + "epoch": 0.14150217892847988, + "grad_norm": 2.65080615124718, + "learning_rate": 1e-06, + "loss": 0.4559, + "step": 2208 + }, + { + "epoch": 0.14156626506024098, + "grad_norm": 2.776708320652998, + "learning_rate": 1e-06, + "loss": 0.4148, + "step": 2209 + }, + { + "epoch": 0.14163035119200204, + "grad_norm": 2.7020700688413513, + "learning_rate": 1e-06, + "loss": 0.4608, + "step": 2210 + }, + { + "epoch": 0.14169443732376313, + "grad_norm": 2.4552041930115904, + "learning_rate": 1e-06, + "loss": 0.391, + "step": 2211 + }, + { + "epoch": 0.14175852345552423, + "grad_norm": 2.5767945349775854, + "learning_rate": 1e-06, + "loss": 0.401, + "step": 2212 + }, + { + "epoch": 0.14182260958728532, + "grad_norm": 2.6559174118536046, + "learning_rate": 1e-06, + "loss": 0.4242, + "step": 2213 + }, + { + "epoch": 0.14188669571904639, + "grad_norm": 2.7023876372387985, + "learning_rate": 1e-06, + "loss": 0.4348, + "step": 2214 + }, + { + "epoch": 0.14195078185080748, + "grad_norm": 2.610694095494199, + "learning_rate": 1e-06, + "loss": 0.4084, + "step": 2215 + }, + { + "epoch": 0.14201486798256857, + "grad_norm": 2.6285159451990623, + "learning_rate": 1e-06, + "loss": 0.4375, + "step": 2216 + }, + { + "epoch": 0.14207895411432966, + "grad_norm": 2.640627442850314, + "learning_rate": 1e-06, + "loss": 0.4541, + "step": 2217 + }, + { + "epoch": 0.14214304024609076, + "grad_norm": 2.6134124413492756, + "learning_rate": 1e-06, + "loss": 0.4073, + "step": 2218 + }, + { + "epoch": 0.14220712637785182, + "grad_norm": 2.6094590815898457, + "learning_rate": 1e-06, + "loss": 0.4346, + "step": 2219 + }, + { + "epoch": 0.14227121250961292, + "grad_norm": 2.6490584464866247, + "learning_rate": 1e-06, + "loss": 0.4433, + "step": 2220 + }, + { + "epoch": 0.142335298641374, + "grad_norm": 2.5338550549194956, + "learning_rate": 1e-06, + "loss": 0.4395, + "step": 2221 + }, + { + "epoch": 0.1423993847731351, + "grad_norm": 2.6873280903762553, + "learning_rate": 1e-06, + "loss": 0.3746, + "step": 2222 + }, + { + "epoch": 0.14246347090489617, + "grad_norm": 2.882287571490444, + "learning_rate": 1e-06, + "loss": 0.4053, + "step": 2223 + }, + { + "epoch": 0.14252755703665726, + "grad_norm": 2.6621684855736145, + "learning_rate": 1e-06, + "loss": 0.4098, + "step": 2224 + }, + { + "epoch": 0.14259164316841835, + "grad_norm": 2.5399516145610406, + "learning_rate": 1e-06, + "loss": 0.4198, + "step": 2225 + }, + { + "epoch": 0.14265572930017945, + "grad_norm": 5.304960869663389, + "learning_rate": 1e-06, + "loss": 0.4114, + "step": 2226 + }, + { + "epoch": 0.14271981543194054, + "grad_norm": 2.6465131106356563, + "learning_rate": 1e-06, + "loss": 0.4118, + "step": 2227 + }, + { + "epoch": 0.1427839015637016, + "grad_norm": 2.823785893975285, + "learning_rate": 1e-06, + "loss": 0.35, + "step": 2228 + }, + { + "epoch": 0.1428479876954627, + "grad_norm": 2.6252988135275728, + "learning_rate": 1e-06, + "loss": 0.4251, + "step": 2229 + }, + { + "epoch": 0.1429120738272238, + "grad_norm": 2.5715196655427395, + "learning_rate": 1e-06, + "loss": 0.4214, + "step": 2230 + }, + { + "epoch": 0.14297615995898488, + "grad_norm": 2.5374478270268215, + "learning_rate": 1e-06, + "loss": 0.392, + "step": 2231 + }, + { + "epoch": 0.14304024609074598, + "grad_norm": 2.693764343659205, + "learning_rate": 1e-06, + "loss": 0.4185, + "step": 2232 + }, + { + "epoch": 0.14310433222250704, + "grad_norm": 2.5538527593794145, + "learning_rate": 1e-06, + "loss": 0.4159, + "step": 2233 + }, + { + "epoch": 0.14316841835426813, + "grad_norm": 2.643995614450662, + "learning_rate": 1e-06, + "loss": 0.4139, + "step": 2234 + }, + { + "epoch": 0.14323250448602923, + "grad_norm": 2.525937144073715, + "learning_rate": 1e-06, + "loss": 0.4341, + "step": 2235 + }, + { + "epoch": 0.14329659061779032, + "grad_norm": 2.451146215447884, + "learning_rate": 1e-06, + "loss": 0.4535, + "step": 2236 + }, + { + "epoch": 0.14336067674955139, + "grad_norm": 2.9775043321883623, + "learning_rate": 1e-06, + "loss": 0.4192, + "step": 2237 + }, + { + "epoch": 0.14342476288131248, + "grad_norm": 2.6372943700404314, + "learning_rate": 1e-06, + "loss": 0.3781, + "step": 2238 + }, + { + "epoch": 0.14348884901307357, + "grad_norm": 2.681809857926576, + "learning_rate": 1e-06, + "loss": 0.3501, + "step": 2239 + }, + { + "epoch": 0.14355293514483466, + "grad_norm": 2.6330684169629888, + "learning_rate": 1e-06, + "loss": 0.4125, + "step": 2240 + }, + { + "epoch": 0.14361702127659576, + "grad_norm": 2.5635021280630146, + "learning_rate": 1e-06, + "loss": 0.4494, + "step": 2241 + }, + { + "epoch": 0.14368110740835682, + "grad_norm": 2.5420088820932087, + "learning_rate": 1e-06, + "loss": 0.4034, + "step": 2242 + }, + { + "epoch": 0.14374519354011792, + "grad_norm": 2.5737910098068064, + "learning_rate": 1e-06, + "loss": 0.3646, + "step": 2243 + }, + { + "epoch": 0.143809279671879, + "grad_norm": 2.824372043653846, + "learning_rate": 1e-06, + "loss": 0.4076, + "step": 2244 + }, + { + "epoch": 0.1438733658036401, + "grad_norm": 2.4381851437931386, + "learning_rate": 1e-06, + "loss": 0.377, + "step": 2245 + }, + { + "epoch": 0.14393745193540117, + "grad_norm": 2.508371697735693, + "learning_rate": 1e-06, + "loss": 0.4176, + "step": 2246 + }, + { + "epoch": 0.14400153806716226, + "grad_norm": 2.720177214825913, + "learning_rate": 1e-06, + "loss": 0.4186, + "step": 2247 + }, + { + "epoch": 0.14406562419892335, + "grad_norm": 2.6059820662859936, + "learning_rate": 1e-06, + "loss": 0.4461, + "step": 2248 + }, + { + "epoch": 0.14412971033068445, + "grad_norm": 2.6734684323623865, + "learning_rate": 1e-06, + "loss": 0.4172, + "step": 2249 + }, + { + "epoch": 0.14419379646244554, + "grad_norm": 2.71227587875518, + "learning_rate": 1e-06, + "loss": 0.3907, + "step": 2250 + }, + { + "epoch": 0.1442578825942066, + "grad_norm": 2.63605657042964, + "learning_rate": 1e-06, + "loss": 0.4382, + "step": 2251 + }, + { + "epoch": 0.1443219687259677, + "grad_norm": 2.596026537683679, + "learning_rate": 1e-06, + "loss": 0.4052, + "step": 2252 + }, + { + "epoch": 0.1443860548577288, + "grad_norm": 2.6134149718188047, + "learning_rate": 1e-06, + "loss": 0.4782, + "step": 2253 + }, + { + "epoch": 0.14445014098948988, + "grad_norm": 2.692257498204221, + "learning_rate": 1e-06, + "loss": 0.4078, + "step": 2254 + }, + { + "epoch": 0.14451422712125095, + "grad_norm": 2.4847848114936006, + "learning_rate": 1e-06, + "loss": 0.3496, + "step": 2255 + }, + { + "epoch": 0.14457831325301204, + "grad_norm": 2.642340400044151, + "learning_rate": 1e-06, + "loss": 0.3692, + "step": 2256 + }, + { + "epoch": 0.14464239938477313, + "grad_norm": 2.8905906794526293, + "learning_rate": 1e-06, + "loss": 0.4426, + "step": 2257 + }, + { + "epoch": 0.14470648551653423, + "grad_norm": 2.6608506377012247, + "learning_rate": 1e-06, + "loss": 0.408, + "step": 2258 + }, + { + "epoch": 0.14477057164829532, + "grad_norm": 2.6869403811241064, + "learning_rate": 1e-06, + "loss": 0.4095, + "step": 2259 + }, + { + "epoch": 0.1448346577800564, + "grad_norm": 2.512352252639373, + "learning_rate": 1e-06, + "loss": 0.3989, + "step": 2260 + }, + { + "epoch": 0.14489874391181748, + "grad_norm": 2.622494628319042, + "learning_rate": 1e-06, + "loss": 0.4209, + "step": 2261 + }, + { + "epoch": 0.14496283004357857, + "grad_norm": 2.7770276907918974, + "learning_rate": 1e-06, + "loss": 0.4578, + "step": 2262 + }, + { + "epoch": 0.14502691617533967, + "grad_norm": 2.666828543397899, + "learning_rate": 1e-06, + "loss": 0.3942, + "step": 2263 + }, + { + "epoch": 0.14509100230710073, + "grad_norm": 2.765447332478381, + "learning_rate": 1e-06, + "loss": 0.4545, + "step": 2264 + }, + { + "epoch": 0.14515508843886182, + "grad_norm": 2.4732133303198967, + "learning_rate": 1e-06, + "loss": 0.4159, + "step": 2265 + }, + { + "epoch": 0.14521917457062292, + "grad_norm": 2.715075164848874, + "learning_rate": 1e-06, + "loss": 0.4377, + "step": 2266 + }, + { + "epoch": 0.145283260702384, + "grad_norm": 2.647220900812757, + "learning_rate": 1e-06, + "loss": 0.414, + "step": 2267 + }, + { + "epoch": 0.1453473468341451, + "grad_norm": 2.799960861199678, + "learning_rate": 1e-06, + "loss": 0.3925, + "step": 2268 + }, + { + "epoch": 0.14541143296590617, + "grad_norm": 2.4458853797244573, + "learning_rate": 1e-06, + "loss": 0.4423, + "step": 2269 + }, + { + "epoch": 0.14547551909766726, + "grad_norm": 2.6759906456774503, + "learning_rate": 1e-06, + "loss": 0.4692, + "step": 2270 + }, + { + "epoch": 0.14553960522942835, + "grad_norm": 2.570308914424076, + "learning_rate": 1e-06, + "loss": 0.3719, + "step": 2271 + }, + { + "epoch": 0.14560369136118945, + "grad_norm": 2.4318165071673215, + "learning_rate": 1e-06, + "loss": 0.3948, + "step": 2272 + }, + { + "epoch": 0.1456677774929505, + "grad_norm": 2.747814037626672, + "learning_rate": 1e-06, + "loss": 0.4243, + "step": 2273 + }, + { + "epoch": 0.1457318636247116, + "grad_norm": 2.6257289323903743, + "learning_rate": 1e-06, + "loss": 0.4053, + "step": 2274 + }, + { + "epoch": 0.1457959497564727, + "grad_norm": 2.4752797780084985, + "learning_rate": 1e-06, + "loss": 0.4473, + "step": 2275 + }, + { + "epoch": 0.1458600358882338, + "grad_norm": 2.3987218490349473, + "learning_rate": 1e-06, + "loss": 0.3736, + "step": 2276 + }, + { + "epoch": 0.14592412201999488, + "grad_norm": 2.8867281314680997, + "learning_rate": 1e-06, + "loss": 0.3696, + "step": 2277 + }, + { + "epoch": 0.14598820815175595, + "grad_norm": 2.7716528014364146, + "learning_rate": 1e-06, + "loss": 0.3906, + "step": 2278 + }, + { + "epoch": 0.14605229428351704, + "grad_norm": 2.8974582112907363, + "learning_rate": 1e-06, + "loss": 0.421, + "step": 2279 + }, + { + "epoch": 0.14611638041527814, + "grad_norm": 2.880131925690081, + "learning_rate": 1e-06, + "loss": 0.4175, + "step": 2280 + }, + { + "epoch": 0.14618046654703923, + "grad_norm": 2.8585496372328723, + "learning_rate": 1e-06, + "loss": 0.3892, + "step": 2281 + }, + { + "epoch": 0.14624455267880032, + "grad_norm": 2.4339756990712744, + "learning_rate": 1e-06, + "loss": 0.4184, + "step": 2282 + }, + { + "epoch": 0.1463086388105614, + "grad_norm": 2.5992029115648316, + "learning_rate": 1e-06, + "loss": 0.3717, + "step": 2283 + }, + { + "epoch": 0.14637272494232248, + "grad_norm": 2.797688498581479, + "learning_rate": 1e-06, + "loss": 0.4105, + "step": 2284 + }, + { + "epoch": 0.14643681107408357, + "grad_norm": 2.4833588908902557, + "learning_rate": 1e-06, + "loss": 0.3732, + "step": 2285 + }, + { + "epoch": 0.14650089720584467, + "grad_norm": 2.58446876123646, + "learning_rate": 1e-06, + "loss": 0.3839, + "step": 2286 + }, + { + "epoch": 0.14656498333760573, + "grad_norm": 2.606881969638494, + "learning_rate": 1e-06, + "loss": 0.4249, + "step": 2287 + }, + { + "epoch": 0.14662906946936682, + "grad_norm": 2.6604866091293475, + "learning_rate": 1e-06, + "loss": 0.4077, + "step": 2288 + }, + { + "epoch": 0.14669315560112792, + "grad_norm": 2.6989920957953246, + "learning_rate": 1e-06, + "loss": 0.4315, + "step": 2289 + }, + { + "epoch": 0.146757241732889, + "grad_norm": 2.725493235912062, + "learning_rate": 1e-06, + "loss": 0.3907, + "step": 2290 + }, + { + "epoch": 0.1468213278646501, + "grad_norm": 2.69132660874459, + "learning_rate": 1e-06, + "loss": 0.3964, + "step": 2291 + }, + { + "epoch": 0.14688541399641117, + "grad_norm": 2.610103572519288, + "learning_rate": 1e-06, + "loss": 0.4651, + "step": 2292 + }, + { + "epoch": 0.14694950012817226, + "grad_norm": 2.4991515605638455, + "learning_rate": 1e-06, + "loss": 0.4226, + "step": 2293 + }, + { + "epoch": 0.14701358625993335, + "grad_norm": 2.538974285721927, + "learning_rate": 1e-06, + "loss": 0.4055, + "step": 2294 + }, + { + "epoch": 0.14707767239169445, + "grad_norm": 2.8066736531056464, + "learning_rate": 1e-06, + "loss": 0.387, + "step": 2295 + }, + { + "epoch": 0.1471417585234555, + "grad_norm": 2.855173144029738, + "learning_rate": 1e-06, + "loss": 0.4252, + "step": 2296 + }, + { + "epoch": 0.1472058446552166, + "grad_norm": 2.8441153351557293, + "learning_rate": 1e-06, + "loss": 0.3698, + "step": 2297 + }, + { + "epoch": 0.1472699307869777, + "grad_norm": 2.6372999046635357, + "learning_rate": 1e-06, + "loss": 0.4374, + "step": 2298 + }, + { + "epoch": 0.1473340169187388, + "grad_norm": 2.561119349595809, + "learning_rate": 1e-06, + "loss": 0.4233, + "step": 2299 + }, + { + "epoch": 0.14739810305049988, + "grad_norm": 2.6508050362160485, + "learning_rate": 1e-06, + "loss": 0.4292, + "step": 2300 + }, + { + "epoch": 0.14746218918226095, + "grad_norm": 2.627610268613044, + "learning_rate": 1e-06, + "loss": 0.419, + "step": 2301 + }, + { + "epoch": 0.14752627531402204, + "grad_norm": 2.811581016043246, + "learning_rate": 1e-06, + "loss": 0.4672, + "step": 2302 + }, + { + "epoch": 0.14759036144578314, + "grad_norm": 2.8176789877873993, + "learning_rate": 1e-06, + "loss": 0.44, + "step": 2303 + }, + { + "epoch": 0.14765444757754423, + "grad_norm": 2.6136882447196848, + "learning_rate": 1e-06, + "loss": 0.4411, + "step": 2304 + }, + { + "epoch": 0.1477185337093053, + "grad_norm": 2.631618330649024, + "learning_rate": 1e-06, + "loss": 0.4069, + "step": 2305 + }, + { + "epoch": 0.1477826198410664, + "grad_norm": 2.8307020910722445, + "learning_rate": 1e-06, + "loss": 0.445, + "step": 2306 + }, + { + "epoch": 0.14784670597282748, + "grad_norm": 2.859794204166223, + "learning_rate": 1e-06, + "loss": 0.3921, + "step": 2307 + }, + { + "epoch": 0.14791079210458857, + "grad_norm": 3.029930250277341, + "learning_rate": 1e-06, + "loss": 0.3989, + "step": 2308 + }, + { + "epoch": 0.14797487823634967, + "grad_norm": 2.769615320491857, + "learning_rate": 1e-06, + "loss": 0.467, + "step": 2309 + }, + { + "epoch": 0.14803896436811073, + "grad_norm": 2.8163409163128414, + "learning_rate": 1e-06, + "loss": 0.4292, + "step": 2310 + }, + { + "epoch": 0.14810305049987182, + "grad_norm": 2.9181548672688438, + "learning_rate": 1e-06, + "loss": 0.398, + "step": 2311 + }, + { + "epoch": 0.14816713663163292, + "grad_norm": 2.515786063454079, + "learning_rate": 1e-06, + "loss": 0.396, + "step": 2312 + }, + { + "epoch": 0.148231222763394, + "grad_norm": 2.6501859655841136, + "learning_rate": 1e-06, + "loss": 0.3726, + "step": 2313 + }, + { + "epoch": 0.14829530889515508, + "grad_norm": 2.727336603534406, + "learning_rate": 1e-06, + "loss": 0.3959, + "step": 2314 + }, + { + "epoch": 0.14835939502691617, + "grad_norm": 2.7322829607818786, + "learning_rate": 1e-06, + "loss": 0.3931, + "step": 2315 + }, + { + "epoch": 0.14842348115867726, + "grad_norm": 2.4220749790430496, + "learning_rate": 1e-06, + "loss": 0.4277, + "step": 2316 + }, + { + "epoch": 0.14848756729043835, + "grad_norm": 2.6338981053065598, + "learning_rate": 1e-06, + "loss": 0.3565, + "step": 2317 + }, + { + "epoch": 0.14855165342219945, + "grad_norm": 2.773079409430958, + "learning_rate": 1e-06, + "loss": 0.4168, + "step": 2318 + }, + { + "epoch": 0.1486157395539605, + "grad_norm": 2.5455067815807704, + "learning_rate": 1e-06, + "loss": 0.4192, + "step": 2319 + }, + { + "epoch": 0.1486798256857216, + "grad_norm": 2.5151484665388555, + "learning_rate": 1e-06, + "loss": 0.3734, + "step": 2320 + }, + { + "epoch": 0.1487439118174827, + "grad_norm": 2.6818970061144043, + "learning_rate": 1e-06, + "loss": 0.4192, + "step": 2321 + }, + { + "epoch": 0.1488079979492438, + "grad_norm": 2.831179594461404, + "learning_rate": 1e-06, + "loss": 0.4075, + "step": 2322 + }, + { + "epoch": 0.14887208408100486, + "grad_norm": 2.8459896863335987, + "learning_rate": 1e-06, + "loss": 0.4173, + "step": 2323 + }, + { + "epoch": 0.14893617021276595, + "grad_norm": 2.393045917617131, + "learning_rate": 1e-06, + "loss": 0.3683, + "step": 2324 + }, + { + "epoch": 0.14900025634452704, + "grad_norm": 2.592468826362506, + "learning_rate": 1e-06, + "loss": 0.448, + "step": 2325 + }, + { + "epoch": 0.14906434247628814, + "grad_norm": 2.5695432484816036, + "learning_rate": 1e-06, + "loss": 0.4202, + "step": 2326 + }, + { + "epoch": 0.14912842860804923, + "grad_norm": 2.7467870217923966, + "learning_rate": 1e-06, + "loss": 0.3948, + "step": 2327 + }, + { + "epoch": 0.1491925147398103, + "grad_norm": 2.7219181762815765, + "learning_rate": 1e-06, + "loss": 0.3861, + "step": 2328 + }, + { + "epoch": 0.1492566008715714, + "grad_norm": 2.6606309439094575, + "learning_rate": 1e-06, + "loss": 0.3892, + "step": 2329 + }, + { + "epoch": 0.14932068700333248, + "grad_norm": 2.4477701980906903, + "learning_rate": 1e-06, + "loss": 0.3845, + "step": 2330 + }, + { + "epoch": 0.14938477313509357, + "grad_norm": 2.5710900386400555, + "learning_rate": 1e-06, + "loss": 0.4313, + "step": 2331 + }, + { + "epoch": 0.14944885926685464, + "grad_norm": 2.9202947763246847, + "learning_rate": 1e-06, + "loss": 0.4593, + "step": 2332 + }, + { + "epoch": 0.14951294539861573, + "grad_norm": 2.8278221339598812, + "learning_rate": 1e-06, + "loss": 0.4257, + "step": 2333 + }, + { + "epoch": 0.14957703153037682, + "grad_norm": 2.8006536711305854, + "learning_rate": 1e-06, + "loss": 0.3817, + "step": 2334 + }, + { + "epoch": 0.14964111766213792, + "grad_norm": 2.5025824047107026, + "learning_rate": 1e-06, + "loss": 0.4401, + "step": 2335 + }, + { + "epoch": 0.149705203793899, + "grad_norm": 2.647992832106811, + "learning_rate": 1e-06, + "loss": 0.3943, + "step": 2336 + }, + { + "epoch": 0.14976928992566008, + "grad_norm": 2.534047410212519, + "learning_rate": 1e-06, + "loss": 0.4123, + "step": 2337 + }, + { + "epoch": 0.14983337605742117, + "grad_norm": 2.544201360530356, + "learning_rate": 1e-06, + "loss": 0.4222, + "step": 2338 + }, + { + "epoch": 0.14989746218918226, + "grad_norm": 2.598334654859618, + "learning_rate": 1e-06, + "loss": 0.4778, + "step": 2339 + }, + { + "epoch": 0.14996154832094336, + "grad_norm": 2.6037963285409806, + "learning_rate": 1e-06, + "loss": 0.4313, + "step": 2340 + }, + { + "epoch": 0.15002563445270445, + "grad_norm": 2.5511618288184152, + "learning_rate": 1e-06, + "loss": 0.4308, + "step": 2341 + }, + { + "epoch": 0.1500897205844655, + "grad_norm": 2.596673832687979, + "learning_rate": 1e-06, + "loss": 0.392, + "step": 2342 + }, + { + "epoch": 0.1501538067162266, + "grad_norm": 2.5935959167733493, + "learning_rate": 1e-06, + "loss": 0.3799, + "step": 2343 + }, + { + "epoch": 0.1502178928479877, + "grad_norm": 2.8157616888072776, + "learning_rate": 1e-06, + "loss": 0.3875, + "step": 2344 + }, + { + "epoch": 0.1502819789797488, + "grad_norm": 2.7239095516119964, + "learning_rate": 1e-06, + "loss": 0.3579, + "step": 2345 + }, + { + "epoch": 0.15034606511150986, + "grad_norm": 2.8732628594396914, + "learning_rate": 1e-06, + "loss": 0.4426, + "step": 2346 + }, + { + "epoch": 0.15041015124327095, + "grad_norm": 2.6053261102088596, + "learning_rate": 1e-06, + "loss": 0.4361, + "step": 2347 + }, + { + "epoch": 0.15047423737503204, + "grad_norm": 2.7501538950837006, + "learning_rate": 1e-06, + "loss": 0.4225, + "step": 2348 + }, + { + "epoch": 0.15053832350679314, + "grad_norm": 2.6748933692579016, + "learning_rate": 1e-06, + "loss": 0.4139, + "step": 2349 + }, + { + "epoch": 0.15060240963855423, + "grad_norm": 2.5140175391129467, + "learning_rate": 1e-06, + "loss": 0.396, + "step": 2350 + }, + { + "epoch": 0.1506664957703153, + "grad_norm": 2.51220473120615, + "learning_rate": 1e-06, + "loss": 0.4299, + "step": 2351 + }, + { + "epoch": 0.1507305819020764, + "grad_norm": 2.5533477749431985, + "learning_rate": 1e-06, + "loss": 0.4048, + "step": 2352 + }, + { + "epoch": 0.15079466803383748, + "grad_norm": 2.7622224664376276, + "learning_rate": 1e-06, + "loss": 0.3982, + "step": 2353 + }, + { + "epoch": 0.15085875416559857, + "grad_norm": 2.745535809237263, + "learning_rate": 1e-06, + "loss": 0.4543, + "step": 2354 + }, + { + "epoch": 0.15092284029735964, + "grad_norm": 2.7300386583191885, + "learning_rate": 1e-06, + "loss": 0.4357, + "step": 2355 + }, + { + "epoch": 0.15098692642912073, + "grad_norm": 2.7017399198890613, + "learning_rate": 1e-06, + "loss": 0.3976, + "step": 2356 + }, + { + "epoch": 0.15105101256088183, + "grad_norm": 2.641428987366016, + "learning_rate": 1e-06, + "loss": 0.4643, + "step": 2357 + }, + { + "epoch": 0.15111509869264292, + "grad_norm": 2.6569814357613186, + "learning_rate": 1e-06, + "loss": 0.4498, + "step": 2358 + }, + { + "epoch": 0.151179184824404, + "grad_norm": 2.5401208849561416, + "learning_rate": 1e-06, + "loss": 0.4235, + "step": 2359 + }, + { + "epoch": 0.15124327095616508, + "grad_norm": 2.5054149135493136, + "learning_rate": 1e-06, + "loss": 0.4279, + "step": 2360 + }, + { + "epoch": 0.15130735708792617, + "grad_norm": 2.6694292857092905, + "learning_rate": 1e-06, + "loss": 0.4528, + "step": 2361 + }, + { + "epoch": 0.15137144321968726, + "grad_norm": 2.5114041686713646, + "learning_rate": 1e-06, + "loss": 0.4483, + "step": 2362 + }, + { + "epoch": 0.15143552935144836, + "grad_norm": 2.6480189800249803, + "learning_rate": 1e-06, + "loss": 0.4174, + "step": 2363 + }, + { + "epoch": 0.15149961548320942, + "grad_norm": 2.4534126730482573, + "learning_rate": 1e-06, + "loss": 0.3561, + "step": 2364 + }, + { + "epoch": 0.15156370161497051, + "grad_norm": 2.6671292785945866, + "learning_rate": 1e-06, + "loss": 0.4201, + "step": 2365 + }, + { + "epoch": 0.1516277877467316, + "grad_norm": 2.726065605093534, + "learning_rate": 1e-06, + "loss": 0.4679, + "step": 2366 + }, + { + "epoch": 0.1516918738784927, + "grad_norm": 2.545323534174155, + "learning_rate": 1e-06, + "loss": 0.401, + "step": 2367 + }, + { + "epoch": 0.1517559600102538, + "grad_norm": 2.791402105635689, + "learning_rate": 1e-06, + "loss": 0.3948, + "step": 2368 + }, + { + "epoch": 0.15182004614201486, + "grad_norm": 2.6468669705500565, + "learning_rate": 1e-06, + "loss": 0.4129, + "step": 2369 + }, + { + "epoch": 0.15188413227377595, + "grad_norm": 2.5476833598403528, + "learning_rate": 1e-06, + "loss": 0.4278, + "step": 2370 + }, + { + "epoch": 0.15194821840553704, + "grad_norm": 2.506687865478058, + "learning_rate": 1e-06, + "loss": 0.3836, + "step": 2371 + }, + { + "epoch": 0.15201230453729814, + "grad_norm": 2.750669796319302, + "learning_rate": 1e-06, + "loss": 0.4487, + "step": 2372 + }, + { + "epoch": 0.1520763906690592, + "grad_norm": 2.6545688375212264, + "learning_rate": 1e-06, + "loss": 0.4, + "step": 2373 + }, + { + "epoch": 0.1521404768008203, + "grad_norm": 2.5413046028139235, + "learning_rate": 1e-06, + "loss": 0.4104, + "step": 2374 + }, + { + "epoch": 0.1522045629325814, + "grad_norm": 2.554444200234787, + "learning_rate": 1e-06, + "loss": 0.395, + "step": 2375 + }, + { + "epoch": 0.15226864906434248, + "grad_norm": 2.579414741406172, + "learning_rate": 1e-06, + "loss": 0.3627, + "step": 2376 + }, + { + "epoch": 0.15233273519610357, + "grad_norm": 2.6671368075780446, + "learning_rate": 1e-06, + "loss": 0.4369, + "step": 2377 + }, + { + "epoch": 0.15239682132786464, + "grad_norm": 2.6810079593267346, + "learning_rate": 1e-06, + "loss": 0.4162, + "step": 2378 + }, + { + "epoch": 0.15246090745962573, + "grad_norm": 2.683487202560093, + "learning_rate": 1e-06, + "loss": 0.4557, + "step": 2379 + }, + { + "epoch": 0.15252499359138683, + "grad_norm": 2.589917499641482, + "learning_rate": 1e-06, + "loss": 0.3939, + "step": 2380 + }, + { + "epoch": 0.15258907972314792, + "grad_norm": 2.8205977099230477, + "learning_rate": 1e-06, + "loss": 0.4398, + "step": 2381 + }, + { + "epoch": 0.15265316585490898, + "grad_norm": 2.592663049668745, + "learning_rate": 1e-06, + "loss": 0.3787, + "step": 2382 + }, + { + "epoch": 0.15271725198667008, + "grad_norm": 2.5460240252707265, + "learning_rate": 1e-06, + "loss": 0.4179, + "step": 2383 + }, + { + "epoch": 0.15278133811843117, + "grad_norm": 2.5645214681159696, + "learning_rate": 1e-06, + "loss": 0.3777, + "step": 2384 + }, + { + "epoch": 0.15284542425019226, + "grad_norm": 2.7015014505643604, + "learning_rate": 1e-06, + "loss": 0.4084, + "step": 2385 + }, + { + "epoch": 0.15290951038195336, + "grad_norm": 2.6132337605579186, + "learning_rate": 1e-06, + "loss": 0.3922, + "step": 2386 + }, + { + "epoch": 0.15297359651371442, + "grad_norm": 2.565897987852246, + "learning_rate": 1e-06, + "loss": 0.4156, + "step": 2387 + }, + { + "epoch": 0.15303768264547551, + "grad_norm": 2.5809983715596645, + "learning_rate": 1e-06, + "loss": 0.3894, + "step": 2388 + }, + { + "epoch": 0.1531017687772366, + "grad_norm": 2.5560680323288376, + "learning_rate": 1e-06, + "loss": 0.3725, + "step": 2389 + }, + { + "epoch": 0.1531658549089977, + "grad_norm": 2.6934001340487073, + "learning_rate": 1e-06, + "loss": 0.4946, + "step": 2390 + }, + { + "epoch": 0.1532299410407588, + "grad_norm": 2.4350422689075937, + "learning_rate": 1e-06, + "loss": 0.3635, + "step": 2391 + }, + { + "epoch": 0.15329402717251986, + "grad_norm": 2.5978625164002436, + "learning_rate": 1e-06, + "loss": 0.3755, + "step": 2392 + }, + { + "epoch": 0.15335811330428095, + "grad_norm": 2.7060153807077065, + "learning_rate": 1e-06, + "loss": 0.3844, + "step": 2393 + }, + { + "epoch": 0.15342219943604204, + "grad_norm": 2.5977471758004533, + "learning_rate": 1e-06, + "loss": 0.398, + "step": 2394 + }, + { + "epoch": 0.15348628556780314, + "grad_norm": 2.8192146974315007, + "learning_rate": 1e-06, + "loss": 0.4224, + "step": 2395 + }, + { + "epoch": 0.1535503716995642, + "grad_norm": 2.8507772617205984, + "learning_rate": 1e-06, + "loss": 0.4593, + "step": 2396 + }, + { + "epoch": 0.1536144578313253, + "grad_norm": 2.757912230882947, + "learning_rate": 1e-06, + "loss": 0.376, + "step": 2397 + }, + { + "epoch": 0.1536785439630864, + "grad_norm": 2.914634273117128, + "learning_rate": 1e-06, + "loss": 0.3884, + "step": 2398 + }, + { + "epoch": 0.15374263009484748, + "grad_norm": 2.7028190039958164, + "learning_rate": 1e-06, + "loss": 0.429, + "step": 2399 + }, + { + "epoch": 0.15380671622660858, + "grad_norm": 2.805975220284593, + "learning_rate": 1e-06, + "loss": 0.4336, + "step": 2400 + }, + { + "epoch": 0.15387080235836964, + "grad_norm": 2.612350568348285, + "learning_rate": 1e-06, + "loss": 0.3692, + "step": 2401 + }, + { + "epoch": 0.15393488849013073, + "grad_norm": 2.6235487697848012, + "learning_rate": 1e-06, + "loss": 0.3941, + "step": 2402 + }, + { + "epoch": 0.15399897462189183, + "grad_norm": 2.5075163297958416, + "learning_rate": 1e-06, + "loss": 0.443, + "step": 2403 + }, + { + "epoch": 0.15406306075365292, + "grad_norm": 2.5941254002232106, + "learning_rate": 1e-06, + "loss": 0.4505, + "step": 2404 + }, + { + "epoch": 0.15412714688541398, + "grad_norm": 2.646111568843153, + "learning_rate": 1e-06, + "loss": 0.4259, + "step": 2405 + }, + { + "epoch": 0.15419123301717508, + "grad_norm": 2.627844708101551, + "learning_rate": 1e-06, + "loss": 0.4292, + "step": 2406 + }, + { + "epoch": 0.15425531914893617, + "grad_norm": 2.6155267950712315, + "learning_rate": 1e-06, + "loss": 0.3833, + "step": 2407 + }, + { + "epoch": 0.15431940528069726, + "grad_norm": 2.80650626799486, + "learning_rate": 1e-06, + "loss": 0.4245, + "step": 2408 + }, + { + "epoch": 0.15438349141245836, + "grad_norm": 2.61101795676931, + "learning_rate": 1e-06, + "loss": 0.359, + "step": 2409 + }, + { + "epoch": 0.15444757754421942, + "grad_norm": 2.707694683825272, + "learning_rate": 1e-06, + "loss": 0.4255, + "step": 2410 + }, + { + "epoch": 0.15451166367598052, + "grad_norm": 2.5831471926173624, + "learning_rate": 1e-06, + "loss": 0.4173, + "step": 2411 + }, + { + "epoch": 0.1545757498077416, + "grad_norm": 2.6242498641680334, + "learning_rate": 1e-06, + "loss": 0.448, + "step": 2412 + }, + { + "epoch": 0.1546398359395027, + "grad_norm": 2.72655226953322, + "learning_rate": 1e-06, + "loss": 0.4317, + "step": 2413 + }, + { + "epoch": 0.15470392207126377, + "grad_norm": 2.7383815110827583, + "learning_rate": 1e-06, + "loss": 0.4047, + "step": 2414 + }, + { + "epoch": 0.15476800820302486, + "grad_norm": 2.5026739932341155, + "learning_rate": 1e-06, + "loss": 0.3867, + "step": 2415 + }, + { + "epoch": 0.15483209433478595, + "grad_norm": 2.674607203930584, + "learning_rate": 1e-06, + "loss": 0.3793, + "step": 2416 + }, + { + "epoch": 0.15489618046654705, + "grad_norm": 2.754064762987339, + "learning_rate": 1e-06, + "loss": 0.4281, + "step": 2417 + }, + { + "epoch": 0.15496026659830814, + "grad_norm": 2.570533524925514, + "learning_rate": 1e-06, + "loss": 0.4598, + "step": 2418 + }, + { + "epoch": 0.1550243527300692, + "grad_norm": 2.597622703572672, + "learning_rate": 1e-06, + "loss": 0.371, + "step": 2419 + }, + { + "epoch": 0.1550884388618303, + "grad_norm": 2.8288481023355776, + "learning_rate": 1e-06, + "loss": 0.4526, + "step": 2420 + }, + { + "epoch": 0.1551525249935914, + "grad_norm": 2.4192969540719527, + "learning_rate": 1e-06, + "loss": 0.4239, + "step": 2421 + }, + { + "epoch": 0.15521661112535248, + "grad_norm": 2.570303124194049, + "learning_rate": 1e-06, + "loss": 0.4859, + "step": 2422 + }, + { + "epoch": 0.15528069725711355, + "grad_norm": 2.5962623992860463, + "learning_rate": 1e-06, + "loss": 0.414, + "step": 2423 + }, + { + "epoch": 0.15534478338887464, + "grad_norm": 2.6407161187844586, + "learning_rate": 1e-06, + "loss": 0.4539, + "step": 2424 + }, + { + "epoch": 0.15540886952063573, + "grad_norm": 2.7032853850977983, + "learning_rate": 1e-06, + "loss": 0.4852, + "step": 2425 + }, + { + "epoch": 0.15547295565239683, + "grad_norm": 2.6186822936277574, + "learning_rate": 1e-06, + "loss": 0.4252, + "step": 2426 + }, + { + "epoch": 0.15553704178415792, + "grad_norm": 2.7692792260713768, + "learning_rate": 1e-06, + "loss": 0.4144, + "step": 2427 + }, + { + "epoch": 0.15560112791591899, + "grad_norm": 2.819212145403548, + "learning_rate": 1e-06, + "loss": 0.3948, + "step": 2428 + }, + { + "epoch": 0.15566521404768008, + "grad_norm": 2.7820635144861376, + "learning_rate": 1e-06, + "loss": 0.4357, + "step": 2429 + }, + { + "epoch": 0.15572930017944117, + "grad_norm": 2.540855770044114, + "learning_rate": 1e-06, + "loss": 0.3847, + "step": 2430 + }, + { + "epoch": 0.15579338631120226, + "grad_norm": 2.5881439472756202, + "learning_rate": 1e-06, + "loss": 0.3905, + "step": 2431 + }, + { + "epoch": 0.15585747244296333, + "grad_norm": 2.628112799287706, + "learning_rate": 1e-06, + "loss": 0.4897, + "step": 2432 + }, + { + "epoch": 0.15592155857472442, + "grad_norm": 2.8470469351320666, + "learning_rate": 1e-06, + "loss": 0.4372, + "step": 2433 + }, + { + "epoch": 0.15598564470648552, + "grad_norm": 2.959718617534213, + "learning_rate": 1e-06, + "loss": 0.3556, + "step": 2434 + }, + { + "epoch": 0.1560497308382466, + "grad_norm": 2.63349545605712, + "learning_rate": 1e-06, + "loss": 0.3966, + "step": 2435 + }, + { + "epoch": 0.1561138169700077, + "grad_norm": 2.6920193790596687, + "learning_rate": 1e-06, + "loss": 0.4002, + "step": 2436 + }, + { + "epoch": 0.15617790310176877, + "grad_norm": 2.5515125234377916, + "learning_rate": 1e-06, + "loss": 0.4704, + "step": 2437 + }, + { + "epoch": 0.15624198923352986, + "grad_norm": 2.7845164985771738, + "learning_rate": 1e-06, + "loss": 0.3741, + "step": 2438 + }, + { + "epoch": 0.15630607536529095, + "grad_norm": 2.7051788981886715, + "learning_rate": 1e-06, + "loss": 0.4248, + "step": 2439 + }, + { + "epoch": 0.15637016149705205, + "grad_norm": 2.5027319502036103, + "learning_rate": 1e-06, + "loss": 0.4275, + "step": 2440 + }, + { + "epoch": 0.1564342476288131, + "grad_norm": 2.67872865736227, + "learning_rate": 1e-06, + "loss": 0.3707, + "step": 2441 + }, + { + "epoch": 0.1564983337605742, + "grad_norm": 2.6234605610607797, + "learning_rate": 1e-06, + "loss": 0.4122, + "step": 2442 + }, + { + "epoch": 0.1565624198923353, + "grad_norm": 2.660891907681123, + "learning_rate": 1e-06, + "loss": 0.4079, + "step": 2443 + }, + { + "epoch": 0.1566265060240964, + "grad_norm": 2.469269141511082, + "learning_rate": 1e-06, + "loss": 0.3787, + "step": 2444 + }, + { + "epoch": 0.15669059215585748, + "grad_norm": 2.9128965615675084, + "learning_rate": 1e-06, + "loss": 0.4011, + "step": 2445 + }, + { + "epoch": 0.15675467828761855, + "grad_norm": 2.3634946944799338, + "learning_rate": 1e-06, + "loss": 0.4086, + "step": 2446 + }, + { + "epoch": 0.15681876441937964, + "grad_norm": 2.487803849131197, + "learning_rate": 1e-06, + "loss": 0.381, + "step": 2447 + }, + { + "epoch": 0.15688285055114073, + "grad_norm": 2.73280436377226, + "learning_rate": 1e-06, + "loss": 0.4247, + "step": 2448 + }, + { + "epoch": 0.15694693668290183, + "grad_norm": 2.645994090911685, + "learning_rate": 1e-06, + "loss": 0.4245, + "step": 2449 + }, + { + "epoch": 0.15701102281466292, + "grad_norm": 2.7045428079749887, + "learning_rate": 1e-06, + "loss": 0.3821, + "step": 2450 + }, + { + "epoch": 0.15707510894642399, + "grad_norm": 2.6721213176800283, + "learning_rate": 1e-06, + "loss": 0.3749, + "step": 2451 + }, + { + "epoch": 0.15713919507818508, + "grad_norm": 2.5446333446323575, + "learning_rate": 1e-06, + "loss": 0.4501, + "step": 2452 + }, + { + "epoch": 0.15720328120994617, + "grad_norm": 2.7025878221632214, + "learning_rate": 1e-06, + "loss": 0.4365, + "step": 2453 + }, + { + "epoch": 0.15726736734170726, + "grad_norm": 2.4377600454198673, + "learning_rate": 1e-06, + "loss": 0.393, + "step": 2454 + }, + { + "epoch": 0.15733145347346833, + "grad_norm": 2.3395968765113704, + "learning_rate": 1e-06, + "loss": 0.4026, + "step": 2455 + }, + { + "epoch": 0.15739553960522942, + "grad_norm": 2.5508080145151895, + "learning_rate": 1e-06, + "loss": 0.4117, + "step": 2456 + }, + { + "epoch": 0.15745962573699052, + "grad_norm": 2.9131013972018893, + "learning_rate": 1e-06, + "loss": 0.406, + "step": 2457 + }, + { + "epoch": 0.1575237118687516, + "grad_norm": 2.6926586987036005, + "learning_rate": 1e-06, + "loss": 0.383, + "step": 2458 + }, + { + "epoch": 0.1575877980005127, + "grad_norm": 2.661373242133559, + "learning_rate": 1e-06, + "loss": 0.4544, + "step": 2459 + }, + { + "epoch": 0.15765188413227377, + "grad_norm": 2.865019871633759, + "learning_rate": 1e-06, + "loss": 0.3793, + "step": 2460 + }, + { + "epoch": 0.15771597026403486, + "grad_norm": 2.7673917992905857, + "learning_rate": 1e-06, + "loss": 0.3693, + "step": 2461 + }, + { + "epoch": 0.15778005639579595, + "grad_norm": 2.6496428816256183, + "learning_rate": 1e-06, + "loss": 0.4039, + "step": 2462 + }, + { + "epoch": 0.15784414252755705, + "grad_norm": 2.5833209913336788, + "learning_rate": 1e-06, + "loss": 0.3253, + "step": 2463 + }, + { + "epoch": 0.1579082286593181, + "grad_norm": 2.9927054051853594, + "learning_rate": 1e-06, + "loss": 0.4455, + "step": 2464 + }, + { + "epoch": 0.1579723147910792, + "grad_norm": 2.688891946243902, + "learning_rate": 1e-06, + "loss": 0.4572, + "step": 2465 + }, + { + "epoch": 0.1580364009228403, + "grad_norm": 2.8014701175146275, + "learning_rate": 1e-06, + "loss": 0.4, + "step": 2466 + }, + { + "epoch": 0.1581004870546014, + "grad_norm": 2.6209516151437016, + "learning_rate": 1e-06, + "loss": 0.4293, + "step": 2467 + }, + { + "epoch": 0.15816457318636248, + "grad_norm": 2.6968176829286077, + "learning_rate": 1e-06, + "loss": 0.4418, + "step": 2468 + }, + { + "epoch": 0.15822865931812355, + "grad_norm": 2.716996681830155, + "learning_rate": 1e-06, + "loss": 0.4425, + "step": 2469 + }, + { + "epoch": 0.15829274544988464, + "grad_norm": 2.7238337604748373, + "learning_rate": 1e-06, + "loss": 0.4307, + "step": 2470 + }, + { + "epoch": 0.15835683158164574, + "grad_norm": 2.739047909106007, + "learning_rate": 1e-06, + "loss": 0.4299, + "step": 2471 + }, + { + "epoch": 0.15842091771340683, + "grad_norm": 2.526469438645234, + "learning_rate": 1e-06, + "loss": 0.4244, + "step": 2472 + }, + { + "epoch": 0.1584850038451679, + "grad_norm": 2.6278675097925763, + "learning_rate": 1e-06, + "loss": 0.4373, + "step": 2473 + }, + { + "epoch": 0.158549089976929, + "grad_norm": 2.5826759075688055, + "learning_rate": 1e-06, + "loss": 0.4258, + "step": 2474 + }, + { + "epoch": 0.15861317610869008, + "grad_norm": 2.77590367710701, + "learning_rate": 1e-06, + "loss": 0.4196, + "step": 2475 + }, + { + "epoch": 0.15867726224045117, + "grad_norm": 2.7835205766483893, + "learning_rate": 1e-06, + "loss": 0.3705, + "step": 2476 + }, + { + "epoch": 0.15874134837221227, + "grad_norm": 2.863480380954405, + "learning_rate": 1e-06, + "loss": 0.4371, + "step": 2477 + }, + { + "epoch": 0.15880543450397333, + "grad_norm": 2.8855879891557374, + "learning_rate": 1e-06, + "loss": 0.4355, + "step": 2478 + }, + { + "epoch": 0.15886952063573442, + "grad_norm": 2.721920994882, + "learning_rate": 1e-06, + "loss": 0.4384, + "step": 2479 + }, + { + "epoch": 0.15893360676749552, + "grad_norm": 2.6621503936105686, + "learning_rate": 1e-06, + "loss": 0.4144, + "step": 2480 + }, + { + "epoch": 0.1589976928992566, + "grad_norm": 2.625862393236496, + "learning_rate": 1e-06, + "loss": 0.4072, + "step": 2481 + }, + { + "epoch": 0.15906177903101767, + "grad_norm": 2.698437048521841, + "learning_rate": 1e-06, + "loss": 0.4212, + "step": 2482 + }, + { + "epoch": 0.15912586516277877, + "grad_norm": 2.747135808007895, + "learning_rate": 1e-06, + "loss": 0.3773, + "step": 2483 + }, + { + "epoch": 0.15918995129453986, + "grad_norm": 2.6924238591324436, + "learning_rate": 1e-06, + "loss": 0.4419, + "step": 2484 + }, + { + "epoch": 0.15925403742630095, + "grad_norm": 2.864874679432206, + "learning_rate": 1e-06, + "loss": 0.4116, + "step": 2485 + }, + { + "epoch": 0.15931812355806205, + "grad_norm": 2.584090760115717, + "learning_rate": 1e-06, + "loss": 0.4204, + "step": 2486 + }, + { + "epoch": 0.1593822096898231, + "grad_norm": 2.3501320571795565, + "learning_rate": 1e-06, + "loss": 0.4432, + "step": 2487 + }, + { + "epoch": 0.1594462958215842, + "grad_norm": 2.62447790662663, + "learning_rate": 1e-06, + "loss": 0.3803, + "step": 2488 + }, + { + "epoch": 0.1595103819533453, + "grad_norm": 2.809122378803803, + "learning_rate": 1e-06, + "loss": 0.4318, + "step": 2489 + }, + { + "epoch": 0.1595744680851064, + "grad_norm": 2.560302012329596, + "learning_rate": 1e-06, + "loss": 0.4023, + "step": 2490 + }, + { + "epoch": 0.15963855421686746, + "grad_norm": 2.7315313434872364, + "learning_rate": 1e-06, + "loss": 0.419, + "step": 2491 + }, + { + "epoch": 0.15970264034862855, + "grad_norm": 2.7487724870084707, + "learning_rate": 1e-06, + "loss": 0.4438, + "step": 2492 + }, + { + "epoch": 0.15976672648038964, + "grad_norm": 2.5211532370360317, + "learning_rate": 1e-06, + "loss": 0.4091, + "step": 2493 + }, + { + "epoch": 0.15983081261215074, + "grad_norm": 2.69198583244194, + "learning_rate": 1e-06, + "loss": 0.3919, + "step": 2494 + }, + { + "epoch": 0.15989489874391183, + "grad_norm": 2.6710258015363175, + "learning_rate": 1e-06, + "loss": 0.3612, + "step": 2495 + }, + { + "epoch": 0.1599589848756729, + "grad_norm": 2.6456233748049787, + "learning_rate": 1e-06, + "loss": 0.4136, + "step": 2496 + }, + { + "epoch": 0.160023071007434, + "grad_norm": 2.5859243386442037, + "learning_rate": 1e-06, + "loss": 0.4355, + "step": 2497 + }, + { + "epoch": 0.16008715713919508, + "grad_norm": 2.7520692263098416, + "learning_rate": 1e-06, + "loss": 0.4324, + "step": 2498 + }, + { + "epoch": 0.16015124327095617, + "grad_norm": 2.7816574091909723, + "learning_rate": 1e-06, + "loss": 0.3854, + "step": 2499 + }, + { + "epoch": 0.16021532940271727, + "grad_norm": 2.6344550709051044, + "learning_rate": 1e-06, + "loss": 0.4278, + "step": 2500 + }, + { + "epoch": 0.16027941553447833, + "grad_norm": 3.197606757914543, + "learning_rate": 1e-06, + "loss": 0.3987, + "step": 2501 + }, + { + "epoch": 0.16034350166623942, + "grad_norm": 2.6471620383147676, + "learning_rate": 1e-06, + "loss": 0.4028, + "step": 2502 + }, + { + "epoch": 0.16040758779800052, + "grad_norm": 2.882593526216847, + "learning_rate": 1e-06, + "loss": 0.4053, + "step": 2503 + }, + { + "epoch": 0.1604716739297616, + "grad_norm": 2.6876055696015246, + "learning_rate": 1e-06, + "loss": 0.4112, + "step": 2504 + }, + { + "epoch": 0.16053576006152268, + "grad_norm": 2.6908732798019157, + "learning_rate": 1e-06, + "loss": 0.4084, + "step": 2505 + }, + { + "epoch": 0.16059984619328377, + "grad_norm": 2.696421366995068, + "learning_rate": 1e-06, + "loss": 0.391, + "step": 2506 + }, + { + "epoch": 0.16066393232504486, + "grad_norm": 2.5708012883800184, + "learning_rate": 1e-06, + "loss": 0.4442, + "step": 2507 + }, + { + "epoch": 0.16072801845680595, + "grad_norm": 2.5577193706830594, + "learning_rate": 1e-06, + "loss": 0.3862, + "step": 2508 + }, + { + "epoch": 0.16079210458856705, + "grad_norm": 2.7187413971424266, + "learning_rate": 1e-06, + "loss": 0.4327, + "step": 2509 + }, + { + "epoch": 0.1608561907203281, + "grad_norm": 2.5817179456414037, + "learning_rate": 1e-06, + "loss": 0.3879, + "step": 2510 + }, + { + "epoch": 0.1609202768520892, + "grad_norm": 2.7269113290519593, + "learning_rate": 1e-06, + "loss": 0.4325, + "step": 2511 + }, + { + "epoch": 0.1609843629838503, + "grad_norm": 2.572578117655859, + "learning_rate": 1e-06, + "loss": 0.3851, + "step": 2512 + }, + { + "epoch": 0.1610484491156114, + "grad_norm": 2.6450180571019937, + "learning_rate": 1e-06, + "loss": 0.4411, + "step": 2513 + }, + { + "epoch": 0.16111253524737246, + "grad_norm": 2.5789710591909545, + "learning_rate": 1e-06, + "loss": 0.4102, + "step": 2514 + }, + { + "epoch": 0.16117662137913355, + "grad_norm": 2.4838537947972177, + "learning_rate": 1e-06, + "loss": 0.405, + "step": 2515 + }, + { + "epoch": 0.16124070751089464, + "grad_norm": 2.534907881809524, + "learning_rate": 1e-06, + "loss": 0.3469, + "step": 2516 + }, + { + "epoch": 0.16130479364265574, + "grad_norm": 2.500800182833469, + "learning_rate": 1e-06, + "loss": 0.3948, + "step": 2517 + }, + { + "epoch": 0.16136887977441683, + "grad_norm": 2.6929560857938974, + "learning_rate": 1e-06, + "loss": 0.4144, + "step": 2518 + }, + { + "epoch": 0.1614329659061779, + "grad_norm": 2.6113591154458287, + "learning_rate": 1e-06, + "loss": 0.3989, + "step": 2519 + }, + { + "epoch": 0.161497052037939, + "grad_norm": 2.537840773620322, + "learning_rate": 1e-06, + "loss": 0.3568, + "step": 2520 + }, + { + "epoch": 0.16156113816970008, + "grad_norm": 2.7707118455203767, + "learning_rate": 1e-06, + "loss": 0.3732, + "step": 2521 + }, + { + "epoch": 0.16162522430146117, + "grad_norm": 2.7225252383237275, + "learning_rate": 1e-06, + "loss": 0.3935, + "step": 2522 + }, + { + "epoch": 0.16168931043322224, + "grad_norm": 2.828852689766831, + "learning_rate": 1e-06, + "loss": 0.4267, + "step": 2523 + }, + { + "epoch": 0.16175339656498333, + "grad_norm": 2.603720737428551, + "learning_rate": 1e-06, + "loss": 0.4425, + "step": 2524 + }, + { + "epoch": 0.16181748269674442, + "grad_norm": 2.5478451431697033, + "learning_rate": 1e-06, + "loss": 0.3837, + "step": 2525 + }, + { + "epoch": 0.16188156882850552, + "grad_norm": 2.661807124509187, + "learning_rate": 1e-06, + "loss": 0.384, + "step": 2526 + }, + { + "epoch": 0.1619456549602666, + "grad_norm": 2.6041595365534604, + "learning_rate": 1e-06, + "loss": 0.3774, + "step": 2527 + }, + { + "epoch": 0.16200974109202768, + "grad_norm": 2.570551916458256, + "learning_rate": 1e-06, + "loss": 0.4089, + "step": 2528 + }, + { + "epoch": 0.16207382722378877, + "grad_norm": 2.691936352986267, + "learning_rate": 1e-06, + "loss": 0.3846, + "step": 2529 + }, + { + "epoch": 0.16213791335554986, + "grad_norm": 2.630771561034015, + "learning_rate": 1e-06, + "loss": 0.4067, + "step": 2530 + }, + { + "epoch": 0.16220199948731095, + "grad_norm": 2.812901706927222, + "learning_rate": 1e-06, + "loss": 0.4493, + "step": 2531 + }, + { + "epoch": 0.16226608561907202, + "grad_norm": 2.895255542589514, + "learning_rate": 1e-06, + "loss": 0.4126, + "step": 2532 + }, + { + "epoch": 0.1623301717508331, + "grad_norm": 2.689542921264475, + "learning_rate": 1e-06, + "loss": 0.4611, + "step": 2533 + }, + { + "epoch": 0.1623942578825942, + "grad_norm": 2.7578900917283318, + "learning_rate": 1e-06, + "loss": 0.4068, + "step": 2534 + }, + { + "epoch": 0.1624583440143553, + "grad_norm": 2.7909225863700104, + "learning_rate": 1e-06, + "loss": 0.4066, + "step": 2535 + }, + { + "epoch": 0.1625224301461164, + "grad_norm": 2.7296440103447535, + "learning_rate": 1e-06, + "loss": 0.4286, + "step": 2536 + }, + { + "epoch": 0.16258651627787746, + "grad_norm": 2.7413253180770747, + "learning_rate": 1e-06, + "loss": 0.398, + "step": 2537 + }, + { + "epoch": 0.16265060240963855, + "grad_norm": 2.968023303872132, + "learning_rate": 1e-06, + "loss": 0.4264, + "step": 2538 + }, + { + "epoch": 0.16271468854139964, + "grad_norm": 2.5240106724224285, + "learning_rate": 1e-06, + "loss": 0.4667, + "step": 2539 + }, + { + "epoch": 0.16277877467316074, + "grad_norm": 2.8291621239977434, + "learning_rate": 1e-06, + "loss": 0.4637, + "step": 2540 + }, + { + "epoch": 0.1628428608049218, + "grad_norm": 2.685586301138915, + "learning_rate": 1e-06, + "loss": 0.4277, + "step": 2541 + }, + { + "epoch": 0.1629069469366829, + "grad_norm": 2.7011354343541827, + "learning_rate": 1e-06, + "loss": 0.4338, + "step": 2542 + }, + { + "epoch": 0.162971033068444, + "grad_norm": 2.652397024902033, + "learning_rate": 1e-06, + "loss": 0.454, + "step": 2543 + }, + { + "epoch": 0.16303511920020508, + "grad_norm": 2.977816057807566, + "learning_rate": 1e-06, + "loss": 0.4084, + "step": 2544 + }, + { + "epoch": 0.16309920533196617, + "grad_norm": 2.7882639087445233, + "learning_rate": 1e-06, + "loss": 0.4856, + "step": 2545 + }, + { + "epoch": 0.16316329146372724, + "grad_norm": 2.5045074481642526, + "learning_rate": 1e-06, + "loss": 0.4176, + "step": 2546 + }, + { + "epoch": 0.16322737759548833, + "grad_norm": 2.617452267392621, + "learning_rate": 1e-06, + "loss": 0.4572, + "step": 2547 + }, + { + "epoch": 0.16329146372724943, + "grad_norm": 2.658894789581529, + "learning_rate": 1e-06, + "loss": 0.4675, + "step": 2548 + }, + { + "epoch": 0.16335554985901052, + "grad_norm": 2.852357932664824, + "learning_rate": 1e-06, + "loss": 0.4418, + "step": 2549 + }, + { + "epoch": 0.16341963599077158, + "grad_norm": 2.6360746418969017, + "learning_rate": 1e-06, + "loss": 0.3874, + "step": 2550 + }, + { + "epoch": 0.16348372212253268, + "grad_norm": 2.8766537155424214, + "learning_rate": 1e-06, + "loss": 0.4384, + "step": 2551 + }, + { + "epoch": 0.16354780825429377, + "grad_norm": 2.625135910693668, + "learning_rate": 1e-06, + "loss": 0.4223, + "step": 2552 + }, + { + "epoch": 0.16361189438605486, + "grad_norm": 2.564132710733575, + "learning_rate": 1e-06, + "loss": 0.4263, + "step": 2553 + }, + { + "epoch": 0.16367598051781596, + "grad_norm": 2.533594805547048, + "learning_rate": 1e-06, + "loss": 0.4281, + "step": 2554 + }, + { + "epoch": 0.16374006664957702, + "grad_norm": 2.6563807475777854, + "learning_rate": 1e-06, + "loss": 0.474, + "step": 2555 + }, + { + "epoch": 0.1638041527813381, + "grad_norm": 2.6555639273705953, + "learning_rate": 1e-06, + "loss": 0.386, + "step": 2556 + }, + { + "epoch": 0.1638682389130992, + "grad_norm": 3.423681981284491, + "learning_rate": 1e-06, + "loss": 0.3963, + "step": 2557 + }, + { + "epoch": 0.1639323250448603, + "grad_norm": 2.6343181562972364, + "learning_rate": 1e-06, + "loss": 0.4535, + "step": 2558 + }, + { + "epoch": 0.1639964111766214, + "grad_norm": 2.558480507768559, + "learning_rate": 1e-06, + "loss": 0.4296, + "step": 2559 + }, + { + "epoch": 0.16406049730838246, + "grad_norm": 2.5653732110638154, + "learning_rate": 1e-06, + "loss": 0.4264, + "step": 2560 + }, + { + "epoch": 0.16412458344014355, + "grad_norm": 2.7535149483462025, + "learning_rate": 1e-06, + "loss": 0.408, + "step": 2561 + }, + { + "epoch": 0.16418866957190464, + "grad_norm": 2.7378653936579553, + "learning_rate": 1e-06, + "loss": 0.4314, + "step": 2562 + }, + { + "epoch": 0.16425275570366574, + "grad_norm": 2.6625159731761268, + "learning_rate": 1e-06, + "loss": 0.3955, + "step": 2563 + }, + { + "epoch": 0.1643168418354268, + "grad_norm": 2.6482824209454137, + "learning_rate": 1e-06, + "loss": 0.3903, + "step": 2564 + }, + { + "epoch": 0.1643809279671879, + "grad_norm": 2.8188480793044666, + "learning_rate": 1e-06, + "loss": 0.3948, + "step": 2565 + }, + { + "epoch": 0.164445014098949, + "grad_norm": 2.563521159898662, + "learning_rate": 1e-06, + "loss": 0.401, + "step": 2566 + }, + { + "epoch": 0.16450910023071008, + "grad_norm": 2.7131990194512623, + "learning_rate": 1e-06, + "loss": 0.4144, + "step": 2567 + }, + { + "epoch": 0.16457318636247117, + "grad_norm": 2.4797567023822937, + "learning_rate": 1e-06, + "loss": 0.3669, + "step": 2568 + }, + { + "epoch": 0.16463727249423224, + "grad_norm": 2.467398013163161, + "learning_rate": 1e-06, + "loss": 0.3637, + "step": 2569 + }, + { + "epoch": 0.16470135862599333, + "grad_norm": 2.544381041213776, + "learning_rate": 1e-06, + "loss": 0.4389, + "step": 2570 + }, + { + "epoch": 0.16476544475775443, + "grad_norm": 2.5132164527889427, + "learning_rate": 1e-06, + "loss": 0.3867, + "step": 2571 + }, + { + "epoch": 0.16482953088951552, + "grad_norm": 2.5655755742496074, + "learning_rate": 1e-06, + "loss": 0.3894, + "step": 2572 + }, + { + "epoch": 0.16489361702127658, + "grad_norm": 2.5545087984725794, + "learning_rate": 1e-06, + "loss": 0.4292, + "step": 2573 + }, + { + "epoch": 0.16495770315303768, + "grad_norm": 2.724930949542696, + "learning_rate": 1e-06, + "loss": 0.4255, + "step": 2574 + }, + { + "epoch": 0.16502178928479877, + "grad_norm": 2.9083920762744464, + "learning_rate": 1e-06, + "loss": 0.4484, + "step": 2575 + }, + { + "epoch": 0.16508587541655986, + "grad_norm": 2.647945420853656, + "learning_rate": 1e-06, + "loss": 0.4454, + "step": 2576 + }, + { + "epoch": 0.16514996154832096, + "grad_norm": 2.656887468380951, + "learning_rate": 1e-06, + "loss": 0.3911, + "step": 2577 + }, + { + "epoch": 0.16521404768008202, + "grad_norm": 2.4060271256432095, + "learning_rate": 1e-06, + "loss": 0.3968, + "step": 2578 + }, + { + "epoch": 0.16527813381184311, + "grad_norm": 2.673050335073366, + "learning_rate": 1e-06, + "loss": 0.4298, + "step": 2579 + }, + { + "epoch": 0.1653422199436042, + "grad_norm": 2.6629568710429488, + "learning_rate": 1e-06, + "loss": 0.4281, + "step": 2580 + }, + { + "epoch": 0.1654063060753653, + "grad_norm": 2.6124306527589414, + "learning_rate": 1e-06, + "loss": 0.4531, + "step": 2581 + }, + { + "epoch": 0.16547039220712637, + "grad_norm": 2.7811035047254764, + "learning_rate": 1e-06, + "loss": 0.4631, + "step": 2582 + }, + { + "epoch": 0.16553447833888746, + "grad_norm": 2.5917185980322937, + "learning_rate": 1e-06, + "loss": 0.4477, + "step": 2583 + }, + { + "epoch": 0.16559856447064855, + "grad_norm": 2.591034780645144, + "learning_rate": 1e-06, + "loss": 0.4439, + "step": 2584 + }, + { + "epoch": 0.16566265060240964, + "grad_norm": 2.794625908520945, + "learning_rate": 1e-06, + "loss": 0.3689, + "step": 2585 + }, + { + "epoch": 0.16572673673417074, + "grad_norm": 2.6106494060829046, + "learning_rate": 1e-06, + "loss": 0.4439, + "step": 2586 + }, + { + "epoch": 0.1657908228659318, + "grad_norm": 2.6363465931070245, + "learning_rate": 1e-06, + "loss": 0.4187, + "step": 2587 + }, + { + "epoch": 0.1658549089976929, + "grad_norm": 2.8385488313877363, + "learning_rate": 1e-06, + "loss": 0.4257, + "step": 2588 + }, + { + "epoch": 0.165918995129454, + "grad_norm": 2.5718888642010556, + "learning_rate": 1e-06, + "loss": 0.4606, + "step": 2589 + }, + { + "epoch": 0.16598308126121508, + "grad_norm": 2.824032193073902, + "learning_rate": 1e-06, + "loss": 0.4144, + "step": 2590 + }, + { + "epoch": 0.16604716739297615, + "grad_norm": 2.6419319673150614, + "learning_rate": 1e-06, + "loss": 0.3877, + "step": 2591 + }, + { + "epoch": 0.16611125352473724, + "grad_norm": 2.6349648425656222, + "learning_rate": 1e-06, + "loss": 0.4523, + "step": 2592 + }, + { + "epoch": 0.16617533965649833, + "grad_norm": 2.5687018277447673, + "learning_rate": 1e-06, + "loss": 0.4163, + "step": 2593 + }, + { + "epoch": 0.16623942578825943, + "grad_norm": 2.7502135847373, + "learning_rate": 1e-06, + "loss": 0.3938, + "step": 2594 + }, + { + "epoch": 0.16630351192002052, + "grad_norm": 2.7870082246775536, + "learning_rate": 1e-06, + "loss": 0.3712, + "step": 2595 + }, + { + "epoch": 0.16636759805178158, + "grad_norm": 2.664471483180359, + "learning_rate": 1e-06, + "loss": 0.3785, + "step": 2596 + }, + { + "epoch": 0.16643168418354268, + "grad_norm": 2.8006579800788405, + "learning_rate": 1e-06, + "loss": 0.4322, + "step": 2597 + }, + { + "epoch": 0.16649577031530377, + "grad_norm": 2.651191581602123, + "learning_rate": 1e-06, + "loss": 0.428, + "step": 2598 + }, + { + "epoch": 0.16655985644706486, + "grad_norm": 2.7140854600700695, + "learning_rate": 1e-06, + "loss": 0.3945, + "step": 2599 + }, + { + "epoch": 0.16662394257882593, + "grad_norm": 2.7856278395057856, + "learning_rate": 1e-06, + "loss": 0.4592, + "step": 2600 + }, + { + "epoch": 0.16668802871058702, + "grad_norm": 2.607410882542063, + "learning_rate": 1e-06, + "loss": 0.4324, + "step": 2601 + }, + { + "epoch": 0.16675211484234811, + "grad_norm": 2.732714333661932, + "learning_rate": 1e-06, + "loss": 0.4252, + "step": 2602 + }, + { + "epoch": 0.1668162009741092, + "grad_norm": 2.921716514174632, + "learning_rate": 1e-06, + "loss": 0.4345, + "step": 2603 + }, + { + "epoch": 0.1668802871058703, + "grad_norm": 2.4597493378828497, + "learning_rate": 1e-06, + "loss": 0.4015, + "step": 2604 + }, + { + "epoch": 0.16694437323763137, + "grad_norm": 2.6806802003367127, + "learning_rate": 1e-06, + "loss": 0.3903, + "step": 2605 + }, + { + "epoch": 0.16700845936939246, + "grad_norm": 2.5394049202373004, + "learning_rate": 1e-06, + "loss": 0.3787, + "step": 2606 + }, + { + "epoch": 0.16707254550115355, + "grad_norm": 2.537658335645286, + "learning_rate": 1e-06, + "loss": 0.4692, + "step": 2607 + }, + { + "epoch": 0.16713663163291465, + "grad_norm": 2.486190723866445, + "learning_rate": 1e-06, + "loss": 0.4244, + "step": 2608 + }, + { + "epoch": 0.1672007177646757, + "grad_norm": 2.6499647012087504, + "learning_rate": 1e-06, + "loss": 0.3738, + "step": 2609 + }, + { + "epoch": 0.1672648038964368, + "grad_norm": 2.4785537911649365, + "learning_rate": 1e-06, + "loss": 0.415, + "step": 2610 + }, + { + "epoch": 0.1673288900281979, + "grad_norm": 2.6183916526906894, + "learning_rate": 1e-06, + "loss": 0.5018, + "step": 2611 + }, + { + "epoch": 0.167392976159959, + "grad_norm": 2.6916594666182996, + "learning_rate": 1e-06, + "loss": 0.4613, + "step": 2612 + }, + { + "epoch": 0.16745706229172008, + "grad_norm": 2.731776274248501, + "learning_rate": 1e-06, + "loss": 0.338, + "step": 2613 + }, + { + "epoch": 0.16752114842348115, + "grad_norm": 2.5633563923010367, + "learning_rate": 1e-06, + "loss": 0.4063, + "step": 2614 + }, + { + "epoch": 0.16758523455524224, + "grad_norm": 2.6792211962059986, + "learning_rate": 1e-06, + "loss": 0.4274, + "step": 2615 + }, + { + "epoch": 0.16764932068700333, + "grad_norm": 2.704490572915443, + "learning_rate": 1e-06, + "loss": 0.3596, + "step": 2616 + }, + { + "epoch": 0.16771340681876443, + "grad_norm": 2.7040420901466127, + "learning_rate": 1e-06, + "loss": 0.3669, + "step": 2617 + }, + { + "epoch": 0.16777749295052552, + "grad_norm": 2.508603329270507, + "learning_rate": 1e-06, + "loss": 0.383, + "step": 2618 + }, + { + "epoch": 0.16784157908228658, + "grad_norm": 2.667668834729716, + "learning_rate": 1e-06, + "loss": 0.3944, + "step": 2619 + }, + { + "epoch": 0.16790566521404768, + "grad_norm": 2.788009681704992, + "learning_rate": 1e-06, + "loss": 0.393, + "step": 2620 + }, + { + "epoch": 0.16796975134580877, + "grad_norm": 2.9943683712234295, + "learning_rate": 1e-06, + "loss": 0.4375, + "step": 2621 + }, + { + "epoch": 0.16803383747756986, + "grad_norm": 2.4580934140045576, + "learning_rate": 1e-06, + "loss": 0.4295, + "step": 2622 + }, + { + "epoch": 0.16809792360933093, + "grad_norm": 2.682255157750206, + "learning_rate": 1e-06, + "loss": 0.415, + "step": 2623 + }, + { + "epoch": 0.16816200974109202, + "grad_norm": 2.6326858723207915, + "learning_rate": 1e-06, + "loss": 0.4016, + "step": 2624 + }, + { + "epoch": 0.16822609587285312, + "grad_norm": 2.7531460345683945, + "learning_rate": 1e-06, + "loss": 0.4366, + "step": 2625 + }, + { + "epoch": 0.1682901820046142, + "grad_norm": 2.4582924292223063, + "learning_rate": 1e-06, + "loss": 0.3714, + "step": 2626 + }, + { + "epoch": 0.1683542681363753, + "grad_norm": 2.4189133463074413, + "learning_rate": 1e-06, + "loss": 0.3395, + "step": 2627 + }, + { + "epoch": 0.16841835426813637, + "grad_norm": 2.4978464701468854, + "learning_rate": 1e-06, + "loss": 0.3388, + "step": 2628 + }, + { + "epoch": 0.16848244039989746, + "grad_norm": 2.6293887606762003, + "learning_rate": 1e-06, + "loss": 0.4318, + "step": 2629 + }, + { + "epoch": 0.16854652653165855, + "grad_norm": 2.688163877974156, + "learning_rate": 1e-06, + "loss": 0.4566, + "step": 2630 + }, + { + "epoch": 0.16861061266341965, + "grad_norm": 2.6615797889998385, + "learning_rate": 1e-06, + "loss": 0.4286, + "step": 2631 + }, + { + "epoch": 0.1686746987951807, + "grad_norm": 2.6150907943144452, + "learning_rate": 1e-06, + "loss": 0.3935, + "step": 2632 + }, + { + "epoch": 0.1687387849269418, + "grad_norm": 2.920403244685697, + "learning_rate": 1e-06, + "loss": 0.4117, + "step": 2633 + }, + { + "epoch": 0.1688028710587029, + "grad_norm": 2.748707499569203, + "learning_rate": 1e-06, + "loss": 0.4312, + "step": 2634 + }, + { + "epoch": 0.168866957190464, + "grad_norm": 2.7044982203747323, + "learning_rate": 1e-06, + "loss": 0.3967, + "step": 2635 + }, + { + "epoch": 0.16893104332222508, + "grad_norm": 2.695912823716649, + "learning_rate": 1e-06, + "loss": 0.4081, + "step": 2636 + }, + { + "epoch": 0.16899512945398615, + "grad_norm": 2.5569026627371625, + "learning_rate": 1e-06, + "loss": 0.4057, + "step": 2637 + }, + { + "epoch": 0.16905921558574724, + "grad_norm": 2.617933717691224, + "learning_rate": 1e-06, + "loss": 0.4184, + "step": 2638 + }, + { + "epoch": 0.16912330171750833, + "grad_norm": 2.828806100247617, + "learning_rate": 1e-06, + "loss": 0.4145, + "step": 2639 + }, + { + "epoch": 0.16918738784926943, + "grad_norm": 2.6495628081292613, + "learning_rate": 1e-06, + "loss": 0.4279, + "step": 2640 + }, + { + "epoch": 0.1692514739810305, + "grad_norm": 2.762389598809753, + "learning_rate": 1e-06, + "loss": 0.4279, + "step": 2641 + }, + { + "epoch": 0.16931556011279159, + "grad_norm": 2.5771748842832647, + "learning_rate": 1e-06, + "loss": 0.3541, + "step": 2642 + }, + { + "epoch": 0.16937964624455268, + "grad_norm": 2.5680137462087944, + "learning_rate": 1e-06, + "loss": 0.3764, + "step": 2643 + }, + { + "epoch": 0.16944373237631377, + "grad_norm": 2.533760471382165, + "learning_rate": 1e-06, + "loss": 0.3492, + "step": 2644 + }, + { + "epoch": 0.16950781850807486, + "grad_norm": 2.537156219708976, + "learning_rate": 1e-06, + "loss": 0.4071, + "step": 2645 + }, + { + "epoch": 0.16957190463983593, + "grad_norm": 2.662693914933848, + "learning_rate": 1e-06, + "loss": 0.3899, + "step": 2646 + }, + { + "epoch": 0.16963599077159702, + "grad_norm": 2.6963640114525202, + "learning_rate": 1e-06, + "loss": 0.3973, + "step": 2647 + }, + { + "epoch": 0.16970007690335812, + "grad_norm": 2.381564264533936, + "learning_rate": 1e-06, + "loss": 0.393, + "step": 2648 + }, + { + "epoch": 0.1697641630351192, + "grad_norm": 2.666993056664914, + "learning_rate": 1e-06, + "loss": 0.4031, + "step": 2649 + }, + { + "epoch": 0.16982824916688027, + "grad_norm": 2.7697832526098214, + "learning_rate": 1e-06, + "loss": 0.4172, + "step": 2650 + }, + { + "epoch": 0.16989233529864137, + "grad_norm": 2.77774648574453, + "learning_rate": 1e-06, + "loss": 0.3955, + "step": 2651 + }, + { + "epoch": 0.16995642143040246, + "grad_norm": 2.773426081646378, + "learning_rate": 1e-06, + "loss": 0.4396, + "step": 2652 + }, + { + "epoch": 0.17002050756216355, + "grad_norm": 2.349409710796212, + "learning_rate": 1e-06, + "loss": 0.3712, + "step": 2653 + }, + { + "epoch": 0.17008459369392465, + "grad_norm": 2.6672881479515587, + "learning_rate": 1e-06, + "loss": 0.4144, + "step": 2654 + }, + { + "epoch": 0.1701486798256857, + "grad_norm": 2.5934156121316865, + "learning_rate": 1e-06, + "loss": 0.4574, + "step": 2655 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 3.0540709948188893, + "learning_rate": 1e-06, + "loss": 0.4623, + "step": 2656 + }, + { + "epoch": 0.1702768520892079, + "grad_norm": 2.5464630959332926, + "learning_rate": 1e-06, + "loss": 0.3893, + "step": 2657 + }, + { + "epoch": 0.170340938220969, + "grad_norm": 2.6894763480997854, + "learning_rate": 1e-06, + "loss": 0.4528, + "step": 2658 + }, + { + "epoch": 0.17040502435273006, + "grad_norm": 2.640883933595137, + "learning_rate": 1e-06, + "loss": 0.4243, + "step": 2659 + }, + { + "epoch": 0.17046911048449115, + "grad_norm": 2.84348627663285, + "learning_rate": 1e-06, + "loss": 0.3694, + "step": 2660 + }, + { + "epoch": 0.17053319661625224, + "grad_norm": 2.7193478058528147, + "learning_rate": 1e-06, + "loss": 0.4163, + "step": 2661 + }, + { + "epoch": 0.17059728274801333, + "grad_norm": 2.3724490315311173, + "learning_rate": 1e-06, + "loss": 0.4099, + "step": 2662 + }, + { + "epoch": 0.17066136887977443, + "grad_norm": 2.7397106390320998, + "learning_rate": 1e-06, + "loss": 0.4298, + "step": 2663 + }, + { + "epoch": 0.1707254550115355, + "grad_norm": 2.928854559865521, + "learning_rate": 1e-06, + "loss": 0.3542, + "step": 2664 + }, + { + "epoch": 0.17078954114329659, + "grad_norm": 3.174576889900393, + "learning_rate": 1e-06, + "loss": 0.4487, + "step": 2665 + }, + { + "epoch": 0.17085362727505768, + "grad_norm": 2.768162935078266, + "learning_rate": 1e-06, + "loss": 0.4126, + "step": 2666 + }, + { + "epoch": 0.17091771340681877, + "grad_norm": 2.6517696398341006, + "learning_rate": 1e-06, + "loss": 0.419, + "step": 2667 + }, + { + "epoch": 0.17098179953857987, + "grad_norm": 2.4838150857756656, + "learning_rate": 1e-06, + "loss": 0.4224, + "step": 2668 + }, + { + "epoch": 0.17104588567034093, + "grad_norm": 2.6833635728441023, + "learning_rate": 1e-06, + "loss": 0.4171, + "step": 2669 + }, + { + "epoch": 0.17110997180210202, + "grad_norm": 2.603487459743295, + "learning_rate": 1e-06, + "loss": 0.3906, + "step": 2670 + }, + { + "epoch": 0.17117405793386312, + "grad_norm": 2.5774146996288136, + "learning_rate": 1e-06, + "loss": 0.3956, + "step": 2671 + }, + { + "epoch": 0.1712381440656242, + "grad_norm": 2.938289624792572, + "learning_rate": 1e-06, + "loss": 0.3997, + "step": 2672 + }, + { + "epoch": 0.17130223019738527, + "grad_norm": 2.8039532904551585, + "learning_rate": 1e-06, + "loss": 0.4422, + "step": 2673 + }, + { + "epoch": 0.17136631632914637, + "grad_norm": 2.750175646020322, + "learning_rate": 1e-06, + "loss": 0.4173, + "step": 2674 + }, + { + "epoch": 0.17143040246090746, + "grad_norm": 2.4505574568016857, + "learning_rate": 1e-06, + "loss": 0.386, + "step": 2675 + }, + { + "epoch": 0.17149448859266855, + "grad_norm": 2.5900624275998285, + "learning_rate": 1e-06, + "loss": 0.4086, + "step": 2676 + }, + { + "epoch": 0.17155857472442965, + "grad_norm": 2.573632710786655, + "learning_rate": 1e-06, + "loss": 0.4297, + "step": 2677 + }, + { + "epoch": 0.1716226608561907, + "grad_norm": 2.868666362706998, + "learning_rate": 1e-06, + "loss": 0.4225, + "step": 2678 + }, + { + "epoch": 0.1716867469879518, + "grad_norm": 2.653904345921513, + "learning_rate": 1e-06, + "loss": 0.433, + "step": 2679 + }, + { + "epoch": 0.1717508331197129, + "grad_norm": 2.55586101100343, + "learning_rate": 1e-06, + "loss": 0.3604, + "step": 2680 + }, + { + "epoch": 0.171814919251474, + "grad_norm": 2.8104819328821535, + "learning_rate": 1e-06, + "loss": 0.4086, + "step": 2681 + }, + { + "epoch": 0.17187900538323506, + "grad_norm": 2.706876954065412, + "learning_rate": 1e-06, + "loss": 0.506, + "step": 2682 + }, + { + "epoch": 0.17194309151499615, + "grad_norm": 2.510943495458629, + "learning_rate": 1e-06, + "loss": 0.4211, + "step": 2683 + }, + { + "epoch": 0.17200717764675724, + "grad_norm": 2.9015099476835777, + "learning_rate": 1e-06, + "loss": 0.3541, + "step": 2684 + }, + { + "epoch": 0.17207126377851834, + "grad_norm": 2.6421129558259504, + "learning_rate": 1e-06, + "loss": 0.4685, + "step": 2685 + }, + { + "epoch": 0.17213534991027943, + "grad_norm": 2.6910006860551583, + "learning_rate": 1e-06, + "loss": 0.4363, + "step": 2686 + }, + { + "epoch": 0.1721994360420405, + "grad_norm": 3.002895992145651, + "learning_rate": 1e-06, + "loss": 0.4241, + "step": 2687 + }, + { + "epoch": 0.1722635221738016, + "grad_norm": 2.630505341919852, + "learning_rate": 1e-06, + "loss": 0.3906, + "step": 2688 + }, + { + "epoch": 0.17232760830556268, + "grad_norm": 2.5058688409268504, + "learning_rate": 1e-06, + "loss": 0.4239, + "step": 2689 + }, + { + "epoch": 0.17239169443732377, + "grad_norm": 2.9556913711850963, + "learning_rate": 1e-06, + "loss": 0.4375, + "step": 2690 + }, + { + "epoch": 0.17245578056908484, + "grad_norm": 2.7664247622675684, + "learning_rate": 1e-06, + "loss": 0.4922, + "step": 2691 + }, + { + "epoch": 0.17251986670084593, + "grad_norm": 2.619465355014572, + "learning_rate": 1e-06, + "loss": 0.4407, + "step": 2692 + }, + { + "epoch": 0.17258395283260702, + "grad_norm": 2.686744915784644, + "learning_rate": 1e-06, + "loss": 0.4147, + "step": 2693 + }, + { + "epoch": 0.17264803896436812, + "grad_norm": 2.6546237480998762, + "learning_rate": 1e-06, + "loss": 0.391, + "step": 2694 + }, + { + "epoch": 0.1727121250961292, + "grad_norm": 2.898017643183714, + "learning_rate": 1e-06, + "loss": 0.4154, + "step": 2695 + }, + { + "epoch": 0.17277621122789028, + "grad_norm": 2.6544734467667817, + "learning_rate": 1e-06, + "loss": 0.4169, + "step": 2696 + }, + { + "epoch": 0.17284029735965137, + "grad_norm": 2.6309737535988593, + "learning_rate": 1e-06, + "loss": 0.4016, + "step": 2697 + }, + { + "epoch": 0.17290438349141246, + "grad_norm": 2.643895300769825, + "learning_rate": 1e-06, + "loss": 0.4367, + "step": 2698 + }, + { + "epoch": 0.17296846962317355, + "grad_norm": 2.6365834195624682, + "learning_rate": 1e-06, + "loss": 0.466, + "step": 2699 + }, + { + "epoch": 0.17303255575493462, + "grad_norm": 2.5310055725501313, + "learning_rate": 1e-06, + "loss": 0.4427, + "step": 2700 + }, + { + "epoch": 0.1730966418866957, + "grad_norm": 2.6053804421251727, + "learning_rate": 1e-06, + "loss": 0.3722, + "step": 2701 + }, + { + "epoch": 0.1731607280184568, + "grad_norm": 2.5949485014712925, + "learning_rate": 1e-06, + "loss": 0.3716, + "step": 2702 + }, + { + "epoch": 0.1732248141502179, + "grad_norm": 2.461700301283681, + "learning_rate": 1e-06, + "loss": 0.4243, + "step": 2703 + }, + { + "epoch": 0.173288900281979, + "grad_norm": 2.667172217132449, + "learning_rate": 1e-06, + "loss": 0.4409, + "step": 2704 + }, + { + "epoch": 0.17335298641374006, + "grad_norm": 2.676509271450921, + "learning_rate": 1e-06, + "loss": 0.3738, + "step": 2705 + }, + { + "epoch": 0.17341707254550115, + "grad_norm": 2.8242422751607963, + "learning_rate": 1e-06, + "loss": 0.4045, + "step": 2706 + }, + { + "epoch": 0.17348115867726224, + "grad_norm": 2.476650612552705, + "learning_rate": 1e-06, + "loss": 0.4105, + "step": 2707 + }, + { + "epoch": 0.17354524480902334, + "grad_norm": 2.993760001961813, + "learning_rate": 1e-06, + "loss": 0.4191, + "step": 2708 + }, + { + "epoch": 0.1736093309407844, + "grad_norm": 3.0046453722930457, + "learning_rate": 1e-06, + "loss": 0.4066, + "step": 2709 + }, + { + "epoch": 0.1736734170725455, + "grad_norm": 2.9357275097901137, + "learning_rate": 1e-06, + "loss": 0.4118, + "step": 2710 + }, + { + "epoch": 0.1737375032043066, + "grad_norm": 2.610844895489968, + "learning_rate": 1e-06, + "loss": 0.4334, + "step": 2711 + }, + { + "epoch": 0.17380158933606768, + "grad_norm": 2.825996241401419, + "learning_rate": 1e-06, + "loss": 0.4395, + "step": 2712 + }, + { + "epoch": 0.17386567546782877, + "grad_norm": 2.8467469193619923, + "learning_rate": 1e-06, + "loss": 0.4601, + "step": 2713 + }, + { + "epoch": 0.17392976159958984, + "grad_norm": 2.908167454995991, + "learning_rate": 1e-06, + "loss": 0.4204, + "step": 2714 + }, + { + "epoch": 0.17399384773135093, + "grad_norm": 2.67761276554012, + "learning_rate": 1e-06, + "loss": 0.386, + "step": 2715 + }, + { + "epoch": 0.17405793386311202, + "grad_norm": 2.6317302005851384, + "learning_rate": 1e-06, + "loss": 0.4263, + "step": 2716 + }, + { + "epoch": 0.17412201999487312, + "grad_norm": 2.747385895018015, + "learning_rate": 1e-06, + "loss": 0.3776, + "step": 2717 + }, + { + "epoch": 0.17418610612663418, + "grad_norm": 2.3660503979944356, + "learning_rate": 1e-06, + "loss": 0.3765, + "step": 2718 + }, + { + "epoch": 0.17425019225839528, + "grad_norm": 2.5496448902297444, + "learning_rate": 1e-06, + "loss": 0.4378, + "step": 2719 + }, + { + "epoch": 0.17431427839015637, + "grad_norm": 2.6347752728926244, + "learning_rate": 1e-06, + "loss": 0.4169, + "step": 2720 + }, + { + "epoch": 0.17437836452191746, + "grad_norm": 2.5835556645401136, + "learning_rate": 1e-06, + "loss": 0.4, + "step": 2721 + }, + { + "epoch": 0.17444245065367855, + "grad_norm": 2.712552510654322, + "learning_rate": 1e-06, + "loss": 0.4285, + "step": 2722 + }, + { + "epoch": 0.17450653678543962, + "grad_norm": 2.641969170333069, + "learning_rate": 1e-06, + "loss": 0.4099, + "step": 2723 + }, + { + "epoch": 0.1745706229172007, + "grad_norm": 3.0637998879801187, + "learning_rate": 1e-06, + "loss": 0.4272, + "step": 2724 + }, + { + "epoch": 0.1746347090489618, + "grad_norm": 2.727006901877962, + "learning_rate": 1e-06, + "loss": 0.374, + "step": 2725 + }, + { + "epoch": 0.1746987951807229, + "grad_norm": 3.023247813234385, + "learning_rate": 1e-06, + "loss": 0.4034, + "step": 2726 + }, + { + "epoch": 0.174762881312484, + "grad_norm": 2.814083489198847, + "learning_rate": 1e-06, + "loss": 0.393, + "step": 2727 + }, + { + "epoch": 0.17482696744424506, + "grad_norm": 2.449041367738676, + "learning_rate": 1e-06, + "loss": 0.4182, + "step": 2728 + }, + { + "epoch": 0.17489105357600615, + "grad_norm": 2.8231543234962806, + "learning_rate": 1e-06, + "loss": 0.3886, + "step": 2729 + }, + { + "epoch": 0.17495513970776724, + "grad_norm": 2.7766829388477325, + "learning_rate": 1e-06, + "loss": 0.4284, + "step": 2730 + }, + { + "epoch": 0.17501922583952834, + "grad_norm": 2.847867939571184, + "learning_rate": 1e-06, + "loss": 0.4885, + "step": 2731 + }, + { + "epoch": 0.1750833119712894, + "grad_norm": 2.4603559459072897, + "learning_rate": 1e-06, + "loss": 0.3952, + "step": 2732 + }, + { + "epoch": 0.1751473981030505, + "grad_norm": 2.9335988133488913, + "learning_rate": 1e-06, + "loss": 0.367, + "step": 2733 + }, + { + "epoch": 0.1752114842348116, + "grad_norm": 2.801463474487573, + "learning_rate": 1e-06, + "loss": 0.4461, + "step": 2734 + }, + { + "epoch": 0.17527557036657268, + "grad_norm": 2.6157190091020146, + "learning_rate": 1e-06, + "loss": 0.4039, + "step": 2735 + }, + { + "epoch": 0.17533965649833377, + "grad_norm": 2.860532946073638, + "learning_rate": 1e-06, + "loss": 0.4914, + "step": 2736 + }, + { + "epoch": 0.17540374263009484, + "grad_norm": 2.6380029352671186, + "learning_rate": 1e-06, + "loss": 0.4389, + "step": 2737 + }, + { + "epoch": 0.17546782876185593, + "grad_norm": 2.7757439534033725, + "learning_rate": 1e-06, + "loss": 0.3723, + "step": 2738 + }, + { + "epoch": 0.17553191489361702, + "grad_norm": 2.5525342561888293, + "learning_rate": 1e-06, + "loss": 0.4242, + "step": 2739 + }, + { + "epoch": 0.17559600102537812, + "grad_norm": 2.8172344485968264, + "learning_rate": 1e-06, + "loss": 0.447, + "step": 2740 + }, + { + "epoch": 0.17566008715713918, + "grad_norm": 2.609296582257566, + "learning_rate": 1e-06, + "loss": 0.3896, + "step": 2741 + }, + { + "epoch": 0.17572417328890028, + "grad_norm": 2.4000086904756732, + "learning_rate": 1e-06, + "loss": 0.3644, + "step": 2742 + }, + { + "epoch": 0.17578825942066137, + "grad_norm": 2.8350571223146908, + "learning_rate": 1e-06, + "loss": 0.3783, + "step": 2743 + }, + { + "epoch": 0.17585234555242246, + "grad_norm": 2.7509441663604552, + "learning_rate": 1e-06, + "loss": 0.4094, + "step": 2744 + }, + { + "epoch": 0.17591643168418356, + "grad_norm": 2.5301745440674135, + "learning_rate": 1e-06, + "loss": 0.3968, + "step": 2745 + }, + { + "epoch": 0.17598051781594462, + "grad_norm": 2.798905110335789, + "learning_rate": 1e-06, + "loss": 0.3853, + "step": 2746 + }, + { + "epoch": 0.1760446039477057, + "grad_norm": 2.705049111037033, + "learning_rate": 1e-06, + "loss": 0.4149, + "step": 2747 + }, + { + "epoch": 0.1761086900794668, + "grad_norm": 2.8010432178773823, + "learning_rate": 1e-06, + "loss": 0.3797, + "step": 2748 + }, + { + "epoch": 0.1761727762112279, + "grad_norm": 2.4673188572577125, + "learning_rate": 1e-06, + "loss": 0.3891, + "step": 2749 + }, + { + "epoch": 0.17623686234298896, + "grad_norm": 2.345419411995933, + "learning_rate": 1e-06, + "loss": 0.3931, + "step": 2750 + }, + { + "epoch": 0.17630094847475006, + "grad_norm": 2.652929007808521, + "learning_rate": 1e-06, + "loss": 0.402, + "step": 2751 + }, + { + "epoch": 0.17636503460651115, + "grad_norm": 2.4664570988941397, + "learning_rate": 1e-06, + "loss": 0.3661, + "step": 2752 + }, + { + "epoch": 0.17642912073827224, + "grad_norm": 2.5442145233577733, + "learning_rate": 1e-06, + "loss": 0.387, + "step": 2753 + }, + { + "epoch": 0.17649320687003334, + "grad_norm": 2.663665531636393, + "learning_rate": 1e-06, + "loss": 0.5124, + "step": 2754 + }, + { + "epoch": 0.1765572930017944, + "grad_norm": 2.6084492727596986, + "learning_rate": 1e-06, + "loss": 0.4015, + "step": 2755 + }, + { + "epoch": 0.1766213791335555, + "grad_norm": 2.6078674777982007, + "learning_rate": 1e-06, + "loss": 0.477, + "step": 2756 + }, + { + "epoch": 0.1766854652653166, + "grad_norm": 2.548574378570834, + "learning_rate": 1e-06, + "loss": 0.3738, + "step": 2757 + }, + { + "epoch": 0.17674955139707768, + "grad_norm": 2.5789832769642036, + "learning_rate": 1e-06, + "loss": 0.3945, + "step": 2758 + }, + { + "epoch": 0.17681363752883875, + "grad_norm": 2.717502081315403, + "learning_rate": 1e-06, + "loss": 0.4145, + "step": 2759 + }, + { + "epoch": 0.17687772366059984, + "grad_norm": 2.568127521668392, + "learning_rate": 1e-06, + "loss": 0.3814, + "step": 2760 + }, + { + "epoch": 0.17694180979236093, + "grad_norm": 2.789005011677915, + "learning_rate": 1e-06, + "loss": 0.4004, + "step": 2761 + }, + { + "epoch": 0.17700589592412203, + "grad_norm": 2.477575732446859, + "learning_rate": 1e-06, + "loss": 0.4279, + "step": 2762 + }, + { + "epoch": 0.17706998205588312, + "grad_norm": 2.7186598813432994, + "learning_rate": 1e-06, + "loss": 0.4073, + "step": 2763 + }, + { + "epoch": 0.17713406818764418, + "grad_norm": 2.7710934300979773, + "learning_rate": 1e-06, + "loss": 0.4294, + "step": 2764 + }, + { + "epoch": 0.17719815431940528, + "grad_norm": 2.7693788880209076, + "learning_rate": 1e-06, + "loss": 0.3503, + "step": 2765 + }, + { + "epoch": 0.17726224045116637, + "grad_norm": 2.6829776861796404, + "learning_rate": 1e-06, + "loss": 0.4504, + "step": 2766 + }, + { + "epoch": 0.17732632658292746, + "grad_norm": 2.4899668268787583, + "learning_rate": 1e-06, + "loss": 0.4042, + "step": 2767 + }, + { + "epoch": 0.17739041271468853, + "grad_norm": 2.5376629009155374, + "learning_rate": 1e-06, + "loss": 0.4113, + "step": 2768 + }, + { + "epoch": 0.17745449884644962, + "grad_norm": 2.6190589519635847, + "learning_rate": 1e-06, + "loss": 0.4423, + "step": 2769 + }, + { + "epoch": 0.17751858497821071, + "grad_norm": 2.733587140196812, + "learning_rate": 1e-06, + "loss": 0.3711, + "step": 2770 + }, + { + "epoch": 0.1775826711099718, + "grad_norm": 2.7646668438152378, + "learning_rate": 1e-06, + "loss": 0.4172, + "step": 2771 + }, + { + "epoch": 0.1776467572417329, + "grad_norm": 2.595584633191652, + "learning_rate": 1e-06, + "loss": 0.3571, + "step": 2772 + }, + { + "epoch": 0.17771084337349397, + "grad_norm": 2.7361207178039266, + "learning_rate": 1e-06, + "loss": 0.407, + "step": 2773 + }, + { + "epoch": 0.17777492950525506, + "grad_norm": 2.876956812662481, + "learning_rate": 1e-06, + "loss": 0.4357, + "step": 2774 + }, + { + "epoch": 0.17783901563701615, + "grad_norm": 2.5906483777638485, + "learning_rate": 1e-06, + "loss": 0.4096, + "step": 2775 + }, + { + "epoch": 0.17790310176877724, + "grad_norm": 2.604898683351699, + "learning_rate": 1e-06, + "loss": 0.3946, + "step": 2776 + }, + { + "epoch": 0.17796718790053834, + "grad_norm": 2.6280129282501243, + "learning_rate": 1e-06, + "loss": 0.4328, + "step": 2777 + }, + { + "epoch": 0.1780312740322994, + "grad_norm": 2.773509257973345, + "learning_rate": 1e-06, + "loss": 0.4847, + "step": 2778 + }, + { + "epoch": 0.1780953601640605, + "grad_norm": 2.669028302656756, + "learning_rate": 1e-06, + "loss": 0.4049, + "step": 2779 + }, + { + "epoch": 0.1781594462958216, + "grad_norm": 2.5119037751744466, + "learning_rate": 1e-06, + "loss": 0.3751, + "step": 2780 + }, + { + "epoch": 0.17822353242758268, + "grad_norm": 2.719489579389454, + "learning_rate": 1e-06, + "loss": 0.4567, + "step": 2781 + }, + { + "epoch": 0.17828761855934375, + "grad_norm": 2.802127986585095, + "learning_rate": 1e-06, + "loss": 0.4662, + "step": 2782 + }, + { + "epoch": 0.17835170469110484, + "grad_norm": 3.488029860507148, + "learning_rate": 1e-06, + "loss": 0.419, + "step": 2783 + }, + { + "epoch": 0.17841579082286593, + "grad_norm": 2.7624002719807117, + "learning_rate": 1e-06, + "loss": 0.4514, + "step": 2784 + }, + { + "epoch": 0.17847987695462703, + "grad_norm": 2.7263045947433677, + "learning_rate": 1e-06, + "loss": 0.4049, + "step": 2785 + }, + { + "epoch": 0.17854396308638812, + "grad_norm": 2.664083750414536, + "learning_rate": 1e-06, + "loss": 0.4095, + "step": 2786 + }, + { + "epoch": 0.17860804921814918, + "grad_norm": 2.5985944580963025, + "learning_rate": 1e-06, + "loss": 0.4391, + "step": 2787 + }, + { + "epoch": 0.17867213534991028, + "grad_norm": 2.6064454548319884, + "learning_rate": 1e-06, + "loss": 0.4302, + "step": 2788 + }, + { + "epoch": 0.17873622148167137, + "grad_norm": 2.5802910154946668, + "learning_rate": 1e-06, + "loss": 0.3561, + "step": 2789 + }, + { + "epoch": 0.17880030761343246, + "grad_norm": 2.571872682594762, + "learning_rate": 1e-06, + "loss": 0.3856, + "step": 2790 + }, + { + "epoch": 0.17886439374519353, + "grad_norm": 2.6983518939151896, + "learning_rate": 1e-06, + "loss": 0.4202, + "step": 2791 + }, + { + "epoch": 0.17892847987695462, + "grad_norm": 2.599966184965254, + "learning_rate": 1e-06, + "loss": 0.4119, + "step": 2792 + }, + { + "epoch": 0.17899256600871571, + "grad_norm": 3.207940867000454, + "learning_rate": 1e-06, + "loss": 0.4075, + "step": 2793 + }, + { + "epoch": 0.1790566521404768, + "grad_norm": 2.714492976153829, + "learning_rate": 1e-06, + "loss": 0.406, + "step": 2794 + }, + { + "epoch": 0.1791207382722379, + "grad_norm": 2.6020945100146124, + "learning_rate": 1e-06, + "loss": 0.4085, + "step": 2795 + }, + { + "epoch": 0.17918482440399897, + "grad_norm": 2.751146216088707, + "learning_rate": 1e-06, + "loss": 0.3513, + "step": 2796 + }, + { + "epoch": 0.17924891053576006, + "grad_norm": 2.7125630629926847, + "learning_rate": 1e-06, + "loss": 0.4104, + "step": 2797 + }, + { + "epoch": 0.17931299666752115, + "grad_norm": 2.5691944151852666, + "learning_rate": 1e-06, + "loss": 0.3925, + "step": 2798 + }, + { + "epoch": 0.17937708279928224, + "grad_norm": 2.4952783268939, + "learning_rate": 1e-06, + "loss": 0.3993, + "step": 2799 + }, + { + "epoch": 0.1794411689310433, + "grad_norm": 2.6788205945919157, + "learning_rate": 1e-06, + "loss": 0.3886, + "step": 2800 + }, + { + "epoch": 0.1795052550628044, + "grad_norm": 2.6060361462237736, + "learning_rate": 1e-06, + "loss": 0.3713, + "step": 2801 + }, + { + "epoch": 0.1795693411945655, + "grad_norm": 2.825696798513869, + "learning_rate": 1e-06, + "loss": 0.4254, + "step": 2802 + }, + { + "epoch": 0.1796334273263266, + "grad_norm": 2.6233720527199287, + "learning_rate": 1e-06, + "loss": 0.3849, + "step": 2803 + }, + { + "epoch": 0.17969751345808768, + "grad_norm": 2.72767923666059, + "learning_rate": 1e-06, + "loss": 0.3825, + "step": 2804 + }, + { + "epoch": 0.17976159958984875, + "grad_norm": 2.6288577192437903, + "learning_rate": 1e-06, + "loss": 0.4019, + "step": 2805 + }, + { + "epoch": 0.17982568572160984, + "grad_norm": 2.897506293696981, + "learning_rate": 1e-06, + "loss": 0.4932, + "step": 2806 + }, + { + "epoch": 0.17988977185337093, + "grad_norm": 2.6155573925589124, + "learning_rate": 1e-06, + "loss": 0.3817, + "step": 2807 + }, + { + "epoch": 0.17995385798513203, + "grad_norm": 2.7440180522487627, + "learning_rate": 1e-06, + "loss": 0.3971, + "step": 2808 + }, + { + "epoch": 0.1800179441168931, + "grad_norm": 2.8646368360187378, + "learning_rate": 1e-06, + "loss": 0.4188, + "step": 2809 + }, + { + "epoch": 0.18008203024865418, + "grad_norm": 2.569865671345972, + "learning_rate": 1e-06, + "loss": 0.382, + "step": 2810 + }, + { + "epoch": 0.18014611638041528, + "grad_norm": 2.5413764281681055, + "learning_rate": 1e-06, + "loss": 0.3716, + "step": 2811 + }, + { + "epoch": 0.18021020251217637, + "grad_norm": 2.438466070103246, + "learning_rate": 1e-06, + "loss": 0.366, + "step": 2812 + }, + { + "epoch": 0.18027428864393746, + "grad_norm": 2.5496759695348814, + "learning_rate": 1e-06, + "loss": 0.4054, + "step": 2813 + }, + { + "epoch": 0.18033837477569853, + "grad_norm": 2.6402962681310065, + "learning_rate": 1e-06, + "loss": 0.4161, + "step": 2814 + }, + { + "epoch": 0.18040246090745962, + "grad_norm": 2.670866895541432, + "learning_rate": 1e-06, + "loss": 0.4144, + "step": 2815 + }, + { + "epoch": 0.18046654703922071, + "grad_norm": 2.791334110746202, + "learning_rate": 1e-06, + "loss": 0.3661, + "step": 2816 + }, + { + "epoch": 0.1805306331709818, + "grad_norm": 2.509276363670558, + "learning_rate": 1e-06, + "loss": 0.4076, + "step": 2817 + }, + { + "epoch": 0.18059471930274287, + "grad_norm": 2.5760759381567775, + "learning_rate": 1e-06, + "loss": 0.3704, + "step": 2818 + }, + { + "epoch": 0.18065880543450397, + "grad_norm": 2.823011500253391, + "learning_rate": 1e-06, + "loss": 0.3794, + "step": 2819 + }, + { + "epoch": 0.18072289156626506, + "grad_norm": 2.5784308273181895, + "learning_rate": 1e-06, + "loss": 0.4494, + "step": 2820 + }, + { + "epoch": 0.18078697769802615, + "grad_norm": 2.8354990552703767, + "learning_rate": 1e-06, + "loss": 0.4224, + "step": 2821 + }, + { + "epoch": 0.18085106382978725, + "grad_norm": 2.545519235918881, + "learning_rate": 1e-06, + "loss": 0.3917, + "step": 2822 + }, + { + "epoch": 0.1809151499615483, + "grad_norm": 2.650679339899046, + "learning_rate": 1e-06, + "loss": 0.4316, + "step": 2823 + }, + { + "epoch": 0.1809792360933094, + "grad_norm": 2.5115419988434167, + "learning_rate": 1e-06, + "loss": 0.4015, + "step": 2824 + }, + { + "epoch": 0.1810433222250705, + "grad_norm": 2.5266254302391333, + "learning_rate": 1e-06, + "loss": 0.3854, + "step": 2825 + }, + { + "epoch": 0.1811074083568316, + "grad_norm": 2.7285294453224895, + "learning_rate": 1e-06, + "loss": 0.3494, + "step": 2826 + }, + { + "epoch": 0.18117149448859265, + "grad_norm": 2.6957667643101453, + "learning_rate": 1e-06, + "loss": 0.4484, + "step": 2827 + }, + { + "epoch": 0.18123558062035375, + "grad_norm": 2.5773499213332327, + "learning_rate": 1e-06, + "loss": 0.3713, + "step": 2828 + }, + { + "epoch": 0.18129966675211484, + "grad_norm": 2.605720550620889, + "learning_rate": 1e-06, + "loss": 0.4316, + "step": 2829 + }, + { + "epoch": 0.18136375288387593, + "grad_norm": 2.852712735022763, + "learning_rate": 1e-06, + "loss": 0.4289, + "step": 2830 + }, + { + "epoch": 0.18142783901563703, + "grad_norm": 2.75059295048179, + "learning_rate": 1e-06, + "loss": 0.3943, + "step": 2831 + }, + { + "epoch": 0.1814919251473981, + "grad_norm": 3.210091013374015, + "learning_rate": 1e-06, + "loss": 0.4356, + "step": 2832 + }, + { + "epoch": 0.18155601127915919, + "grad_norm": 2.7176044087136577, + "learning_rate": 1e-06, + "loss": 0.4638, + "step": 2833 + }, + { + "epoch": 0.18162009741092028, + "grad_norm": 2.6859048781918156, + "learning_rate": 1e-06, + "loss": 0.3808, + "step": 2834 + }, + { + "epoch": 0.18168418354268137, + "grad_norm": 2.690898630883437, + "learning_rate": 1e-06, + "loss": 0.4282, + "step": 2835 + }, + { + "epoch": 0.18174826967444246, + "grad_norm": 2.670934781807855, + "learning_rate": 1e-06, + "loss": 0.3632, + "step": 2836 + }, + { + "epoch": 0.18181235580620353, + "grad_norm": 2.5683697269090775, + "learning_rate": 1e-06, + "loss": 0.3937, + "step": 2837 + }, + { + "epoch": 0.18187644193796462, + "grad_norm": 2.721953999228212, + "learning_rate": 1e-06, + "loss": 0.4822, + "step": 2838 + }, + { + "epoch": 0.18194052806972572, + "grad_norm": 2.761994453753278, + "learning_rate": 1e-06, + "loss": 0.4528, + "step": 2839 + }, + { + "epoch": 0.1820046142014868, + "grad_norm": 2.4924789215698047, + "learning_rate": 1e-06, + "loss": 0.4148, + "step": 2840 + }, + { + "epoch": 0.18206870033324787, + "grad_norm": 2.6911603552831953, + "learning_rate": 1e-06, + "loss": 0.4053, + "step": 2841 + }, + { + "epoch": 0.18213278646500897, + "grad_norm": 2.4914369771221514, + "learning_rate": 1e-06, + "loss": 0.4142, + "step": 2842 + }, + { + "epoch": 0.18219687259677006, + "grad_norm": 2.670264209025931, + "learning_rate": 1e-06, + "loss": 0.4526, + "step": 2843 + }, + { + "epoch": 0.18226095872853115, + "grad_norm": 2.655186995685657, + "learning_rate": 1e-06, + "loss": 0.4319, + "step": 2844 + }, + { + "epoch": 0.18232504486029225, + "grad_norm": 2.7994660696485654, + "learning_rate": 1e-06, + "loss": 0.3766, + "step": 2845 + }, + { + "epoch": 0.1823891309920533, + "grad_norm": 2.6672566061037983, + "learning_rate": 1e-06, + "loss": 0.3663, + "step": 2846 + }, + { + "epoch": 0.1824532171238144, + "grad_norm": 2.4995281084688403, + "learning_rate": 1e-06, + "loss": 0.3993, + "step": 2847 + }, + { + "epoch": 0.1825173032555755, + "grad_norm": 2.9307889954258664, + "learning_rate": 1e-06, + "loss": 0.4265, + "step": 2848 + }, + { + "epoch": 0.1825813893873366, + "grad_norm": 2.418817204994986, + "learning_rate": 1e-06, + "loss": 0.399, + "step": 2849 + }, + { + "epoch": 0.18264547551909766, + "grad_norm": 2.683877473241475, + "learning_rate": 1e-06, + "loss": 0.4107, + "step": 2850 + }, + { + "epoch": 0.18270956165085875, + "grad_norm": 2.4528568962910615, + "learning_rate": 1e-06, + "loss": 0.4044, + "step": 2851 + }, + { + "epoch": 0.18277364778261984, + "grad_norm": 2.507911995843844, + "learning_rate": 1e-06, + "loss": 0.4158, + "step": 2852 + }, + { + "epoch": 0.18283773391438093, + "grad_norm": 2.6538955269222164, + "learning_rate": 1e-06, + "loss": 0.4015, + "step": 2853 + }, + { + "epoch": 0.18290182004614203, + "grad_norm": 2.3812884712864055, + "learning_rate": 1e-06, + "loss": 0.3987, + "step": 2854 + }, + { + "epoch": 0.1829659061779031, + "grad_norm": 2.7519692295648457, + "learning_rate": 1e-06, + "loss": 0.4249, + "step": 2855 + }, + { + "epoch": 0.18302999230966419, + "grad_norm": 2.5769076247826717, + "learning_rate": 1e-06, + "loss": 0.4546, + "step": 2856 + }, + { + "epoch": 0.18309407844142528, + "grad_norm": 2.7671252476956343, + "learning_rate": 1e-06, + "loss": 0.4223, + "step": 2857 + }, + { + "epoch": 0.18315816457318637, + "grad_norm": 2.7236996174804853, + "learning_rate": 1e-06, + "loss": 0.4156, + "step": 2858 + }, + { + "epoch": 0.18322225070494744, + "grad_norm": 2.777018765978659, + "learning_rate": 1e-06, + "loss": 0.4837, + "step": 2859 + }, + { + "epoch": 0.18328633683670853, + "grad_norm": 2.7820808558817247, + "learning_rate": 1e-06, + "loss": 0.4043, + "step": 2860 + }, + { + "epoch": 0.18335042296846962, + "grad_norm": 2.728486937802698, + "learning_rate": 1e-06, + "loss": 0.4269, + "step": 2861 + }, + { + "epoch": 0.18341450910023072, + "grad_norm": 2.4941825007689644, + "learning_rate": 1e-06, + "loss": 0.4187, + "step": 2862 + }, + { + "epoch": 0.1834785952319918, + "grad_norm": 2.5636554116823262, + "learning_rate": 1e-06, + "loss": 0.4574, + "step": 2863 + }, + { + "epoch": 0.18354268136375287, + "grad_norm": 2.745168426594291, + "learning_rate": 1e-06, + "loss": 0.4249, + "step": 2864 + }, + { + "epoch": 0.18360676749551397, + "grad_norm": 2.55893475455084, + "learning_rate": 1e-06, + "loss": 0.3796, + "step": 2865 + }, + { + "epoch": 0.18367085362727506, + "grad_norm": 2.48777488032714, + "learning_rate": 1e-06, + "loss": 0.432, + "step": 2866 + }, + { + "epoch": 0.18373493975903615, + "grad_norm": 3.1189156752141023, + "learning_rate": 1e-06, + "loss": 0.4482, + "step": 2867 + }, + { + "epoch": 0.18379902589079722, + "grad_norm": 2.5609793040069584, + "learning_rate": 1e-06, + "loss": 0.4464, + "step": 2868 + }, + { + "epoch": 0.1838631120225583, + "grad_norm": 2.538218566718866, + "learning_rate": 1e-06, + "loss": 0.4741, + "step": 2869 + }, + { + "epoch": 0.1839271981543194, + "grad_norm": 2.473628489684195, + "learning_rate": 1e-06, + "loss": 0.4571, + "step": 2870 + }, + { + "epoch": 0.1839912842860805, + "grad_norm": 2.6655771340155425, + "learning_rate": 1e-06, + "loss": 0.4382, + "step": 2871 + }, + { + "epoch": 0.1840553704178416, + "grad_norm": 2.5370217160344426, + "learning_rate": 1e-06, + "loss": 0.363, + "step": 2872 + }, + { + "epoch": 0.18411945654960266, + "grad_norm": 2.4867648521029797, + "learning_rate": 1e-06, + "loss": 0.3996, + "step": 2873 + }, + { + "epoch": 0.18418354268136375, + "grad_norm": 2.835924752123543, + "learning_rate": 1e-06, + "loss": 0.4362, + "step": 2874 + }, + { + "epoch": 0.18424762881312484, + "grad_norm": 2.405751638794753, + "learning_rate": 1e-06, + "loss": 0.4, + "step": 2875 + }, + { + "epoch": 0.18431171494488593, + "grad_norm": 2.8384661298225087, + "learning_rate": 1e-06, + "loss": 0.3972, + "step": 2876 + }, + { + "epoch": 0.184375801076647, + "grad_norm": 2.4594890443427184, + "learning_rate": 1e-06, + "loss": 0.4313, + "step": 2877 + }, + { + "epoch": 0.1844398872084081, + "grad_norm": 2.526104120739134, + "learning_rate": 1e-06, + "loss": 0.3513, + "step": 2878 + }, + { + "epoch": 0.1845039733401692, + "grad_norm": 2.6250238766805585, + "learning_rate": 1e-06, + "loss": 0.4057, + "step": 2879 + }, + { + "epoch": 0.18456805947193028, + "grad_norm": 2.633967543674042, + "learning_rate": 1e-06, + "loss": 0.435, + "step": 2880 + }, + { + "epoch": 0.18463214560369137, + "grad_norm": 2.7900500444927507, + "learning_rate": 1e-06, + "loss": 0.462, + "step": 2881 + }, + { + "epoch": 0.18469623173545244, + "grad_norm": 2.9289848277770814, + "learning_rate": 1e-06, + "loss": 0.453, + "step": 2882 + }, + { + "epoch": 0.18476031786721353, + "grad_norm": 2.6918411760353536, + "learning_rate": 1e-06, + "loss": 0.3917, + "step": 2883 + }, + { + "epoch": 0.18482440399897462, + "grad_norm": 2.9527899042701615, + "learning_rate": 1e-06, + "loss": 0.3835, + "step": 2884 + }, + { + "epoch": 0.18488849013073572, + "grad_norm": 2.6012306739317785, + "learning_rate": 1e-06, + "loss": 0.4306, + "step": 2885 + }, + { + "epoch": 0.1849525762624968, + "grad_norm": 2.9987158571479977, + "learning_rate": 1e-06, + "loss": 0.365, + "step": 2886 + }, + { + "epoch": 0.18501666239425787, + "grad_norm": 2.800871847179712, + "learning_rate": 1e-06, + "loss": 0.3935, + "step": 2887 + }, + { + "epoch": 0.18508074852601897, + "grad_norm": 2.828246130679259, + "learning_rate": 1e-06, + "loss": 0.3956, + "step": 2888 + }, + { + "epoch": 0.18514483465778006, + "grad_norm": 2.619825070399056, + "learning_rate": 1e-06, + "loss": 0.4172, + "step": 2889 + }, + { + "epoch": 0.18520892078954115, + "grad_norm": 2.624254162054244, + "learning_rate": 1e-06, + "loss": 0.4387, + "step": 2890 + }, + { + "epoch": 0.18527300692130222, + "grad_norm": 2.669571750177714, + "learning_rate": 1e-06, + "loss": 0.4586, + "step": 2891 + }, + { + "epoch": 0.1853370930530633, + "grad_norm": 2.58413530972886, + "learning_rate": 1e-06, + "loss": 0.4605, + "step": 2892 + }, + { + "epoch": 0.1854011791848244, + "grad_norm": 2.750414580107718, + "learning_rate": 1e-06, + "loss": 0.4133, + "step": 2893 + }, + { + "epoch": 0.1854652653165855, + "grad_norm": 2.5738479042844635, + "learning_rate": 1e-06, + "loss": 0.3957, + "step": 2894 + }, + { + "epoch": 0.1855293514483466, + "grad_norm": 2.5740432194741394, + "learning_rate": 1e-06, + "loss": 0.3783, + "step": 2895 + }, + { + "epoch": 0.18559343758010766, + "grad_norm": 2.7328320766355745, + "learning_rate": 1e-06, + "loss": 0.4155, + "step": 2896 + }, + { + "epoch": 0.18565752371186875, + "grad_norm": 2.6266822727179617, + "learning_rate": 1e-06, + "loss": 0.4322, + "step": 2897 + }, + { + "epoch": 0.18572160984362984, + "grad_norm": 2.7423123147741624, + "learning_rate": 1e-06, + "loss": 0.3899, + "step": 2898 + }, + { + "epoch": 0.18578569597539094, + "grad_norm": 2.6644779759023405, + "learning_rate": 1e-06, + "loss": 0.3878, + "step": 2899 + }, + { + "epoch": 0.185849782107152, + "grad_norm": 2.8416891170218164, + "learning_rate": 1e-06, + "loss": 0.3696, + "step": 2900 + }, + { + "epoch": 0.1859138682389131, + "grad_norm": 2.651614683525343, + "learning_rate": 1e-06, + "loss": 0.4604, + "step": 2901 + }, + { + "epoch": 0.1859779543706742, + "grad_norm": 2.6711252166907253, + "learning_rate": 1e-06, + "loss": 0.4333, + "step": 2902 + }, + { + "epoch": 0.18604204050243528, + "grad_norm": 2.6775615588910275, + "learning_rate": 1e-06, + "loss": 0.3517, + "step": 2903 + }, + { + "epoch": 0.18610612663419637, + "grad_norm": 2.702059561627033, + "learning_rate": 1e-06, + "loss": 0.4167, + "step": 2904 + }, + { + "epoch": 0.18617021276595744, + "grad_norm": 2.536263599797668, + "learning_rate": 1e-06, + "loss": 0.3772, + "step": 2905 + }, + { + "epoch": 0.18623429889771853, + "grad_norm": 2.845862941191916, + "learning_rate": 1e-06, + "loss": 0.4259, + "step": 2906 + }, + { + "epoch": 0.18629838502947962, + "grad_norm": 2.700477381709106, + "learning_rate": 1e-06, + "loss": 0.3883, + "step": 2907 + }, + { + "epoch": 0.18636247116124072, + "grad_norm": 2.682607738754587, + "learning_rate": 1e-06, + "loss": 0.3964, + "step": 2908 + }, + { + "epoch": 0.18642655729300178, + "grad_norm": 2.554052321832966, + "learning_rate": 1e-06, + "loss": 0.3855, + "step": 2909 + }, + { + "epoch": 0.18649064342476288, + "grad_norm": 2.6403351465968274, + "learning_rate": 1e-06, + "loss": 0.4517, + "step": 2910 + }, + { + "epoch": 0.18655472955652397, + "grad_norm": 2.5477057026439476, + "learning_rate": 1e-06, + "loss": 0.3922, + "step": 2911 + }, + { + "epoch": 0.18661881568828506, + "grad_norm": 2.5704484585325926, + "learning_rate": 1e-06, + "loss": 0.4367, + "step": 2912 + }, + { + "epoch": 0.18668290182004615, + "grad_norm": 2.6724804046778594, + "learning_rate": 1e-06, + "loss": 0.3791, + "step": 2913 + }, + { + "epoch": 0.18674698795180722, + "grad_norm": 2.705919945837271, + "learning_rate": 1e-06, + "loss": 0.4193, + "step": 2914 + }, + { + "epoch": 0.1868110740835683, + "grad_norm": 2.6275085521905037, + "learning_rate": 1e-06, + "loss": 0.3668, + "step": 2915 + }, + { + "epoch": 0.1868751602153294, + "grad_norm": 2.5644893129504993, + "learning_rate": 1e-06, + "loss": 0.385, + "step": 2916 + }, + { + "epoch": 0.1869392463470905, + "grad_norm": 2.6374626451766843, + "learning_rate": 1e-06, + "loss": 0.3871, + "step": 2917 + }, + { + "epoch": 0.18700333247885156, + "grad_norm": 2.916365013642046, + "learning_rate": 1e-06, + "loss": 0.4274, + "step": 2918 + }, + { + "epoch": 0.18706741861061266, + "grad_norm": 2.5881011852417557, + "learning_rate": 1e-06, + "loss": 0.4246, + "step": 2919 + }, + { + "epoch": 0.18713150474237375, + "grad_norm": 2.455367104725631, + "learning_rate": 1e-06, + "loss": 0.4351, + "step": 2920 + }, + { + "epoch": 0.18719559087413484, + "grad_norm": 2.675434496637276, + "learning_rate": 1e-06, + "loss": 0.4039, + "step": 2921 + }, + { + "epoch": 0.18725967700589594, + "grad_norm": 2.7514060307588823, + "learning_rate": 1e-06, + "loss": 0.3964, + "step": 2922 + }, + { + "epoch": 0.187323763137657, + "grad_norm": 2.8818081578888135, + "learning_rate": 1e-06, + "loss": 0.4365, + "step": 2923 + }, + { + "epoch": 0.1873878492694181, + "grad_norm": 2.722378179353567, + "learning_rate": 1e-06, + "loss": 0.4214, + "step": 2924 + }, + { + "epoch": 0.1874519354011792, + "grad_norm": 2.6386298247760442, + "learning_rate": 1e-06, + "loss": 0.4116, + "step": 2925 + }, + { + "epoch": 0.18751602153294028, + "grad_norm": 2.6530040293298693, + "learning_rate": 1e-06, + "loss": 0.3865, + "step": 2926 + }, + { + "epoch": 0.18758010766470135, + "grad_norm": 2.8852720002345364, + "learning_rate": 1e-06, + "loss": 0.4578, + "step": 2927 + }, + { + "epoch": 0.18764419379646244, + "grad_norm": 2.624833979077509, + "learning_rate": 1e-06, + "loss": 0.4535, + "step": 2928 + }, + { + "epoch": 0.18770827992822353, + "grad_norm": 2.5224928226685557, + "learning_rate": 1e-06, + "loss": 0.3461, + "step": 2929 + }, + { + "epoch": 0.18777236605998462, + "grad_norm": 2.783588647866934, + "learning_rate": 1e-06, + "loss": 0.349, + "step": 2930 + }, + { + "epoch": 0.18783645219174572, + "grad_norm": 2.5465407705725, + "learning_rate": 1e-06, + "loss": 0.4535, + "step": 2931 + }, + { + "epoch": 0.18790053832350678, + "grad_norm": 2.5538413117725964, + "learning_rate": 1e-06, + "loss": 0.421, + "step": 2932 + }, + { + "epoch": 0.18796462445526788, + "grad_norm": 2.551250691707148, + "learning_rate": 1e-06, + "loss": 0.4594, + "step": 2933 + }, + { + "epoch": 0.18802871058702897, + "grad_norm": 3.1452067199513345, + "learning_rate": 1e-06, + "loss": 0.4302, + "step": 2934 + }, + { + "epoch": 0.18809279671879006, + "grad_norm": 2.743176730793466, + "learning_rate": 1e-06, + "loss": 0.3967, + "step": 2935 + }, + { + "epoch": 0.18815688285055113, + "grad_norm": 2.8114938453740113, + "learning_rate": 1e-06, + "loss": 0.4454, + "step": 2936 + }, + { + "epoch": 0.18822096898231222, + "grad_norm": 2.8125690365615457, + "learning_rate": 1e-06, + "loss": 0.3852, + "step": 2937 + }, + { + "epoch": 0.1882850551140733, + "grad_norm": 2.7093968409248954, + "learning_rate": 1e-06, + "loss": 0.4598, + "step": 2938 + }, + { + "epoch": 0.1883491412458344, + "grad_norm": 2.7084723079218977, + "learning_rate": 1e-06, + "loss": 0.3802, + "step": 2939 + }, + { + "epoch": 0.1884132273775955, + "grad_norm": 2.924608356651168, + "learning_rate": 1e-06, + "loss": 0.4494, + "step": 2940 + }, + { + "epoch": 0.18847731350935656, + "grad_norm": 2.7623725533237296, + "learning_rate": 1e-06, + "loss": 0.4018, + "step": 2941 + }, + { + "epoch": 0.18854139964111766, + "grad_norm": 2.570143623445525, + "learning_rate": 1e-06, + "loss": 0.3848, + "step": 2942 + }, + { + "epoch": 0.18860548577287875, + "grad_norm": 2.590379184495267, + "learning_rate": 1e-06, + "loss": 0.3513, + "step": 2943 + }, + { + "epoch": 0.18866957190463984, + "grad_norm": 2.6091991566587796, + "learning_rate": 1e-06, + "loss": 0.3754, + "step": 2944 + }, + { + "epoch": 0.18873365803640094, + "grad_norm": 2.5426130853423685, + "learning_rate": 1e-06, + "loss": 0.4436, + "step": 2945 + }, + { + "epoch": 0.188797744168162, + "grad_norm": 2.4880023974397534, + "learning_rate": 1e-06, + "loss": 0.3936, + "step": 2946 + }, + { + "epoch": 0.1888618302999231, + "grad_norm": 2.7254911181430734, + "learning_rate": 1e-06, + "loss": 0.4069, + "step": 2947 + }, + { + "epoch": 0.1889259164316842, + "grad_norm": 2.855985760623235, + "learning_rate": 1e-06, + "loss": 0.4405, + "step": 2948 + }, + { + "epoch": 0.18899000256344528, + "grad_norm": 2.738984275461709, + "learning_rate": 1e-06, + "loss": 0.4785, + "step": 2949 + }, + { + "epoch": 0.18905408869520635, + "grad_norm": 2.693467333245631, + "learning_rate": 1e-06, + "loss": 0.4415, + "step": 2950 + }, + { + "epoch": 0.18911817482696744, + "grad_norm": 2.725268201313213, + "learning_rate": 1e-06, + "loss": 0.4535, + "step": 2951 + }, + { + "epoch": 0.18918226095872853, + "grad_norm": 2.649478170221162, + "learning_rate": 1e-06, + "loss": 0.4313, + "step": 2952 + }, + { + "epoch": 0.18924634709048962, + "grad_norm": 2.7059280275569053, + "learning_rate": 1e-06, + "loss": 0.4168, + "step": 2953 + }, + { + "epoch": 0.18931043322225072, + "grad_norm": 2.9448804667390966, + "learning_rate": 1e-06, + "loss": 0.4091, + "step": 2954 + }, + { + "epoch": 0.18937451935401178, + "grad_norm": 2.6088760199937036, + "learning_rate": 1e-06, + "loss": 0.4026, + "step": 2955 + }, + { + "epoch": 0.18943860548577288, + "grad_norm": 2.792982465429231, + "learning_rate": 1e-06, + "loss": 0.4355, + "step": 2956 + }, + { + "epoch": 0.18950269161753397, + "grad_norm": 2.856731613527467, + "learning_rate": 1e-06, + "loss": 0.4374, + "step": 2957 + }, + { + "epoch": 0.18956677774929506, + "grad_norm": 2.6591552161854044, + "learning_rate": 1e-06, + "loss": 0.4328, + "step": 2958 + }, + { + "epoch": 0.18963086388105613, + "grad_norm": 2.820863474787736, + "learning_rate": 1e-06, + "loss": 0.4068, + "step": 2959 + }, + { + "epoch": 0.18969495001281722, + "grad_norm": 3.1564803034432067, + "learning_rate": 1e-06, + "loss": 0.4894, + "step": 2960 + }, + { + "epoch": 0.1897590361445783, + "grad_norm": 2.7775467867516723, + "learning_rate": 1e-06, + "loss": 0.4219, + "step": 2961 + }, + { + "epoch": 0.1898231222763394, + "grad_norm": 2.570709104763904, + "learning_rate": 1e-06, + "loss": 0.3836, + "step": 2962 + }, + { + "epoch": 0.1898872084081005, + "grad_norm": 3.02681267671875, + "learning_rate": 1e-06, + "loss": 0.4426, + "step": 2963 + }, + { + "epoch": 0.18995129453986156, + "grad_norm": 2.510522158823479, + "learning_rate": 1e-06, + "loss": 0.4054, + "step": 2964 + }, + { + "epoch": 0.19001538067162266, + "grad_norm": 2.693774060885273, + "learning_rate": 1e-06, + "loss": 0.4705, + "step": 2965 + }, + { + "epoch": 0.19007946680338375, + "grad_norm": 2.748528804109493, + "learning_rate": 1e-06, + "loss": 0.4332, + "step": 2966 + }, + { + "epoch": 0.19014355293514484, + "grad_norm": 2.776594086545492, + "learning_rate": 1e-06, + "loss": 0.4031, + "step": 2967 + }, + { + "epoch": 0.1902076390669059, + "grad_norm": 2.797526799261737, + "learning_rate": 1e-06, + "loss": 0.4434, + "step": 2968 + }, + { + "epoch": 0.190271725198667, + "grad_norm": 2.716540472496669, + "learning_rate": 1e-06, + "loss": 0.3527, + "step": 2969 + }, + { + "epoch": 0.1903358113304281, + "grad_norm": 2.5129230641901255, + "learning_rate": 1e-06, + "loss": 0.4134, + "step": 2970 + }, + { + "epoch": 0.1903998974621892, + "grad_norm": 2.600501120594301, + "learning_rate": 1e-06, + "loss": 0.4419, + "step": 2971 + }, + { + "epoch": 0.19046398359395028, + "grad_norm": 2.976313279269352, + "learning_rate": 1e-06, + "loss": 0.4721, + "step": 2972 + }, + { + "epoch": 0.19052806972571135, + "grad_norm": 2.771766068236897, + "learning_rate": 1e-06, + "loss": 0.3931, + "step": 2973 + }, + { + "epoch": 0.19059215585747244, + "grad_norm": 2.6489145777810394, + "learning_rate": 1e-06, + "loss": 0.4546, + "step": 2974 + }, + { + "epoch": 0.19065624198923353, + "grad_norm": 2.580312074382169, + "learning_rate": 1e-06, + "loss": 0.3992, + "step": 2975 + }, + { + "epoch": 0.19072032812099463, + "grad_norm": 2.7443330078441237, + "learning_rate": 1e-06, + "loss": 0.4155, + "step": 2976 + }, + { + "epoch": 0.1907844142527557, + "grad_norm": 2.633306061091404, + "learning_rate": 1e-06, + "loss": 0.4246, + "step": 2977 + }, + { + "epoch": 0.19084850038451678, + "grad_norm": 2.604761994040819, + "learning_rate": 1e-06, + "loss": 0.3895, + "step": 2978 + }, + { + "epoch": 0.19091258651627788, + "grad_norm": 2.683095016513032, + "learning_rate": 1e-06, + "loss": 0.4331, + "step": 2979 + }, + { + "epoch": 0.19097667264803897, + "grad_norm": 2.6835183303069003, + "learning_rate": 1e-06, + "loss": 0.4427, + "step": 2980 + }, + { + "epoch": 0.19104075877980006, + "grad_norm": 2.502521760012333, + "learning_rate": 1e-06, + "loss": 0.3837, + "step": 2981 + }, + { + "epoch": 0.19110484491156113, + "grad_norm": 2.4357388695095055, + "learning_rate": 1e-06, + "loss": 0.3592, + "step": 2982 + }, + { + "epoch": 0.19116893104332222, + "grad_norm": 2.7305965671623036, + "learning_rate": 1e-06, + "loss": 0.4131, + "step": 2983 + }, + { + "epoch": 0.19123301717508331, + "grad_norm": 2.362646675213141, + "learning_rate": 1e-06, + "loss": 0.3743, + "step": 2984 + }, + { + "epoch": 0.1912971033068444, + "grad_norm": 2.7384238709786413, + "learning_rate": 1e-06, + "loss": 0.413, + "step": 2985 + }, + { + "epoch": 0.19136118943860547, + "grad_norm": 2.5007223098863647, + "learning_rate": 1e-06, + "loss": 0.3885, + "step": 2986 + }, + { + "epoch": 0.19142527557036657, + "grad_norm": 2.700176270926466, + "learning_rate": 1e-06, + "loss": 0.4215, + "step": 2987 + }, + { + "epoch": 0.19148936170212766, + "grad_norm": 2.7085371035105346, + "learning_rate": 1e-06, + "loss": 0.4206, + "step": 2988 + }, + { + "epoch": 0.19155344783388875, + "grad_norm": 2.79283849751298, + "learning_rate": 1e-06, + "loss": 0.4543, + "step": 2989 + }, + { + "epoch": 0.19161753396564984, + "grad_norm": 2.7877343008577946, + "learning_rate": 1e-06, + "loss": 0.4294, + "step": 2990 + }, + { + "epoch": 0.1916816200974109, + "grad_norm": 2.7778817346697493, + "learning_rate": 1e-06, + "loss": 0.3539, + "step": 2991 + }, + { + "epoch": 0.191745706229172, + "grad_norm": 2.6074940866398157, + "learning_rate": 1e-06, + "loss": 0.4145, + "step": 2992 + }, + { + "epoch": 0.1918097923609331, + "grad_norm": 2.7947299787206887, + "learning_rate": 1e-06, + "loss": 0.4358, + "step": 2993 + }, + { + "epoch": 0.1918738784926942, + "grad_norm": 2.550657685199485, + "learning_rate": 1e-06, + "loss": 0.4123, + "step": 2994 + }, + { + "epoch": 0.19193796462445528, + "grad_norm": 2.6333770331684154, + "learning_rate": 1e-06, + "loss": 0.3853, + "step": 2995 + }, + { + "epoch": 0.19200205075621635, + "grad_norm": 2.6548130755758126, + "learning_rate": 1e-06, + "loss": 0.3742, + "step": 2996 + }, + { + "epoch": 0.19206613688797744, + "grad_norm": 2.5175421052166915, + "learning_rate": 1e-06, + "loss": 0.4083, + "step": 2997 + }, + { + "epoch": 0.19213022301973853, + "grad_norm": 2.6092498555990433, + "learning_rate": 1e-06, + "loss": 0.4065, + "step": 2998 + }, + { + "epoch": 0.19219430915149963, + "grad_norm": 2.6746199446994057, + "learning_rate": 1e-06, + "loss": 0.3676, + "step": 2999 + }, + { + "epoch": 0.1922583952832607, + "grad_norm": 2.689102338246197, + "learning_rate": 1e-06, + "loss": 0.4043, + "step": 3000 + }, + { + "epoch": 0.19232248141502178, + "grad_norm": 2.8545617517948174, + "learning_rate": 1e-06, + "loss": 0.3947, + "step": 3001 + }, + { + "epoch": 0.19238656754678288, + "grad_norm": 2.72399379976401, + "learning_rate": 1e-06, + "loss": 0.4357, + "step": 3002 + }, + { + "epoch": 0.19245065367854397, + "grad_norm": 2.6992917271041943, + "learning_rate": 1e-06, + "loss": 0.3954, + "step": 3003 + }, + { + "epoch": 0.19251473981030506, + "grad_norm": 2.7469258281355975, + "learning_rate": 1e-06, + "loss": 0.4817, + "step": 3004 + }, + { + "epoch": 0.19257882594206613, + "grad_norm": 2.8151942792068714, + "learning_rate": 1e-06, + "loss": 0.4743, + "step": 3005 + }, + { + "epoch": 0.19264291207382722, + "grad_norm": 2.717535241339008, + "learning_rate": 1e-06, + "loss": 0.4756, + "step": 3006 + }, + { + "epoch": 0.19270699820558831, + "grad_norm": 2.6518470118186936, + "learning_rate": 1e-06, + "loss": 0.4416, + "step": 3007 + }, + { + "epoch": 0.1927710843373494, + "grad_norm": 2.745889107117633, + "learning_rate": 1e-06, + "loss": 0.4452, + "step": 3008 + }, + { + "epoch": 0.19283517046911047, + "grad_norm": 2.6299402934811225, + "learning_rate": 1e-06, + "loss": 0.3689, + "step": 3009 + }, + { + "epoch": 0.19289925660087157, + "grad_norm": 3.3013282687495265, + "learning_rate": 1e-06, + "loss": 0.4228, + "step": 3010 + }, + { + "epoch": 0.19296334273263266, + "grad_norm": 2.7392676652603045, + "learning_rate": 1e-06, + "loss": 0.439, + "step": 3011 + }, + { + "epoch": 0.19302742886439375, + "grad_norm": 2.5699388413274815, + "learning_rate": 1e-06, + "loss": 0.4317, + "step": 3012 + }, + { + "epoch": 0.19309151499615484, + "grad_norm": 2.527230563956275, + "learning_rate": 1e-06, + "loss": 0.4066, + "step": 3013 + }, + { + "epoch": 0.1931556011279159, + "grad_norm": 2.4490387549595685, + "learning_rate": 1e-06, + "loss": 0.4148, + "step": 3014 + }, + { + "epoch": 0.193219687259677, + "grad_norm": 2.7023787880147108, + "learning_rate": 1e-06, + "loss": 0.4121, + "step": 3015 + }, + { + "epoch": 0.1932837733914381, + "grad_norm": 2.6746716675529654, + "learning_rate": 1e-06, + "loss": 0.4117, + "step": 3016 + }, + { + "epoch": 0.1933478595231992, + "grad_norm": 2.6345823971450373, + "learning_rate": 1e-06, + "loss": 0.4575, + "step": 3017 + }, + { + "epoch": 0.19341194565496025, + "grad_norm": 2.7711286043104755, + "learning_rate": 1e-06, + "loss": 0.4484, + "step": 3018 + }, + { + "epoch": 0.19347603178672135, + "grad_norm": 2.4680613743712616, + "learning_rate": 1e-06, + "loss": 0.4018, + "step": 3019 + }, + { + "epoch": 0.19354011791848244, + "grad_norm": 2.6003860886706556, + "learning_rate": 1e-06, + "loss": 0.384, + "step": 3020 + }, + { + "epoch": 0.19360420405024353, + "grad_norm": 2.6108666448050384, + "learning_rate": 1e-06, + "loss": 0.4367, + "step": 3021 + }, + { + "epoch": 0.19366829018200463, + "grad_norm": 2.758026508393086, + "learning_rate": 1e-06, + "loss": 0.3995, + "step": 3022 + }, + { + "epoch": 0.1937323763137657, + "grad_norm": 2.4812630624240546, + "learning_rate": 1e-06, + "loss": 0.373, + "step": 3023 + }, + { + "epoch": 0.19379646244552678, + "grad_norm": 2.5059074556288685, + "learning_rate": 1e-06, + "loss": 0.4, + "step": 3024 + }, + { + "epoch": 0.19386054857728788, + "grad_norm": 2.700687588961143, + "learning_rate": 1e-06, + "loss": 0.4318, + "step": 3025 + }, + { + "epoch": 0.19392463470904897, + "grad_norm": 2.5778023098429625, + "learning_rate": 1e-06, + "loss": 0.4004, + "step": 3026 + }, + { + "epoch": 0.19398872084081004, + "grad_norm": 2.8092589531817236, + "learning_rate": 1e-06, + "loss": 0.4292, + "step": 3027 + }, + { + "epoch": 0.19405280697257113, + "grad_norm": 2.6755152209276702, + "learning_rate": 1e-06, + "loss": 0.4104, + "step": 3028 + }, + { + "epoch": 0.19411689310433222, + "grad_norm": 2.717840638318553, + "learning_rate": 1e-06, + "loss": 0.3854, + "step": 3029 + }, + { + "epoch": 0.19418097923609332, + "grad_norm": 2.8100797610129864, + "learning_rate": 1e-06, + "loss": 0.3904, + "step": 3030 + }, + { + "epoch": 0.1942450653678544, + "grad_norm": 2.8468932816579846, + "learning_rate": 1e-06, + "loss": 0.4597, + "step": 3031 + }, + { + "epoch": 0.19430915149961547, + "grad_norm": 2.7200125501208388, + "learning_rate": 1e-06, + "loss": 0.4227, + "step": 3032 + }, + { + "epoch": 0.19437323763137657, + "grad_norm": 2.745674467926342, + "learning_rate": 1e-06, + "loss": 0.3926, + "step": 3033 + }, + { + "epoch": 0.19443732376313766, + "grad_norm": 2.66764518189243, + "learning_rate": 1e-06, + "loss": 0.4487, + "step": 3034 + }, + { + "epoch": 0.19450140989489875, + "grad_norm": 2.72148919183853, + "learning_rate": 1e-06, + "loss": 0.431, + "step": 3035 + }, + { + "epoch": 0.19456549602665982, + "grad_norm": 2.766399378265474, + "learning_rate": 1e-06, + "loss": 0.3967, + "step": 3036 + }, + { + "epoch": 0.1946295821584209, + "grad_norm": 2.4568489231818256, + "learning_rate": 1e-06, + "loss": 0.3821, + "step": 3037 + }, + { + "epoch": 0.194693668290182, + "grad_norm": 2.8245628648840047, + "learning_rate": 1e-06, + "loss": 0.3791, + "step": 3038 + }, + { + "epoch": 0.1947577544219431, + "grad_norm": 2.8867994809759514, + "learning_rate": 1e-06, + "loss": 0.4492, + "step": 3039 + }, + { + "epoch": 0.1948218405537042, + "grad_norm": 2.7372903443602343, + "learning_rate": 1e-06, + "loss": 0.3816, + "step": 3040 + }, + { + "epoch": 0.19488592668546525, + "grad_norm": 2.654619328065438, + "learning_rate": 1e-06, + "loss": 0.3837, + "step": 3041 + }, + { + "epoch": 0.19495001281722635, + "grad_norm": 2.6189824595138953, + "learning_rate": 1e-06, + "loss": 0.4104, + "step": 3042 + }, + { + "epoch": 0.19501409894898744, + "grad_norm": 2.549845796501405, + "learning_rate": 1e-06, + "loss": 0.3848, + "step": 3043 + }, + { + "epoch": 0.19507818508074853, + "grad_norm": 2.7252239626651438, + "learning_rate": 1e-06, + "loss": 0.4229, + "step": 3044 + }, + { + "epoch": 0.1951422712125096, + "grad_norm": 2.8890697357641875, + "learning_rate": 1e-06, + "loss": 0.4447, + "step": 3045 + }, + { + "epoch": 0.1952063573442707, + "grad_norm": 2.72838732384927, + "learning_rate": 1e-06, + "loss": 0.4355, + "step": 3046 + }, + { + "epoch": 0.19527044347603179, + "grad_norm": 2.656321398304245, + "learning_rate": 1e-06, + "loss": 0.422, + "step": 3047 + }, + { + "epoch": 0.19533452960779288, + "grad_norm": 2.6185422333681867, + "learning_rate": 1e-06, + "loss": 0.4524, + "step": 3048 + }, + { + "epoch": 0.19539861573955397, + "grad_norm": 2.5406421196673437, + "learning_rate": 1e-06, + "loss": 0.3771, + "step": 3049 + }, + { + "epoch": 0.19546270187131504, + "grad_norm": 2.7469839128125497, + "learning_rate": 1e-06, + "loss": 0.3864, + "step": 3050 + }, + { + "epoch": 0.19552678800307613, + "grad_norm": 2.4965398898043585, + "learning_rate": 1e-06, + "loss": 0.4065, + "step": 3051 + }, + { + "epoch": 0.19559087413483722, + "grad_norm": 2.4349462204869314, + "learning_rate": 1e-06, + "loss": 0.3925, + "step": 3052 + }, + { + "epoch": 0.19565496026659832, + "grad_norm": 2.466053052427371, + "learning_rate": 1e-06, + "loss": 0.4485, + "step": 3053 + }, + { + "epoch": 0.1957190463983594, + "grad_norm": 2.644641993283081, + "learning_rate": 1e-06, + "loss": 0.3925, + "step": 3054 + }, + { + "epoch": 0.19578313253012047, + "grad_norm": 2.7286313655607666, + "learning_rate": 1e-06, + "loss": 0.3796, + "step": 3055 + }, + { + "epoch": 0.19584721866188157, + "grad_norm": 3.288638532373253, + "learning_rate": 1e-06, + "loss": 0.5013, + "step": 3056 + }, + { + "epoch": 0.19591130479364266, + "grad_norm": 2.6024355715319603, + "learning_rate": 1e-06, + "loss": 0.4452, + "step": 3057 + }, + { + "epoch": 0.19597539092540375, + "grad_norm": 2.781418640960001, + "learning_rate": 1e-06, + "loss": 0.4034, + "step": 3058 + }, + { + "epoch": 0.19603947705716482, + "grad_norm": 2.5251976082178884, + "learning_rate": 1e-06, + "loss": 0.3554, + "step": 3059 + }, + { + "epoch": 0.1961035631889259, + "grad_norm": 2.7218225486270264, + "learning_rate": 1e-06, + "loss": 0.3854, + "step": 3060 + }, + { + "epoch": 0.196167649320687, + "grad_norm": 2.465634369754827, + "learning_rate": 1e-06, + "loss": 0.3811, + "step": 3061 + }, + { + "epoch": 0.1962317354524481, + "grad_norm": 2.637439083827366, + "learning_rate": 1e-06, + "loss": 0.4004, + "step": 3062 + }, + { + "epoch": 0.1962958215842092, + "grad_norm": 2.615911737590689, + "learning_rate": 1e-06, + "loss": 0.4588, + "step": 3063 + }, + { + "epoch": 0.19635990771597026, + "grad_norm": 2.5254899516331526, + "learning_rate": 1e-06, + "loss": 0.4321, + "step": 3064 + }, + { + "epoch": 0.19642399384773135, + "grad_norm": 2.6078553888979394, + "learning_rate": 1e-06, + "loss": 0.3808, + "step": 3065 + }, + { + "epoch": 0.19648807997949244, + "grad_norm": 2.7298428056608466, + "learning_rate": 1e-06, + "loss": 0.4116, + "step": 3066 + }, + { + "epoch": 0.19655216611125353, + "grad_norm": 2.636498777465297, + "learning_rate": 1e-06, + "loss": 0.4114, + "step": 3067 + }, + { + "epoch": 0.1966162522430146, + "grad_norm": 2.6319615141742934, + "learning_rate": 1e-06, + "loss": 0.3911, + "step": 3068 + }, + { + "epoch": 0.1966803383747757, + "grad_norm": 2.5563489648377313, + "learning_rate": 1e-06, + "loss": 0.468, + "step": 3069 + }, + { + "epoch": 0.19674442450653679, + "grad_norm": 2.4495929287604232, + "learning_rate": 1e-06, + "loss": 0.3965, + "step": 3070 + }, + { + "epoch": 0.19680851063829788, + "grad_norm": 2.4717484721741254, + "learning_rate": 1e-06, + "loss": 0.4132, + "step": 3071 + }, + { + "epoch": 0.19687259677005897, + "grad_norm": 2.590478233813598, + "learning_rate": 1e-06, + "loss": 0.4312, + "step": 3072 + }, + { + "epoch": 0.19693668290182004, + "grad_norm": 2.7616833115828707, + "learning_rate": 1e-06, + "loss": 0.3987, + "step": 3073 + }, + { + "epoch": 0.19700076903358113, + "grad_norm": 2.5261292650531884, + "learning_rate": 1e-06, + "loss": 0.3813, + "step": 3074 + }, + { + "epoch": 0.19706485516534222, + "grad_norm": 2.7011738929384284, + "learning_rate": 1e-06, + "loss": 0.3816, + "step": 3075 + }, + { + "epoch": 0.19712894129710332, + "grad_norm": 2.879203428383538, + "learning_rate": 1e-06, + "loss": 0.4445, + "step": 3076 + }, + { + "epoch": 0.19719302742886438, + "grad_norm": 2.595528866437978, + "learning_rate": 1e-06, + "loss": 0.4057, + "step": 3077 + }, + { + "epoch": 0.19725711356062547, + "grad_norm": 2.94515771596298, + "learning_rate": 1e-06, + "loss": 0.4529, + "step": 3078 + }, + { + "epoch": 0.19732119969238657, + "grad_norm": 2.6736790736746445, + "learning_rate": 1e-06, + "loss": 0.4309, + "step": 3079 + }, + { + "epoch": 0.19738528582414766, + "grad_norm": 2.5140252504503726, + "learning_rate": 1e-06, + "loss": 0.4023, + "step": 3080 + }, + { + "epoch": 0.19744937195590875, + "grad_norm": 2.81911651094041, + "learning_rate": 1e-06, + "loss": 0.4125, + "step": 3081 + }, + { + "epoch": 0.19751345808766982, + "grad_norm": 2.5216675333964185, + "learning_rate": 1e-06, + "loss": 0.4035, + "step": 3082 + }, + { + "epoch": 0.1975775442194309, + "grad_norm": 2.597791352413284, + "learning_rate": 1e-06, + "loss": 0.4371, + "step": 3083 + }, + { + "epoch": 0.197641630351192, + "grad_norm": 2.6051238849878824, + "learning_rate": 1e-06, + "loss": 0.4179, + "step": 3084 + }, + { + "epoch": 0.1977057164829531, + "grad_norm": 2.4626622081291916, + "learning_rate": 1e-06, + "loss": 0.3564, + "step": 3085 + }, + { + "epoch": 0.19776980261471416, + "grad_norm": 2.6135316427042787, + "learning_rate": 1e-06, + "loss": 0.4043, + "step": 3086 + }, + { + "epoch": 0.19783388874647526, + "grad_norm": 2.497469648863096, + "learning_rate": 1e-06, + "loss": 0.3922, + "step": 3087 + }, + { + "epoch": 0.19789797487823635, + "grad_norm": 2.5303900994698476, + "learning_rate": 1e-06, + "loss": 0.3694, + "step": 3088 + }, + { + "epoch": 0.19796206100999744, + "grad_norm": 2.6047418213304003, + "learning_rate": 1e-06, + "loss": 0.4524, + "step": 3089 + }, + { + "epoch": 0.19802614714175854, + "grad_norm": 2.6321807982006797, + "learning_rate": 1e-06, + "loss": 0.3732, + "step": 3090 + }, + { + "epoch": 0.1980902332735196, + "grad_norm": 2.5343685708188834, + "learning_rate": 1e-06, + "loss": 0.4256, + "step": 3091 + }, + { + "epoch": 0.1981543194052807, + "grad_norm": 2.8626866297721123, + "learning_rate": 1e-06, + "loss": 0.3833, + "step": 3092 + }, + { + "epoch": 0.1982184055370418, + "grad_norm": 2.648616331118623, + "learning_rate": 1e-06, + "loss": 0.3815, + "step": 3093 + }, + { + "epoch": 0.19828249166880288, + "grad_norm": 2.7536516489830976, + "learning_rate": 1e-06, + "loss": 0.4149, + "step": 3094 + }, + { + "epoch": 0.19834657780056394, + "grad_norm": 2.7451936420156873, + "learning_rate": 1e-06, + "loss": 0.4143, + "step": 3095 + }, + { + "epoch": 0.19841066393232504, + "grad_norm": 2.559391512260891, + "learning_rate": 1e-06, + "loss": 0.4169, + "step": 3096 + }, + { + "epoch": 0.19847475006408613, + "grad_norm": 2.5924661584948376, + "learning_rate": 1e-06, + "loss": 0.398, + "step": 3097 + }, + { + "epoch": 0.19853883619584722, + "grad_norm": 2.7921584349548145, + "learning_rate": 1e-06, + "loss": 0.45, + "step": 3098 + }, + { + "epoch": 0.19860292232760832, + "grad_norm": 2.659738608230625, + "learning_rate": 1e-06, + "loss": 0.3917, + "step": 3099 + }, + { + "epoch": 0.19866700845936938, + "grad_norm": 2.5305396379037197, + "learning_rate": 1e-06, + "loss": 0.3791, + "step": 3100 + }, + { + "epoch": 0.19873109459113047, + "grad_norm": 2.756190416272402, + "learning_rate": 1e-06, + "loss": 0.3839, + "step": 3101 + }, + { + "epoch": 0.19879518072289157, + "grad_norm": 2.539580583309545, + "learning_rate": 1e-06, + "loss": 0.4275, + "step": 3102 + }, + { + "epoch": 0.19885926685465266, + "grad_norm": 2.67813254086445, + "learning_rate": 1e-06, + "loss": 0.3512, + "step": 3103 + }, + { + "epoch": 0.19892335298641373, + "grad_norm": 2.640991640157843, + "learning_rate": 1e-06, + "loss": 0.4512, + "step": 3104 + }, + { + "epoch": 0.19898743911817482, + "grad_norm": 2.5533029619456853, + "learning_rate": 1e-06, + "loss": 0.3762, + "step": 3105 + }, + { + "epoch": 0.1990515252499359, + "grad_norm": 2.6127542430810635, + "learning_rate": 1e-06, + "loss": 0.3938, + "step": 3106 + }, + { + "epoch": 0.199115611381697, + "grad_norm": 2.6179136331616215, + "learning_rate": 1e-06, + "loss": 0.3971, + "step": 3107 + }, + { + "epoch": 0.1991796975134581, + "grad_norm": 2.6944181569516914, + "learning_rate": 1e-06, + "loss": 0.4073, + "step": 3108 + }, + { + "epoch": 0.19924378364521916, + "grad_norm": 2.8686676163236613, + "learning_rate": 1e-06, + "loss": 0.4284, + "step": 3109 + }, + { + "epoch": 0.19930786977698026, + "grad_norm": 2.679658713052307, + "learning_rate": 1e-06, + "loss": 0.3844, + "step": 3110 + }, + { + "epoch": 0.19937195590874135, + "grad_norm": 2.905490286430153, + "learning_rate": 1e-06, + "loss": 0.4576, + "step": 3111 + }, + { + "epoch": 0.19943604204050244, + "grad_norm": 2.77202972977206, + "learning_rate": 1e-06, + "loss": 0.4071, + "step": 3112 + }, + { + "epoch": 0.19950012817226354, + "grad_norm": 2.6987682861458877, + "learning_rate": 1e-06, + "loss": 0.3778, + "step": 3113 + }, + { + "epoch": 0.1995642143040246, + "grad_norm": 2.9302419214602664, + "learning_rate": 1e-06, + "loss": 0.4353, + "step": 3114 + }, + { + "epoch": 0.1996283004357857, + "grad_norm": 2.7112189817286887, + "learning_rate": 1e-06, + "loss": 0.4163, + "step": 3115 + }, + { + "epoch": 0.1996923865675468, + "grad_norm": 2.6842811724796687, + "learning_rate": 1e-06, + "loss": 0.4416, + "step": 3116 + }, + { + "epoch": 0.19975647269930788, + "grad_norm": 2.5186808347832708, + "learning_rate": 1e-06, + "loss": 0.3756, + "step": 3117 + }, + { + "epoch": 0.19982055883106895, + "grad_norm": 2.891265829188758, + "learning_rate": 1e-06, + "loss": 0.4248, + "step": 3118 + }, + { + "epoch": 0.19988464496283004, + "grad_norm": 2.445277384717998, + "learning_rate": 1e-06, + "loss": 0.3837, + "step": 3119 + }, + { + "epoch": 0.19994873109459113, + "grad_norm": 2.4402702476916227, + "learning_rate": 1e-06, + "loss": 0.38, + "step": 3120 + }, + { + "epoch": 0.20001281722635222, + "grad_norm": 2.6536177038610727, + "learning_rate": 1e-06, + "loss": 0.3974, + "step": 3121 + }, + { + "epoch": 0.20007690335811332, + "grad_norm": 2.68284801164674, + "learning_rate": 1e-06, + "loss": 0.4214, + "step": 3122 + }, + { + "epoch": 0.20014098948987438, + "grad_norm": 2.9008015151571342, + "learning_rate": 1e-06, + "loss": 0.4362, + "step": 3123 + }, + { + "epoch": 0.20020507562163548, + "grad_norm": 2.640711993107474, + "learning_rate": 1e-06, + "loss": 0.4139, + "step": 3124 + }, + { + "epoch": 0.20026916175339657, + "grad_norm": 2.7448204574462416, + "learning_rate": 1e-06, + "loss": 0.407, + "step": 3125 + }, + { + "epoch": 0.20033324788515766, + "grad_norm": 2.6303804716884183, + "learning_rate": 1e-06, + "loss": 0.3674, + "step": 3126 + }, + { + "epoch": 0.20039733401691873, + "grad_norm": 2.7394724745610897, + "learning_rate": 1e-06, + "loss": 0.3837, + "step": 3127 + }, + { + "epoch": 0.20046142014867982, + "grad_norm": 2.407575755511553, + "learning_rate": 1e-06, + "loss": 0.3786, + "step": 3128 + }, + { + "epoch": 0.2005255062804409, + "grad_norm": 2.7833525732649353, + "learning_rate": 1e-06, + "loss": 0.4346, + "step": 3129 + }, + { + "epoch": 0.200589592412202, + "grad_norm": 2.7948666514800657, + "learning_rate": 1e-06, + "loss": 0.4425, + "step": 3130 + }, + { + "epoch": 0.2006536785439631, + "grad_norm": 2.619044236684681, + "learning_rate": 1e-06, + "loss": 0.3851, + "step": 3131 + }, + { + "epoch": 0.20071776467572416, + "grad_norm": 2.7631433482197596, + "learning_rate": 1e-06, + "loss": 0.3831, + "step": 3132 + }, + { + "epoch": 0.20078185080748526, + "grad_norm": 2.7735001022993155, + "learning_rate": 1e-06, + "loss": 0.4302, + "step": 3133 + }, + { + "epoch": 0.20084593693924635, + "grad_norm": 2.6649390403366198, + "learning_rate": 1e-06, + "loss": 0.4513, + "step": 3134 + }, + { + "epoch": 0.20091002307100744, + "grad_norm": 2.642064035206794, + "learning_rate": 1e-06, + "loss": 0.4495, + "step": 3135 + }, + { + "epoch": 0.2009741092027685, + "grad_norm": 2.6129073896361414, + "learning_rate": 1e-06, + "loss": 0.3816, + "step": 3136 + }, + { + "epoch": 0.2010381953345296, + "grad_norm": 2.9648669735240447, + "learning_rate": 1e-06, + "loss": 0.4165, + "step": 3137 + }, + { + "epoch": 0.2011022814662907, + "grad_norm": 2.8669137466757326, + "learning_rate": 1e-06, + "loss": 0.3595, + "step": 3138 + }, + { + "epoch": 0.2011663675980518, + "grad_norm": 2.5460585330292003, + "learning_rate": 1e-06, + "loss": 0.3667, + "step": 3139 + }, + { + "epoch": 0.20123045372981288, + "grad_norm": 2.3987256825811274, + "learning_rate": 1e-06, + "loss": 0.3765, + "step": 3140 + }, + { + "epoch": 0.20129453986157395, + "grad_norm": 2.5263050591029867, + "learning_rate": 1e-06, + "loss": 0.4062, + "step": 3141 + }, + { + "epoch": 0.20135862599333504, + "grad_norm": 2.985354785460851, + "learning_rate": 1e-06, + "loss": 0.396, + "step": 3142 + }, + { + "epoch": 0.20142271212509613, + "grad_norm": 2.7659888764684264, + "learning_rate": 1e-06, + "loss": 0.4453, + "step": 3143 + }, + { + "epoch": 0.20148679825685722, + "grad_norm": 2.591450612167699, + "learning_rate": 1e-06, + "loss": 0.4083, + "step": 3144 + }, + { + "epoch": 0.2015508843886183, + "grad_norm": 2.8521468462132544, + "learning_rate": 1e-06, + "loss": 0.4404, + "step": 3145 + }, + { + "epoch": 0.20161497052037938, + "grad_norm": 2.436797817729498, + "learning_rate": 1e-06, + "loss": 0.4238, + "step": 3146 + }, + { + "epoch": 0.20167905665214048, + "grad_norm": 2.8810258226794265, + "learning_rate": 1e-06, + "loss": 0.4529, + "step": 3147 + }, + { + "epoch": 0.20174314278390157, + "grad_norm": 2.749593307275725, + "learning_rate": 1e-06, + "loss": 0.3718, + "step": 3148 + }, + { + "epoch": 0.20180722891566266, + "grad_norm": 2.5819133802039698, + "learning_rate": 1e-06, + "loss": 0.4202, + "step": 3149 + }, + { + "epoch": 0.20187131504742373, + "grad_norm": 2.814319540956971, + "learning_rate": 1e-06, + "loss": 0.4702, + "step": 3150 + }, + { + "epoch": 0.20193540117918482, + "grad_norm": 2.755121410304437, + "learning_rate": 1e-06, + "loss": 0.4292, + "step": 3151 + }, + { + "epoch": 0.2019994873109459, + "grad_norm": 2.8607357345261186, + "learning_rate": 1e-06, + "loss": 0.4081, + "step": 3152 + }, + { + "epoch": 0.202063573442707, + "grad_norm": 2.7644998286160414, + "learning_rate": 1e-06, + "loss": 0.4148, + "step": 3153 + }, + { + "epoch": 0.20212765957446807, + "grad_norm": 3.1791677414919937, + "learning_rate": 1e-06, + "loss": 0.4024, + "step": 3154 + }, + { + "epoch": 0.20219174570622916, + "grad_norm": 2.761445190168834, + "learning_rate": 1e-06, + "loss": 0.4556, + "step": 3155 + }, + { + "epoch": 0.20225583183799026, + "grad_norm": 2.6524443855935744, + "learning_rate": 1e-06, + "loss": 0.3857, + "step": 3156 + }, + { + "epoch": 0.20231991796975135, + "grad_norm": 2.6616266519428486, + "learning_rate": 1e-06, + "loss": 0.4252, + "step": 3157 + }, + { + "epoch": 0.20238400410151244, + "grad_norm": 2.5016076188972827, + "learning_rate": 1e-06, + "loss": 0.3676, + "step": 3158 + }, + { + "epoch": 0.2024480902332735, + "grad_norm": 2.530401535505697, + "learning_rate": 1e-06, + "loss": 0.3893, + "step": 3159 + }, + { + "epoch": 0.2025121763650346, + "grad_norm": 2.6587919579612147, + "learning_rate": 1e-06, + "loss": 0.4009, + "step": 3160 + }, + { + "epoch": 0.2025762624967957, + "grad_norm": 2.9566194209341, + "learning_rate": 1e-06, + "loss": 0.42, + "step": 3161 + }, + { + "epoch": 0.2026403486285568, + "grad_norm": 2.6206510556300033, + "learning_rate": 1e-06, + "loss": 0.4262, + "step": 3162 + }, + { + "epoch": 0.20270443476031788, + "grad_norm": 2.927652472371367, + "learning_rate": 1e-06, + "loss": 0.4889, + "step": 3163 + }, + { + "epoch": 0.20276852089207895, + "grad_norm": 2.65170729751374, + "learning_rate": 1e-06, + "loss": 0.4269, + "step": 3164 + }, + { + "epoch": 0.20283260702384004, + "grad_norm": 2.5867425876599865, + "learning_rate": 1e-06, + "loss": 0.4889, + "step": 3165 + }, + { + "epoch": 0.20289669315560113, + "grad_norm": 2.7063246897863285, + "learning_rate": 1e-06, + "loss": 0.4004, + "step": 3166 + }, + { + "epoch": 0.20296077928736223, + "grad_norm": 2.624579109415967, + "learning_rate": 1e-06, + "loss": 0.4464, + "step": 3167 + }, + { + "epoch": 0.2030248654191233, + "grad_norm": 2.7380146988562504, + "learning_rate": 1e-06, + "loss": 0.3857, + "step": 3168 + }, + { + "epoch": 0.20308895155088438, + "grad_norm": 2.4525698832360465, + "learning_rate": 1e-06, + "loss": 0.4224, + "step": 3169 + }, + { + "epoch": 0.20315303768264548, + "grad_norm": 2.717087375931541, + "learning_rate": 1e-06, + "loss": 0.3979, + "step": 3170 + }, + { + "epoch": 0.20321712381440657, + "grad_norm": 2.7561886591445672, + "learning_rate": 1e-06, + "loss": 0.364, + "step": 3171 + }, + { + "epoch": 0.20328120994616766, + "grad_norm": 2.7637234247630853, + "learning_rate": 1e-06, + "loss": 0.3985, + "step": 3172 + }, + { + "epoch": 0.20334529607792873, + "grad_norm": 2.660887630337373, + "learning_rate": 1e-06, + "loss": 0.3672, + "step": 3173 + }, + { + "epoch": 0.20340938220968982, + "grad_norm": 2.7012411259252924, + "learning_rate": 1e-06, + "loss": 0.5082, + "step": 3174 + }, + { + "epoch": 0.2034734683414509, + "grad_norm": 2.7199733769770913, + "learning_rate": 1e-06, + "loss": 0.4146, + "step": 3175 + }, + { + "epoch": 0.203537554473212, + "grad_norm": 2.8716087266272314, + "learning_rate": 1e-06, + "loss": 0.3657, + "step": 3176 + }, + { + "epoch": 0.20360164060497307, + "grad_norm": 2.757204262047918, + "learning_rate": 1e-06, + "loss": 0.418, + "step": 3177 + }, + { + "epoch": 0.20366572673673417, + "grad_norm": 2.7868293714034356, + "learning_rate": 1e-06, + "loss": 0.4133, + "step": 3178 + }, + { + "epoch": 0.20372981286849526, + "grad_norm": 2.4386658649656097, + "learning_rate": 1e-06, + "loss": 0.3242, + "step": 3179 + }, + { + "epoch": 0.20379389900025635, + "grad_norm": 2.745982025953506, + "learning_rate": 1e-06, + "loss": 0.4203, + "step": 3180 + }, + { + "epoch": 0.20385798513201744, + "grad_norm": 2.8324684296501697, + "learning_rate": 1e-06, + "loss": 0.4359, + "step": 3181 + }, + { + "epoch": 0.2039220712637785, + "grad_norm": 2.997115716317168, + "learning_rate": 1e-06, + "loss": 0.4222, + "step": 3182 + }, + { + "epoch": 0.2039861573955396, + "grad_norm": 2.6677174069426366, + "learning_rate": 1e-06, + "loss": 0.4169, + "step": 3183 + }, + { + "epoch": 0.2040502435273007, + "grad_norm": 2.5669472730439704, + "learning_rate": 1e-06, + "loss": 0.4617, + "step": 3184 + }, + { + "epoch": 0.2041143296590618, + "grad_norm": 2.5467043969436656, + "learning_rate": 1e-06, + "loss": 0.4144, + "step": 3185 + }, + { + "epoch": 0.20417841579082285, + "grad_norm": 2.6219195102759723, + "learning_rate": 1e-06, + "loss": 0.4482, + "step": 3186 + }, + { + "epoch": 0.20424250192258395, + "grad_norm": 2.753937077422682, + "learning_rate": 1e-06, + "loss": 0.4078, + "step": 3187 + }, + { + "epoch": 0.20430658805434504, + "grad_norm": 2.5606663484409355, + "learning_rate": 1e-06, + "loss": 0.3949, + "step": 3188 + }, + { + "epoch": 0.20437067418610613, + "grad_norm": 2.6646453207007994, + "learning_rate": 1e-06, + "loss": 0.4301, + "step": 3189 + }, + { + "epoch": 0.20443476031786723, + "grad_norm": 2.6862551366161513, + "learning_rate": 1e-06, + "loss": 0.3938, + "step": 3190 + }, + { + "epoch": 0.2044988464496283, + "grad_norm": 2.523615636687127, + "learning_rate": 1e-06, + "loss": 0.4181, + "step": 3191 + }, + { + "epoch": 0.20456293258138938, + "grad_norm": 2.6884484446187873, + "learning_rate": 1e-06, + "loss": 0.3618, + "step": 3192 + }, + { + "epoch": 0.20462701871315048, + "grad_norm": 2.829665325764848, + "learning_rate": 1e-06, + "loss": 0.4052, + "step": 3193 + }, + { + "epoch": 0.20469110484491157, + "grad_norm": 2.5953374746890403, + "learning_rate": 1e-06, + "loss": 0.4389, + "step": 3194 + }, + { + "epoch": 0.20475519097667264, + "grad_norm": 2.8466331258627604, + "learning_rate": 1e-06, + "loss": 0.4039, + "step": 3195 + }, + { + "epoch": 0.20481927710843373, + "grad_norm": 2.7486183212115547, + "learning_rate": 1e-06, + "loss": 0.3876, + "step": 3196 + }, + { + "epoch": 0.20488336324019482, + "grad_norm": 2.801298683547888, + "learning_rate": 1e-06, + "loss": 0.367, + "step": 3197 + }, + { + "epoch": 0.20494744937195591, + "grad_norm": 2.695730931752156, + "learning_rate": 1e-06, + "loss": 0.4933, + "step": 3198 + }, + { + "epoch": 0.205011535503717, + "grad_norm": 2.8125270494981027, + "learning_rate": 1e-06, + "loss": 0.3954, + "step": 3199 + }, + { + "epoch": 0.20507562163547807, + "grad_norm": 3.2577167938002796, + "learning_rate": 1e-06, + "loss": 0.4277, + "step": 3200 + }, + { + "epoch": 0.20513970776723917, + "grad_norm": 2.758478341893175, + "learning_rate": 1e-06, + "loss": 0.4352, + "step": 3201 + }, + { + "epoch": 0.20520379389900026, + "grad_norm": 2.879050943230032, + "learning_rate": 1e-06, + "loss": 0.4305, + "step": 3202 + }, + { + "epoch": 0.20526788003076135, + "grad_norm": 2.686643342867524, + "learning_rate": 1e-06, + "loss": 0.3573, + "step": 3203 + }, + { + "epoch": 0.20533196616252242, + "grad_norm": 2.721258382735068, + "learning_rate": 1e-06, + "loss": 0.4019, + "step": 3204 + }, + { + "epoch": 0.2053960522942835, + "grad_norm": 2.7158203178493854, + "learning_rate": 1e-06, + "loss": 0.43, + "step": 3205 + }, + { + "epoch": 0.2054601384260446, + "grad_norm": 2.624134577750634, + "learning_rate": 1e-06, + "loss": 0.3608, + "step": 3206 + }, + { + "epoch": 0.2055242245578057, + "grad_norm": 2.720833641453419, + "learning_rate": 1e-06, + "loss": 0.4991, + "step": 3207 + }, + { + "epoch": 0.2055883106895668, + "grad_norm": 2.7278965888939255, + "learning_rate": 1e-06, + "loss": 0.4445, + "step": 3208 + }, + { + "epoch": 0.20565239682132785, + "grad_norm": 2.9254754982044346, + "learning_rate": 1e-06, + "loss": 0.3889, + "step": 3209 + }, + { + "epoch": 0.20571648295308895, + "grad_norm": 2.6542985721484382, + "learning_rate": 1e-06, + "loss": 0.3975, + "step": 3210 + }, + { + "epoch": 0.20578056908485004, + "grad_norm": 2.699604037727829, + "learning_rate": 1e-06, + "loss": 0.3911, + "step": 3211 + }, + { + "epoch": 0.20584465521661113, + "grad_norm": 2.7276421722192525, + "learning_rate": 1e-06, + "loss": 0.4107, + "step": 3212 + }, + { + "epoch": 0.2059087413483722, + "grad_norm": 2.6977864355232715, + "learning_rate": 1e-06, + "loss": 0.4003, + "step": 3213 + }, + { + "epoch": 0.2059728274801333, + "grad_norm": 2.6477848771201886, + "learning_rate": 1e-06, + "loss": 0.4429, + "step": 3214 + }, + { + "epoch": 0.20603691361189438, + "grad_norm": 2.672606633365545, + "learning_rate": 1e-06, + "loss": 0.3984, + "step": 3215 + }, + { + "epoch": 0.20610099974365548, + "grad_norm": 2.4604151399516945, + "learning_rate": 1e-06, + "loss": 0.4922, + "step": 3216 + }, + { + "epoch": 0.20616508587541657, + "grad_norm": 2.857586432951352, + "learning_rate": 1e-06, + "loss": 0.3976, + "step": 3217 + }, + { + "epoch": 0.20622917200717764, + "grad_norm": 2.4822416440627264, + "learning_rate": 1e-06, + "loss": 0.4069, + "step": 3218 + }, + { + "epoch": 0.20629325813893873, + "grad_norm": 2.6436355434453565, + "learning_rate": 1e-06, + "loss": 0.3792, + "step": 3219 + }, + { + "epoch": 0.20635734427069982, + "grad_norm": 2.6742355296426212, + "learning_rate": 1e-06, + "loss": 0.4404, + "step": 3220 + }, + { + "epoch": 0.20642143040246091, + "grad_norm": 2.8105861214534453, + "learning_rate": 1e-06, + "loss": 0.39, + "step": 3221 + }, + { + "epoch": 0.206485516534222, + "grad_norm": 2.5930834712675876, + "learning_rate": 1e-06, + "loss": 0.4299, + "step": 3222 + }, + { + "epoch": 0.20654960266598307, + "grad_norm": 2.5829548868133654, + "learning_rate": 1e-06, + "loss": 0.4105, + "step": 3223 + }, + { + "epoch": 0.20661368879774417, + "grad_norm": 2.719837435467215, + "learning_rate": 1e-06, + "loss": 0.3834, + "step": 3224 + }, + { + "epoch": 0.20667777492950526, + "grad_norm": 2.61535271989138, + "learning_rate": 1e-06, + "loss": 0.371, + "step": 3225 + }, + { + "epoch": 0.20674186106126635, + "grad_norm": 2.5771180704151755, + "learning_rate": 1e-06, + "loss": 0.4275, + "step": 3226 + }, + { + "epoch": 0.20680594719302742, + "grad_norm": 2.672183893932333, + "learning_rate": 1e-06, + "loss": 0.4107, + "step": 3227 + }, + { + "epoch": 0.2068700333247885, + "grad_norm": 2.773792137100908, + "learning_rate": 1e-06, + "loss": 0.4827, + "step": 3228 + }, + { + "epoch": 0.2069341194565496, + "grad_norm": 2.681428202763583, + "learning_rate": 1e-06, + "loss": 0.4176, + "step": 3229 + }, + { + "epoch": 0.2069982055883107, + "grad_norm": 2.9492158799161796, + "learning_rate": 1e-06, + "loss": 0.4708, + "step": 3230 + }, + { + "epoch": 0.2070622917200718, + "grad_norm": 2.8897345694841534, + "learning_rate": 1e-06, + "loss": 0.4303, + "step": 3231 + }, + { + "epoch": 0.20712637785183285, + "grad_norm": 2.759597729311294, + "learning_rate": 1e-06, + "loss": 0.4035, + "step": 3232 + }, + { + "epoch": 0.20719046398359395, + "grad_norm": 2.840421542454811, + "learning_rate": 1e-06, + "loss": 0.4273, + "step": 3233 + }, + { + "epoch": 0.20725455011535504, + "grad_norm": 2.739728384977558, + "learning_rate": 1e-06, + "loss": 0.4528, + "step": 3234 + }, + { + "epoch": 0.20731863624711613, + "grad_norm": 2.824433398735102, + "learning_rate": 1e-06, + "loss": 0.3889, + "step": 3235 + }, + { + "epoch": 0.2073827223788772, + "grad_norm": 2.6354881167840505, + "learning_rate": 1e-06, + "loss": 0.3858, + "step": 3236 + }, + { + "epoch": 0.2074468085106383, + "grad_norm": 2.8945305522820477, + "learning_rate": 1e-06, + "loss": 0.4126, + "step": 3237 + }, + { + "epoch": 0.20751089464239938, + "grad_norm": 2.764839911328175, + "learning_rate": 1e-06, + "loss": 0.4274, + "step": 3238 + }, + { + "epoch": 0.20757498077416048, + "grad_norm": 2.773496463928611, + "learning_rate": 1e-06, + "loss": 0.5385, + "step": 3239 + }, + { + "epoch": 0.20763906690592157, + "grad_norm": 2.7564360890775084, + "learning_rate": 1e-06, + "loss": 0.4388, + "step": 3240 + }, + { + "epoch": 0.20770315303768264, + "grad_norm": 2.6660067624046637, + "learning_rate": 1e-06, + "loss": 0.4451, + "step": 3241 + }, + { + "epoch": 0.20776723916944373, + "grad_norm": 2.3442950677854797, + "learning_rate": 1e-06, + "loss": 0.3913, + "step": 3242 + }, + { + "epoch": 0.20783132530120482, + "grad_norm": 2.6071034795612014, + "learning_rate": 1e-06, + "loss": 0.35, + "step": 3243 + }, + { + "epoch": 0.20789541143296592, + "grad_norm": 2.5404513415884065, + "learning_rate": 1e-06, + "loss": 0.4069, + "step": 3244 + }, + { + "epoch": 0.20795949756472698, + "grad_norm": 2.732016662782567, + "learning_rate": 1e-06, + "loss": 0.4083, + "step": 3245 + }, + { + "epoch": 0.20802358369648807, + "grad_norm": 2.470865179331822, + "learning_rate": 1e-06, + "loss": 0.366, + "step": 3246 + }, + { + "epoch": 0.20808766982824917, + "grad_norm": 2.8312892904633227, + "learning_rate": 1e-06, + "loss": 0.3434, + "step": 3247 + }, + { + "epoch": 0.20815175596001026, + "grad_norm": 2.9626699856363143, + "learning_rate": 1e-06, + "loss": 0.3919, + "step": 3248 + }, + { + "epoch": 0.20821584209177135, + "grad_norm": 2.7202318287031617, + "learning_rate": 1e-06, + "loss": 0.3718, + "step": 3249 + }, + { + "epoch": 0.20827992822353242, + "grad_norm": 2.5074669078149814, + "learning_rate": 1e-06, + "loss": 0.4725, + "step": 3250 + }, + { + "epoch": 0.2083440143552935, + "grad_norm": 2.9385941662449473, + "learning_rate": 1e-06, + "loss": 0.4403, + "step": 3251 + }, + { + "epoch": 0.2084081004870546, + "grad_norm": 2.748212967840643, + "learning_rate": 1e-06, + "loss": 0.3987, + "step": 3252 + }, + { + "epoch": 0.2084721866188157, + "grad_norm": 2.6400904306111572, + "learning_rate": 1e-06, + "loss": 0.4194, + "step": 3253 + }, + { + "epoch": 0.20853627275057676, + "grad_norm": 2.7657922553991554, + "learning_rate": 1e-06, + "loss": 0.4343, + "step": 3254 + }, + { + "epoch": 0.20860035888233786, + "grad_norm": 2.6880821418013046, + "learning_rate": 1e-06, + "loss": 0.3921, + "step": 3255 + }, + { + "epoch": 0.20866444501409895, + "grad_norm": 2.5586073503420756, + "learning_rate": 1e-06, + "loss": 0.3641, + "step": 3256 + }, + { + "epoch": 0.20872853114586004, + "grad_norm": 2.7781977676069314, + "learning_rate": 1e-06, + "loss": 0.4149, + "step": 3257 + }, + { + "epoch": 0.20879261727762113, + "grad_norm": 2.6557419528008372, + "learning_rate": 1e-06, + "loss": 0.38, + "step": 3258 + }, + { + "epoch": 0.2088567034093822, + "grad_norm": 2.5621653160572184, + "learning_rate": 1e-06, + "loss": 0.3763, + "step": 3259 + }, + { + "epoch": 0.2089207895411433, + "grad_norm": 2.9311715852645515, + "learning_rate": 1e-06, + "loss": 0.3435, + "step": 3260 + }, + { + "epoch": 0.20898487567290439, + "grad_norm": 3.0964528455275517, + "learning_rate": 1e-06, + "loss": 0.4291, + "step": 3261 + }, + { + "epoch": 0.20904896180466548, + "grad_norm": 2.9231040164499826, + "learning_rate": 1e-06, + "loss": 0.3814, + "step": 3262 + }, + { + "epoch": 0.20911304793642654, + "grad_norm": 2.449860144323429, + "learning_rate": 1e-06, + "loss": 0.3969, + "step": 3263 + }, + { + "epoch": 0.20917713406818764, + "grad_norm": 2.7797757734601145, + "learning_rate": 1e-06, + "loss": 0.4768, + "step": 3264 + }, + { + "epoch": 0.20924122019994873, + "grad_norm": 2.813532878048226, + "learning_rate": 1e-06, + "loss": 0.4363, + "step": 3265 + }, + { + "epoch": 0.20930530633170982, + "grad_norm": 2.66716937995186, + "learning_rate": 1e-06, + "loss": 0.4468, + "step": 3266 + }, + { + "epoch": 0.20936939246347092, + "grad_norm": 2.5170598395883825, + "learning_rate": 1e-06, + "loss": 0.3783, + "step": 3267 + }, + { + "epoch": 0.20943347859523198, + "grad_norm": 2.674849349647163, + "learning_rate": 1e-06, + "loss": 0.4077, + "step": 3268 + }, + { + "epoch": 0.20949756472699307, + "grad_norm": 2.624379949822509, + "learning_rate": 1e-06, + "loss": 0.3982, + "step": 3269 + }, + { + "epoch": 0.20956165085875417, + "grad_norm": 2.820816841414604, + "learning_rate": 1e-06, + "loss": 0.395, + "step": 3270 + }, + { + "epoch": 0.20962573699051526, + "grad_norm": 2.527956786895495, + "learning_rate": 1e-06, + "loss": 0.4627, + "step": 3271 + }, + { + "epoch": 0.20968982312227635, + "grad_norm": 2.725541845425045, + "learning_rate": 1e-06, + "loss": 0.4151, + "step": 3272 + }, + { + "epoch": 0.20975390925403742, + "grad_norm": 2.878927720490599, + "learning_rate": 1e-06, + "loss": 0.4412, + "step": 3273 + }, + { + "epoch": 0.2098179953857985, + "grad_norm": 2.9259836698705963, + "learning_rate": 1e-06, + "loss": 0.4036, + "step": 3274 + }, + { + "epoch": 0.2098820815175596, + "grad_norm": 2.7982781093411586, + "learning_rate": 1e-06, + "loss": 0.3854, + "step": 3275 + }, + { + "epoch": 0.2099461676493207, + "grad_norm": 2.5793222625428953, + "learning_rate": 1e-06, + "loss": 0.4232, + "step": 3276 + }, + { + "epoch": 0.21001025378108176, + "grad_norm": 2.9796808226633757, + "learning_rate": 1e-06, + "loss": 0.3803, + "step": 3277 + }, + { + "epoch": 0.21007433991284286, + "grad_norm": 2.4441804847492343, + "learning_rate": 1e-06, + "loss": 0.3494, + "step": 3278 + }, + { + "epoch": 0.21013842604460395, + "grad_norm": 2.925764791415414, + "learning_rate": 1e-06, + "loss": 0.3994, + "step": 3279 + }, + { + "epoch": 0.21020251217636504, + "grad_norm": 2.633148554481942, + "learning_rate": 1e-06, + "loss": 0.4208, + "step": 3280 + }, + { + "epoch": 0.21026659830812613, + "grad_norm": 2.5106080788505984, + "learning_rate": 1e-06, + "loss": 0.4771, + "step": 3281 + }, + { + "epoch": 0.2103306844398872, + "grad_norm": 2.6172458561738865, + "learning_rate": 1e-06, + "loss": 0.4521, + "step": 3282 + }, + { + "epoch": 0.2103947705716483, + "grad_norm": 2.5929935788049487, + "learning_rate": 1e-06, + "loss": 0.3985, + "step": 3283 + }, + { + "epoch": 0.21045885670340939, + "grad_norm": 2.7905184923144657, + "learning_rate": 1e-06, + "loss": 0.4423, + "step": 3284 + }, + { + "epoch": 0.21052294283517048, + "grad_norm": 2.5347958919401874, + "learning_rate": 1e-06, + "loss": 0.3942, + "step": 3285 + }, + { + "epoch": 0.21058702896693154, + "grad_norm": 2.8626532701014487, + "learning_rate": 1e-06, + "loss": 0.4876, + "step": 3286 + }, + { + "epoch": 0.21065111509869264, + "grad_norm": 2.757452199734976, + "learning_rate": 1e-06, + "loss": 0.4211, + "step": 3287 + }, + { + "epoch": 0.21071520123045373, + "grad_norm": 2.903120359363833, + "learning_rate": 1e-06, + "loss": 0.4326, + "step": 3288 + }, + { + "epoch": 0.21077928736221482, + "grad_norm": 2.880237709233351, + "learning_rate": 1e-06, + "loss": 0.4016, + "step": 3289 + }, + { + "epoch": 0.21084337349397592, + "grad_norm": 2.553661952738896, + "learning_rate": 1e-06, + "loss": 0.3995, + "step": 3290 + }, + { + "epoch": 0.21090745962573698, + "grad_norm": 2.5558492231930363, + "learning_rate": 1e-06, + "loss": 0.3656, + "step": 3291 + }, + { + "epoch": 0.21097154575749807, + "grad_norm": 2.7074908240403066, + "learning_rate": 1e-06, + "loss": 0.4687, + "step": 3292 + }, + { + "epoch": 0.21103563188925917, + "grad_norm": 2.505986301029715, + "learning_rate": 1e-06, + "loss": 0.4268, + "step": 3293 + }, + { + "epoch": 0.21109971802102026, + "grad_norm": 2.663176507171353, + "learning_rate": 1e-06, + "loss": 0.3707, + "step": 3294 + }, + { + "epoch": 0.21116380415278133, + "grad_norm": 2.680302070013859, + "learning_rate": 1e-06, + "loss": 0.4296, + "step": 3295 + }, + { + "epoch": 0.21122789028454242, + "grad_norm": 2.561409369888171, + "learning_rate": 1e-06, + "loss": 0.3794, + "step": 3296 + }, + { + "epoch": 0.2112919764163035, + "grad_norm": 2.419773725787716, + "learning_rate": 1e-06, + "loss": 0.4255, + "step": 3297 + }, + { + "epoch": 0.2113560625480646, + "grad_norm": 2.982157224673509, + "learning_rate": 1e-06, + "loss": 0.4163, + "step": 3298 + }, + { + "epoch": 0.2114201486798257, + "grad_norm": 2.5932585562032577, + "learning_rate": 1e-06, + "loss": 0.3908, + "step": 3299 + }, + { + "epoch": 0.21148423481158676, + "grad_norm": 2.5018110031718033, + "learning_rate": 1e-06, + "loss": 0.4069, + "step": 3300 + }, + { + "epoch": 0.21154832094334786, + "grad_norm": 2.742542865770265, + "learning_rate": 1e-06, + "loss": 0.4739, + "step": 3301 + }, + { + "epoch": 0.21161240707510895, + "grad_norm": 2.5360610155104784, + "learning_rate": 1e-06, + "loss": 0.4523, + "step": 3302 + }, + { + "epoch": 0.21167649320687004, + "grad_norm": 2.551525084190843, + "learning_rate": 1e-06, + "loss": 0.4041, + "step": 3303 + }, + { + "epoch": 0.2117405793386311, + "grad_norm": 2.434809756110649, + "learning_rate": 1e-06, + "loss": 0.4214, + "step": 3304 + }, + { + "epoch": 0.2118046654703922, + "grad_norm": 2.411436200322004, + "learning_rate": 1e-06, + "loss": 0.4291, + "step": 3305 + }, + { + "epoch": 0.2118687516021533, + "grad_norm": 2.905538512840825, + "learning_rate": 1e-06, + "loss": 0.4184, + "step": 3306 + }, + { + "epoch": 0.2119328377339144, + "grad_norm": 2.9864965713027205, + "learning_rate": 1e-06, + "loss": 0.4506, + "step": 3307 + }, + { + "epoch": 0.21199692386567548, + "grad_norm": 2.742136252581274, + "learning_rate": 1e-06, + "loss": 0.3788, + "step": 3308 + }, + { + "epoch": 0.21206100999743654, + "grad_norm": 2.5326228134750206, + "learning_rate": 1e-06, + "loss": 0.398, + "step": 3309 + }, + { + "epoch": 0.21212509612919764, + "grad_norm": 2.830915338256494, + "learning_rate": 1e-06, + "loss": 0.417, + "step": 3310 + }, + { + "epoch": 0.21218918226095873, + "grad_norm": 2.562052867540231, + "learning_rate": 1e-06, + "loss": 0.4002, + "step": 3311 + }, + { + "epoch": 0.21225326839271982, + "grad_norm": 2.7686620430274655, + "learning_rate": 1e-06, + "loss": 0.3997, + "step": 3312 + }, + { + "epoch": 0.2123173545244809, + "grad_norm": 2.7678823836226916, + "learning_rate": 1e-06, + "loss": 0.4272, + "step": 3313 + }, + { + "epoch": 0.21238144065624198, + "grad_norm": 2.5608644724506515, + "learning_rate": 1e-06, + "loss": 0.3728, + "step": 3314 + }, + { + "epoch": 0.21244552678800308, + "grad_norm": 2.7431587712595635, + "learning_rate": 1e-06, + "loss": 0.4088, + "step": 3315 + }, + { + "epoch": 0.21250961291976417, + "grad_norm": 2.6725004227866846, + "learning_rate": 1e-06, + "loss": 0.4367, + "step": 3316 + }, + { + "epoch": 0.21257369905152526, + "grad_norm": 2.6414894647068774, + "learning_rate": 1e-06, + "loss": 0.4033, + "step": 3317 + }, + { + "epoch": 0.21263778518328633, + "grad_norm": 2.6201335968447186, + "learning_rate": 1e-06, + "loss": 0.4295, + "step": 3318 + }, + { + "epoch": 0.21270187131504742, + "grad_norm": 2.6344494226288595, + "learning_rate": 1e-06, + "loss": 0.443, + "step": 3319 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 2.6728837469432785, + "learning_rate": 1e-06, + "loss": 0.4037, + "step": 3320 + }, + { + "epoch": 0.2128300435785696, + "grad_norm": 2.5284088187012155, + "learning_rate": 1e-06, + "loss": 0.3615, + "step": 3321 + }, + { + "epoch": 0.21289412971033067, + "grad_norm": 2.7400867894135406, + "learning_rate": 1e-06, + "loss": 0.4012, + "step": 3322 + }, + { + "epoch": 0.21295821584209176, + "grad_norm": 2.8026338036476974, + "learning_rate": 1e-06, + "loss": 0.432, + "step": 3323 + }, + { + "epoch": 0.21302230197385286, + "grad_norm": 2.733888181563427, + "learning_rate": 1e-06, + "loss": 0.4293, + "step": 3324 + }, + { + "epoch": 0.21308638810561395, + "grad_norm": 2.587643683316493, + "learning_rate": 1e-06, + "loss": 0.374, + "step": 3325 + }, + { + "epoch": 0.21315047423737504, + "grad_norm": 2.6521670469030143, + "learning_rate": 1e-06, + "loss": 0.4076, + "step": 3326 + }, + { + "epoch": 0.2132145603691361, + "grad_norm": 2.858247171493016, + "learning_rate": 1e-06, + "loss": 0.4151, + "step": 3327 + }, + { + "epoch": 0.2132786465008972, + "grad_norm": 2.9518990696313554, + "learning_rate": 1e-06, + "loss": 0.3808, + "step": 3328 + }, + { + "epoch": 0.2133427326326583, + "grad_norm": 2.5807436920995706, + "learning_rate": 1e-06, + "loss": 0.3848, + "step": 3329 + }, + { + "epoch": 0.2134068187644194, + "grad_norm": 2.692268668879906, + "learning_rate": 1e-06, + "loss": 0.4469, + "step": 3330 + }, + { + "epoch": 0.21347090489618048, + "grad_norm": 2.548409043303377, + "learning_rate": 1e-06, + "loss": 0.4276, + "step": 3331 + }, + { + "epoch": 0.21353499102794155, + "grad_norm": 2.457547361289557, + "learning_rate": 1e-06, + "loss": 0.4154, + "step": 3332 + }, + { + "epoch": 0.21359907715970264, + "grad_norm": 3.140234343189398, + "learning_rate": 1e-06, + "loss": 0.3672, + "step": 3333 + }, + { + "epoch": 0.21366316329146373, + "grad_norm": 2.5891886639159987, + "learning_rate": 1e-06, + "loss": 0.4243, + "step": 3334 + }, + { + "epoch": 0.21372724942322482, + "grad_norm": 2.8625826627525175, + "learning_rate": 1e-06, + "loss": 0.4242, + "step": 3335 + }, + { + "epoch": 0.2137913355549859, + "grad_norm": 2.509069922553237, + "learning_rate": 1e-06, + "loss": 0.4424, + "step": 3336 + }, + { + "epoch": 0.21385542168674698, + "grad_norm": 2.6593254396801727, + "learning_rate": 1e-06, + "loss": 0.3739, + "step": 3337 + }, + { + "epoch": 0.21391950781850808, + "grad_norm": 2.5553613531487565, + "learning_rate": 1e-06, + "loss": 0.4033, + "step": 3338 + }, + { + "epoch": 0.21398359395026917, + "grad_norm": 2.7534326763941133, + "learning_rate": 1e-06, + "loss": 0.4361, + "step": 3339 + }, + { + "epoch": 0.21404768008203026, + "grad_norm": 2.809417915523082, + "learning_rate": 1e-06, + "loss": 0.4618, + "step": 3340 + }, + { + "epoch": 0.21411176621379133, + "grad_norm": 2.5990675730161152, + "learning_rate": 1e-06, + "loss": 0.4367, + "step": 3341 + }, + { + "epoch": 0.21417585234555242, + "grad_norm": 2.7348091981913756, + "learning_rate": 1e-06, + "loss": 0.4644, + "step": 3342 + }, + { + "epoch": 0.2142399384773135, + "grad_norm": 2.4841998191988854, + "learning_rate": 1e-06, + "loss": 0.4541, + "step": 3343 + }, + { + "epoch": 0.2143040246090746, + "grad_norm": 2.513354725827738, + "learning_rate": 1e-06, + "loss": 0.3605, + "step": 3344 + }, + { + "epoch": 0.21436811074083567, + "grad_norm": 2.6741945967742975, + "learning_rate": 1e-06, + "loss": 0.3704, + "step": 3345 + }, + { + "epoch": 0.21443219687259676, + "grad_norm": 2.6481448811114743, + "learning_rate": 1e-06, + "loss": 0.4211, + "step": 3346 + }, + { + "epoch": 0.21449628300435786, + "grad_norm": 2.881638476738493, + "learning_rate": 1e-06, + "loss": 0.4405, + "step": 3347 + }, + { + "epoch": 0.21456036913611895, + "grad_norm": 2.5160389638004594, + "learning_rate": 1e-06, + "loss": 0.4447, + "step": 3348 + }, + { + "epoch": 0.21462445526788004, + "grad_norm": 2.6219438331059846, + "learning_rate": 1e-06, + "loss": 0.4039, + "step": 3349 + }, + { + "epoch": 0.2146885413996411, + "grad_norm": 2.7939679761335587, + "learning_rate": 1e-06, + "loss": 0.4316, + "step": 3350 + }, + { + "epoch": 0.2147526275314022, + "grad_norm": 2.742546179444123, + "learning_rate": 1e-06, + "loss": 0.4001, + "step": 3351 + }, + { + "epoch": 0.2148167136631633, + "grad_norm": 2.7588570120175753, + "learning_rate": 1e-06, + "loss": 0.4035, + "step": 3352 + }, + { + "epoch": 0.2148807997949244, + "grad_norm": 2.9727016909828863, + "learning_rate": 1e-06, + "loss": 0.4292, + "step": 3353 + }, + { + "epoch": 0.21494488592668545, + "grad_norm": 2.700187053010241, + "learning_rate": 1e-06, + "loss": 0.377, + "step": 3354 + }, + { + "epoch": 0.21500897205844655, + "grad_norm": 2.8029003823348546, + "learning_rate": 1e-06, + "loss": 0.4384, + "step": 3355 + }, + { + "epoch": 0.21507305819020764, + "grad_norm": 2.6629430980581237, + "learning_rate": 1e-06, + "loss": 0.3895, + "step": 3356 + }, + { + "epoch": 0.21513714432196873, + "grad_norm": 2.757531931586532, + "learning_rate": 1e-06, + "loss": 0.398, + "step": 3357 + }, + { + "epoch": 0.21520123045372982, + "grad_norm": 2.918032036734727, + "learning_rate": 1e-06, + "loss": 0.3756, + "step": 3358 + }, + { + "epoch": 0.2152653165854909, + "grad_norm": 2.989072875608744, + "learning_rate": 1e-06, + "loss": 0.417, + "step": 3359 + }, + { + "epoch": 0.21532940271725198, + "grad_norm": 2.754298990911609, + "learning_rate": 1e-06, + "loss": 0.4508, + "step": 3360 + }, + { + "epoch": 0.21539348884901308, + "grad_norm": 2.6404117189428407, + "learning_rate": 1e-06, + "loss": 0.4318, + "step": 3361 + }, + { + "epoch": 0.21545757498077417, + "grad_norm": 3.3312953457728525, + "learning_rate": 1e-06, + "loss": 0.3877, + "step": 3362 + }, + { + "epoch": 0.21552166111253523, + "grad_norm": 2.775685646675011, + "learning_rate": 1e-06, + "loss": 0.4369, + "step": 3363 + }, + { + "epoch": 0.21558574724429633, + "grad_norm": 2.7227277955938005, + "learning_rate": 1e-06, + "loss": 0.439, + "step": 3364 + }, + { + "epoch": 0.21564983337605742, + "grad_norm": 2.7479986060865693, + "learning_rate": 1e-06, + "loss": 0.4015, + "step": 3365 + }, + { + "epoch": 0.2157139195078185, + "grad_norm": 2.749304608901622, + "learning_rate": 1e-06, + "loss": 0.407, + "step": 3366 + }, + { + "epoch": 0.2157780056395796, + "grad_norm": 2.677492736617638, + "learning_rate": 1e-06, + "loss": 0.4595, + "step": 3367 + }, + { + "epoch": 0.21584209177134067, + "grad_norm": 2.721007325759014, + "learning_rate": 1e-06, + "loss": 0.3814, + "step": 3368 + }, + { + "epoch": 0.21590617790310176, + "grad_norm": 2.8715102553121805, + "learning_rate": 1e-06, + "loss": 0.4608, + "step": 3369 + }, + { + "epoch": 0.21597026403486286, + "grad_norm": 2.7047494169852784, + "learning_rate": 1e-06, + "loss": 0.4142, + "step": 3370 + }, + { + "epoch": 0.21603435016662395, + "grad_norm": 2.390858891462131, + "learning_rate": 1e-06, + "loss": 0.4006, + "step": 3371 + }, + { + "epoch": 0.21609843629838502, + "grad_norm": 2.688612240628484, + "learning_rate": 1e-06, + "loss": 0.3934, + "step": 3372 + }, + { + "epoch": 0.2161625224301461, + "grad_norm": 2.3726230078498354, + "learning_rate": 1e-06, + "loss": 0.3854, + "step": 3373 + }, + { + "epoch": 0.2162266085619072, + "grad_norm": 2.4845931896277924, + "learning_rate": 1e-06, + "loss": 0.3693, + "step": 3374 + }, + { + "epoch": 0.2162906946936683, + "grad_norm": 2.569118902647708, + "learning_rate": 1e-06, + "loss": 0.4183, + "step": 3375 + }, + { + "epoch": 0.2163547808254294, + "grad_norm": 2.8433001863545884, + "learning_rate": 1e-06, + "loss": 0.4537, + "step": 3376 + }, + { + "epoch": 0.21641886695719045, + "grad_norm": 2.6467296962041003, + "learning_rate": 1e-06, + "loss": 0.3823, + "step": 3377 + }, + { + "epoch": 0.21648295308895155, + "grad_norm": 2.5467529724807223, + "learning_rate": 1e-06, + "loss": 0.384, + "step": 3378 + }, + { + "epoch": 0.21654703922071264, + "grad_norm": 2.6583699584853515, + "learning_rate": 1e-06, + "loss": 0.5313, + "step": 3379 + }, + { + "epoch": 0.21661112535247373, + "grad_norm": 2.7413522163195903, + "learning_rate": 1e-06, + "loss": 0.3918, + "step": 3380 + }, + { + "epoch": 0.21667521148423483, + "grad_norm": 2.850274265246485, + "learning_rate": 1e-06, + "loss": 0.3708, + "step": 3381 + }, + { + "epoch": 0.2167392976159959, + "grad_norm": 2.4545951876671444, + "learning_rate": 1e-06, + "loss": 0.4202, + "step": 3382 + }, + { + "epoch": 0.21680338374775698, + "grad_norm": 2.7541006484296826, + "learning_rate": 1e-06, + "loss": 0.4132, + "step": 3383 + }, + { + "epoch": 0.21686746987951808, + "grad_norm": 2.654628145662869, + "learning_rate": 1e-06, + "loss": 0.391, + "step": 3384 + }, + { + "epoch": 0.21693155601127917, + "grad_norm": 2.679905549390188, + "learning_rate": 1e-06, + "loss": 0.3952, + "step": 3385 + }, + { + "epoch": 0.21699564214304023, + "grad_norm": 2.7026520249743853, + "learning_rate": 1e-06, + "loss": 0.411, + "step": 3386 + }, + { + "epoch": 0.21705972827480133, + "grad_norm": 2.8935314931031972, + "learning_rate": 1e-06, + "loss": 0.4039, + "step": 3387 + }, + { + "epoch": 0.21712381440656242, + "grad_norm": 2.8256596007112678, + "learning_rate": 1e-06, + "loss": 0.4511, + "step": 3388 + }, + { + "epoch": 0.21718790053832351, + "grad_norm": 2.666041222942203, + "learning_rate": 1e-06, + "loss": 0.38, + "step": 3389 + }, + { + "epoch": 0.2172519866700846, + "grad_norm": 2.6944711686890104, + "learning_rate": 1e-06, + "loss": 0.3871, + "step": 3390 + }, + { + "epoch": 0.21731607280184567, + "grad_norm": 2.550478836812208, + "learning_rate": 1e-06, + "loss": 0.399, + "step": 3391 + }, + { + "epoch": 0.21738015893360677, + "grad_norm": 2.614636062866319, + "learning_rate": 1e-06, + "loss": 0.4499, + "step": 3392 + }, + { + "epoch": 0.21744424506536786, + "grad_norm": 2.668976199272245, + "learning_rate": 1e-06, + "loss": 0.459, + "step": 3393 + }, + { + "epoch": 0.21750833119712895, + "grad_norm": 2.5668912370881976, + "learning_rate": 1e-06, + "loss": 0.3908, + "step": 3394 + }, + { + "epoch": 0.21757241732889002, + "grad_norm": 2.8245523377207307, + "learning_rate": 1e-06, + "loss": 0.4191, + "step": 3395 + }, + { + "epoch": 0.2176365034606511, + "grad_norm": 2.6904693651871763, + "learning_rate": 1e-06, + "loss": 0.386, + "step": 3396 + }, + { + "epoch": 0.2177005895924122, + "grad_norm": 2.754562661101229, + "learning_rate": 1e-06, + "loss": 0.3947, + "step": 3397 + }, + { + "epoch": 0.2177646757241733, + "grad_norm": 2.63518445508415, + "learning_rate": 1e-06, + "loss": 0.4651, + "step": 3398 + }, + { + "epoch": 0.2178287618559344, + "grad_norm": 2.377123367412555, + "learning_rate": 1e-06, + "loss": 0.3997, + "step": 3399 + }, + { + "epoch": 0.21789284798769545, + "grad_norm": 2.674782214632184, + "learning_rate": 1e-06, + "loss": 0.453, + "step": 3400 + } + ], + "logging_steps": 1.0, + "max_steps": 15604, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 825505490534400.0, + "train_batch_size": 10, + "trial_name": null, + "trial_params": null +}