diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4151 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9987239472564866, + "eval_steps": 500, + "global_step": 587, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0017014036580178648, + "grad_norm": 2.168562940412461, + "learning_rate": 1.111111111111111e-06, + "loss": 1.1424, + "step": 1 + }, + { + "epoch": 0.0034028073160357296, + "grad_norm": 2.250032849091299, + "learning_rate": 2.222222222222222e-06, + "loss": 1.1697, + "step": 2 + }, + { + "epoch": 0.005104210974053594, + "grad_norm": 2.124744531823449, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.1364, + "step": 3 + }, + { + "epoch": 0.006805614632071459, + "grad_norm": 2.060484893473413, + "learning_rate": 4.444444444444444e-06, + "loss": 1.1471, + "step": 4 + }, + { + "epoch": 0.008507018290089324, + "grad_norm": 1.89749522427158, + "learning_rate": 5.555555555555557e-06, + "loss": 1.1181, + "step": 5 + }, + { + "epoch": 0.010208421948107189, + "grad_norm": 1.3357851941439673, + "learning_rate": 6.666666666666667e-06, + "loss": 1.0893, + "step": 6 + }, + { + "epoch": 0.011909825606125054, + "grad_norm": 1.2850663628144225, + "learning_rate": 7.77777777777778e-06, + "loss": 1.0484, + "step": 7 + }, + { + "epoch": 0.013611229264142918, + "grad_norm": 1.636445160753731, + "learning_rate": 8.888888888888888e-06, + "loss": 1.0308, + "step": 8 + }, + { + "epoch": 0.015312632922160783, + "grad_norm": 1.3002228280073427, + "learning_rate": 1e-05, + "loss": 1.0234, + "step": 9 + }, + { + "epoch": 0.017014036580178648, + "grad_norm": 1.2380680351381246, + "learning_rate": 1.1111111111111113e-05, + "loss": 0.9064, + "step": 10 + }, + { + "epoch": 0.01871544023819651, + "grad_norm": 1.2765329463296096, + "learning_rate": 1.2222222222222224e-05, + "loss": 0.8696, + "step": 11 + }, + { + "epoch": 0.020416843896214378, + "grad_norm": 1.060585818467504, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.8444, + "step": 12 + }, + { + "epoch": 0.02211824755423224, + "grad_norm": 1.1232952035148502, + "learning_rate": 1.4444444444444446e-05, + "loss": 0.841, + "step": 13 + }, + { + "epoch": 0.023819651212250107, + "grad_norm": 1.107943114775762, + "learning_rate": 1.555555555555556e-05, + "loss": 0.7655, + "step": 14 + }, + { + "epoch": 0.02552105487026797, + "grad_norm": 1.0059483053370937, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.7063, + "step": 15 + }, + { + "epoch": 0.027222458528285837, + "grad_norm": 0.9803969915500654, + "learning_rate": 1.7777777777777777e-05, + "loss": 0.6458, + "step": 16 + }, + { + "epoch": 0.0289238621863037, + "grad_norm": 0.9105498405702808, + "learning_rate": 1.888888888888889e-05, + "loss": 0.6321, + "step": 17 + }, + { + "epoch": 0.030625265844321566, + "grad_norm": 0.9433546278214147, + "learning_rate": 2e-05, + "loss": 0.493, + "step": 18 + }, + { + "epoch": 0.03232666950233943, + "grad_norm": 0.8625027370409255, + "learning_rate": 1.9999847579243196e-05, + "loss": 0.5339, + "step": 19 + }, + { + "epoch": 0.034028073160357296, + "grad_norm": 0.8101152462345085, + "learning_rate": 1.9999390321619196e-05, + "loss": 0.4566, + "step": 20 + }, + { + "epoch": 0.03572947681837516, + "grad_norm": 0.7923424067716904, + "learning_rate": 1.9998628241067113e-05, + "loss": 0.4507, + "step": 21 + }, + { + "epoch": 0.03743088047639302, + "grad_norm": 0.7681394677537327, + "learning_rate": 1.9997561360818322e-05, + "loss": 0.4265, + "step": 22 + }, + { + "epoch": 0.03913228413441089, + "grad_norm": 0.7031739222010267, + "learning_rate": 1.999618971339577e-05, + "loss": 0.4174, + "step": 23 + }, + { + "epoch": 0.040833687792428755, + "grad_norm": 0.6603135403892707, + "learning_rate": 1.9994513340612957e-05, + "loss": 0.3599, + "step": 24 + }, + { + "epoch": 0.04253509145044662, + "grad_norm": 0.7260664469398356, + "learning_rate": 1.9992532293572688e-05, + "loss": 0.3398, + "step": 25 + }, + { + "epoch": 0.04423649510846448, + "grad_norm": 0.590669951494315, + "learning_rate": 1.9990246632665503e-05, + "loss": 0.2616, + "step": 26 + }, + { + "epoch": 0.04593789876648235, + "grad_norm": 0.6319158523814317, + "learning_rate": 1.998765642756783e-05, + "loss": 0.3082, + "step": 27 + }, + { + "epoch": 0.047639302424500214, + "grad_norm": 0.6331615276638973, + "learning_rate": 1.9984761757239878e-05, + "loss": 0.2933, + "step": 28 + }, + { + "epoch": 0.04934070608251808, + "grad_norm": 0.6376428367891515, + "learning_rate": 1.998156270992321e-05, + "loss": 0.2612, + "step": 29 + }, + { + "epoch": 0.05104210974053594, + "grad_norm": 0.6322085618139611, + "learning_rate": 1.9978059383138073e-05, + "loss": 0.2397, + "step": 30 + }, + { + "epoch": 0.05274351339855381, + "grad_norm": 0.6174252074442552, + "learning_rate": 1.997425188368041e-05, + "loss": 0.2566, + "step": 31 + }, + { + "epoch": 0.05444491705657167, + "grad_norm": 0.5327916753070241, + "learning_rate": 1.9970140327618612e-05, + "loss": 0.2416, + "step": 32 + }, + { + "epoch": 0.05614632071458953, + "grad_norm": 0.5479078907290673, + "learning_rate": 1.9965724840289972e-05, + "loss": 0.1781, + "step": 33 + }, + { + "epoch": 0.0578477243726074, + "grad_norm": 0.5315756790547577, + "learning_rate": 1.9961005556296875e-05, + "loss": 0.2258, + "step": 34 + }, + { + "epoch": 0.059549128030625266, + "grad_norm": 0.48386295113930106, + "learning_rate": 1.9955982619502693e-05, + "loss": 0.1679, + "step": 35 + }, + { + "epoch": 0.06125053168864313, + "grad_norm": 0.5185655350233463, + "learning_rate": 1.9950656183027392e-05, + "loss": 0.2022, + "step": 36 + }, + { + "epoch": 0.06295193534666099, + "grad_norm": 0.5425773293966369, + "learning_rate": 1.994502640924286e-05, + "loss": 0.2144, + "step": 37 + }, + { + "epoch": 0.06465333900467886, + "grad_norm": 0.39700624229353654, + "learning_rate": 1.993909346976798e-05, + "loss": 0.1361, + "step": 38 + }, + { + "epoch": 0.06635474266269673, + "grad_norm": 0.5024564814818023, + "learning_rate": 1.993285754546338e-05, + "loss": 0.1618, + "step": 39 + }, + { + "epoch": 0.06805614632071459, + "grad_norm": 0.4298514607710804, + "learning_rate": 1.9926318826425905e-05, + "loss": 0.1801, + "step": 40 + }, + { + "epoch": 0.06975754997873246, + "grad_norm": 0.4631733150959672, + "learning_rate": 1.9919477511982873e-05, + "loss": 0.1212, + "step": 41 + }, + { + "epoch": 0.07145895363675032, + "grad_norm": 0.46629563189251916, + "learning_rate": 1.991233381068594e-05, + "loss": 0.1538, + "step": 42 + }, + { + "epoch": 0.07316035729476818, + "grad_norm": 0.4077353003444054, + "learning_rate": 1.990488794030478e-05, + "loss": 0.1601, + "step": 43 + }, + { + "epoch": 0.07486176095278604, + "grad_norm": 0.3874990470468177, + "learning_rate": 1.9897140127820432e-05, + "loss": 0.125, + "step": 44 + }, + { + "epoch": 0.07656316461080391, + "grad_norm": 0.3638523211055093, + "learning_rate": 1.9889090609418384e-05, + "loss": 0.1366, + "step": 45 + }, + { + "epoch": 0.07826456826882178, + "grad_norm": 0.5006533990444182, + "learning_rate": 1.9880739630481376e-05, + "loss": 0.1466, + "step": 46 + }, + { + "epoch": 0.07996597192683964, + "grad_norm": 0.4320089119157406, + "learning_rate": 1.9872087445581912e-05, + "loss": 0.1256, + "step": 47 + }, + { + "epoch": 0.08166737558485751, + "grad_norm": 0.3330675873783333, + "learning_rate": 1.9863134318474504e-05, + "loss": 0.1066, + "step": 48 + }, + { + "epoch": 0.08336877924287538, + "grad_norm": 0.4410537592519413, + "learning_rate": 1.985388052208764e-05, + "loss": 0.1381, + "step": 49 + }, + { + "epoch": 0.08507018290089324, + "grad_norm": 0.4003560082493989, + "learning_rate": 1.9844326338515444e-05, + "loss": 0.1202, + "step": 50 + }, + { + "epoch": 0.0867715865589111, + "grad_norm": 0.41012939531437076, + "learning_rate": 1.9834472059009097e-05, + "loss": 0.1246, + "step": 51 + }, + { + "epoch": 0.08847299021692896, + "grad_norm": 0.408091381990657, + "learning_rate": 1.982431798396794e-05, + "loss": 0.1289, + "step": 52 + }, + { + "epoch": 0.09017439387494683, + "grad_norm": 0.4821914250638501, + "learning_rate": 1.9813864422930345e-05, + "loss": 0.1312, + "step": 53 + }, + { + "epoch": 0.0918757975329647, + "grad_norm": 0.40793016164627693, + "learning_rate": 1.9803111694564246e-05, + "loss": 0.1237, + "step": 54 + }, + { + "epoch": 0.09357720119098256, + "grad_norm": 0.3734013818822545, + "learning_rate": 1.9792060126657437e-05, + "loss": 0.1049, + "step": 55 + }, + { + "epoch": 0.09527860484900043, + "grad_norm": 0.3186480775193613, + "learning_rate": 1.9780710056107587e-05, + "loss": 0.0911, + "step": 56 + }, + { + "epoch": 0.0969800085070183, + "grad_norm": 0.39667217504563396, + "learning_rate": 1.976906182891197e-05, + "loss": 0.0918, + "step": 57 + }, + { + "epoch": 0.09868141216503616, + "grad_norm": 0.4482874203710604, + "learning_rate": 1.97571158001569e-05, + "loss": 0.1209, + "step": 58 + }, + { + "epoch": 0.10038281582305401, + "grad_norm": 0.2965972804159924, + "learning_rate": 1.9744872334006936e-05, + "loss": 0.0932, + "step": 59 + }, + { + "epoch": 0.10208421948107188, + "grad_norm": 0.3566484676555622, + "learning_rate": 1.973233180369374e-05, + "loss": 0.1248, + "step": 60 + }, + { + "epoch": 0.10378562313908975, + "grad_norm": 0.30896394609488814, + "learning_rate": 1.9719494591504747e-05, + "loss": 0.0851, + "step": 61 + }, + { + "epoch": 0.10548702679710761, + "grad_norm": 0.2903927724315073, + "learning_rate": 1.9706361088771474e-05, + "loss": 0.0763, + "step": 62 + }, + { + "epoch": 0.10718843045512548, + "grad_norm": 0.3405843767078732, + "learning_rate": 1.96929316958576e-05, + "loss": 0.103, + "step": 63 + }, + { + "epoch": 0.10888983411314335, + "grad_norm": 0.40152278275783043, + "learning_rate": 1.9679206822146776e-05, + "loss": 0.1084, + "step": 64 + }, + { + "epoch": 0.11059123777116121, + "grad_norm": 0.3372032580489575, + "learning_rate": 1.9665186886030135e-05, + "loss": 0.1022, + "step": 65 + }, + { + "epoch": 0.11229264142917907, + "grad_norm": 0.332966030135067, + "learning_rate": 1.9650872314893523e-05, + "loss": 0.0891, + "step": 66 + }, + { + "epoch": 0.11399404508719693, + "grad_norm": 0.3832837550171733, + "learning_rate": 1.9636263545104498e-05, + "loss": 0.1181, + "step": 67 + }, + { + "epoch": 0.1156954487452148, + "grad_norm": 0.40356329916123346, + "learning_rate": 1.962136102199901e-05, + "loss": 0.1262, + "step": 68 + }, + { + "epoch": 0.11739685240323267, + "grad_norm": 0.3689423450550846, + "learning_rate": 1.9606165199867822e-05, + "loss": 0.0826, + "step": 69 + }, + { + "epoch": 0.11909825606125053, + "grad_norm": 0.3457217662571401, + "learning_rate": 1.959067654194268e-05, + "loss": 0.1007, + "step": 70 + }, + { + "epoch": 0.1207996597192684, + "grad_norm": 0.3511006571491925, + "learning_rate": 1.9574895520382183e-05, + "loss": 0.1263, + "step": 71 + }, + { + "epoch": 0.12250106337728627, + "grad_norm": 0.36819206266348997, + "learning_rate": 1.955882261625737e-05, + "loss": 0.1137, + "step": 72 + }, + { + "epoch": 0.12420246703530413, + "grad_norm": 0.2607004742274578, + "learning_rate": 1.9542458319537094e-05, + "loss": 0.0914, + "step": 73 + }, + { + "epoch": 0.12590387069332198, + "grad_norm": 0.2773076381582836, + "learning_rate": 1.9525803129073046e-05, + "loss": 0.0592, + "step": 74 + }, + { + "epoch": 0.12760527435133986, + "grad_norm": 0.28951070788934496, + "learning_rate": 1.9508857552584574e-05, + "loss": 0.0928, + "step": 75 + }, + { + "epoch": 0.12930667800935772, + "grad_norm": 0.35718485474187367, + "learning_rate": 1.9491622106643195e-05, + "loss": 0.0952, + "step": 76 + }, + { + "epoch": 0.1310080816673756, + "grad_norm": 0.34891501382529494, + "learning_rate": 1.9474097316656856e-05, + "loss": 0.1004, + "step": 77 + }, + { + "epoch": 0.13270948532539345, + "grad_norm": 0.28733346641687174, + "learning_rate": 1.9456283716853906e-05, + "loss": 0.0823, + "step": 78 + }, + { + "epoch": 0.1344108889834113, + "grad_norm": 0.3735237899238901, + "learning_rate": 1.9438181850266815e-05, + "loss": 0.121, + "step": 79 + }, + { + "epoch": 0.13611229264142918, + "grad_norm": 0.2904974285120679, + "learning_rate": 1.941979226871563e-05, + "loss": 0.0768, + "step": 80 + }, + { + "epoch": 0.13781369629944704, + "grad_norm": 0.32117427662146836, + "learning_rate": 1.9401115532791134e-05, + "loss": 0.0837, + "step": 81 + }, + { + "epoch": 0.13951509995746492, + "grad_norm": 0.40967624537549835, + "learning_rate": 1.938215221183777e-05, + "loss": 0.1061, + "step": 82 + }, + { + "epoch": 0.14121650361548277, + "grad_norm": 0.33629708001931885, + "learning_rate": 1.936290288393629e-05, + "loss": 0.0892, + "step": 83 + }, + { + "epoch": 0.14291790727350065, + "grad_norm": 0.31834590604771723, + "learning_rate": 1.9343368135886112e-05, + "loss": 0.1088, + "step": 84 + }, + { + "epoch": 0.1446193109315185, + "grad_norm": 0.30988146926514176, + "learning_rate": 1.932354856318746e-05, + "loss": 0.0864, + "step": 85 + }, + { + "epoch": 0.14632071458953635, + "grad_norm": 0.41148714128819724, + "learning_rate": 1.9303444770023184e-05, + "loss": 0.0867, + "step": 86 + }, + { + "epoch": 0.14802211824755424, + "grad_norm": 0.3641579696865324, + "learning_rate": 1.9283057369240358e-05, + "loss": 0.1149, + "step": 87 + }, + { + "epoch": 0.1497235219055721, + "grad_norm": 0.3359968169832728, + "learning_rate": 1.9262386982331596e-05, + "loss": 0.0959, + "step": 88 + }, + { + "epoch": 0.15142492556358997, + "grad_norm": 0.2849324477084618, + "learning_rate": 1.9241434239416093e-05, + "loss": 0.0957, + "step": 89 + }, + { + "epoch": 0.15312632922160782, + "grad_norm": 0.29673544787967593, + "learning_rate": 1.922019977922045e-05, + "loss": 0.0823, + "step": 90 + }, + { + "epoch": 0.1548277328796257, + "grad_norm": 0.28046315847608455, + "learning_rate": 1.919868424905915e-05, + "loss": 0.0885, + "step": 91 + }, + { + "epoch": 0.15652913653764355, + "grad_norm": 0.3024116439443639, + "learning_rate": 1.9176888304814882e-05, + "loss": 0.0919, + "step": 92 + }, + { + "epoch": 0.15823054019566143, + "grad_norm": 0.4139703560637819, + "learning_rate": 1.9154812610918503e-05, + "loss": 0.0938, + "step": 93 + }, + { + "epoch": 0.1599319438536793, + "grad_norm": 0.2618424473915207, + "learning_rate": 1.913245784032881e-05, + "loss": 0.0684, + "step": 94 + }, + { + "epoch": 0.16163334751169714, + "grad_norm": 0.31266175001831703, + "learning_rate": 1.9109824674512014e-05, + "loss": 0.0673, + "step": 95 + }, + { + "epoch": 0.16333475116971502, + "grad_norm": 0.3741829507920355, + "learning_rate": 1.9086913803420966e-05, + "loss": 0.1097, + "step": 96 + }, + { + "epoch": 0.16503615482773287, + "grad_norm": 0.3356628202803492, + "learning_rate": 1.906372592547413e-05, + "loss": 0.1028, + "step": 97 + }, + { + "epoch": 0.16673755848575075, + "grad_norm": 0.34106497626751014, + "learning_rate": 1.9040261747534282e-05, + "loss": 0.0989, + "step": 98 + }, + { + "epoch": 0.1684389621437686, + "grad_norm": 0.29746452263621875, + "learning_rate": 1.9016521984886984e-05, + "loss": 0.0718, + "step": 99 + }, + { + "epoch": 0.1701403658017865, + "grad_norm": 0.2990092253885084, + "learning_rate": 1.8992507361218743e-05, + "loss": 0.0765, + "step": 100 + }, + { + "epoch": 0.17184176945980434, + "grad_norm": 0.2177337033832474, + "learning_rate": 1.8968218608594987e-05, + "loss": 0.056, + "step": 101 + }, + { + "epoch": 0.1735431731178222, + "grad_norm": 0.39840165284276846, + "learning_rate": 1.8943656467437726e-05, + "loss": 0.1111, + "step": 102 + }, + { + "epoch": 0.17524457677584007, + "grad_norm": 0.3378745899319231, + "learning_rate": 1.8918821686502992e-05, + "loss": 0.0931, + "step": 103 + }, + { + "epoch": 0.17694598043385792, + "grad_norm": 0.2637083690262847, + "learning_rate": 1.8893715022858e-05, + "loss": 0.0759, + "step": 104 + }, + { + "epoch": 0.1786473840918758, + "grad_norm": 0.36645660481595893, + "learning_rate": 1.886833724185809e-05, + "loss": 0.0956, + "step": 105 + }, + { + "epoch": 0.18034878774989366, + "grad_norm": 0.2626991205572612, + "learning_rate": 1.8842689117123377e-05, + "loss": 0.0622, + "step": 106 + }, + { + "epoch": 0.18205019140791154, + "grad_norm": 0.28778996504448945, + "learning_rate": 1.8816771430515178e-05, + "loss": 0.0778, + "step": 107 + }, + { + "epoch": 0.1837515950659294, + "grad_norm": 0.30016452527135956, + "learning_rate": 1.8790584972112174e-05, + "loss": 0.0658, + "step": 108 + }, + { + "epoch": 0.18545299872394724, + "grad_norm": 0.5203118503020492, + "learning_rate": 1.876413054018633e-05, + "loss": 0.146, + "step": 109 + }, + { + "epoch": 0.18715440238196512, + "grad_norm": 0.30134320864935904, + "learning_rate": 1.873740894117854e-05, + "loss": 0.0821, + "step": 110 + }, + { + "epoch": 0.18885580603998298, + "grad_norm": 0.34344915555537253, + "learning_rate": 1.8710420989674093e-05, + "loss": 0.075, + "step": 111 + }, + { + "epoch": 0.19055720969800086, + "grad_norm": 0.2458501444323483, + "learning_rate": 1.8683167508377775e-05, + "loss": 0.0596, + "step": 112 + }, + { + "epoch": 0.1922586133560187, + "grad_norm": 0.2651178569441844, + "learning_rate": 1.8655649328088836e-05, + "loss": 0.0674, + "step": 113 + }, + { + "epoch": 0.1939600170140366, + "grad_norm": 0.2662036987842302, + "learning_rate": 1.862786728767565e-05, + "loss": 0.0769, + "step": 114 + }, + { + "epoch": 0.19566142067205444, + "grad_norm": 0.28498333925442115, + "learning_rate": 1.8599822234050143e-05, + "loss": 0.0778, + "step": 115 + }, + { + "epoch": 0.19736282433007232, + "grad_norm": 0.26717816913116776, + "learning_rate": 1.8571515022141974e-05, + "loss": 0.0896, + "step": 116 + }, + { + "epoch": 0.19906422798809018, + "grad_norm": 0.3374842493414297, + "learning_rate": 1.8542946514872478e-05, + "loss": 0.0992, + "step": 117 + }, + { + "epoch": 0.20076563164610803, + "grad_norm": 0.21117730752547184, + "learning_rate": 1.851411758312835e-05, + "loss": 0.0454, + "step": 118 + }, + { + "epoch": 0.2024670353041259, + "grad_norm": 0.19203553955386238, + "learning_rate": 1.8485029105735112e-05, + "loss": 0.0611, + "step": 119 + }, + { + "epoch": 0.20416843896214376, + "grad_norm": 0.22445524786962626, + "learning_rate": 1.8455681969430307e-05, + "loss": 0.0584, + "step": 120 + }, + { + "epoch": 0.20586984262016164, + "grad_norm": 0.23148081400071185, + "learning_rate": 1.8426077068836487e-05, + "loss": 0.0629, + "step": 121 + }, + { + "epoch": 0.2075712462781795, + "grad_norm": 0.17709972528549517, + "learning_rate": 1.839621530643392e-05, + "loss": 0.0563, + "step": 122 + }, + { + "epoch": 0.20927264993619737, + "grad_norm": 0.3048264391849215, + "learning_rate": 1.8366097592533095e-05, + "loss": 0.0778, + "step": 123 + }, + { + "epoch": 0.21097405359421523, + "grad_norm": 0.32208607157512736, + "learning_rate": 1.8335724845246948e-05, + "loss": 0.1028, + "step": 124 + }, + { + "epoch": 0.21267545725223308, + "grad_norm": 0.23725153079241285, + "learning_rate": 1.830509799046292e-05, + "loss": 0.0803, + "step": 125 + }, + { + "epoch": 0.21437686091025096, + "grad_norm": 0.2996136224560952, + "learning_rate": 1.8274217961814682e-05, + "loss": 0.0718, + "step": 126 + }, + { + "epoch": 0.2160782645682688, + "grad_norm": 0.23529202737787724, + "learning_rate": 1.8243085700653698e-05, + "loss": 0.058, + "step": 127 + }, + { + "epoch": 0.2177796682262867, + "grad_norm": 0.2602674833841576, + "learning_rate": 1.821170215602053e-05, + "loss": 0.0847, + "step": 128 + }, + { + "epoch": 0.21948107188430455, + "grad_norm": 0.23134307276310323, + "learning_rate": 1.818006828461591e-05, + "loss": 0.0568, + "step": 129 + }, + { + "epoch": 0.22118247554232243, + "grad_norm": 0.26453961985179086, + "learning_rate": 1.8148185050771554e-05, + "loss": 0.0801, + "step": 130 + }, + { + "epoch": 0.22288387920034028, + "grad_norm": 0.25318378680529585, + "learning_rate": 1.8116053426420793e-05, + "loss": 0.0749, + "step": 131 + }, + { + "epoch": 0.22458528285835813, + "grad_norm": 0.20502925194857724, + "learning_rate": 1.8083674391068925e-05, + "loss": 0.0629, + "step": 132 + }, + { + "epoch": 0.226286686516376, + "grad_norm": 0.19279005026370763, + "learning_rate": 1.8051048931763366e-05, + "loss": 0.0463, + "step": 133 + }, + { + "epoch": 0.22798809017439386, + "grad_norm": 0.2986931928862026, + "learning_rate": 1.8018178043063554e-05, + "loss": 0.0869, + "step": 134 + }, + { + "epoch": 0.22968949383241175, + "grad_norm": 0.2580936970306385, + "learning_rate": 1.798506272701064e-05, + "loss": 0.0689, + "step": 135 + }, + { + "epoch": 0.2313908974904296, + "grad_norm": 0.35433155424672486, + "learning_rate": 1.795170399309692e-05, + "loss": 0.0889, + "step": 136 + }, + { + "epoch": 0.23309230114844748, + "grad_norm": 0.3342563856245416, + "learning_rate": 1.7918102858235103e-05, + "loss": 0.1088, + "step": 137 + }, + { + "epoch": 0.23479370480646533, + "grad_norm": 0.2658901230535673, + "learning_rate": 1.7884260346727257e-05, + "loss": 0.0967, + "step": 138 + }, + { + "epoch": 0.2364951084644832, + "grad_norm": 0.3116747759735481, + "learning_rate": 1.7850177490233635e-05, + "loss": 0.0763, + "step": 139 + }, + { + "epoch": 0.23819651212250106, + "grad_norm": 0.2561525037232419, + "learning_rate": 1.7815855327741185e-05, + "loss": 0.0956, + "step": 140 + }, + { + "epoch": 0.23989791578051892, + "grad_norm": 0.2246820236163285, + "learning_rate": 1.7781294905531908e-05, + "loss": 0.0792, + "step": 141 + }, + { + "epoch": 0.2415993194385368, + "grad_norm": 0.3264214270885714, + "learning_rate": 1.774649727715094e-05, + "loss": 0.0792, + "step": 142 + }, + { + "epoch": 0.24330072309655465, + "grad_norm": 0.19102790309255646, + "learning_rate": 1.7711463503374466e-05, + "loss": 0.0627, + "step": 143 + }, + { + "epoch": 0.24500212675457253, + "grad_norm": 0.29766375426258634, + "learning_rate": 1.7676194652177333e-05, + "loss": 0.0529, + "step": 144 + }, + { + "epoch": 0.24670353041259038, + "grad_norm": 0.29002494096896503, + "learning_rate": 1.764069179870055e-05, + "loss": 0.0873, + "step": 145 + }, + { + "epoch": 0.24840493407060826, + "grad_norm": 0.1961062743167957, + "learning_rate": 1.760495602521847e-05, + "loss": 0.0549, + "step": 146 + }, + { + "epoch": 0.2501063377286261, + "grad_norm": 0.3391386450150704, + "learning_rate": 1.756898842110582e-05, + "loss": 0.0855, + "step": 147 + }, + { + "epoch": 0.25180774138664397, + "grad_norm": 0.2237841919957392, + "learning_rate": 1.753279008280449e-05, + "loss": 0.0611, + "step": 148 + }, + { + "epoch": 0.2535091450446618, + "grad_norm": 0.1863997964345379, + "learning_rate": 1.74963621137901e-05, + "loss": 0.0557, + "step": 149 + }, + { + "epoch": 0.25521054870267973, + "grad_norm": 0.2322748935860178, + "learning_rate": 1.7459705624538383e-05, + "loss": 0.0744, + "step": 150 + }, + { + "epoch": 0.2569119523606976, + "grad_norm": 0.2811884338929891, + "learning_rate": 1.7422821732491297e-05, + "loss": 0.0869, + "step": 151 + }, + { + "epoch": 0.25861335601871543, + "grad_norm": 0.2336367022405289, + "learning_rate": 1.7385711562022988e-05, + "loss": 0.0662, + "step": 152 + }, + { + "epoch": 0.2603147596767333, + "grad_norm": 0.2525517790918003, + "learning_rate": 1.734837624440551e-05, + "loss": 0.0709, + "step": 153 + }, + { + "epoch": 0.2620161633347512, + "grad_norm": 0.25674452456443186, + "learning_rate": 1.731081691777434e-05, + "loss": 0.0536, + "step": 154 + }, + { + "epoch": 0.26371756699276905, + "grad_norm": 0.2380952734859674, + "learning_rate": 1.7273034727093677e-05, + "loss": 0.0797, + "step": 155 + }, + { + "epoch": 0.2654189706507869, + "grad_norm": 0.22013348660975807, + "learning_rate": 1.7235030824121542e-05, + "loss": 0.0608, + "step": 156 + }, + { + "epoch": 0.26712037430880475, + "grad_norm": 0.22281523456342697, + "learning_rate": 1.7196806367374656e-05, + "loss": 0.0635, + "step": 157 + }, + { + "epoch": 0.2688217779668226, + "grad_norm": 0.2102268959812806, + "learning_rate": 1.7158362522093153e-05, + "loss": 0.0682, + "step": 158 + }, + { + "epoch": 0.2705231816248405, + "grad_norm": 0.2981020750225053, + "learning_rate": 1.7119700460205026e-05, + "loss": 0.0748, + "step": 159 + }, + { + "epoch": 0.27222458528285837, + "grad_norm": 0.23396857925074813, + "learning_rate": 1.7080821360290426e-05, + "loss": 0.0668, + "step": 160 + }, + { + "epoch": 0.2739259889408762, + "grad_norm": 0.14039625188691493, + "learning_rate": 1.7041726407545716e-05, + "loss": 0.0334, + "step": 161 + }, + { + "epoch": 0.27562739259889407, + "grad_norm": 0.2591445596342466, + "learning_rate": 1.7002416793747354e-05, + "loss": 0.0613, + "step": 162 + }, + { + "epoch": 0.2773287962569119, + "grad_norm": 0.29367133177482335, + "learning_rate": 1.696289371721556e-05, + "loss": 0.0713, + "step": 163 + }, + { + "epoch": 0.27903019991492983, + "grad_norm": 0.3690305678977906, + "learning_rate": 1.692315838277778e-05, + "loss": 0.1274, + "step": 164 + }, + { + "epoch": 0.2807316035729477, + "grad_norm": 0.22214467316639894, + "learning_rate": 1.6883212001731956e-05, + "loss": 0.0655, + "step": 165 + }, + { + "epoch": 0.28243300723096554, + "grad_norm": 0.1574976967869379, + "learning_rate": 1.6843055791809623e-05, + "loss": 0.0322, + "step": 166 + }, + { + "epoch": 0.2841344108889834, + "grad_norm": 0.27654091402143877, + "learning_rate": 1.680269097713876e-05, + "loss": 0.0922, + "step": 167 + }, + { + "epoch": 0.2858358145470013, + "grad_norm": 0.28597652739783896, + "learning_rate": 1.6762118788206488e-05, + "loss": 0.079, + "step": 168 + }, + { + "epoch": 0.28753721820501915, + "grad_norm": 0.26938794460799176, + "learning_rate": 1.6721340461821555e-05, + "loss": 0.0871, + "step": 169 + }, + { + "epoch": 0.289238621863037, + "grad_norm": 0.32524563962835895, + "learning_rate": 1.6680357241076632e-05, + "loss": 0.1052, + "step": 170 + }, + { + "epoch": 0.29094002552105486, + "grad_norm": 0.3017728585235927, + "learning_rate": 1.6639170375310422e-05, + "loss": 0.0705, + "step": 171 + }, + { + "epoch": 0.2926414291790727, + "grad_norm": 0.23611080555948813, + "learning_rate": 1.6597781120069584e-05, + "loss": 0.0735, + "step": 172 + }, + { + "epoch": 0.2943428328370906, + "grad_norm": 0.2824684340323033, + "learning_rate": 1.655619073707043e-05, + "loss": 0.0957, + "step": 173 + }, + { + "epoch": 0.29604423649510847, + "grad_norm": 0.22439711399094156, + "learning_rate": 1.6514400494160498e-05, + "loss": 0.0572, + "step": 174 + }, + { + "epoch": 0.2977456401531263, + "grad_norm": 0.18939390048237154, + "learning_rate": 1.6472411665279872e-05, + "loss": 0.057, + "step": 175 + }, + { + "epoch": 0.2994470438111442, + "grad_norm": 0.22456676896884215, + "learning_rate": 1.643022553042237e-05, + "loss": 0.0557, + "step": 176 + }, + { + "epoch": 0.3011484474691621, + "grad_norm": 0.18664835436430186, + "learning_rate": 1.6387843375596513e-05, + "loss": 0.0494, + "step": 177 + }, + { + "epoch": 0.30284985112717994, + "grad_norm": 0.23489112753979385, + "learning_rate": 1.634526649278632e-05, + "loss": 0.0821, + "step": 178 + }, + { + "epoch": 0.3045512547851978, + "grad_norm": 0.2194109402248629, + "learning_rate": 1.630249617991194e-05, + "loss": 0.0672, + "step": 179 + }, + { + "epoch": 0.30625265844321564, + "grad_norm": 0.3321350768766143, + "learning_rate": 1.6259533740790055e-05, + "loss": 0.1135, + "step": 180 + }, + { + "epoch": 0.3079540621012335, + "grad_norm": 0.2502151994073321, + "learning_rate": 1.6216380485094164e-05, + "loss": 0.0718, + "step": 181 + }, + { + "epoch": 0.3096554657592514, + "grad_norm": 0.2670386782819874, + "learning_rate": 1.617303772831465e-05, + "loss": 0.0728, + "step": 182 + }, + { + "epoch": 0.31135686941726926, + "grad_norm": 0.26272098469205685, + "learning_rate": 1.6129506791718665e-05, + "loss": 0.0832, + "step": 183 + }, + { + "epoch": 0.3130582730752871, + "grad_norm": 0.2246929104575992, + "learning_rate": 1.6085789002309873e-05, + "loss": 0.0631, + "step": 184 + }, + { + "epoch": 0.31475967673330496, + "grad_norm": 0.21270610161779943, + "learning_rate": 1.6041885692787985e-05, + "loss": 0.067, + "step": 185 + }, + { + "epoch": 0.31646108039132287, + "grad_norm": 0.21242584741643203, + "learning_rate": 1.599779820150813e-05, + "loss": 0.068, + "step": 186 + }, + { + "epoch": 0.3181624840493407, + "grad_norm": 0.20419485410339863, + "learning_rate": 1.5953527872440063e-05, + "loss": 0.0757, + "step": 187 + }, + { + "epoch": 0.3198638877073586, + "grad_norm": 0.27210472135834674, + "learning_rate": 1.5909076055127202e-05, + "loss": 0.0853, + "step": 188 + }, + { + "epoch": 0.3215652913653764, + "grad_norm": 0.22338089059411625, + "learning_rate": 1.5864444104645473e-05, + "loss": 0.0742, + "step": 189 + }, + { + "epoch": 0.3232666950233943, + "grad_norm": 0.2732168335391855, + "learning_rate": 1.581963338156201e-05, + "loss": 0.0908, + "step": 190 + }, + { + "epoch": 0.3249680986814122, + "grad_norm": 0.2467079865485088, + "learning_rate": 1.5774645251893673e-05, + "loss": 0.0591, + "step": 191 + }, + { + "epoch": 0.32666950233943004, + "grad_norm": 0.2313614767876932, + "learning_rate": 1.5729481087065423e-05, + "loss": 0.0892, + "step": 192 + }, + { + "epoch": 0.3283709059974479, + "grad_norm": 0.2023763116089166, + "learning_rate": 1.5684142263868493e-05, + "loss": 0.0672, + "step": 193 + }, + { + "epoch": 0.33007230965546575, + "grad_norm": 0.31753011619888016, + "learning_rate": 1.5638630164418435e-05, + "loss": 0.0712, + "step": 194 + }, + { + "epoch": 0.3317737133134836, + "grad_norm": 0.2757080957715778, + "learning_rate": 1.5592946176112973e-05, + "loss": 0.1015, + "step": 195 + }, + { + "epoch": 0.3334751169715015, + "grad_norm": 0.2505973926615177, + "learning_rate": 1.554709169158972e-05, + "loss": 0.0971, + "step": 196 + }, + { + "epoch": 0.33517652062951936, + "grad_norm": 0.2226907461950715, + "learning_rate": 1.550106810868373e-05, + "loss": 0.0513, + "step": 197 + }, + { + "epoch": 0.3368779242875372, + "grad_norm": 0.15446367505194006, + "learning_rate": 1.5454876830384868e-05, + "loss": 0.057, + "step": 198 + }, + { + "epoch": 0.33857932794555506, + "grad_norm": 0.30643512505114406, + "learning_rate": 1.540851926479505e-05, + "loss": 0.0975, + "step": 199 + }, + { + "epoch": 0.340280731603573, + "grad_norm": 0.25762810344960363, + "learning_rate": 1.536199682508533e-05, + "loss": 0.0633, + "step": 200 + }, + { + "epoch": 0.3419821352615908, + "grad_norm": 0.19158691969988545, + "learning_rate": 1.531531092945279e-05, + "loss": 0.0569, + "step": 201 + }, + { + "epoch": 0.3436835389196087, + "grad_norm": 0.2834266199692826, + "learning_rate": 1.526846300107734e-05, + "loss": 0.0988, + "step": 202 + }, + { + "epoch": 0.34538494257762653, + "grad_norm": 0.2556087083863337, + "learning_rate": 1.5221454468078336e-05, + "loss": 0.0689, + "step": 203 + }, + { + "epoch": 0.3470863462356444, + "grad_norm": 0.2438742301334132, + "learning_rate": 1.5174286763470995e-05, + "loss": 0.0715, + "step": 204 + }, + { + "epoch": 0.3487877498936623, + "grad_norm": 0.21904569801568857, + "learning_rate": 1.5126961325122773e-05, + "loss": 0.0715, + "step": 205 + }, + { + "epoch": 0.35048915355168014, + "grad_norm": 0.32118887699890014, + "learning_rate": 1.5079479595709493e-05, + "loss": 0.1042, + "step": 206 + }, + { + "epoch": 0.352190557209698, + "grad_norm": 0.22503680461445366, + "learning_rate": 1.5031843022671377e-05, + "loss": 0.0516, + "step": 207 + }, + { + "epoch": 0.35389196086771585, + "grad_norm": 0.20476968089029912, + "learning_rate": 1.4984053058168936e-05, + "loss": 0.0651, + "step": 208 + }, + { + "epoch": 0.35559336452573376, + "grad_norm": 0.26260720806295024, + "learning_rate": 1.4936111159038677e-05, + "loss": 0.078, + "step": 209 + }, + { + "epoch": 0.3572947681837516, + "grad_norm": 0.21335908324799727, + "learning_rate": 1.4888018786748713e-05, + "loss": 0.0531, + "step": 210 + }, + { + "epoch": 0.35899617184176946, + "grad_norm": 0.24776977384172608, + "learning_rate": 1.4839777407354194e-05, + "loss": 0.0711, + "step": 211 + }, + { + "epoch": 0.3606975754997873, + "grad_norm": 0.26283265478228607, + "learning_rate": 1.4791388491452637e-05, + "loss": 0.1005, + "step": 212 + }, + { + "epoch": 0.36239897915780517, + "grad_norm": 0.21912083728071574, + "learning_rate": 1.4742853514139076e-05, + "loss": 0.0759, + "step": 213 + }, + { + "epoch": 0.3641003828158231, + "grad_norm": 0.30538258609344554, + "learning_rate": 1.4694173954961105e-05, + "loss": 0.1009, + "step": 214 + }, + { + "epoch": 0.36580178647384093, + "grad_norm": 0.26764402263401943, + "learning_rate": 1.4645351297873774e-05, + "loss": 0.0792, + "step": 215 + }, + { + "epoch": 0.3675031901318588, + "grad_norm": 0.23675986028315632, + "learning_rate": 1.4596387031194354e-05, + "loss": 0.0743, + "step": 216 + }, + { + "epoch": 0.36920459378987663, + "grad_norm": 0.223485373234782, + "learning_rate": 1.4547282647556964e-05, + "loss": 0.0929, + "step": 217 + }, + { + "epoch": 0.3709059974478945, + "grad_norm": 0.29133370479541304, + "learning_rate": 1.449803964386706e-05, + "loss": 0.0798, + "step": 218 + }, + { + "epoch": 0.3726074011059124, + "grad_norm": 0.18643880537818264, + "learning_rate": 1.4448659521255823e-05, + "loss": 0.0569, + "step": 219 + }, + { + "epoch": 0.37430880476393025, + "grad_norm": 0.280997156813028, + "learning_rate": 1.4399143785034388e-05, + "loss": 0.0999, + "step": 220 + }, + { + "epoch": 0.3760102084219481, + "grad_norm": 0.24715686808729184, + "learning_rate": 1.4349493944647953e-05, + "loss": 0.0627, + "step": 221 + }, + { + "epoch": 0.37771161207996595, + "grad_norm": 0.2653638113311677, + "learning_rate": 1.4299711513629759e-05, + "loss": 0.0863, + "step": 222 + }, + { + "epoch": 0.37941301573798386, + "grad_norm": 0.3152170570199493, + "learning_rate": 1.4249798009554979e-05, + "loss": 0.0962, + "step": 223 + }, + { + "epoch": 0.3811144193960017, + "grad_norm": 0.25435032990077316, + "learning_rate": 1.419975495399442e-05, + "loss": 0.0937, + "step": 224 + }, + { + "epoch": 0.38281582305401957, + "grad_norm": 0.15047418159614007, + "learning_rate": 1.4149583872468165e-05, + "loss": 0.0482, + "step": 225 + }, + { + "epoch": 0.3845172267120374, + "grad_norm": 0.15295980990137148, + "learning_rate": 1.4099286294399051e-05, + "loss": 0.0382, + "step": 226 + }, + { + "epoch": 0.38621863037005527, + "grad_norm": 0.2533178513009288, + "learning_rate": 1.404886375306607e-05, + "loss": 0.0948, + "step": 227 + }, + { + "epoch": 0.3879200340280732, + "grad_norm": 0.23107415073266793, + "learning_rate": 1.3998317785557597e-05, + "loss": 0.0556, + "step": 228 + }, + { + "epoch": 0.38962143768609103, + "grad_norm": 0.22917339521301278, + "learning_rate": 1.3947649932724563e-05, + "loss": 0.0843, + "step": 229 + }, + { + "epoch": 0.3913228413441089, + "grad_norm": 0.17860874606051796, + "learning_rate": 1.3896861739133456e-05, + "loss": 0.0488, + "step": 230 + }, + { + "epoch": 0.39302424500212674, + "grad_norm": 0.20110618464098665, + "learning_rate": 1.384595475301926e-05, + "loss": 0.0707, + "step": 231 + }, + { + "epoch": 0.39472564866014465, + "grad_norm": 0.20490708885008196, + "learning_rate": 1.3794930526238246e-05, + "loss": 0.0638, + "step": 232 + }, + { + "epoch": 0.3964270523181625, + "grad_norm": 0.23510478688343137, + "learning_rate": 1.3743790614220664e-05, + "loss": 0.0795, + "step": 233 + }, + { + "epoch": 0.39812845597618035, + "grad_norm": 0.23100407798838418, + "learning_rate": 1.3692536575923334e-05, + "loss": 0.0761, + "step": 234 + }, + { + "epoch": 0.3998298596341982, + "grad_norm": 0.256927774293719, + "learning_rate": 1.3641169973782117e-05, + "loss": 0.0669, + "step": 235 + }, + { + "epoch": 0.40153126329221606, + "grad_norm": 0.265960997604412, + "learning_rate": 1.3589692373664288e-05, + "loss": 0.0792, + "step": 236 + }, + { + "epoch": 0.40323266695023396, + "grad_norm": 0.2915525993978868, + "learning_rate": 1.3538105344820798e-05, + "loss": 0.0995, + "step": 237 + }, + { + "epoch": 0.4049340706082518, + "grad_norm": 0.2578778351844946, + "learning_rate": 1.3486410459838448e-05, + "loss": 0.0719, + "step": 238 + }, + { + "epoch": 0.40663547426626967, + "grad_norm": 0.23384387017059058, + "learning_rate": 1.343460929459193e-05, + "loss": 0.0712, + "step": 239 + }, + { + "epoch": 0.4083368779242875, + "grad_norm": 0.3019784671101925, + "learning_rate": 1.3382703428195812e-05, + "loss": 0.1115, + "step": 240 + }, + { + "epoch": 0.4100382815823054, + "grad_norm": 0.1352734841435768, + "learning_rate": 1.3330694442956376e-05, + "loss": 0.0464, + "step": 241 + }, + { + "epoch": 0.4117396852403233, + "grad_norm": 0.17202787234184866, + "learning_rate": 1.3278583924323405e-05, + "loss": 0.0454, + "step": 242 + }, + { + "epoch": 0.41344108889834114, + "grad_norm": 0.197390798554472, + "learning_rate": 1.3226373460841835e-05, + "loss": 0.0643, + "step": 243 + }, + { + "epoch": 0.415142492556359, + "grad_norm": 0.21059005106394152, + "learning_rate": 1.3174064644103334e-05, + "loss": 0.0619, + "step": 244 + }, + { + "epoch": 0.41684389621437684, + "grad_norm": 0.1722308196492277, + "learning_rate": 1.3121659068697797e-05, + "loss": 0.0454, + "step": 245 + }, + { + "epoch": 0.41854529987239475, + "grad_norm": 0.32958443926088915, + "learning_rate": 1.306915833216471e-05, + "loss": 0.1037, + "step": 246 + }, + { + "epoch": 0.4202467035304126, + "grad_norm": 0.2544357662171513, + "learning_rate": 1.3016564034944473e-05, + "loss": 0.0706, + "step": 247 + }, + { + "epoch": 0.42194810718843045, + "grad_norm": 0.2585510606952109, + "learning_rate": 1.29638777803296e-05, + "loss": 0.0825, + "step": 248 + }, + { + "epoch": 0.4236495108464483, + "grad_norm": 0.20275321109441732, + "learning_rate": 1.2911101174415861e-05, + "loss": 0.0526, + "step": 249 + }, + { + "epoch": 0.42535091450446616, + "grad_norm": 0.233981891948638, + "learning_rate": 1.2858235826053294e-05, + "loss": 0.0695, + "step": 250 + }, + { + "epoch": 0.42705231816248407, + "grad_norm": 0.21493578674491315, + "learning_rate": 1.2805283346797179e-05, + "loss": 0.0653, + "step": 251 + }, + { + "epoch": 0.4287537218205019, + "grad_norm": 0.24141724836500014, + "learning_rate": 1.2752245350858905e-05, + "loss": 0.0797, + "step": 252 + }, + { + "epoch": 0.4304551254785198, + "grad_norm": 0.1489771140649588, + "learning_rate": 1.2699123455056777e-05, + "loss": 0.03, + "step": 253 + }, + { + "epoch": 0.4321565291365376, + "grad_norm": 0.24828101075324488, + "learning_rate": 1.26459192787667e-05, + "loss": 0.0819, + "step": 254 + }, + { + "epoch": 0.43385793279455553, + "grad_norm": 0.17372013690514643, + "learning_rate": 1.2592634443872842e-05, + "loss": 0.0461, + "step": 255 + }, + { + "epoch": 0.4355593364525734, + "grad_norm": 0.2764346314356569, + "learning_rate": 1.2539270574718172e-05, + "loss": 0.0806, + "step": 256 + }, + { + "epoch": 0.43726074011059124, + "grad_norm": 0.29987546473911214, + "learning_rate": 1.2485829298054952e-05, + "loss": 0.0846, + "step": 257 + }, + { + "epoch": 0.4389621437686091, + "grad_norm": 0.31175687442320515, + "learning_rate": 1.2432312242995158e-05, + "loss": 0.0971, + "step": 258 + }, + { + "epoch": 0.44066354742662694, + "grad_norm": 0.3009904680059143, + "learning_rate": 1.2378721040960788e-05, + "loss": 0.0994, + "step": 259 + }, + { + "epoch": 0.44236495108464485, + "grad_norm": 0.2562384582969849, + "learning_rate": 1.232505732563416e-05, + "loss": 0.0759, + "step": 260 + }, + { + "epoch": 0.4440663547426627, + "grad_norm": 0.2550190452410635, + "learning_rate": 1.2271322732908091e-05, + "loss": 0.0733, + "step": 261 + }, + { + "epoch": 0.44576775840068056, + "grad_norm": 0.2515270792806656, + "learning_rate": 1.2217518900836045e-05, + "loss": 0.0708, + "step": 262 + }, + { + "epoch": 0.4474691620586984, + "grad_norm": 0.18357019713578807, + "learning_rate": 1.2163647469582181e-05, + "loss": 0.0515, + "step": 263 + }, + { + "epoch": 0.44917056571671626, + "grad_norm": 0.2671034404389676, + "learning_rate": 1.210971008137136e-05, + "loss": 0.0825, + "step": 264 + }, + { + "epoch": 0.45087196937473417, + "grad_norm": 0.2571681277129728, + "learning_rate": 1.2055708380439089e-05, + "loss": 0.1042, + "step": 265 + }, + { + "epoch": 0.452573373032752, + "grad_norm": 0.21793020282041717, + "learning_rate": 1.2001644012981392e-05, + "loss": 0.0672, + "step": 266 + }, + { + "epoch": 0.4542747766907699, + "grad_norm": 0.3515840793799933, + "learning_rate": 1.1947518627104637e-05, + "loss": 0.1232, + "step": 267 + }, + { + "epoch": 0.45597618034878773, + "grad_norm": 0.22270456303325817, + "learning_rate": 1.1893333872775275e-05, + "loss": 0.084, + "step": 268 + }, + { + "epoch": 0.45767758400680564, + "grad_norm": 0.1770352144116934, + "learning_rate": 1.1839091401769559e-05, + "loss": 0.0435, + "step": 269 + }, + { + "epoch": 0.4593789876648235, + "grad_norm": 0.19476419487806804, + "learning_rate": 1.1784792867623179e-05, + "loss": 0.0535, + "step": 270 + }, + { + "epoch": 0.46108039132284134, + "grad_norm": 0.2961488962332661, + "learning_rate": 1.1730439925580876e-05, + "loss": 0.1054, + "step": 271 + }, + { + "epoch": 0.4627817949808592, + "grad_norm": 0.23209294477341702, + "learning_rate": 1.1676034232545963e-05, + "loss": 0.0898, + "step": 272 + }, + { + "epoch": 0.46448319863887705, + "grad_norm": 0.3192995286716727, + "learning_rate": 1.1621577447029816e-05, + "loss": 0.0864, + "step": 273 + }, + { + "epoch": 0.46618460229689496, + "grad_norm": 0.22033834311649156, + "learning_rate": 1.1567071229101332e-05, + "loss": 0.061, + "step": 274 + }, + { + "epoch": 0.4678860059549128, + "grad_norm": 0.24431011933295152, + "learning_rate": 1.1512517240336304e-05, + "loss": 0.05, + "step": 275 + }, + { + "epoch": 0.46958740961293066, + "grad_norm": 0.2501002015877452, + "learning_rate": 1.1457917143766786e-05, + "loss": 0.0811, + "step": 276 + }, + { + "epoch": 0.4712888132709485, + "grad_norm": 0.15591637050035256, + "learning_rate": 1.1403272603830384e-05, + "loss": 0.0439, + "step": 277 + }, + { + "epoch": 0.4729902169289664, + "grad_norm": 0.23445891677475122, + "learning_rate": 1.1348585286319529e-05, + "loss": 0.0562, + "step": 278 + }, + { + "epoch": 0.4746916205869843, + "grad_norm": 0.22953431606642624, + "learning_rate": 1.1293856858330678e-05, + "loss": 0.0712, + "step": 279 + }, + { + "epoch": 0.47639302424500213, + "grad_norm": 0.24410355979016798, + "learning_rate": 1.1239088988213522e-05, + "loss": 0.0652, + "step": 280 + }, + { + "epoch": 0.47809442790302, + "grad_norm": 0.18183702432279936, + "learning_rate": 1.11842833455201e-05, + "loss": 0.0464, + "step": 281 + }, + { + "epoch": 0.47979583156103783, + "grad_norm": 0.3068246013758465, + "learning_rate": 1.1129441600953916e-05, + "loss": 0.101, + "step": 282 + }, + { + "epoch": 0.48149723521905574, + "grad_norm": 0.2725071360381625, + "learning_rate": 1.1074565426319014e-05, + "loss": 0.0906, + "step": 283 + }, + { + "epoch": 0.4831986388770736, + "grad_norm": 0.23078769715046143, + "learning_rate": 1.101965649446901e-05, + "loss": 0.0659, + "step": 284 + }, + { + "epoch": 0.48490004253509145, + "grad_norm": 0.24607556757801813, + "learning_rate": 1.0964716479256094e-05, + "loss": 0.0581, + "step": 285 + }, + { + "epoch": 0.4866014461931093, + "grad_norm": 0.2039783514676341, + "learning_rate": 1.0909747055480004e-05, + "loss": 0.042, + "step": 286 + }, + { + "epoch": 0.4883028498511272, + "grad_norm": 0.2132647216715679, + "learning_rate": 1.0854749898836974e-05, + "loss": 0.042, + "step": 287 + }, + { + "epoch": 0.49000425350914506, + "grad_norm": 0.17146104368359683, + "learning_rate": 1.0799726685868648e-05, + "loss": 0.0486, + "step": 288 + }, + { + "epoch": 0.4917056571671629, + "grad_norm": 0.3101626941591736, + "learning_rate": 1.0744679093910987e-05, + "loss": 0.0855, + "step": 289 + }, + { + "epoch": 0.49340706082518077, + "grad_norm": 0.26359884054130955, + "learning_rate": 1.0689608801043107e-05, + "loss": 0.0671, + "step": 290 + }, + { + "epoch": 0.4951084644831986, + "grad_norm": 0.21556126895353891, + "learning_rate": 1.063451748603616e-05, + "loss": 0.076, + "step": 291 + }, + { + "epoch": 0.4968098681412165, + "grad_norm": 0.27181671931172413, + "learning_rate": 1.0579406828302124e-05, + "loss": 0.0847, + "step": 292 + }, + { + "epoch": 0.4985112717992344, + "grad_norm": 0.31490941982013704, + "learning_rate": 1.0524278507842637e-05, + "loss": 0.1254, + "step": 293 + }, + { + "epoch": 0.5002126754572522, + "grad_norm": 0.28738576717755915, + "learning_rate": 1.0469134205197762e-05, + "loss": 0.0741, + "step": 294 + }, + { + "epoch": 0.5019140791152701, + "grad_norm": 0.24967744618311208, + "learning_rate": 1.0413975601394765e-05, + "loss": 0.0952, + "step": 295 + }, + { + "epoch": 0.5036154827732879, + "grad_norm": 0.18653535652798736, + "learning_rate": 1.0358804377896876e-05, + "loss": 0.0666, + "step": 296 + }, + { + "epoch": 0.5053168864313058, + "grad_norm": 0.31524703768298645, + "learning_rate": 1.0303622216552022e-05, + "loss": 0.0821, + "step": 297 + }, + { + "epoch": 0.5070182900893236, + "grad_norm": 0.19693363449938797, + "learning_rate": 1.0248430799541564e-05, + "loss": 0.0486, + "step": 298 + }, + { + "epoch": 0.5087196937473416, + "grad_norm": 0.3100958390915376, + "learning_rate": 1.019323180932901e-05, + "loss": 0.075, + "step": 299 + }, + { + "epoch": 0.5104210974053595, + "grad_norm": 0.2263867395185222, + "learning_rate": 1.013802692860873e-05, + "loss": 0.0729, + "step": 300 + }, + { + "epoch": 0.5121225010633773, + "grad_norm": 0.30499024558777427, + "learning_rate": 1.0082817840254667e-05, + "loss": 0.0949, + "step": 301 + }, + { + "epoch": 0.5138239047213952, + "grad_norm": 0.27412297007732506, + "learning_rate": 1.0027606227269026e-05, + "loss": 0.0711, + "step": 302 + }, + { + "epoch": 0.515525308379413, + "grad_norm": 0.2236619712267448, + "learning_rate": 9.972393772730975e-06, + "loss": 0.0711, + "step": 303 + }, + { + "epoch": 0.5172267120374309, + "grad_norm": 0.21645526531787626, + "learning_rate": 9.917182159745335e-06, + "loss": 0.0696, + "step": 304 + }, + { + "epoch": 0.5189281156954487, + "grad_norm": 0.19297627616781193, + "learning_rate": 9.861973071391272e-06, + "loss": 0.0723, + "step": 305 + }, + { + "epoch": 0.5206295193534666, + "grad_norm": 0.17935274621615926, + "learning_rate": 9.806768190670994e-06, + "loss": 0.0603, + "step": 306 + }, + { + "epoch": 0.5223309230114844, + "grad_norm": 0.36516731843883593, + "learning_rate": 9.751569200458438e-06, + "loss": 0.1183, + "step": 307 + }, + { + "epoch": 0.5240323266695024, + "grad_norm": 0.2666543854374252, + "learning_rate": 9.69637778344798e-06, + "loss": 0.0683, + "step": 308 + }, + { + "epoch": 0.5257337303275202, + "grad_norm": 0.1559589176353223, + "learning_rate": 9.641195622103126e-06, + "loss": 0.0457, + "step": 309 + }, + { + "epoch": 0.5274351339855381, + "grad_norm": 0.21822959242881637, + "learning_rate": 9.586024398605238e-06, + "loss": 0.0728, + "step": 310 + }, + { + "epoch": 0.529136537643556, + "grad_norm": 0.1910067933489864, + "learning_rate": 9.530865794802243e-06, + "loss": 0.0518, + "step": 311 + }, + { + "epoch": 0.5308379413015738, + "grad_norm": 0.2286981911174439, + "learning_rate": 9.475721492157365e-06, + "loss": 0.0538, + "step": 312 + }, + { + "epoch": 0.5325393449595917, + "grad_norm": 0.2649695673029832, + "learning_rate": 9.420593171697876e-06, + "loss": 0.086, + "step": 313 + }, + { + "epoch": 0.5342407486176095, + "grad_norm": 0.2420342613285877, + "learning_rate": 9.365482513963844e-06, + "loss": 0.0972, + "step": 314 + }, + { + "epoch": 0.5359421522756274, + "grad_norm": 0.2855237701478863, + "learning_rate": 9.310391198956896e-06, + "loss": 0.0795, + "step": 315 + }, + { + "epoch": 0.5376435559336452, + "grad_norm": 0.19062890218128994, + "learning_rate": 9.255320906089017e-06, + "loss": 0.0385, + "step": 316 + }, + { + "epoch": 0.5393449595916632, + "grad_norm": 0.12818263127205254, + "learning_rate": 9.200273314131356e-06, + "loss": 0.0358, + "step": 317 + }, + { + "epoch": 0.541046363249681, + "grad_norm": 0.21271005750639677, + "learning_rate": 9.145250101163032e-06, + "loss": 0.0511, + "step": 318 + }, + { + "epoch": 0.5427477669076989, + "grad_norm": 0.3230145385857894, + "learning_rate": 9.090252944520002e-06, + "loss": 0.1249, + "step": 319 + }, + { + "epoch": 0.5444491705657167, + "grad_norm": 0.17129666760859652, + "learning_rate": 9.035283520743911e-06, + "loss": 0.0473, + "step": 320 + }, + { + "epoch": 0.5461505742237346, + "grad_norm": 0.25326344089197617, + "learning_rate": 8.980343505530988e-06, + "loss": 0.0613, + "step": 321 + }, + { + "epoch": 0.5478519778817524, + "grad_norm": 0.2847520686151069, + "learning_rate": 8.925434573680986e-06, + "loss": 0.0883, + "step": 322 + }, + { + "epoch": 0.5495533815397703, + "grad_norm": 0.358576412450163, + "learning_rate": 8.870558399046086e-06, + "loss": 0.1097, + "step": 323 + }, + { + "epoch": 0.5512547851977881, + "grad_norm": 0.21816479879448794, + "learning_rate": 8.815716654479903e-06, + "loss": 0.0766, + "step": 324 + }, + { + "epoch": 0.552956188855806, + "grad_norm": 0.27098416317022683, + "learning_rate": 8.76091101178648e-06, + "loss": 0.0959, + "step": 325 + }, + { + "epoch": 0.5546575925138238, + "grad_norm": 0.16228239529758662, + "learning_rate": 8.706143141669324e-06, + "loss": 0.0427, + "step": 326 + }, + { + "epoch": 0.5563589961718418, + "grad_norm": 0.19457073616768888, + "learning_rate": 8.651414713680474e-06, + "loss": 0.0674, + "step": 327 + }, + { + "epoch": 0.5580603998298597, + "grad_norm": 0.21506185063350097, + "learning_rate": 8.59672739616962e-06, + "loss": 0.0725, + "step": 328 + }, + { + "epoch": 0.5597618034878775, + "grad_norm": 0.25928980929110046, + "learning_rate": 8.542082856233216e-06, + "loss": 0.0926, + "step": 329 + }, + { + "epoch": 0.5614632071458954, + "grad_norm": 0.19301809133671421, + "learning_rate": 8.487482759663696e-06, + "loss": 0.0661, + "step": 330 + }, + { + "epoch": 0.5631646108039132, + "grad_norm": 0.19336796199191325, + "learning_rate": 8.43292877089867e-06, + "loss": 0.0694, + "step": 331 + }, + { + "epoch": 0.5648660144619311, + "grad_norm": 0.2300001624245782, + "learning_rate": 8.378422552970185e-06, + "loss": 0.0746, + "step": 332 + }, + { + "epoch": 0.5665674181199489, + "grad_norm": 0.23998094239502984, + "learning_rate": 8.32396576745404e-06, + "loss": 0.0696, + "step": 333 + }, + { + "epoch": 0.5682688217779668, + "grad_norm": 0.2052361707208072, + "learning_rate": 8.269560074419126e-06, + "loss": 0.0624, + "step": 334 + }, + { + "epoch": 0.5699702254359846, + "grad_norm": 0.2962367385563096, + "learning_rate": 8.215207132376824e-06, + "loss": 0.124, + "step": 335 + }, + { + "epoch": 0.5716716290940026, + "grad_norm": 0.24752974776344203, + "learning_rate": 8.160908598230448e-06, + "loss": 0.0653, + "step": 336 + }, + { + "epoch": 0.5733730327520205, + "grad_norm": 0.18127533151541284, + "learning_rate": 8.10666612722473e-06, + "loss": 0.0591, + "step": 337 + }, + { + "epoch": 0.5750744364100383, + "grad_norm": 0.18346580605719615, + "learning_rate": 8.052481372895363e-06, + "loss": 0.0488, + "step": 338 + }, + { + "epoch": 0.5767758400680562, + "grad_norm": 0.31877710293947625, + "learning_rate": 7.998355987018606e-06, + "loss": 0.0872, + "step": 339 + }, + { + "epoch": 0.578477243726074, + "grad_norm": 0.1344651628761348, + "learning_rate": 7.944291619560914e-06, + "loss": 0.0403, + "step": 340 + }, + { + "epoch": 0.5801786473840919, + "grad_norm": 0.13295165420726127, + "learning_rate": 7.890289918628644e-06, + "loss": 0.0476, + "step": 341 + }, + { + "epoch": 0.5818800510421097, + "grad_norm": 0.27781942031149137, + "learning_rate": 7.836352530417824e-06, + "loss": 0.0925, + "step": 342 + }, + { + "epoch": 0.5835814547001276, + "grad_norm": 0.2923332990407699, + "learning_rate": 7.782481099163958e-06, + "loss": 0.1173, + "step": 343 + }, + { + "epoch": 0.5852828583581454, + "grad_norm": 0.20398201081622527, + "learning_rate": 7.728677267091912e-06, + "loss": 0.0712, + "step": 344 + }, + { + "epoch": 0.5869842620161634, + "grad_norm": 0.1827445671136079, + "learning_rate": 7.674942674365847e-06, + "loss": 0.0588, + "step": 345 + }, + { + "epoch": 0.5886856656741812, + "grad_norm": 0.16240437969547905, + "learning_rate": 7.621278959039217e-06, + "loss": 0.0637, + "step": 346 + }, + { + "epoch": 0.5903870693321991, + "grad_norm": 0.16062106806552065, + "learning_rate": 7.567687757004843e-06, + "loss": 0.0414, + "step": 347 + }, + { + "epoch": 0.5920884729902169, + "grad_norm": 0.2689581192862772, + "learning_rate": 7.514170701945047e-06, + "loss": 0.0897, + "step": 348 + }, + { + "epoch": 0.5937898766482348, + "grad_norm": 0.2681187781079101, + "learning_rate": 7.460729425281831e-06, + "loss": 0.0709, + "step": 349 + }, + { + "epoch": 0.5954912803062526, + "grad_norm": 0.17526558434710648, + "learning_rate": 7.407365556127162e-06, + "loss": 0.0539, + "step": 350 + }, + { + "epoch": 0.5971926839642705, + "grad_norm": 0.1648773603270098, + "learning_rate": 7.354080721233303e-06, + "loss": 0.0503, + "step": 351 + }, + { + "epoch": 0.5988940876222884, + "grad_norm": 0.17999080092184985, + "learning_rate": 7.300876544943227e-06, + "loss": 0.0605, + "step": 352 + }, + { + "epoch": 0.6005954912803062, + "grad_norm": 0.20308998126444186, + "learning_rate": 7.247754649141097e-06, + "loss": 0.0769, + "step": 353 + }, + { + "epoch": 0.6022968949383242, + "grad_norm": 0.16018163485236867, + "learning_rate": 7.194716653202826e-06, + "loss": 0.0545, + "step": 354 + }, + { + "epoch": 0.603998298596342, + "grad_norm": 0.24744753769790884, + "learning_rate": 7.1417641739467104e-06, + "loss": 0.0776, + "step": 355 + }, + { + "epoch": 0.6056997022543599, + "grad_norm": 0.2298041880240223, + "learning_rate": 7.088898825584139e-06, + "loss": 0.0674, + "step": 356 + }, + { + "epoch": 0.6074011059123777, + "grad_norm": 0.19841362318124559, + "learning_rate": 7.036122219670398e-06, + "loss": 0.0635, + "step": 357 + }, + { + "epoch": 0.6091025095703956, + "grad_norm": 0.18877644807321198, + "learning_rate": 6.9834359650555305e-06, + "loss": 0.0777, + "step": 358 + }, + { + "epoch": 0.6108039132284134, + "grad_norm": 0.16102641349173863, + "learning_rate": 6.930841667835295e-06, + "loss": 0.0576, + "step": 359 + }, + { + "epoch": 0.6125053168864313, + "grad_norm": 0.20224797100905906, + "learning_rate": 6.878340931302208e-06, + "loss": 0.0754, + "step": 360 + }, + { + "epoch": 0.6142067205444491, + "grad_norm": 0.2857194289415506, + "learning_rate": 6.825935355896669e-06, + "loss": 0.1052, + "step": 361 + }, + { + "epoch": 0.615908124202467, + "grad_norm": 0.20616924754434873, + "learning_rate": 6.773626539158171e-06, + "loss": 0.0716, + "step": 362 + }, + { + "epoch": 0.617609527860485, + "grad_norm": 0.23846455066099467, + "learning_rate": 6.721416075676601e-06, + "loss": 0.0847, + "step": 363 + }, + { + "epoch": 0.6193109315185028, + "grad_norm": 0.14989055759308637, + "learning_rate": 6.669305557043626e-06, + "loss": 0.0371, + "step": 364 + }, + { + "epoch": 0.6210123351765207, + "grad_norm": 0.25146318527723016, + "learning_rate": 6.617296571804191e-06, + "loss": 0.0938, + "step": 365 + }, + { + "epoch": 0.6227137388345385, + "grad_norm": 0.2795331744757292, + "learning_rate": 6.565390705408072e-06, + "loss": 0.0503, + "step": 366 + }, + { + "epoch": 0.6244151424925564, + "grad_norm": 0.17139219570184439, + "learning_rate": 6.513589540161556e-06, + "loss": 0.0578, + "step": 367 + }, + { + "epoch": 0.6261165461505742, + "grad_norm": 0.157560985399721, + "learning_rate": 6.461894655179204e-06, + "loss": 0.0582, + "step": 368 + }, + { + "epoch": 0.6278179498085921, + "grad_norm": 0.22752999438352467, + "learning_rate": 6.410307626335717e-06, + "loss": 0.0779, + "step": 369 + }, + { + "epoch": 0.6295193534666099, + "grad_norm": 0.27521908636927156, + "learning_rate": 6.358830026217887e-06, + "loss": 0.0826, + "step": 370 + }, + { + "epoch": 0.6312207571246278, + "grad_norm": 0.21611309006232896, + "learning_rate": 6.30746342407667e-06, + "loss": 0.0575, + "step": 371 + }, + { + "epoch": 0.6329221607826457, + "grad_norm": 0.2022628238984182, + "learning_rate": 6.256209385779341e-06, + "loss": 0.0597, + "step": 372 + }, + { + "epoch": 0.6346235644406636, + "grad_norm": 0.20087283890633761, + "learning_rate": 6.205069473761756e-06, + "loss": 0.0565, + "step": 373 + }, + { + "epoch": 0.6363249680986814, + "grad_norm": 0.27973559478745097, + "learning_rate": 6.154045246980742e-06, + "loss": 0.0777, + "step": 374 + }, + { + "epoch": 0.6380263717566993, + "grad_norm": 0.2153647046344647, + "learning_rate": 6.1031382608665456e-06, + "loss": 0.065, + "step": 375 + }, + { + "epoch": 0.6397277754147171, + "grad_norm": 0.15832658385378948, + "learning_rate": 6.052350067275441e-06, + "loss": 0.0463, + "step": 376 + }, + { + "epoch": 0.641429179072735, + "grad_norm": 0.24889956825525697, + "learning_rate": 6.001682214442406e-06, + "loss": 0.0868, + "step": 377 + }, + { + "epoch": 0.6431305827307529, + "grad_norm": 0.2537522589782198, + "learning_rate": 5.951136246933933e-06, + "loss": 0.0771, + "step": 378 + }, + { + "epoch": 0.6448319863887707, + "grad_norm": 0.35384965176549915, + "learning_rate": 5.900713705600951e-06, + "loss": 0.0885, + "step": 379 + }, + { + "epoch": 0.6465333900467886, + "grad_norm": 0.24583176378622248, + "learning_rate": 5.850416127531841e-06, + "loss": 0.076, + "step": 380 + }, + { + "epoch": 0.6482347937048064, + "grad_norm": 0.19401849479737754, + "learning_rate": 5.800245046005585e-06, + "loss": 0.055, + "step": 381 + }, + { + "epoch": 0.6499361973628244, + "grad_norm": 0.22292321754995165, + "learning_rate": 5.750201990445024e-06, + "loss": 0.0837, + "step": 382 + }, + { + "epoch": 0.6516376010208422, + "grad_norm": 0.15980625639550533, + "learning_rate": 5.70028848637024e-06, + "loss": 0.053, + "step": 383 + }, + { + "epoch": 0.6533390046788601, + "grad_norm": 0.17476218437373806, + "learning_rate": 5.650506055352052e-06, + "loss": 0.047, + "step": 384 + }, + { + "epoch": 0.6550404083368779, + "grad_norm": 0.26159431180356163, + "learning_rate": 5.600856214965613e-06, + "loss": 0.075, + "step": 385 + }, + { + "epoch": 0.6567418119948958, + "grad_norm": 0.24983048217170784, + "learning_rate": 5.551340478744176e-06, + "loss": 0.0819, + "step": 386 + }, + { + "epoch": 0.6584432156529136, + "grad_norm": 0.277677983790708, + "learning_rate": 5.501960356132945e-06, + "loss": 0.0743, + "step": 387 + }, + { + "epoch": 0.6601446193109315, + "grad_norm": 0.2687619515031017, + "learning_rate": 5.4527173524430395e-06, + "loss": 0.076, + "step": 388 + }, + { + "epoch": 0.6618460229689493, + "grad_norm": 0.18825889492381687, + "learning_rate": 5.403612968805649e-06, + "loss": 0.0533, + "step": 389 + }, + { + "epoch": 0.6635474266269672, + "grad_norm": 0.22722813251197366, + "learning_rate": 5.354648702126229e-06, + "loss": 0.0669, + "step": 390 + }, + { + "epoch": 0.6652488302849852, + "grad_norm": 0.15428942499610793, + "learning_rate": 5.305826045038899e-06, + "loss": 0.0496, + "step": 391 + }, + { + "epoch": 0.666950233943003, + "grad_norm": 0.29481966419649847, + "learning_rate": 5.257146485860927e-06, + "loss": 0.0871, + "step": 392 + }, + { + "epoch": 0.6686516376010209, + "grad_norm": 0.2132988743676148, + "learning_rate": 5.208611508547367e-06, + "loss": 0.072, + "step": 393 + }, + { + "epoch": 0.6703530412590387, + "grad_norm": 0.1932211736507852, + "learning_rate": 5.160222592645808e-06, + "loss": 0.0672, + "step": 394 + }, + { + "epoch": 0.6720544449170566, + "grad_norm": 0.32017863808058095, + "learning_rate": 5.111981213251293e-06, + "loss": 0.0996, + "step": 395 + }, + { + "epoch": 0.6737558485750744, + "grad_norm": 0.33076228774617505, + "learning_rate": 5.063888840961325e-06, + "loss": 0.1062, + "step": 396 + }, + { + "epoch": 0.6754572522330923, + "grad_norm": 0.2152145333210106, + "learning_rate": 5.015946941831064e-06, + "loss": 0.0682, + "step": 397 + }, + { + "epoch": 0.6771586558911101, + "grad_norm": 0.21543841192984545, + "learning_rate": 4.968156977328626e-06, + "loss": 0.0572, + "step": 398 + }, + { + "epoch": 0.678860059549128, + "grad_norm": 0.16950132912260057, + "learning_rate": 4.920520404290512e-06, + "loss": 0.0577, + "step": 399 + }, + { + "epoch": 0.680561463207146, + "grad_norm": 0.20787322030508298, + "learning_rate": 4.87303867487723e-06, + "loss": 0.0561, + "step": 400 + }, + { + "epoch": 0.6822628668651638, + "grad_norm": 0.1533621298140527, + "learning_rate": 4.825713236529005e-06, + "loss": 0.0435, + "step": 401 + }, + { + "epoch": 0.6839642705231816, + "grad_norm": 0.18296820958014204, + "learning_rate": 4.778545531921668e-06, + "loss": 0.0538, + "step": 402 + }, + { + "epoch": 0.6856656741811995, + "grad_norm": 0.24677189398080018, + "learning_rate": 4.731536998922657e-06, + "loss": 0.0715, + "step": 403 + }, + { + "epoch": 0.6873670778392174, + "grad_norm": 0.18381971512083234, + "learning_rate": 4.684689070547216e-06, + "loss": 0.0589, + "step": 404 + }, + { + "epoch": 0.6890684814972352, + "grad_norm": 0.19563535366138982, + "learning_rate": 4.638003174914675e-06, + "loss": 0.0375, + "step": 405 + }, + { + "epoch": 0.6907698851552531, + "grad_norm": 0.27920320369616836, + "learning_rate": 4.591480735204953e-06, + "loss": 0.0657, + "step": 406 + }, + { + "epoch": 0.6924712888132709, + "grad_norm": 0.22501473081164228, + "learning_rate": 4.545123169615134e-06, + "loss": 0.0754, + "step": 407 + }, + { + "epoch": 0.6941726924712888, + "grad_norm": 0.18685344173190274, + "learning_rate": 4.49893189131627e-06, + "loss": 0.0753, + "step": 408 + }, + { + "epoch": 0.6958740961293067, + "grad_norm": 0.17185385088024444, + "learning_rate": 4.45290830841028e-06, + "loss": 0.0514, + "step": 409 + }, + { + "epoch": 0.6975754997873246, + "grad_norm": 0.2702549233525611, + "learning_rate": 4.407053823887033e-06, + "loss": 0.0833, + "step": 410 + }, + { + "epoch": 0.6992769034453424, + "grad_norm": 0.23362806883478313, + "learning_rate": 4.361369835581569e-06, + "loss": 0.0769, + "step": 411 + }, + { + "epoch": 0.7009783071033603, + "grad_norm": 0.2101592538580294, + "learning_rate": 4.315857736131508e-06, + "loss": 0.0602, + "step": 412 + }, + { + "epoch": 0.7026797107613781, + "grad_norm": 0.20842960868238944, + "learning_rate": 4.2705189129345814e-06, + "loss": 0.074, + "step": 413 + }, + { + "epoch": 0.704381114419396, + "grad_norm": 0.18803427484767865, + "learning_rate": 4.225354748106328e-06, + "loss": 0.07, + "step": 414 + }, + { + "epoch": 0.7060825180774138, + "grad_norm": 0.3066569805512131, + "learning_rate": 4.180366618437996e-06, + "loss": 0.093, + "step": 415 + }, + { + "epoch": 0.7077839217354317, + "grad_norm": 0.1744953221856188, + "learning_rate": 4.13555589535453e-06, + "loss": 0.0555, + "step": 416 + }, + { + "epoch": 0.7094853253934496, + "grad_norm": 0.166243605934049, + "learning_rate": 4.0909239448727985e-06, + "loss": 0.061, + "step": 417 + }, + { + "epoch": 0.7111867290514675, + "grad_norm": 0.20351482232627222, + "learning_rate": 4.046472127559937e-06, + "loss": 0.0715, + "step": 418 + }, + { + "epoch": 0.7128881327094854, + "grad_norm": 0.21225234915881963, + "learning_rate": 4.002201798491875e-06, + "loss": 0.0502, + "step": 419 + }, + { + "epoch": 0.7145895363675032, + "grad_norm": 0.20113105383651986, + "learning_rate": 3.958114307212018e-06, + "loss": 0.0645, + "step": 420 + }, + { + "epoch": 0.7162909400255211, + "grad_norm": 0.18912603242706336, + "learning_rate": 3.91421099769013e-06, + "loss": 0.0642, + "step": 421 + }, + { + "epoch": 0.7179923436835389, + "grad_norm": 0.1879026045376002, + "learning_rate": 3.870493208281337e-06, + "loss": 0.0479, + "step": 422 + }, + { + "epoch": 0.7196937473415568, + "grad_norm": 0.26477442490530756, + "learning_rate": 3.826962271685351e-06, + "loss": 0.0831, + "step": 423 + }, + { + "epoch": 0.7213951509995746, + "grad_norm": 0.29428954187807327, + "learning_rate": 3.7836195149058386e-06, + "loss": 0.0724, + "step": 424 + }, + { + "epoch": 0.7230965546575925, + "grad_norm": 0.23675032676032767, + "learning_rate": 3.7404662592099483e-06, + "loss": 0.0854, + "step": 425 + }, + { + "epoch": 0.7247979583156103, + "grad_norm": 0.283635317471, + "learning_rate": 3.697503820088063e-06, + "loss": 0.0805, + "step": 426 + }, + { + "epoch": 0.7264993619736282, + "grad_norm": 0.20299536681350067, + "learning_rate": 3.654733507213678e-06, + "loss": 0.0629, + "step": 427 + }, + { + "epoch": 0.7282007656316462, + "grad_norm": 0.2737835974179114, + "learning_rate": 3.61215662440349e-06, + "loss": 0.0813, + "step": 428 + }, + { + "epoch": 0.729902169289664, + "grad_norm": 0.2292958769762407, + "learning_rate": 3.5697744695776326e-06, + "loss": 0.0625, + "step": 429 + }, + { + "epoch": 0.7316035729476819, + "grad_norm": 0.2690574028374729, + "learning_rate": 3.5275883347201336e-06, + "loss": 0.0895, + "step": 430 + }, + { + "epoch": 0.7333049766056997, + "grad_norm": 0.2612064488237033, + "learning_rate": 3.4855995058395066e-06, + "loss": 0.076, + "step": 431 + }, + { + "epoch": 0.7350063802637176, + "grad_norm": 0.20866850048700003, + "learning_rate": 3.443809262929575e-06, + "loss": 0.0719, + "step": 432 + }, + { + "epoch": 0.7367077839217354, + "grad_norm": 0.2274495669126989, + "learning_rate": 3.4022188799304214e-06, + "loss": 0.0754, + "step": 433 + }, + { + "epoch": 0.7384091875797533, + "grad_norm": 0.21436057034429756, + "learning_rate": 3.36082962468958e-06, + "loss": 0.0634, + "step": 434 + }, + { + "epoch": 0.7401105912377711, + "grad_norm": 0.2122758067179719, + "learning_rate": 3.3196427589233725e-06, + "loss": 0.0605, + "step": 435 + }, + { + "epoch": 0.741811994895789, + "grad_norm": 0.2387395787166068, + "learning_rate": 3.2786595381784512e-06, + "loss": 0.0679, + "step": 436 + }, + { + "epoch": 0.7435133985538069, + "grad_norm": 0.31722612816219875, + "learning_rate": 3.2378812117935154e-06, + "loss": 0.1076, + "step": 437 + }, + { + "epoch": 0.7452148022118248, + "grad_norm": 0.34631520054017056, + "learning_rate": 3.1973090228612404e-06, + "loss": 0.1121, + "step": 438 + }, + { + "epoch": 0.7469162058698426, + "grad_norm": 0.23043404441429802, + "learning_rate": 3.15694420819038e-06, + "loss": 0.0877, + "step": 439 + }, + { + "epoch": 0.7486176095278605, + "grad_norm": 0.09929972754097662, + "learning_rate": 3.116787998268046e-06, + "loss": 0.0281, + "step": 440 + }, + { + "epoch": 0.7503190131858783, + "grad_norm": 0.2540228794638467, + "learning_rate": 3.076841617222228e-06, + "loss": 0.1016, + "step": 441 + }, + { + "epoch": 0.7520204168438962, + "grad_norm": 0.20530486566659917, + "learning_rate": 3.0371062827844434e-06, + "loss": 0.0759, + "step": 442 + }, + { + "epoch": 0.753721820501914, + "grad_norm": 0.22889025759698128, + "learning_rate": 2.997583206252647e-06, + "loss": 0.0641, + "step": 443 + }, + { + "epoch": 0.7554232241599319, + "grad_norm": 0.22376344098617418, + "learning_rate": 2.958273592454285e-06, + "loss": 0.0696, + "step": 444 + }, + { + "epoch": 0.7571246278179498, + "grad_norm": 0.29335139294143503, + "learning_rate": 2.9191786397095778e-06, + "loss": 0.0722, + "step": 445 + }, + { + "epoch": 0.7588260314759677, + "grad_norm": 0.19904638249374088, + "learning_rate": 2.880299539794975e-06, + "loss": 0.0644, + "step": 446 + }, + { + "epoch": 0.7605274351339856, + "grad_norm": 0.27398415500191214, + "learning_rate": 2.841637477906851e-06, + "loss": 0.097, + "step": 447 + }, + { + "epoch": 0.7622288387920034, + "grad_norm": 0.1909061169980495, + "learning_rate": 2.803193632625346e-06, + "loss": 0.0653, + "step": 448 + }, + { + "epoch": 0.7639302424500213, + "grad_norm": 0.1839577194240098, + "learning_rate": 2.7649691758784603e-06, + "loss": 0.0612, + "step": 449 + }, + { + "epoch": 0.7656316461080391, + "grad_norm": 0.18749598930597564, + "learning_rate": 2.7269652729063233e-06, + "loss": 0.0586, + "step": 450 + }, + { + "epoch": 0.767333049766057, + "grad_norm": 0.2550319493358391, + "learning_rate": 2.689183082225659e-06, + "loss": 0.0784, + "step": 451 + }, + { + "epoch": 0.7690344534240748, + "grad_norm": 0.2702034699283639, + "learning_rate": 2.65162375559449e-06, + "loss": 0.1012, + "step": 452 + }, + { + "epoch": 0.7707358570820927, + "grad_norm": 0.22699731806268653, + "learning_rate": 2.614288437977014e-06, + "loss": 0.08, + "step": 453 + }, + { + "epoch": 0.7724372607401105, + "grad_norm": 0.28841442095335584, + "learning_rate": 2.5771782675087078e-06, + "loss": 0.105, + "step": 454 + }, + { + "epoch": 0.7741386643981285, + "grad_norm": 0.2545180037798505, + "learning_rate": 2.5402943754616182e-06, + "loss": 0.0847, + "step": 455 + }, + { + "epoch": 0.7758400680561464, + "grad_norm": 0.16486311867632228, + "learning_rate": 2.5036378862099e-06, + "loss": 0.0409, + "step": 456 + }, + { + "epoch": 0.7775414717141642, + "grad_norm": 0.263761370929647, + "learning_rate": 2.467209917195513e-06, + "loss": 0.096, + "step": 457 + }, + { + "epoch": 0.7792428753721821, + "grad_norm": 0.235530147590817, + "learning_rate": 2.4310115788941855e-06, + "loss": 0.0595, + "step": 458 + }, + { + "epoch": 0.7809442790301999, + "grad_norm": 0.2110709448726579, + "learning_rate": 2.3950439747815357e-06, + "loss": 0.07, + "step": 459 + }, + { + "epoch": 0.7826456826882178, + "grad_norm": 0.1763868737174647, + "learning_rate": 2.359308201299454e-06, + "loss": 0.0586, + "step": 460 + }, + { + "epoch": 0.7843470863462356, + "grad_norm": 0.16676117425431294, + "learning_rate": 2.3238053478226665e-06, + "loss": 0.0492, + "step": 461 + }, + { + "epoch": 0.7860484900042535, + "grad_norm": 0.15717970250735389, + "learning_rate": 2.2885364966255372e-06, + "loss": 0.0487, + "step": 462 + }, + { + "epoch": 0.7877498936622713, + "grad_norm": 0.28197077618286126, + "learning_rate": 2.2535027228490582e-06, + "loss": 0.0857, + "step": 463 + }, + { + "epoch": 0.7894512973202893, + "grad_norm": 0.264862322279995, + "learning_rate": 2.2187050944680942e-06, + "loss": 0.0937, + "step": 464 + }, + { + "epoch": 0.7911527009783071, + "grad_norm": 0.22066167775922854, + "learning_rate": 2.18414467225882e-06, + "loss": 0.0642, + "step": 465 + }, + { + "epoch": 0.792854104636325, + "grad_norm": 0.2250702122829751, + "learning_rate": 2.1498225097663695e-06, + "loss": 0.0831, + "step": 466 + }, + { + "epoch": 0.7945555082943428, + "grad_norm": 0.22295479048611572, + "learning_rate": 2.115739653272747e-06, + "loss": 0.0631, + "step": 467 + }, + { + "epoch": 0.7962569119523607, + "grad_norm": 0.24242984035739493, + "learning_rate": 2.0818971417649013e-06, + "loss": 0.0591, + "step": 468 + }, + { + "epoch": 0.7979583156103786, + "grad_norm": 0.2612637093823693, + "learning_rate": 2.048296006903081e-06, + "loss": 0.1046, + "step": 469 + }, + { + "epoch": 0.7996597192683964, + "grad_norm": 0.1792782441746806, + "learning_rate": 2.0149372729893646e-06, + "loss": 0.0445, + "step": 470 + }, + { + "epoch": 0.8013611229264143, + "grad_norm": 0.29350099593257656, + "learning_rate": 1.981821956936448e-06, + "loss": 0.0804, + "step": 471 + }, + { + "epoch": 0.8030625265844321, + "grad_norm": 0.22341777662676934, + "learning_rate": 1.9489510682366363e-06, + "loss": 0.0745, + "step": 472 + }, + { + "epoch": 0.8047639302424501, + "grad_norm": 0.1589102792801742, + "learning_rate": 1.916325608931079e-06, + "loss": 0.047, + "step": 473 + }, + { + "epoch": 0.8064653339004679, + "grad_norm": 0.2345326238035068, + "learning_rate": 1.8839465735792095e-06, + "loss": 0.0572, + "step": 474 + }, + { + "epoch": 0.8081667375584858, + "grad_norm": 0.24004839314637838, + "learning_rate": 1.8518149492284477e-06, + "loss": 0.0884, + "step": 475 + }, + { + "epoch": 0.8098681412165036, + "grad_norm": 0.29421822191095054, + "learning_rate": 1.8199317153840933e-06, + "loss": 0.0887, + "step": 476 + }, + { + "epoch": 0.8115695448745215, + "grad_norm": 0.29025654033124915, + "learning_rate": 1.7882978439794708e-06, + "loss": 0.1021, + "step": 477 + }, + { + "epoch": 0.8132709485325393, + "grad_norm": 0.25878138649406646, + "learning_rate": 1.756914299346304e-06, + "loss": 0.0616, + "step": 478 + }, + { + "epoch": 0.8149723521905572, + "grad_norm": 0.2102766744469287, + "learning_rate": 1.7257820381853197e-06, + "loss": 0.0627, + "step": 479 + }, + { + "epoch": 0.816673755848575, + "grad_norm": 0.24781694684746497, + "learning_rate": 1.6949020095370816e-06, + "loss": 0.0766, + "step": 480 + }, + { + "epoch": 0.8183751595065929, + "grad_norm": 0.1734085990747018, + "learning_rate": 1.6642751547530512e-06, + "loss": 0.0514, + "step": 481 + }, + { + "epoch": 0.8200765631646108, + "grad_norm": 0.2117204201108364, + "learning_rate": 1.6339024074669107e-06, + "loss": 0.0717, + "step": 482 + }, + { + "epoch": 0.8217779668226287, + "grad_norm": 0.23022449445835655, + "learning_rate": 1.6037846935660807e-06, + "loss": 0.0697, + "step": 483 + }, + { + "epoch": 0.8234793704806466, + "grad_norm": 0.2031147008426011, + "learning_rate": 1.5739229311635152e-06, + "loss": 0.0647, + "step": 484 + }, + { + "epoch": 0.8251807741386644, + "grad_norm": 0.16452534724080284, + "learning_rate": 1.5443180305696948e-06, + "loss": 0.0477, + "step": 485 + }, + { + "epoch": 0.8268821777966823, + "grad_norm": 0.22807646976291562, + "learning_rate": 1.5149708942648922e-06, + "loss": 0.0814, + "step": 486 + }, + { + "epoch": 0.8285835814547001, + "grad_norm": 0.24827387251547514, + "learning_rate": 1.4858824168716524e-06, + "loss": 0.0755, + "step": 487 + }, + { + "epoch": 0.830284985112718, + "grad_norm": 0.22526986180364844, + "learning_rate": 1.4570534851275241e-06, + "loss": 0.076, + "step": 488 + }, + { + "epoch": 0.8319863887707358, + "grad_norm": 0.20155351877240635, + "learning_rate": 1.4284849778580279e-06, + "loss": 0.0698, + "step": 489 + }, + { + "epoch": 0.8336877924287537, + "grad_norm": 0.1495610561199564, + "learning_rate": 1.4001777659498584e-06, + "loss": 0.04, + "step": 490 + }, + { + "epoch": 0.8353891960867715, + "grad_norm": 0.22042874488356587, + "learning_rate": 1.3721327123243533e-06, + "loss": 0.0696, + "step": 491 + }, + { + "epoch": 0.8370905997447895, + "grad_norm": 0.217650029772456, + "learning_rate": 1.3443506719111666e-06, + "loss": 0.0499, + "step": 492 + }, + { + "epoch": 0.8387920034028074, + "grad_norm": 0.28478642607874244, + "learning_rate": 1.3168324916222296e-06, + "loss": 0.1052, + "step": 493 + }, + { + "epoch": 0.8404934070608252, + "grad_norm": 0.2847136340573529, + "learning_rate": 1.28957901032591e-06, + "loss": 0.0772, + "step": 494 + }, + { + "epoch": 0.8421948107188431, + "grad_norm": 0.21335659065873505, + "learning_rate": 1.2625910588214608e-06, + "loss": 0.0651, + "step": 495 + }, + { + "epoch": 0.8438962143768609, + "grad_norm": 0.2007624647622523, + "learning_rate": 1.2358694598136755e-06, + "loss": 0.0579, + "step": 496 + }, + { + "epoch": 0.8455976180348788, + "grad_norm": 0.22052289165556443, + "learning_rate": 1.2094150278878303e-06, + "loss": 0.0564, + "step": 497 + }, + { + "epoch": 0.8472990216928966, + "grad_norm": 0.23003203856097848, + "learning_rate": 1.1832285694848255e-06, + "loss": 0.0604, + "step": 498 + }, + { + "epoch": 0.8490004253509145, + "grad_norm": 0.18674042534277024, + "learning_rate": 1.1573108828766255e-06, + "loss": 0.0442, + "step": 499 + }, + { + "epoch": 0.8507018290089323, + "grad_norm": 0.21647795156393285, + "learning_rate": 1.1316627581419137e-06, + "loss": 0.0535, + "step": 500 + }, + { + "epoch": 0.8524032326669503, + "grad_norm": 0.3119714991626263, + "learning_rate": 1.1062849771420025e-06, + "loss": 0.1191, + "step": 501 + }, + { + "epoch": 0.8541046363249681, + "grad_norm": 0.21164862051641084, + "learning_rate": 1.0811783134970132e-06, + "loss": 0.0658, + "step": 502 + }, + { + "epoch": 0.855806039982986, + "grad_norm": 0.2724719475504451, + "learning_rate": 1.0563435325622762e-06, + "loss": 0.0736, + "step": 503 + }, + { + "epoch": 0.8575074436410038, + "grad_norm": 0.22358105859093347, + "learning_rate": 1.0317813914050157e-06, + "loss": 0.0711, + "step": 504 + }, + { + "epoch": 0.8592088472990217, + "grad_norm": 0.2978912375008609, + "learning_rate": 1.007492638781259e-06, + "loss": 0.0895, + "step": 505 + }, + { + "epoch": 0.8609102509570395, + "grad_norm": 0.209790599151719, + "learning_rate": 9.834780151130196e-07, + "loss": 0.0718, + "step": 506 + }, + { + "epoch": 0.8626116546150574, + "grad_norm": 0.1681086042265431, + "learning_rate": 9.597382524657173e-07, + "loss": 0.0592, + "step": 507 + }, + { + "epoch": 0.8643130582730753, + "grad_norm": 0.20035287400774016, + "learning_rate": 9.362740745258736e-07, + "loss": 0.074, + "step": 508 + }, + { + "epoch": 0.8660144619310931, + "grad_norm": 0.17882197027245497, + "learning_rate": 9.13086196579035e-07, + "loss": 0.0481, + "step": 509 + }, + { + "epoch": 0.8677158655891111, + "grad_norm": 0.2010107765861354, + "learning_rate": 8.901753254879885e-07, + "loss": 0.0599, + "step": 510 + }, + { + "epoch": 0.8694172692471289, + "grad_norm": 0.1784357377756698, + "learning_rate": 8.67542159671192e-07, + "loss": 0.0422, + "step": 511 + }, + { + "epoch": 0.8711186729051468, + "grad_norm": 0.2595041553389473, + "learning_rate": 8.451873890814988e-07, + "loss": 0.0834, + "step": 512 + }, + { + "epoch": 0.8728200765631646, + "grad_norm": 0.18720029506475785, + "learning_rate": 8.231116951851204e-07, + "loss": 0.0441, + "step": 513 + }, + { + "epoch": 0.8745214802211825, + "grad_norm": 0.13605762437865937, + "learning_rate": 8.013157509408509e-07, + "loss": 0.0499, + "step": 514 + }, + { + "epoch": 0.8762228838792003, + "grad_norm": 0.3392074002609155, + "learning_rate": 7.79800220779554e-07, + "loss": 0.0935, + "step": 515 + }, + { + "epoch": 0.8779242875372182, + "grad_norm": 0.27563422248531844, + "learning_rate": 7.585657605839059e-07, + "loss": 0.0749, + "step": 516 + }, + { + "epoch": 0.879625691195236, + "grad_norm": 0.2928759924757975, + "learning_rate": 7.376130176684082e-07, + "loss": 0.107, + "step": 517 + }, + { + "epoch": 0.8813270948532539, + "grad_norm": 0.20820494565138964, + "learning_rate": 7.169426307596428e-07, + "loss": 0.0711, + "step": 518 + }, + { + "epoch": 0.8830284985112719, + "grad_norm": 0.1778541297140114, + "learning_rate": 6.965552299768186e-07, + "loss": 0.0548, + "step": 519 + }, + { + "epoch": 0.8847299021692897, + "grad_norm": 0.18347737015999377, + "learning_rate": 6.764514368125419e-07, + "loss": 0.0468, + "step": 520 + }, + { + "epoch": 0.8864313058273076, + "grad_norm": 0.22402928085104057, + "learning_rate": 6.566318641138902e-07, + "loss": 0.0819, + "step": 521 + }, + { + "epoch": 0.8881327094853254, + "grad_norm": 0.10087048048840686, + "learning_rate": 6.370971160637129e-07, + "loss": 0.0257, + "step": 522 + }, + { + "epoch": 0.8898341131433433, + "grad_norm": 0.24244337938824437, + "learning_rate": 6.178477881622325e-07, + "loss": 0.0929, + "step": 523 + }, + { + "epoch": 0.8915355168013611, + "grad_norm": 0.2215797215803862, + "learning_rate": 5.98884467208869e-07, + "loss": 0.0707, + "step": 524 + }, + { + "epoch": 0.893236920459379, + "grad_norm": 0.21837004790407136, + "learning_rate": 5.802077312843723e-07, + "loss": 0.0601, + "step": 525 + }, + { + "epoch": 0.8949383241173968, + "grad_norm": 0.13798341621078078, + "learning_rate": 5.618181497331865e-07, + "loss": 0.0387, + "step": 526 + }, + { + "epoch": 0.8966397277754147, + "grad_norm": 0.18913792414386757, + "learning_rate": 5.437162831460962e-07, + "loss": 0.0498, + "step": 527 + }, + { + "epoch": 0.8983411314334325, + "grad_norm": 0.22877307977873534, + "learning_rate": 5.259026833431468e-07, + "loss": 0.0704, + "step": 528 + }, + { + "epoch": 0.9000425350914505, + "grad_norm": 0.20234348876984656, + "learning_rate": 5.083778933568073e-07, + "loss": 0.0649, + "step": 529 + }, + { + "epoch": 0.9017439387494683, + "grad_norm": 0.2531050005740147, + "learning_rate": 4.911424474154314e-07, + "loss": 0.0878, + "step": 530 + }, + { + "epoch": 0.9034453424074862, + "grad_norm": 0.24193903233807051, + "learning_rate": 4.741968709269573e-07, + "loss": 0.073, + "step": 531 + }, + { + "epoch": 0.905146746065504, + "grad_norm": 0.2541146802125869, + "learning_rate": 4.575416804629085e-07, + "loss": 0.0563, + "step": 532 + }, + { + "epoch": 0.9068481497235219, + "grad_norm": 0.16762038698102058, + "learning_rate": 4.411773837426303e-07, + "loss": 0.053, + "step": 533 + }, + { + "epoch": 0.9085495533815398, + "grad_norm": 0.23076714189266884, + "learning_rate": 4.2510447961782055e-07, + "loss": 0.0687, + "step": 534 + }, + { + "epoch": 0.9102509570395576, + "grad_norm": 0.26029701749366, + "learning_rate": 4.093234580573202e-07, + "loss": 0.0765, + "step": 535 + }, + { + "epoch": 0.9119523606975755, + "grad_norm": 0.2130287074707999, + "learning_rate": 3.938348001321812e-07, + "loss": 0.062, + "step": 536 + }, + { + "epoch": 0.9136537643555933, + "grad_norm": 0.18154744728141692, + "learning_rate": 3.786389780009958e-07, + "loss": 0.0521, + "step": 537 + }, + { + "epoch": 0.9153551680136113, + "grad_norm": 0.15469238363767565, + "learning_rate": 3.637364548955047e-07, + "loss": 0.0358, + "step": 538 + }, + { + "epoch": 0.9170565716716291, + "grad_norm": 0.3678961865523596, + "learning_rate": 3.491276851064784e-07, + "loss": 0.0881, + "step": 539 + }, + { + "epoch": 0.918757975329647, + "grad_norm": 0.23428870195639928, + "learning_rate": 3.3481311396986626e-07, + "loss": 0.0844, + "step": 540 + }, + { + "epoch": 0.9204593789876648, + "grad_norm": 0.2491236742624812, + "learning_rate": 3.2079317785322363e-07, + "loss": 0.0767, + "step": 541 + }, + { + "epoch": 0.9221607826456827, + "grad_norm": 0.27866990645467393, + "learning_rate": 3.0706830414240164e-07, + "loss": 0.0862, + "step": 542 + }, + { + "epoch": 0.9238621863037005, + "grad_norm": 0.13956980085260687, + "learning_rate": 2.9363891122853097e-07, + "loss": 0.0437, + "step": 543 + }, + { + "epoch": 0.9255635899617184, + "grad_norm": 0.1975336620060032, + "learning_rate": 2.805054084952552e-07, + "loss": 0.076, + "step": 544 + }, + { + "epoch": 0.9272649936197362, + "grad_norm": 0.22649643287994412, + "learning_rate": 2.6766819630626216e-07, + "loss": 0.0647, + "step": 545 + }, + { + "epoch": 0.9289663972777541, + "grad_norm": 0.1932416005125246, + "learning_rate": 2.5512766599306903e-07, + "loss": 0.0642, + "step": 546 + }, + { + "epoch": 0.9306678009357721, + "grad_norm": 0.1779695034006232, + "learning_rate": 2.4288419984310086e-07, + "loss": 0.0439, + "step": 547 + }, + { + "epoch": 0.9323692045937899, + "grad_norm": 0.18689194535029088, + "learning_rate": 2.3093817108803318e-07, + "loss": 0.0761, + "step": 548 + }, + { + "epoch": 0.9340706082518078, + "grad_norm": 0.13597627329781425, + "learning_rate": 2.1928994389241454e-07, + "loss": 0.0369, + "step": 549 + }, + { + "epoch": 0.9357720119098256, + "grad_norm": 0.21989658377102142, + "learning_rate": 2.0793987334256637e-07, + "loss": 0.0625, + "step": 550 + }, + { + "epoch": 0.9374734155678435, + "grad_norm": 0.21266747025038635, + "learning_rate": 1.968883054357562e-07, + "loss": 0.0689, + "step": 551 + }, + { + "epoch": 0.9391748192258613, + "grad_norm": 0.25928931998255494, + "learning_rate": 1.861355770696549e-07, + "loss": 0.1025, + "step": 552 + }, + { + "epoch": 0.9408762228838792, + "grad_norm": 0.2801452061064216, + "learning_rate": 1.7568201603205827e-07, + "loss": 0.0869, + "step": 553 + }, + { + "epoch": 0.942577626541897, + "grad_norm": 0.3090393640358726, + "learning_rate": 1.6552794099090718e-07, + "loss": 0.1212, + "step": 554 + }, + { + "epoch": 0.9442790301999149, + "grad_norm": 0.17720681659522422, + "learning_rate": 1.5567366148455887e-07, + "loss": 0.0355, + "step": 555 + }, + { + "epoch": 0.9459804338579328, + "grad_norm": 0.15472295591692306, + "learning_rate": 1.4611947791236314e-07, + "loss": 0.0395, + "step": 556 + }, + { + "epoch": 0.9476818375159507, + "grad_norm": 0.26730416240977917, + "learning_rate": 1.3686568152549539e-07, + "loss": 0.0595, + "step": 557 + }, + { + "epoch": 0.9493832411739686, + "grad_norm": 0.21978673843573918, + "learning_rate": 1.2791255441809037e-07, + "loss": 0.064, + "step": 558 + }, + { + "epoch": 0.9510846448319864, + "grad_norm": 0.19683818868694625, + "learning_rate": 1.1926036951862563e-07, + "loss": 0.0672, + "step": 559 + }, + { + "epoch": 0.9527860484900043, + "grad_norm": 0.19189970865124908, + "learning_rate": 1.109093905816172e-07, + "loss": 0.0569, + "step": 560 + }, + { + "epoch": 0.9544874521480221, + "grad_norm": 0.22200476537406752, + "learning_rate": 1.0285987217957038e-07, + "loss": 0.0761, + "step": 561 + }, + { + "epoch": 0.95618885580604, + "grad_norm": 0.2253757142644252, + "learning_rate": 9.511205969522263e-08, + "loss": 0.0645, + "step": 562 + }, + { + "epoch": 0.9578902594640578, + "grad_norm": 0.218059392950448, + "learning_rate": 8.76661893140629e-08, + "loss": 0.0682, + "step": 563 + }, + { + "epoch": 0.9595916631220757, + "grad_norm": 0.2649165636045151, + "learning_rate": 8.052248801712958e-08, + "loss": 0.0912, + "step": 564 + }, + { + "epoch": 0.9612930667800936, + "grad_norm": 0.20253828553816175, + "learning_rate": 7.36811735740961e-08, + "loss": 0.0571, + "step": 565 + }, + { + "epoch": 0.9629944704381115, + "grad_norm": 0.19699087263694953, + "learning_rate": 6.714245453662504e-08, + "loss": 0.0457, + "step": 566 + }, + { + "epoch": 0.9646958740961293, + "grad_norm": 0.27159809390838907, + "learning_rate": 6.090653023201997e-08, + "loss": 0.1057, + "step": 567 + }, + { + "epoch": 0.9663972777541472, + "grad_norm": 0.2589995420361442, + "learning_rate": 5.497359075714026e-08, + "loss": 0.102, + "step": 568 + }, + { + "epoch": 0.968098681412165, + "grad_norm": 0.19790945958126713, + "learning_rate": 4.934381697261015e-08, + "loss": 0.0608, + "step": 569 + }, + { + "epoch": 0.9698000850701829, + "grad_norm": 0.24043673016274164, + "learning_rate": 4.401738049730653e-08, + "loss": 0.0609, + "step": 570 + }, + { + "epoch": 0.9715014887282007, + "grad_norm": 0.2690784354034665, + "learning_rate": 3.899444370312533e-08, + "loss": 0.0861, + "step": 571 + }, + { + "epoch": 0.9732028923862186, + "grad_norm": 0.22841785491856118, + "learning_rate": 3.4275159710032146e-08, + "loss": 0.0712, + "step": 572 + }, + { + "epoch": 0.9749042960442365, + "grad_norm": 0.1950332471845292, + "learning_rate": 2.9859672381392644e-08, + "loss": 0.0532, + "step": 573 + }, + { + "epoch": 0.9766056997022544, + "grad_norm": 0.22678060127201066, + "learning_rate": 2.574811631959273e-08, + "loss": 0.0771, + "step": 574 + }, + { + "epoch": 0.9783071033602723, + "grad_norm": 0.3698823942978684, + "learning_rate": 2.1940616861929608e-08, + "loss": 0.0693, + "step": 575 + }, + { + "epoch": 0.9800085070182901, + "grad_norm": 0.23298311393693824, + "learning_rate": 1.8437290076792624e-08, + "loss": 0.0725, + "step": 576 + }, + { + "epoch": 0.981709910676308, + "grad_norm": 0.2028362088180894, + "learning_rate": 1.5238242760126088e-08, + "loss": 0.0756, + "step": 577 + }, + { + "epoch": 0.9834113143343258, + "grad_norm": 0.29483561036206646, + "learning_rate": 1.234357243217188e-08, + "loss": 0.0988, + "step": 578 + }, + { + "epoch": 0.9851127179923437, + "grad_norm": 0.2525424387277673, + "learning_rate": 9.753367334499608e-09, + "loss": 0.0771, + "step": 579 + }, + { + "epoch": 0.9868141216503615, + "grad_norm": 0.26576715050467775, + "learning_rate": 7.467706427312093e-09, + "loss": 0.0612, + "step": 580 + }, + { + "epoch": 0.9885155253083794, + "grad_norm": 0.1729974193289849, + "learning_rate": 5.486659387043958e-09, + "loss": 0.0371, + "step": 581 + }, + { + "epoch": 0.9902169289663972, + "grad_norm": 0.19337221194799736, + "learning_rate": 3.810286604232216e-09, + "loss": 0.0548, + "step": 582 + }, + { + "epoch": 0.9919183326244151, + "grad_norm": 0.2881735818611327, + "learning_rate": 2.4386391816777488e-09, + "loss": 0.0866, + "step": 583 + }, + { + "epoch": 0.993619736282433, + "grad_norm": 0.25389258821605704, + "learning_rate": 1.3717589328898773e-09, + "loss": 0.0442, + "step": 584 + }, + { + "epoch": 0.9953211399404509, + "grad_norm": 0.2721835551378759, + "learning_rate": 6.096783808062778e-10, + "loss": 0.0977, + "step": 585 + }, + { + "epoch": 0.9970225435984688, + "grad_norm": 0.2313901379023938, + "learning_rate": 1.524207568059932e-10, + "loss": 0.0628, + "step": 586 + }, + { + "epoch": 0.9987239472564866, + "grad_norm": 0.24018936080778358, + "learning_rate": 0.0, + "loss": 0.0614, + "step": 587 + }, + { + "epoch": 0.9987239472564866, + "step": 587, + "total_flos": 1551551083839488.0, + "train_loss": 0.1089990799881683, + "train_runtime": 5367.2694, + "train_samples_per_second": 14.016, + "train_steps_per_second": 0.109 + } + ], + "logging_steps": 1.0, + "max_steps": 587, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1551551083839488.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}