diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,43472 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999194392975107, + "eval_steps": 500, + "global_step": 6206, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 6.53196797006627, + "learning_rate": 1.0695187165775401e-07, + "loss": 1.541, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 8.311062312027166, + "learning_rate": 2.1390374331550802e-07, + "loss": 1.6299, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 7.436112959736087, + "learning_rate": 3.208556149732621e-07, + "loss": 1.5385, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 7.335068617726015, + "learning_rate": 4.2780748663101604e-07, + "loss": 1.6662, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 6.909720869133421, + "learning_rate": 5.347593582887701e-07, + "loss": 1.5632, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 8.666190490609123, + "learning_rate": 6.417112299465242e-07, + "loss": 1.7323, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 7.435249428878192, + "learning_rate": 7.486631016042781e-07, + "loss": 1.5217, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 0.6538306097949063, + "learning_rate": 8.556149732620321e-07, + "loss": 0.283, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 7.340246462572206, + "learning_rate": 9.625668449197862e-07, + "loss": 1.6522, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 6.123407602103352, + "learning_rate": 1.0695187165775401e-06, + "loss": 1.5194, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 7.011151051186081, + "learning_rate": 1.1764705882352942e-06, + "loss": 1.7549, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 0.7412442035271506, + "learning_rate": 1.2834224598930483e-06, + "loss": 0.2853, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 5.479597583899963, + "learning_rate": 1.3903743315508022e-06, + "loss": 1.6564, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 5.287917688005597, + "learning_rate": 1.4973262032085562e-06, + "loss": 1.4943, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 4.45375430027414, + "learning_rate": 1.6042780748663103e-06, + "loss": 1.552, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 2.844408782956599, + "learning_rate": 1.7112299465240642e-06, + "loss": 1.4518, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 3.2620626062752733, + "learning_rate": 1.8181818181818183e-06, + "loss": 1.4307, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 3.722124968461394, + "learning_rate": 1.9251336898395724e-06, + "loss": 1.431, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 3.1641531001176832, + "learning_rate": 2.0320855614973265e-06, + "loss": 1.3461, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 3.6808611887113956, + "learning_rate": 2.1390374331550802e-06, + "loss": 1.583, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 2.298234965142126, + "learning_rate": 2.2459893048128343e-06, + "loss": 1.3332, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 3.4592974966390613, + "learning_rate": 2.3529411764705885e-06, + "loss": 1.4626, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 0.7742613771782929, + "learning_rate": 2.4598930481283426e-06, + "loss": 0.2514, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 3.038969709014357, + "learning_rate": 2.5668449197860967e-06, + "loss": 1.4968, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 2.2174413710426824, + "learning_rate": 2.673796791443851e-06, + "loss": 1.3723, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 0.7424892182432326, + "learning_rate": 2.7807486631016045e-06, + "loss": 0.3019, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 2.9059395919149864, + "learning_rate": 2.8877005347593586e-06, + "loss": 1.3785, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 2.2769614466398784, + "learning_rate": 2.9946524064171123e-06, + "loss": 1.3185, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 2.225934634441806, + "learning_rate": 3.101604278074867e-06, + "loss": 1.3484, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 2.2311105656532804, + "learning_rate": 3.2085561497326205e-06, + "loss": 1.2698, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 2.7313827859365953, + "learning_rate": 3.3155080213903747e-06, + "loss": 1.2595, + "step": 31 + }, + { + "epoch": 0.01, + "grad_norm": 2.4180644252133177, + "learning_rate": 3.4224598930481284e-06, + "loss": 1.2453, + "step": 32 + }, + { + "epoch": 0.01, + "grad_norm": 1.9719030566418552, + "learning_rate": 3.529411764705883e-06, + "loss": 1.2943, + "step": 33 + }, + { + "epoch": 0.01, + "grad_norm": 2.281268512474351, + "learning_rate": 3.6363636363636366e-06, + "loss": 1.2718, + "step": 34 + }, + { + "epoch": 0.01, + "grad_norm": 2.1712818913925984, + "learning_rate": 3.7433155080213907e-06, + "loss": 1.1555, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 2.1235626246995163, + "learning_rate": 3.850267379679145e-06, + "loss": 1.2219, + "step": 36 + }, + { + "epoch": 0.01, + "grad_norm": 2.222512457575872, + "learning_rate": 3.957219251336899e-06, + "loss": 1.27, + "step": 37 + }, + { + "epoch": 0.01, + "grad_norm": 1.9888253675939873, + "learning_rate": 4.064171122994653e-06, + "loss": 1.26, + "step": 38 + }, + { + "epoch": 0.01, + "grad_norm": 1.8742132473427955, + "learning_rate": 4.171122994652407e-06, + "loss": 1.2675, + "step": 39 + }, + { + "epoch": 0.01, + "grad_norm": 1.7767172264587079, + "learning_rate": 4.2780748663101604e-06, + "loss": 1.1135, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 2.194225907803369, + "learning_rate": 4.385026737967915e-06, + "loss": 1.2028, + "step": 41 + }, + { + "epoch": 0.01, + "grad_norm": 2.021768498379485, + "learning_rate": 4.491978609625669e-06, + "loss": 1.1143, + "step": 42 + }, + { + "epoch": 0.01, + "grad_norm": 1.951684289763473, + "learning_rate": 4.598930481283423e-06, + "loss": 1.137, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 2.3684429555053232, + "learning_rate": 4.705882352941177e-06, + "loss": 1.2509, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 2.0715721967418115, + "learning_rate": 4.812834224598931e-06, + "loss": 1.2642, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 1.9425322320312466, + "learning_rate": 4.919786096256685e-06, + "loss": 1.1564, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 1.860974752620202, + "learning_rate": 5.026737967914439e-06, + "loss": 1.1736, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 2.062549334896917, + "learning_rate": 5.133689839572193e-06, + "loss": 1.2111, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 1.925396788988329, + "learning_rate": 5.240641711229947e-06, + "loss": 1.2041, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 2.1103645440814014, + "learning_rate": 5.347593582887702e-06, + "loss": 1.1769, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 2.15020264679653, + "learning_rate": 5.4545454545454545e-06, + "loss": 1.3617, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 1.6850801586450304, + "learning_rate": 5.561497326203209e-06, + "loss": 1.1572, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 1.8924022333001187, + "learning_rate": 5.6684491978609635e-06, + "loss": 1.1743, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 2.485608070791642, + "learning_rate": 5.775401069518717e-06, + "loss": 1.1586, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 1.888615712905799, + "learning_rate": 5.882352941176471e-06, + "loss": 1.126, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 1.7415196492224712, + "learning_rate": 5.989304812834225e-06, + "loss": 1.1537, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 1.8693679209978158, + "learning_rate": 6.096256684491979e-06, + "loss": 1.2311, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 1.9081308771900998, + "learning_rate": 6.203208556149734e-06, + "loss": 1.1317, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 2.1073178503402636, + "learning_rate": 6.3101604278074865e-06, + "loss": 1.2236, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 1.893703437144612, + "learning_rate": 6.417112299465241e-06, + "loss": 1.0762, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 2.142937542267072, + "learning_rate": 6.524064171122996e-06, + "loss": 1.247, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 1.7095661502780057, + "learning_rate": 6.631016042780749e-06, + "loss": 1.1433, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 1.7205317814630048, + "learning_rate": 6.737967914438504e-06, + "loss": 1.2264, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 1.9025266939275545, + "learning_rate": 6.844919786096257e-06, + "loss": 1.2017, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 1.7742469406575907, + "learning_rate": 6.951871657754011e-06, + "loss": 0.9904, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 1.8717123434598966, + "learning_rate": 7.058823529411766e-06, + "loss": 1.0933, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 1.827996217336415, + "learning_rate": 7.1657754010695195e-06, + "loss": 1.168, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 2.2054898964695444, + "learning_rate": 7.272727272727273e-06, + "loss": 1.1252, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 1.8097542866352285, + "learning_rate": 7.379679144385027e-06, + "loss": 1.0983, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 1.6601209715292462, + "learning_rate": 7.486631016042781e-06, + "loss": 1.0822, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 1.9329582051308762, + "learning_rate": 7.593582887700536e-06, + "loss": 1.1667, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 1.8737046903488233, + "learning_rate": 7.70053475935829e-06, + "loss": 1.2134, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 1.8726308139765264, + "learning_rate": 7.807486631016043e-06, + "loss": 1.139, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 1.8082208279847156, + "learning_rate": 7.914438502673799e-06, + "loss": 1.1306, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 1.7999342511093201, + "learning_rate": 8.02139037433155e-06, + "loss": 1.0592, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 1.925622009789158, + "learning_rate": 8.128342245989306e-06, + "loss": 1.05, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 3.0099471147044214, + "learning_rate": 8.23529411764706e-06, + "loss": 1.1996, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 1.6907884755838152, + "learning_rate": 8.342245989304813e-06, + "loss": 1.1032, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 1.076613101666246, + "learning_rate": 8.449197860962567e-06, + "loss": 0.3197, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 2.045408360454748, + "learning_rate": 8.556149732620321e-06, + "loss": 1.1921, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 1.9764200920701724, + "learning_rate": 8.663101604278076e-06, + "loss": 1.133, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 2.024395421370614, + "learning_rate": 8.77005347593583e-06, + "loss": 1.1514, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 1.845005434362296, + "learning_rate": 8.877005347593584e-06, + "loss": 1.1382, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 1.9149695801567612, + "learning_rate": 8.983957219251337e-06, + "loss": 1.135, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 1.8194727956227852, + "learning_rate": 9.090909090909091e-06, + "loss": 1.1115, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 1.938362929715415, + "learning_rate": 9.197860962566846e-06, + "loss": 1.0397, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 1.7911833005517368, + "learning_rate": 9.3048128342246e-06, + "loss": 1.1121, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 1.661275803670195, + "learning_rate": 9.411764705882354e-06, + "loss": 1.1139, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 2.035726729042163, + "learning_rate": 9.518716577540108e-06, + "loss": 1.0997, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 1.866391987105394, + "learning_rate": 9.625668449197861e-06, + "loss": 1.0734, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 2.039139150589232, + "learning_rate": 9.732620320855617e-06, + "loss": 1.095, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 1.8804583453725374, + "learning_rate": 9.83957219251337e-06, + "loss": 1.1043, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 1.7630157929624992, + "learning_rate": 9.946524064171124e-06, + "loss": 1.1254, + "step": 93 + }, + { + "epoch": 0.02, + "grad_norm": 2.176711001947502, + "learning_rate": 1.0053475935828878e-05, + "loss": 1.1558, + "step": 94 + }, + { + "epoch": 0.02, + "grad_norm": 1.6176395648249458, + "learning_rate": 1.0160427807486633e-05, + "loss": 1.1931, + "step": 95 + }, + { + "epoch": 0.02, + "grad_norm": 1.663288501156598, + "learning_rate": 1.0267379679144387e-05, + "loss": 1.1311, + "step": 96 + }, + { + "epoch": 0.02, + "grad_norm": 1.7754888444428265, + "learning_rate": 1.0374331550802139e-05, + "loss": 1.1876, + "step": 97 + }, + { + "epoch": 0.02, + "grad_norm": 1.7860000367033118, + "learning_rate": 1.0481283422459894e-05, + "loss": 1.1278, + "step": 98 + }, + { + "epoch": 0.02, + "grad_norm": 1.0241960238016596, + "learning_rate": 1.0588235294117648e-05, + "loss": 0.312, + "step": 99 + }, + { + "epoch": 0.02, + "grad_norm": 1.8151561916827776, + "learning_rate": 1.0695187165775403e-05, + "loss": 1.1143, + "step": 100 + }, + { + "epoch": 0.02, + "grad_norm": 1.9800457173425494, + "learning_rate": 1.0802139037433157e-05, + "loss": 1.1773, + "step": 101 + }, + { + "epoch": 0.02, + "grad_norm": 1.7866228879808546, + "learning_rate": 1.0909090909090909e-05, + "loss": 1.1064, + "step": 102 + }, + { + "epoch": 0.02, + "grad_norm": 1.8476968790026995, + "learning_rate": 1.1016042780748664e-05, + "loss": 1.1745, + "step": 103 + }, + { + "epoch": 0.02, + "grad_norm": 0.9918767346656621, + "learning_rate": 1.1122994652406418e-05, + "loss": 0.2951, + "step": 104 + }, + { + "epoch": 0.02, + "grad_norm": 1.7922576744014986, + "learning_rate": 1.1229946524064172e-05, + "loss": 1.1567, + "step": 105 + }, + { + "epoch": 0.02, + "grad_norm": 1.9851329868909744, + "learning_rate": 1.1336898395721927e-05, + "loss": 1.0187, + "step": 106 + }, + { + "epoch": 0.02, + "grad_norm": 1.8704732102826962, + "learning_rate": 1.1443850267379679e-05, + "loss": 1.1149, + "step": 107 + }, + { + "epoch": 0.02, + "grad_norm": 2.3819123224989367, + "learning_rate": 1.1550802139037434e-05, + "loss": 1.1869, + "step": 108 + }, + { + "epoch": 0.02, + "grad_norm": 1.7911645791981734, + "learning_rate": 1.1657754010695188e-05, + "loss": 1.1098, + "step": 109 + }, + { + "epoch": 0.02, + "grad_norm": 1.8259111310069072, + "learning_rate": 1.1764705882352942e-05, + "loss": 1.1297, + "step": 110 + }, + { + "epoch": 0.02, + "grad_norm": 1.631045496479277, + "learning_rate": 1.1871657754010697e-05, + "loss": 1.1629, + "step": 111 + }, + { + "epoch": 0.02, + "grad_norm": 1.9198604255691587, + "learning_rate": 1.197860962566845e-05, + "loss": 1.0011, + "step": 112 + }, + { + "epoch": 0.02, + "grad_norm": 1.8552065076721744, + "learning_rate": 1.2085561497326203e-05, + "loss": 1.1413, + "step": 113 + }, + { + "epoch": 0.02, + "grad_norm": 1.5587142769860194, + "learning_rate": 1.2192513368983958e-05, + "loss": 0.9748, + "step": 114 + }, + { + "epoch": 0.02, + "grad_norm": 1.8968289196884078, + "learning_rate": 1.2299465240641712e-05, + "loss": 1.0986, + "step": 115 + }, + { + "epoch": 0.02, + "grad_norm": 1.73161972015209, + "learning_rate": 1.2406417112299467e-05, + "loss": 1.0496, + "step": 116 + }, + { + "epoch": 0.02, + "grad_norm": 1.993942806695572, + "learning_rate": 1.251336898395722e-05, + "loss": 1.2119, + "step": 117 + }, + { + "epoch": 0.02, + "grad_norm": 1.7452115127867245, + "learning_rate": 1.2620320855614973e-05, + "loss": 1.0531, + "step": 118 + }, + { + "epoch": 0.02, + "grad_norm": 1.8448890818407966, + "learning_rate": 1.2727272727272728e-05, + "loss": 1.0764, + "step": 119 + }, + { + "epoch": 0.02, + "grad_norm": 1.8865129672544962, + "learning_rate": 1.2834224598930482e-05, + "loss": 1.038, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 2.0099952740785687, + "learning_rate": 1.2941176470588238e-05, + "loss": 1.083, + "step": 121 + }, + { + "epoch": 0.02, + "grad_norm": 1.893496895626763, + "learning_rate": 1.3048128342245991e-05, + "loss": 1.1138, + "step": 122 + }, + { + "epoch": 0.02, + "grad_norm": 1.7035437843358887, + "learning_rate": 1.3155080213903743e-05, + "loss": 1.067, + "step": 123 + }, + { + "epoch": 0.02, + "grad_norm": 1.7522601174420176, + "learning_rate": 1.3262032085561499e-05, + "loss": 1.0795, + "step": 124 + }, + { + "epoch": 0.02, + "grad_norm": 2.2574278300025763, + "learning_rate": 1.3368983957219252e-05, + "loss": 1.2813, + "step": 125 + }, + { + "epoch": 0.02, + "grad_norm": 2.180472958805692, + "learning_rate": 1.3475935828877008e-05, + "loss": 1.1244, + "step": 126 + }, + { + "epoch": 0.02, + "grad_norm": 1.9639436649054822, + "learning_rate": 1.3582887700534761e-05, + "loss": 1.0696, + "step": 127 + }, + { + "epoch": 0.02, + "grad_norm": 1.8280256592829247, + "learning_rate": 1.3689839572192513e-05, + "loss": 1.1231, + "step": 128 + }, + { + "epoch": 0.02, + "grad_norm": 1.7034101581049972, + "learning_rate": 1.3796791443850269e-05, + "loss": 1.1078, + "step": 129 + }, + { + "epoch": 0.02, + "grad_norm": 1.8470609353073417, + "learning_rate": 1.3903743315508022e-05, + "loss": 1.0944, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 1.6983980246626533, + "learning_rate": 1.4010695187165778e-05, + "loss": 1.1211, + "step": 131 + }, + { + "epoch": 0.02, + "grad_norm": 0.8081912499390137, + "learning_rate": 1.4117647058823532e-05, + "loss": 0.2879, + "step": 132 + }, + { + "epoch": 0.02, + "grad_norm": 1.9252928439475747, + "learning_rate": 1.4224598930481284e-05, + "loss": 1.1393, + "step": 133 + }, + { + "epoch": 0.02, + "grad_norm": 2.0287409251251614, + "learning_rate": 1.4331550802139039e-05, + "loss": 1.1213, + "step": 134 + }, + { + "epoch": 0.02, + "grad_norm": 1.5189652455235887, + "learning_rate": 1.4438502673796793e-05, + "loss": 0.9858, + "step": 135 + }, + { + "epoch": 0.02, + "grad_norm": 1.8087314796323413, + "learning_rate": 1.4545454545454546e-05, + "loss": 1.0565, + "step": 136 + }, + { + "epoch": 0.02, + "grad_norm": 1.8380488024596233, + "learning_rate": 1.4652406417112302e-05, + "loss": 1.0444, + "step": 137 + }, + { + "epoch": 0.02, + "grad_norm": 1.842529761897001, + "learning_rate": 1.4759358288770054e-05, + "loss": 0.9803, + "step": 138 + }, + { + "epoch": 0.02, + "grad_norm": 1.695847818462051, + "learning_rate": 1.4866310160427807e-05, + "loss": 1.0971, + "step": 139 + }, + { + "epoch": 0.02, + "grad_norm": 1.658080892359949, + "learning_rate": 1.4973262032085563e-05, + "loss": 1.0766, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 1.534511371260837, + "learning_rate": 1.5080213903743316e-05, + "loss": 1.146, + "step": 141 + }, + { + "epoch": 0.02, + "grad_norm": 1.687273172525131, + "learning_rate": 1.5187165775401072e-05, + "loss": 1.0289, + "step": 142 + }, + { + "epoch": 0.02, + "grad_norm": 1.9564170273263932, + "learning_rate": 1.5294117647058822e-05, + "loss": 1.0891, + "step": 143 + }, + { + "epoch": 0.02, + "grad_norm": 2.016508270211656, + "learning_rate": 1.540106951871658e-05, + "loss": 1.1391, + "step": 144 + }, + { + "epoch": 0.02, + "grad_norm": 2.053799951976019, + "learning_rate": 1.5508021390374333e-05, + "loss": 0.9739, + "step": 145 + }, + { + "epoch": 0.02, + "grad_norm": 1.830280776187756, + "learning_rate": 1.5614973262032087e-05, + "loss": 1.1195, + "step": 146 + }, + { + "epoch": 0.02, + "grad_norm": 2.0163638993212434, + "learning_rate": 1.572192513368984e-05, + "loss": 1.1383, + "step": 147 + }, + { + "epoch": 0.02, + "grad_norm": 1.7667897747824024, + "learning_rate": 1.5828877005347597e-05, + "loss": 1.1388, + "step": 148 + }, + { + "epoch": 0.02, + "grad_norm": 1.6899698338271028, + "learning_rate": 1.5935828877005348e-05, + "loss": 1.0119, + "step": 149 + }, + { + "epoch": 0.02, + "grad_norm": 1.8653969329380322, + "learning_rate": 1.60427807486631e-05, + "loss": 1.0592, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 1.4913781661875962, + "learning_rate": 1.614973262032086e-05, + "loss": 0.9858, + "step": 151 + }, + { + "epoch": 0.02, + "grad_norm": 1.857071966913271, + "learning_rate": 1.6256684491978612e-05, + "loss": 1.0336, + "step": 152 + }, + { + "epoch": 0.02, + "grad_norm": 0.7681772791047755, + "learning_rate": 1.6363636363636366e-05, + "loss": 0.3091, + "step": 153 + }, + { + "epoch": 0.02, + "grad_norm": 1.7973483037596603, + "learning_rate": 1.647058823529412e-05, + "loss": 1.0741, + "step": 154 + }, + { + "epoch": 0.02, + "grad_norm": 1.7031188503206849, + "learning_rate": 1.6577540106951873e-05, + "loss": 1.1208, + "step": 155 + }, + { + "epoch": 0.03, + "grad_norm": 1.5884058815477005, + "learning_rate": 1.6684491978609627e-05, + "loss": 1.031, + "step": 156 + }, + { + "epoch": 0.03, + "grad_norm": 1.942411030314576, + "learning_rate": 1.679144385026738e-05, + "loss": 1.1414, + "step": 157 + }, + { + "epoch": 0.03, + "grad_norm": 1.8014056205268636, + "learning_rate": 1.6898395721925134e-05, + "loss": 1.0525, + "step": 158 + }, + { + "epoch": 0.03, + "grad_norm": 2.019216987094288, + "learning_rate": 1.7005347593582888e-05, + "loss": 1.0838, + "step": 159 + }, + { + "epoch": 0.03, + "grad_norm": 0.7706589097372759, + "learning_rate": 1.7112299465240642e-05, + "loss": 0.3062, + "step": 160 + }, + { + "epoch": 0.03, + "grad_norm": 2.008390867483698, + "learning_rate": 1.7219251336898395e-05, + "loss": 1.1059, + "step": 161 + }, + { + "epoch": 0.03, + "grad_norm": 1.5253240769849044, + "learning_rate": 1.7326203208556153e-05, + "loss": 0.9575, + "step": 162 + }, + { + "epoch": 0.03, + "grad_norm": 1.8312856767551149, + "learning_rate": 1.7433155080213906e-05, + "loss": 1.1092, + "step": 163 + }, + { + "epoch": 0.03, + "grad_norm": 1.728249129263797, + "learning_rate": 1.754010695187166e-05, + "loss": 1.1073, + "step": 164 + }, + { + "epoch": 0.03, + "grad_norm": 1.7704008841273098, + "learning_rate": 1.7647058823529414e-05, + "loss": 1.0628, + "step": 165 + }, + { + "epoch": 0.03, + "grad_norm": 1.6661351505952953, + "learning_rate": 1.7754010695187167e-05, + "loss": 1.0585, + "step": 166 + }, + { + "epoch": 0.03, + "grad_norm": 1.8485500470317957, + "learning_rate": 1.786096256684492e-05, + "loss": 1.0741, + "step": 167 + }, + { + "epoch": 0.03, + "grad_norm": 1.6544345091365844, + "learning_rate": 1.7967914438502675e-05, + "loss": 1.026, + "step": 168 + }, + { + "epoch": 0.03, + "grad_norm": 1.7496030966221727, + "learning_rate": 1.807486631016043e-05, + "loss": 1.095, + "step": 169 + }, + { + "epoch": 0.03, + "grad_norm": 1.664799656814396, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.0104, + "step": 170 + }, + { + "epoch": 0.03, + "grad_norm": 1.591765179932876, + "learning_rate": 1.8288770053475936e-05, + "loss": 1.0212, + "step": 171 + }, + { + "epoch": 0.03, + "grad_norm": 1.6665906143097107, + "learning_rate": 1.8395721925133693e-05, + "loss": 1.0319, + "step": 172 + }, + { + "epoch": 0.03, + "grad_norm": 1.775528084120115, + "learning_rate": 1.8502673796791447e-05, + "loss": 1.115, + "step": 173 + }, + { + "epoch": 0.03, + "grad_norm": 1.6831446986970393, + "learning_rate": 1.86096256684492e-05, + "loss": 1.0538, + "step": 174 + }, + { + "epoch": 0.03, + "grad_norm": 1.7067504958551498, + "learning_rate": 1.8716577540106954e-05, + "loss": 1.0346, + "step": 175 + }, + { + "epoch": 0.03, + "grad_norm": 1.8000110915623775, + "learning_rate": 1.8823529411764708e-05, + "loss": 1.0941, + "step": 176 + }, + { + "epoch": 0.03, + "grad_norm": 1.631768825070943, + "learning_rate": 1.893048128342246e-05, + "loss": 1.086, + "step": 177 + }, + { + "epoch": 0.03, + "grad_norm": 1.591403918529108, + "learning_rate": 1.9037433155080215e-05, + "loss": 0.9815, + "step": 178 + }, + { + "epoch": 0.03, + "grad_norm": 1.8318516443947612, + "learning_rate": 1.9144385026737972e-05, + "loss": 1.033, + "step": 179 + }, + { + "epoch": 0.03, + "grad_norm": 1.8872040870712627, + "learning_rate": 1.9251336898395722e-05, + "loss": 1.1848, + "step": 180 + }, + { + "epoch": 0.03, + "grad_norm": 1.8544692188224614, + "learning_rate": 1.9358288770053476e-05, + "loss": 1.0957, + "step": 181 + }, + { + "epoch": 0.03, + "grad_norm": 0.8651969753044279, + "learning_rate": 1.9465240641711233e-05, + "loss": 0.2902, + "step": 182 + }, + { + "epoch": 0.03, + "grad_norm": 1.7281646587725787, + "learning_rate": 1.9572192513368987e-05, + "loss": 0.9997, + "step": 183 + }, + { + "epoch": 0.03, + "grad_norm": 1.748033940677634, + "learning_rate": 1.967914438502674e-05, + "loss": 1.1354, + "step": 184 + }, + { + "epoch": 0.03, + "grad_norm": 0.7433533045349984, + "learning_rate": 1.9786096256684494e-05, + "loss": 0.2895, + "step": 185 + }, + { + "epoch": 0.03, + "grad_norm": 1.9279365114715972, + "learning_rate": 1.9893048128342248e-05, + "loss": 1.1446, + "step": 186 + }, + { + "epoch": 0.03, + "grad_norm": 1.5954366961135809, + "learning_rate": 2e-05, + "loss": 1.0631, + "step": 187 + }, + { + "epoch": 0.03, + "grad_norm": 1.7074825859927216, + "learning_rate": 1.9999998637862175e-05, + "loss": 1.0039, + "step": 188 + }, + { + "epoch": 0.03, + "grad_norm": 1.86033997018962, + "learning_rate": 1.999999455144907e-05, + "loss": 1.1297, + "step": 189 + }, + { + "epoch": 0.03, + "grad_norm": 2.0091468258782172, + "learning_rate": 1.9999987740761794e-05, + "loss": 1.0665, + "step": 190 + }, + { + "epoch": 0.03, + "grad_norm": 1.7261058140821122, + "learning_rate": 1.999997820580221e-05, + "loss": 1.0258, + "step": 191 + }, + { + "epoch": 0.03, + "grad_norm": 1.9504220111566979, + "learning_rate": 1.999996594657291e-05, + "loss": 1.0149, + "step": 192 + }, + { + "epoch": 0.03, + "grad_norm": 2.004685784760631, + "learning_rate": 1.9999950963077235e-05, + "loss": 1.072, + "step": 193 + }, + { + "epoch": 0.03, + "grad_norm": 1.9309316979287292, + "learning_rate": 1.999993325531927e-05, + "loss": 1.0467, + "step": 194 + }, + { + "epoch": 0.03, + "grad_norm": 1.6356637207113613, + "learning_rate": 1.9999912823303832e-05, + "loss": 0.9829, + "step": 195 + }, + { + "epoch": 0.03, + "grad_norm": 2.045396696853703, + "learning_rate": 1.9999889667036496e-05, + "loss": 1.0942, + "step": 196 + }, + { + "epoch": 0.03, + "grad_norm": 1.9932046640345547, + "learning_rate": 1.9999863786523567e-05, + "loss": 1.1781, + "step": 197 + }, + { + "epoch": 0.03, + "grad_norm": 1.6717669015323524, + "learning_rate": 1.999983518177209e-05, + "loss": 1.0575, + "step": 198 + }, + { + "epoch": 0.03, + "grad_norm": 1.7365123932062148, + "learning_rate": 1.9999803852789864e-05, + "loss": 1.0566, + "step": 199 + }, + { + "epoch": 0.03, + "grad_norm": 2.062516656721653, + "learning_rate": 1.999976979958542e-05, + "loss": 1.027, + "step": 200 + }, + { + "epoch": 0.03, + "grad_norm": 1.7495776362860862, + "learning_rate": 1.9999733022168043e-05, + "loss": 1.054, + "step": 201 + }, + { + "epoch": 0.03, + "grad_norm": 1.6099227533297795, + "learning_rate": 1.9999693520547745e-05, + "loss": 1.0498, + "step": 202 + }, + { + "epoch": 0.03, + "grad_norm": 1.6277078827555203, + "learning_rate": 1.9999651294735285e-05, + "loss": 1.0575, + "step": 203 + }, + { + "epoch": 0.03, + "grad_norm": 1.6762913736664313, + "learning_rate": 1.9999606344742176e-05, + "loss": 1.1026, + "step": 204 + }, + { + "epoch": 0.03, + "grad_norm": 2.0819182123700086, + "learning_rate": 1.9999558670580656e-05, + "loss": 1.1238, + "step": 205 + }, + { + "epoch": 0.03, + "grad_norm": 0.933011872225037, + "learning_rate": 1.999950827226371e-05, + "loss": 0.3296, + "step": 206 + }, + { + "epoch": 0.03, + "grad_norm": 1.7318182671057278, + "learning_rate": 1.9999455149805076e-05, + "loss": 1.0823, + "step": 207 + }, + { + "epoch": 0.03, + "grad_norm": 1.4232312529086144, + "learning_rate": 1.9999399303219222e-05, + "loss": 0.9811, + "step": 208 + }, + { + "epoch": 0.03, + "grad_norm": 1.7046086876602637, + "learning_rate": 1.9999340732521363e-05, + "loss": 1.0608, + "step": 209 + }, + { + "epoch": 0.03, + "grad_norm": 1.793007597782919, + "learning_rate": 1.9999279437727456e-05, + "loss": 1.0736, + "step": 210 + }, + { + "epoch": 0.03, + "grad_norm": 1.5795068073409793, + "learning_rate": 1.99992154188542e-05, + "loss": 1.0054, + "step": 211 + }, + { + "epoch": 0.03, + "grad_norm": 1.751789952443813, + "learning_rate": 1.999914867591903e-05, + "loss": 1.0345, + "step": 212 + }, + { + "epoch": 0.03, + "grad_norm": 1.7243277880402155, + "learning_rate": 1.9999079208940137e-05, + "loss": 1.1349, + "step": 213 + }, + { + "epoch": 0.03, + "grad_norm": 1.6695632873035346, + "learning_rate": 1.9999007017936436e-05, + "loss": 1.0211, + "step": 214 + }, + { + "epoch": 0.03, + "grad_norm": 1.7941020451680676, + "learning_rate": 1.99989321029276e-05, + "loss": 0.982, + "step": 215 + }, + { + "epoch": 0.03, + "grad_norm": 1.9803340995881384, + "learning_rate": 1.999885446393404e-05, + "loss": 1.07, + "step": 216 + }, + { + "epoch": 0.03, + "grad_norm": 1.8949564240388055, + "learning_rate": 1.9998774100976903e-05, + "loss": 0.9726, + "step": 217 + }, + { + "epoch": 0.04, + "grad_norm": 1.6204232041265776, + "learning_rate": 1.999869101407808e-05, + "loss": 1.0177, + "step": 218 + }, + { + "epoch": 0.04, + "grad_norm": 1.437885558599507, + "learning_rate": 1.999860520326021e-05, + "loss": 1.0564, + "step": 219 + }, + { + "epoch": 0.04, + "grad_norm": 1.6407564383035635, + "learning_rate": 1.9998516668546675e-05, + "loss": 0.973, + "step": 220 + }, + { + "epoch": 0.04, + "grad_norm": 0.887030025916089, + "learning_rate": 1.9998425409961585e-05, + "loss": 0.2966, + "step": 221 + }, + { + "epoch": 0.04, + "grad_norm": 1.8936230193188892, + "learning_rate": 1.9998331427529803e-05, + "loss": 1.0326, + "step": 222 + }, + { + "epoch": 0.04, + "grad_norm": 1.682042121032617, + "learning_rate": 1.9998234721276938e-05, + "loss": 0.9614, + "step": 223 + }, + { + "epoch": 0.04, + "grad_norm": 1.6461869488164882, + "learning_rate": 1.999813529122933e-05, + "loss": 1.1369, + "step": 224 + }, + { + "epoch": 0.04, + "grad_norm": 1.6307905816513593, + "learning_rate": 1.999803313741407e-05, + "loss": 1.0643, + "step": 225 + }, + { + "epoch": 0.04, + "grad_norm": 0.7634064025143267, + "learning_rate": 1.9997928259858985e-05, + "loss": 0.314, + "step": 226 + }, + { + "epoch": 0.04, + "grad_norm": 1.7318951519404067, + "learning_rate": 1.9997820658592645e-05, + "loss": 1.094, + "step": 227 + }, + { + "epoch": 0.04, + "grad_norm": 1.4966028781453264, + "learning_rate": 1.999771033364437e-05, + "loss": 1.0218, + "step": 228 + }, + { + "epoch": 0.04, + "grad_norm": 1.8623967950584346, + "learning_rate": 1.999759728504421e-05, + "loss": 1.0368, + "step": 229 + }, + { + "epoch": 0.04, + "grad_norm": 1.8766268765177092, + "learning_rate": 1.9997481512822966e-05, + "loss": 1.1044, + "step": 230 + }, + { + "epoch": 0.04, + "grad_norm": 1.6312093604760378, + "learning_rate": 1.9997363017012174e-05, + "loss": 0.983, + "step": 231 + }, + { + "epoch": 0.04, + "grad_norm": 1.501956967595695, + "learning_rate": 1.9997241797644117e-05, + "loss": 0.9985, + "step": 232 + }, + { + "epoch": 0.04, + "grad_norm": 2.0000772545094674, + "learning_rate": 1.9997117854751818e-05, + "loss": 1.0484, + "step": 233 + }, + { + "epoch": 0.04, + "grad_norm": 1.607509648300964, + "learning_rate": 1.9996991188369045e-05, + "loss": 0.9913, + "step": 234 + }, + { + "epoch": 0.04, + "grad_norm": 1.5917588609708224, + "learning_rate": 1.9996861798530304e-05, + "loss": 0.9898, + "step": 235 + }, + { + "epoch": 0.04, + "grad_norm": 1.7948878807028363, + "learning_rate": 1.9996729685270844e-05, + "loss": 1.0135, + "step": 236 + }, + { + "epoch": 0.04, + "grad_norm": 1.5680855462110115, + "learning_rate": 1.9996594848626655e-05, + "loss": 1.0215, + "step": 237 + }, + { + "epoch": 0.04, + "grad_norm": 2.0745048954174523, + "learning_rate": 1.999645728863447e-05, + "loss": 1.0933, + "step": 238 + }, + { + "epoch": 0.04, + "grad_norm": 1.7530365154261676, + "learning_rate": 1.9996317005331768e-05, + "loss": 1.0567, + "step": 239 + }, + { + "epoch": 0.04, + "grad_norm": 1.6240767758637327, + "learning_rate": 1.999617399875676e-05, + "loss": 1.0015, + "step": 240 + }, + { + "epoch": 0.04, + "grad_norm": 1.8086770545196882, + "learning_rate": 1.9996028268948414e-05, + "loss": 1.0575, + "step": 241 + }, + { + "epoch": 0.04, + "grad_norm": 1.938130289662243, + "learning_rate": 1.9995879815946423e-05, + "loss": 1.0534, + "step": 242 + }, + { + "epoch": 0.04, + "grad_norm": 1.5878139852137902, + "learning_rate": 1.999572863979123e-05, + "loss": 1.0343, + "step": 243 + }, + { + "epoch": 0.04, + "grad_norm": 0.9034428631931116, + "learning_rate": 1.9995574740524024e-05, + "loss": 0.2794, + "step": 244 + }, + { + "epoch": 0.04, + "grad_norm": 0.8347427663141224, + "learning_rate": 1.9995418118186728e-05, + "loss": 0.259, + "step": 245 + }, + { + "epoch": 0.04, + "grad_norm": 1.7036518766055384, + "learning_rate": 1.9995258772822012e-05, + "loss": 1.0136, + "step": 246 + }, + { + "epoch": 0.04, + "grad_norm": 1.638416518183063, + "learning_rate": 1.9995096704473287e-05, + "loss": 1.0135, + "step": 247 + }, + { + "epoch": 0.04, + "grad_norm": 1.5603252338237057, + "learning_rate": 1.99949319131847e-05, + "loss": 1.0414, + "step": 248 + }, + { + "epoch": 0.04, + "grad_norm": 1.7747313662404303, + "learning_rate": 1.9994764399001153e-05, + "loss": 1.1135, + "step": 249 + }, + { + "epoch": 0.04, + "grad_norm": 1.5724468043882305, + "learning_rate": 1.999459416196827e-05, + "loss": 1.0623, + "step": 250 + }, + { + "epoch": 0.04, + "grad_norm": 1.8867847177373578, + "learning_rate": 1.9994421202132436e-05, + "loss": 0.9757, + "step": 251 + }, + { + "epoch": 0.04, + "grad_norm": 1.762573509551815, + "learning_rate": 1.9994245519540772e-05, + "loss": 1.0376, + "step": 252 + }, + { + "epoch": 0.04, + "grad_norm": 1.5064456518314533, + "learning_rate": 1.9994067114241135e-05, + "loss": 1.0614, + "step": 253 + }, + { + "epoch": 0.04, + "grad_norm": 1.7505906238725135, + "learning_rate": 1.9993885986282125e-05, + "loss": 1.0021, + "step": 254 + }, + { + "epoch": 0.04, + "grad_norm": 1.4459820987644794, + "learning_rate": 1.9993702135713093e-05, + "loss": 0.3405, + "step": 255 + }, + { + "epoch": 0.04, + "grad_norm": 1.3961162375339864, + "learning_rate": 1.9993515562584117e-05, + "loss": 1.0088, + "step": 256 + }, + { + "epoch": 0.04, + "grad_norm": 1.7188801748657532, + "learning_rate": 1.9993326266946033e-05, + "loss": 0.9858, + "step": 257 + }, + { + "epoch": 0.04, + "grad_norm": 1.65953326373045, + "learning_rate": 1.9993134248850402e-05, + "loss": 0.9896, + "step": 258 + }, + { + "epoch": 0.04, + "grad_norm": 1.7575140535464575, + "learning_rate": 1.9992939508349544e-05, + "loss": 1.037, + "step": 259 + }, + { + "epoch": 0.04, + "grad_norm": 1.6528061147711812, + "learning_rate": 1.9992742045496502e-05, + "loss": 0.9862, + "step": 260 + }, + { + "epoch": 0.04, + "grad_norm": 2.326822861609363, + "learning_rate": 1.999254186034508e-05, + "loss": 0.988, + "step": 261 + }, + { + "epoch": 0.04, + "grad_norm": 1.641734343088275, + "learning_rate": 1.9992338952949805e-05, + "loss": 1.0615, + "step": 262 + }, + { + "epoch": 0.04, + "grad_norm": 1.6920213770140218, + "learning_rate": 1.9992133323365963e-05, + "loss": 1.0513, + "step": 263 + }, + { + "epoch": 0.04, + "grad_norm": 1.74228701927351, + "learning_rate": 1.9991924971649566e-05, + "loss": 1.0678, + "step": 264 + }, + { + "epoch": 0.04, + "grad_norm": 1.548480057299955, + "learning_rate": 1.9991713897857376e-05, + "loss": 0.9962, + "step": 265 + }, + { + "epoch": 0.04, + "grad_norm": 2.508814252888038, + "learning_rate": 1.99915001020469e-05, + "loss": 0.9918, + "step": 266 + }, + { + "epoch": 0.04, + "grad_norm": 1.5128546957737885, + "learning_rate": 1.999128358427638e-05, + "loss": 1.0114, + "step": 267 + }, + { + "epoch": 0.04, + "grad_norm": 1.7263825649686189, + "learning_rate": 1.9991064344604798e-05, + "loss": 1.0112, + "step": 268 + }, + { + "epoch": 0.04, + "grad_norm": 1.8627291717203835, + "learning_rate": 1.9990842383091884e-05, + "loss": 0.9972, + "step": 269 + }, + { + "epoch": 0.04, + "grad_norm": 1.9641887063953032, + "learning_rate": 1.9990617699798104e-05, + "loss": 1.1887, + "step": 270 + }, + { + "epoch": 0.04, + "grad_norm": 1.671211556682236, + "learning_rate": 1.999039029478467e-05, + "loss": 1.0209, + "step": 271 + }, + { + "epoch": 0.04, + "grad_norm": 1.9177823850207563, + "learning_rate": 1.9990160168113534e-05, + "loss": 1.1049, + "step": 272 + }, + { + "epoch": 0.04, + "grad_norm": 1.7138496283911413, + "learning_rate": 1.998992731984739e-05, + "loss": 0.9027, + "step": 273 + }, + { + "epoch": 0.04, + "grad_norm": 1.68064719107991, + "learning_rate": 1.998969175004967e-05, + "loss": 1.0262, + "step": 274 + }, + { + "epoch": 0.04, + "grad_norm": 1.582945199395315, + "learning_rate": 1.9989453458784544e-05, + "loss": 1.0366, + "step": 275 + }, + { + "epoch": 0.04, + "grad_norm": 1.4032919568281048, + "learning_rate": 1.998921244611694e-05, + "loss": 0.3259, + "step": 276 + }, + { + "epoch": 0.04, + "grad_norm": 1.575045969364126, + "learning_rate": 1.9988968712112512e-05, + "loss": 1.0383, + "step": 277 + }, + { + "epoch": 0.04, + "grad_norm": 1.160030215253682, + "learning_rate": 1.9988722256837656e-05, + "loss": 0.3398, + "step": 278 + }, + { + "epoch": 0.04, + "grad_norm": 1.517575500859136, + "learning_rate": 1.998847308035952e-05, + "loss": 0.9764, + "step": 279 + }, + { + "epoch": 0.05, + "grad_norm": 1.735991152407227, + "learning_rate": 1.998822118274598e-05, + "loss": 1.1008, + "step": 280 + }, + { + "epoch": 0.05, + "grad_norm": 2.043477594962616, + "learning_rate": 1.9987966564065663e-05, + "loss": 1.108, + "step": 281 + }, + { + "epoch": 0.05, + "grad_norm": 2.0173223338875044, + "learning_rate": 1.9987709224387935e-05, + "loss": 1.0837, + "step": 282 + }, + { + "epoch": 0.05, + "grad_norm": 1.6558523476719926, + "learning_rate": 1.9987449163782902e-05, + "loss": 1.051, + "step": 283 + }, + { + "epoch": 0.05, + "grad_norm": 1.909934445923109, + "learning_rate": 1.9987186382321408e-05, + "loss": 1.1229, + "step": 284 + }, + { + "epoch": 0.05, + "grad_norm": 1.5780926862798184, + "learning_rate": 1.998692088007505e-05, + "loss": 1.0195, + "step": 285 + }, + { + "epoch": 0.05, + "grad_norm": 1.6962813639095509, + "learning_rate": 1.998665265711615e-05, + "loss": 1.0278, + "step": 286 + }, + { + "epoch": 0.05, + "grad_norm": 2.1093561242283343, + "learning_rate": 1.9986381713517783e-05, + "loss": 1.0229, + "step": 287 + }, + { + "epoch": 0.05, + "grad_norm": 1.4771685082044144, + "learning_rate": 1.998610804935376e-05, + "loss": 1.0514, + "step": 288 + }, + { + "epoch": 0.05, + "grad_norm": 2.3456627789091917, + "learning_rate": 1.998583166469864e-05, + "loss": 1.0326, + "step": 289 + }, + { + "epoch": 0.05, + "grad_norm": 1.6743407986083125, + "learning_rate": 1.9985552559627708e-05, + "loss": 1.0177, + "step": 290 + }, + { + "epoch": 0.05, + "grad_norm": 1.9978865622552715, + "learning_rate": 1.998527073421701e-05, + "loss": 1.0525, + "step": 291 + }, + { + "epoch": 0.05, + "grad_norm": 1.4441850756252455, + "learning_rate": 1.9984986188543314e-05, + "loss": 0.9472, + "step": 292 + }, + { + "epoch": 0.05, + "grad_norm": 1.8749097494418914, + "learning_rate": 1.9984698922684146e-05, + "loss": 1.045, + "step": 293 + }, + { + "epoch": 0.05, + "grad_norm": 1.656638989338678, + "learning_rate": 1.9984408936717758e-05, + "loss": 1.0225, + "step": 294 + }, + { + "epoch": 0.05, + "grad_norm": 1.6307851877299302, + "learning_rate": 1.9984116230723157e-05, + "loss": 1.0178, + "step": 295 + }, + { + "epoch": 0.05, + "grad_norm": 1.8527458952212414, + "learning_rate": 1.9983820804780082e-05, + "loss": 1.1196, + "step": 296 + }, + { + "epoch": 0.05, + "grad_norm": 1.50788174665385, + "learning_rate": 1.998352265896901e-05, + "loss": 0.9815, + "step": 297 + }, + { + "epoch": 0.05, + "grad_norm": 1.637605150546744, + "learning_rate": 1.9983221793371173e-05, + "loss": 0.995, + "step": 298 + }, + { + "epoch": 0.05, + "grad_norm": 1.6395445388038, + "learning_rate": 1.9982918208068525e-05, + "loss": 0.3293, + "step": 299 + }, + { + "epoch": 0.05, + "grad_norm": 2.0625504790977898, + "learning_rate": 1.9982611903143782e-05, + "loss": 1.1525, + "step": 300 + }, + { + "epoch": 0.05, + "grad_norm": 1.56115777539679, + "learning_rate": 1.998230287868038e-05, + "loss": 1.0189, + "step": 301 + }, + { + "epoch": 0.05, + "grad_norm": 1.5393848263201955, + "learning_rate": 1.998199113476251e-05, + "loss": 1.0209, + "step": 302 + }, + { + "epoch": 0.05, + "grad_norm": 1.939003223778072, + "learning_rate": 1.9981676671475103e-05, + "loss": 1.0647, + "step": 303 + }, + { + "epoch": 0.05, + "grad_norm": 1.7285409785421004, + "learning_rate": 1.9981359488903818e-05, + "loss": 0.9428, + "step": 304 + }, + { + "epoch": 0.05, + "grad_norm": 1.6750264212851753, + "learning_rate": 1.9981039587135078e-05, + "loss": 1.0533, + "step": 305 + }, + { + "epoch": 0.05, + "grad_norm": 1.650964779271924, + "learning_rate": 1.9980716966256023e-05, + "loss": 1.0834, + "step": 306 + }, + { + "epoch": 0.05, + "grad_norm": 1.9692782552394792, + "learning_rate": 1.9980391626354543e-05, + "loss": 1.0232, + "step": 307 + }, + { + "epoch": 0.05, + "grad_norm": 1.6910486944626006, + "learning_rate": 1.998006356751928e-05, + "loss": 1.0881, + "step": 308 + }, + { + "epoch": 0.05, + "grad_norm": 1.402373648300866, + "learning_rate": 1.997973278983959e-05, + "loss": 0.9751, + "step": 309 + }, + { + "epoch": 0.05, + "grad_norm": 1.6707533277844913, + "learning_rate": 1.99793992934056e-05, + "loss": 1.0244, + "step": 310 + }, + { + "epoch": 0.05, + "grad_norm": 1.3672112820464226, + "learning_rate": 1.997906307830816e-05, + "loss": 1.0024, + "step": 311 + }, + { + "epoch": 0.05, + "grad_norm": 1.677283721679435, + "learning_rate": 1.9978724144638863e-05, + "loss": 0.9786, + "step": 312 + }, + { + "epoch": 0.05, + "grad_norm": 1.5901802641775895, + "learning_rate": 1.997838249249004e-05, + "loss": 1.0446, + "step": 313 + }, + { + "epoch": 0.05, + "grad_norm": 1.9036247426850756, + "learning_rate": 1.9978038121954775e-05, + "loss": 1.0828, + "step": 314 + }, + { + "epoch": 0.05, + "grad_norm": 1.6039006692071103, + "learning_rate": 1.9977691033126875e-05, + "loss": 1.0634, + "step": 315 + }, + { + "epoch": 0.05, + "grad_norm": 1.5195992045174704, + "learning_rate": 1.9977341226100905e-05, + "loss": 1.0296, + "step": 316 + }, + { + "epoch": 0.05, + "grad_norm": 1.7189598502667076, + "learning_rate": 1.9976988700972154e-05, + "loss": 1.0893, + "step": 317 + }, + { + "epoch": 0.05, + "grad_norm": 1.7745096855871998, + "learning_rate": 1.9976633457836664e-05, + "loss": 1.0324, + "step": 318 + }, + { + "epoch": 0.05, + "grad_norm": 1.7552055662842703, + "learning_rate": 1.9976275496791216e-05, + "loss": 0.9983, + "step": 319 + }, + { + "epoch": 0.05, + "grad_norm": 1.8826269210673037, + "learning_rate": 1.9975914817933325e-05, + "loss": 1.1194, + "step": 320 + }, + { + "epoch": 0.05, + "grad_norm": 1.7413744756051104, + "learning_rate": 1.9975551421361244e-05, + "loss": 1.0757, + "step": 321 + }, + { + "epoch": 0.05, + "grad_norm": 1.6086285095942818, + "learning_rate": 1.9975185307173985e-05, + "loss": 1.0564, + "step": 322 + }, + { + "epoch": 0.05, + "grad_norm": 1.4523290535940954, + "learning_rate": 1.9974816475471277e-05, + "loss": 1.0273, + "step": 323 + }, + { + "epoch": 0.05, + "grad_norm": 1.4488141846005422, + "learning_rate": 1.9974444926353605e-05, + "loss": 0.9917, + "step": 324 + }, + { + "epoch": 0.05, + "grad_norm": 1.483359612904228, + "learning_rate": 1.9974070659922184e-05, + "loss": 1.0363, + "step": 325 + }, + { + "epoch": 0.05, + "grad_norm": 1.181937241172379, + "learning_rate": 1.9973693676278985e-05, + "loss": 0.3275, + "step": 326 + }, + { + "epoch": 0.05, + "grad_norm": 1.7873387330257697, + "learning_rate": 1.9973313975526696e-05, + "loss": 1.0662, + "step": 327 + }, + { + "epoch": 0.05, + "grad_norm": 1.4729436282844879, + "learning_rate": 1.9972931557768768e-05, + "loss": 1.0659, + "step": 328 + }, + { + "epoch": 0.05, + "grad_norm": 1.8189004703508875, + "learning_rate": 1.9972546423109378e-05, + "loss": 1.0371, + "step": 329 + }, + { + "epoch": 0.05, + "grad_norm": 1.8560641588276288, + "learning_rate": 1.9972158571653442e-05, + "loss": 1.062, + "step": 330 + }, + { + "epoch": 0.05, + "grad_norm": 1.7930752705947612, + "learning_rate": 1.9971768003506635e-05, + "loss": 1.0352, + "step": 331 + }, + { + "epoch": 0.05, + "grad_norm": 1.720750424864003, + "learning_rate": 1.9971374718775346e-05, + "loss": 0.9813, + "step": 332 + }, + { + "epoch": 0.05, + "grad_norm": 1.3731088142749825, + "learning_rate": 1.997097871756672e-05, + "loss": 1.0148, + "step": 333 + }, + { + "epoch": 0.05, + "grad_norm": 1.5444373873369466, + "learning_rate": 1.9970579999988643e-05, + "loss": 0.906, + "step": 334 + }, + { + "epoch": 0.05, + "grad_norm": 1.765355201379636, + "learning_rate": 1.9970178566149734e-05, + "loss": 1.0247, + "step": 335 + }, + { + "epoch": 0.05, + "grad_norm": 1.5319686220315032, + "learning_rate": 1.996977441615935e-05, + "loss": 1.0515, + "step": 336 + }, + { + "epoch": 0.05, + "grad_norm": 1.5419664326426117, + "learning_rate": 1.99693675501276e-05, + "loss": 1.0184, + "step": 337 + }, + { + "epoch": 0.05, + "grad_norm": 1.5518617205103495, + "learning_rate": 1.996895796816532e-05, + "loss": 0.9798, + "step": 338 + }, + { + "epoch": 0.05, + "grad_norm": 1.630601807773335, + "learning_rate": 1.9968545670384094e-05, + "loss": 0.9609, + "step": 339 + }, + { + "epoch": 0.05, + "grad_norm": 2.0892124042353273, + "learning_rate": 1.9968130656896244e-05, + "loss": 1.1176, + "step": 340 + }, + { + "epoch": 0.05, + "grad_norm": 1.8048389658180481, + "learning_rate": 1.996771292781483e-05, + "loss": 1.0576, + "step": 341 + }, + { + "epoch": 0.06, + "grad_norm": 1.6254413451284853, + "learning_rate": 1.996729248325365e-05, + "loss": 1.0423, + "step": 342 + }, + { + "epoch": 0.06, + "grad_norm": 1.5440792373104453, + "learning_rate": 1.996686932332725e-05, + "loss": 0.994, + "step": 343 + }, + { + "epoch": 0.06, + "grad_norm": 1.62651917679758, + "learning_rate": 1.9966443448150906e-05, + "loss": 1.0093, + "step": 344 + }, + { + "epoch": 0.06, + "grad_norm": 1.5464446222891104, + "learning_rate": 1.9966014857840644e-05, + "loss": 0.9879, + "step": 345 + }, + { + "epoch": 0.06, + "grad_norm": 1.6410926016242666, + "learning_rate": 1.9965583552513216e-05, + "loss": 0.9956, + "step": 346 + }, + { + "epoch": 0.06, + "grad_norm": 1.3818818202674275, + "learning_rate": 1.9965149532286126e-05, + "loss": 1.0365, + "step": 347 + }, + { + "epoch": 0.06, + "grad_norm": 1.6038971364616648, + "learning_rate": 1.9964712797277614e-05, + "loss": 1.0633, + "step": 348 + }, + { + "epoch": 0.06, + "grad_norm": 1.8897259166516216, + "learning_rate": 1.9964273347606656e-05, + "loss": 1.0792, + "step": 349 + }, + { + "epoch": 0.06, + "grad_norm": 1.4591939123300526, + "learning_rate": 1.996383118339297e-05, + "loss": 0.986, + "step": 350 + }, + { + "epoch": 0.06, + "grad_norm": 1.563494940726578, + "learning_rate": 1.9963386304757018e-05, + "loss": 1.0064, + "step": 351 + }, + { + "epoch": 0.06, + "grad_norm": 1.3998777043409887, + "learning_rate": 1.9962938711819993e-05, + "loss": 1.0882, + "step": 352 + }, + { + "epoch": 0.06, + "grad_norm": 1.8929350165109857, + "learning_rate": 1.9962488404703832e-05, + "loss": 1.139, + "step": 353 + }, + { + "epoch": 0.06, + "grad_norm": 1.4728290806799589, + "learning_rate": 1.996203538353121e-05, + "loss": 1.0492, + "step": 354 + }, + { + "epoch": 0.06, + "grad_norm": 1.639981639829432, + "learning_rate": 1.9961579648425552e-05, + "loss": 0.9444, + "step": 355 + }, + { + "epoch": 0.06, + "grad_norm": 1.7431463267388456, + "learning_rate": 1.9961121199510996e-05, + "loss": 1.0705, + "step": 356 + }, + { + "epoch": 0.06, + "grad_norm": 1.411548324313841, + "learning_rate": 1.9960660036912453e-05, + "loss": 1.0136, + "step": 357 + }, + { + "epoch": 0.06, + "grad_norm": 1.7627777735476993, + "learning_rate": 1.9960196160755542e-05, + "loss": 1.0714, + "step": 358 + }, + { + "epoch": 0.06, + "grad_norm": 1.3695222792709723, + "learning_rate": 1.995972957116665e-05, + "loss": 0.9472, + "step": 359 + }, + { + "epoch": 0.06, + "grad_norm": 1.8009094463650925, + "learning_rate": 1.9959260268272876e-05, + "loss": 1.0052, + "step": 360 + }, + { + "epoch": 0.06, + "grad_norm": 1.564560276938673, + "learning_rate": 1.9958788252202078e-05, + "loss": 1.1055, + "step": 361 + }, + { + "epoch": 0.06, + "grad_norm": 1.7573095566418, + "learning_rate": 1.9958313523082842e-05, + "loss": 1.0904, + "step": 362 + }, + { + "epoch": 0.06, + "grad_norm": 1.4765610903021726, + "learning_rate": 1.9957836081044498e-05, + "loss": 0.9864, + "step": 363 + }, + { + "epoch": 0.06, + "grad_norm": 1.68714992520169, + "learning_rate": 1.995735592621712e-05, + "loss": 1.0252, + "step": 364 + }, + { + "epoch": 0.06, + "grad_norm": 1.6182491697953183, + "learning_rate": 1.9956873058731514e-05, + "loss": 1.1404, + "step": 365 + }, + { + "epoch": 0.06, + "grad_norm": 1.7733148089161146, + "learning_rate": 1.9956387478719222e-05, + "loss": 1.0286, + "step": 366 + }, + { + "epoch": 0.06, + "grad_norm": 1.8230162778386978, + "learning_rate": 1.9955899186312527e-05, + "loss": 1.0189, + "step": 367 + }, + { + "epoch": 0.06, + "grad_norm": 1.5368141632441858, + "learning_rate": 1.9955408181644464e-05, + "loss": 1.0836, + "step": 368 + }, + { + "epoch": 0.06, + "grad_norm": 1.6499475064832012, + "learning_rate": 1.9954914464848787e-05, + "loss": 1.0254, + "step": 369 + }, + { + "epoch": 0.06, + "grad_norm": 1.7510700594464543, + "learning_rate": 1.995441803606e-05, + "loss": 1.0241, + "step": 370 + }, + { + "epoch": 0.06, + "grad_norm": 1.9730704557263021, + "learning_rate": 1.9953918895413346e-05, + "loss": 1.0729, + "step": 371 + }, + { + "epoch": 0.06, + "grad_norm": 1.614765088108355, + "learning_rate": 1.9953417043044806e-05, + "loss": 1.0195, + "step": 372 + }, + { + "epoch": 0.06, + "grad_norm": 1.888963527902822, + "learning_rate": 1.9952912479091094e-05, + "loss": 1.0155, + "step": 373 + }, + { + "epoch": 0.06, + "grad_norm": 1.7263120571016581, + "learning_rate": 1.9952405203689668e-05, + "loss": 1.0115, + "step": 374 + }, + { + "epoch": 0.06, + "grad_norm": 1.7561186608571346, + "learning_rate": 1.9951895216978725e-05, + "loss": 0.9807, + "step": 375 + }, + { + "epoch": 0.06, + "grad_norm": 1.5541686445435405, + "learning_rate": 1.9951382519097197e-05, + "loss": 1.0311, + "step": 376 + }, + { + "epoch": 0.06, + "grad_norm": 1.594049936242053, + "learning_rate": 1.9950867110184765e-05, + "loss": 1.0424, + "step": 377 + }, + { + "epoch": 0.06, + "grad_norm": 1.6909264110736675, + "learning_rate": 1.995034899038183e-05, + "loss": 0.9985, + "step": 378 + }, + { + "epoch": 0.06, + "grad_norm": 1.824913138309026, + "learning_rate": 1.994982815982955e-05, + "loss": 0.9497, + "step": 379 + }, + { + "epoch": 0.06, + "grad_norm": 1.6141206621915138, + "learning_rate": 1.994930461866981e-05, + "loss": 1.0562, + "step": 380 + }, + { + "epoch": 0.06, + "grad_norm": 1.455954081298042, + "learning_rate": 1.9948778367045235e-05, + "loss": 0.968, + "step": 381 + }, + { + "epoch": 0.06, + "grad_norm": 1.5789371527564335, + "learning_rate": 1.99482494050992e-05, + "loss": 1.0081, + "step": 382 + }, + { + "epoch": 0.06, + "grad_norm": 1.9240866879437077, + "learning_rate": 1.9947717732975795e-05, + "loss": 1.0633, + "step": 383 + }, + { + "epoch": 0.06, + "grad_norm": 1.686415790353978, + "learning_rate": 1.994718335081987e-05, + "loss": 1.0187, + "step": 384 + }, + { + "epoch": 0.06, + "grad_norm": 1.5507195584159947, + "learning_rate": 1.9946646258777008e-05, + "loss": 1.0076, + "step": 385 + }, + { + "epoch": 0.06, + "grad_norm": 1.4315745063907792, + "learning_rate": 1.994610645699352e-05, + "loss": 0.9508, + "step": 386 + }, + { + "epoch": 0.06, + "grad_norm": 1.3875781277566939, + "learning_rate": 1.994556394561647e-05, + "loss": 0.952, + "step": 387 + }, + { + "epoch": 0.06, + "grad_norm": 1.7184657533726364, + "learning_rate": 1.994501872479365e-05, + "loss": 1.0146, + "step": 388 + }, + { + "epoch": 0.06, + "grad_norm": 1.8936008587422144, + "learning_rate": 1.9944470794673592e-05, + "loss": 0.945, + "step": 389 + }, + { + "epoch": 0.06, + "grad_norm": 1.4647632761149083, + "learning_rate": 1.994392015540557e-05, + "loss": 1.0143, + "step": 390 + }, + { + "epoch": 0.06, + "grad_norm": 1.7073484018130995, + "learning_rate": 1.994336680713959e-05, + "loss": 1.0782, + "step": 391 + }, + { + "epoch": 0.06, + "grad_norm": 1.4760453400861175, + "learning_rate": 1.9942810750026403e-05, + "loss": 0.9985, + "step": 392 + }, + { + "epoch": 0.06, + "grad_norm": 1.8949264271527535, + "learning_rate": 1.9942251984217492e-05, + "loss": 0.9794, + "step": 393 + }, + { + "epoch": 0.06, + "grad_norm": 1.908689342272236, + "learning_rate": 1.994169050986508e-05, + "loss": 1.0248, + "step": 394 + }, + { + "epoch": 0.06, + "grad_norm": 1.3058824948504468, + "learning_rate": 1.9941126327122128e-05, + "loss": 0.9788, + "step": 395 + }, + { + "epoch": 0.06, + "grad_norm": 1.8176228099850074, + "learning_rate": 1.9940559436142338e-05, + "loss": 1.0306, + "step": 396 + }, + { + "epoch": 0.06, + "grad_norm": 1.476718204982747, + "learning_rate": 1.9939989837080143e-05, + "loss": 0.9584, + "step": 397 + }, + { + "epoch": 0.06, + "grad_norm": 2.060066210472057, + "learning_rate": 1.9939417530090722e-05, + "loss": 1.0027, + "step": 398 + }, + { + "epoch": 0.06, + "grad_norm": 1.702479137220705, + "learning_rate": 1.993884251532998e-05, + "loss": 1.0553, + "step": 399 + }, + { + "epoch": 0.06, + "grad_norm": 1.6689142645039947, + "learning_rate": 1.9938264792954573e-05, + "loss": 1.0583, + "step": 400 + }, + { + "epoch": 0.06, + "grad_norm": 1.524097043100204, + "learning_rate": 1.9937684363121886e-05, + "loss": 0.9643, + "step": 401 + }, + { + "epoch": 0.06, + "grad_norm": 2.0548985399170636, + "learning_rate": 1.9937101225990046e-05, + "loss": 0.9941, + "step": 402 + }, + { + "epoch": 0.06, + "grad_norm": 1.9681639215261673, + "learning_rate": 1.993651538171791e-05, + "loss": 1.046, + "step": 403 + }, + { + "epoch": 0.07, + "grad_norm": 1.5211915443206312, + "learning_rate": 1.993592683046509e-05, + "loss": 1.0181, + "step": 404 + }, + { + "epoch": 0.07, + "grad_norm": 1.8378544676180313, + "learning_rate": 1.993533557239191e-05, + "loss": 0.9782, + "step": 405 + }, + { + "epoch": 0.07, + "grad_norm": 1.525910033862189, + "learning_rate": 1.993474160765945e-05, + "loss": 0.9425, + "step": 406 + }, + { + "epoch": 0.07, + "grad_norm": 1.433443295434048, + "learning_rate": 1.9934144936429526e-05, + "loss": 0.9982, + "step": 407 + }, + { + "epoch": 0.07, + "grad_norm": 0.8243129854884879, + "learning_rate": 1.9933545558864686e-05, + "loss": 0.3481, + "step": 408 + }, + { + "epoch": 0.07, + "grad_norm": 1.7822853266813, + "learning_rate": 1.9932943475128215e-05, + "loss": 0.9145, + "step": 409 + }, + { + "epoch": 0.07, + "grad_norm": 1.609621489731567, + "learning_rate": 1.993233868538414e-05, + "loss": 1.0196, + "step": 410 + }, + { + "epoch": 0.07, + "grad_norm": 1.8076276784022587, + "learning_rate": 1.9931731189797216e-05, + "loss": 1.0013, + "step": 411 + }, + { + "epoch": 0.07, + "grad_norm": 1.3161724523931488, + "learning_rate": 1.993112098853295e-05, + "loss": 1.0404, + "step": 412 + }, + { + "epoch": 0.07, + "grad_norm": 1.7723900625807714, + "learning_rate": 1.9930508081757572e-05, + "loss": 1.0119, + "step": 413 + }, + { + "epoch": 0.07, + "grad_norm": 1.4812569338147326, + "learning_rate": 1.9929892469638056e-05, + "loss": 1.0798, + "step": 414 + }, + { + "epoch": 0.07, + "grad_norm": 1.6181961544448267, + "learning_rate": 1.9929274152342113e-05, + "loss": 1.0017, + "step": 415 + }, + { + "epoch": 0.07, + "grad_norm": 1.7371539248301142, + "learning_rate": 1.9928653130038188e-05, + "loss": 1.0232, + "step": 416 + }, + { + "epoch": 0.07, + "grad_norm": 1.4399499242924538, + "learning_rate": 1.9928029402895466e-05, + "loss": 1.0, + "step": 417 + }, + { + "epoch": 0.07, + "grad_norm": 0.8199510913530488, + "learning_rate": 1.992740297108387e-05, + "loss": 0.3309, + "step": 418 + }, + { + "epoch": 0.07, + "grad_norm": 1.5022318361924163, + "learning_rate": 1.992677383477405e-05, + "loss": 1.0469, + "step": 419 + }, + { + "epoch": 0.07, + "grad_norm": 1.4357756270239264, + "learning_rate": 1.9926141994137404e-05, + "loss": 1.0193, + "step": 420 + }, + { + "epoch": 0.07, + "grad_norm": 1.7176110254294175, + "learning_rate": 1.9925507449346066e-05, + "loss": 0.9908, + "step": 421 + }, + { + "epoch": 0.07, + "grad_norm": 1.5735301802556574, + "learning_rate": 1.99248702005729e-05, + "loss": 1.0161, + "step": 422 + }, + { + "epoch": 0.07, + "grad_norm": 1.965738007688637, + "learning_rate": 1.9924230247991508e-05, + "loss": 1.0884, + "step": 423 + }, + { + "epoch": 0.07, + "grad_norm": 1.7346010264901173, + "learning_rate": 1.9923587591776236e-05, + "loss": 0.9484, + "step": 424 + }, + { + "epoch": 0.07, + "grad_norm": 1.4528084007557365, + "learning_rate": 1.992294223210216e-05, + "loss": 0.9744, + "step": 425 + }, + { + "epoch": 0.07, + "grad_norm": 1.4905042816359426, + "learning_rate": 1.9922294169145088e-05, + "loss": 0.9992, + "step": 426 + }, + { + "epoch": 0.07, + "grad_norm": 1.3913613620721677, + "learning_rate": 1.992164340308158e-05, + "loss": 1.0361, + "step": 427 + }, + { + "epoch": 0.07, + "grad_norm": 1.375308021945723, + "learning_rate": 1.9920989934088914e-05, + "loss": 0.9669, + "step": 428 + }, + { + "epoch": 0.07, + "grad_norm": 1.5424953205182184, + "learning_rate": 1.9920333762345116e-05, + "loss": 0.8922, + "step": 429 + }, + { + "epoch": 0.07, + "grad_norm": 1.4799823823901987, + "learning_rate": 1.9919674888028946e-05, + "loss": 0.9667, + "step": 430 + }, + { + "epoch": 0.07, + "grad_norm": 1.7058697045041662, + "learning_rate": 1.99190133113199e-05, + "loss": 1.0567, + "step": 431 + }, + { + "epoch": 0.07, + "grad_norm": 1.644740411908266, + "learning_rate": 1.991834903239821e-05, + "loss": 0.9518, + "step": 432 + }, + { + "epoch": 0.07, + "grad_norm": 1.7007928600142808, + "learning_rate": 1.991768205144484e-05, + "loss": 1.0071, + "step": 433 + }, + { + "epoch": 0.07, + "grad_norm": 1.385453195392088, + "learning_rate": 1.9917012368641497e-05, + "loss": 0.9523, + "step": 434 + }, + { + "epoch": 0.07, + "grad_norm": 1.627875518812952, + "learning_rate": 1.991633998417062e-05, + "loss": 0.9832, + "step": 435 + }, + { + "epoch": 0.07, + "grad_norm": 0.9107051420111736, + "learning_rate": 1.991566489821539e-05, + "loss": 0.3308, + "step": 436 + }, + { + "epoch": 0.07, + "grad_norm": 1.4563285754252062, + "learning_rate": 1.9914987110959713e-05, + "loss": 0.9231, + "step": 437 + }, + { + "epoch": 0.07, + "grad_norm": 1.9196050106916664, + "learning_rate": 1.9914306622588237e-05, + "loss": 1.0093, + "step": 438 + }, + { + "epoch": 0.07, + "grad_norm": 1.7538293904496163, + "learning_rate": 1.9913623433286346e-05, + "loss": 1.0578, + "step": 439 + }, + { + "epoch": 0.07, + "grad_norm": 0.8652342337248711, + "learning_rate": 1.9912937543240164e-05, + "loss": 0.3396, + "step": 440 + }, + { + "epoch": 0.07, + "grad_norm": 1.6572881146556147, + "learning_rate": 1.9912248952636543e-05, + "loss": 1.0178, + "step": 441 + }, + { + "epoch": 0.07, + "grad_norm": 1.7109232068009135, + "learning_rate": 1.9911557661663073e-05, + "loss": 0.9887, + "step": 442 + }, + { + "epoch": 0.07, + "grad_norm": 1.595434251402399, + "learning_rate": 1.9910863670508088e-05, + "loss": 1.1132, + "step": 443 + }, + { + "epoch": 0.07, + "grad_norm": 1.5232395569267112, + "learning_rate": 1.991016697936064e-05, + "loss": 1.0143, + "step": 444 + }, + { + "epoch": 0.07, + "grad_norm": 1.651354793071197, + "learning_rate": 1.990946758841053e-05, + "loss": 1.0009, + "step": 445 + }, + { + "epoch": 0.07, + "grad_norm": 1.6310819331229927, + "learning_rate": 1.9908765497848296e-05, + "loss": 0.9179, + "step": 446 + }, + { + "epoch": 0.07, + "grad_norm": 1.7366726107695452, + "learning_rate": 1.99080607078652e-05, + "loss": 1.06, + "step": 447 + }, + { + "epoch": 0.07, + "grad_norm": 1.815619264362159, + "learning_rate": 1.9907353218653254e-05, + "loss": 1.0394, + "step": 448 + }, + { + "epoch": 0.07, + "grad_norm": 1.5390582350850919, + "learning_rate": 1.9906643030405194e-05, + "loss": 0.9819, + "step": 449 + }, + { + "epoch": 0.07, + "grad_norm": 1.5205153836874603, + "learning_rate": 1.990593014331449e-05, + "loss": 1.0491, + "step": 450 + }, + { + "epoch": 0.07, + "grad_norm": 1.6056494263108478, + "learning_rate": 1.9905214557575357e-05, + "loss": 1.0196, + "step": 451 + }, + { + "epoch": 0.07, + "grad_norm": 1.5559826127831224, + "learning_rate": 1.990449627338274e-05, + "loss": 0.9372, + "step": 452 + }, + { + "epoch": 0.07, + "grad_norm": 1.4761872388681696, + "learning_rate": 1.990377529093232e-05, + "loss": 1.0121, + "step": 453 + }, + { + "epoch": 0.07, + "grad_norm": 1.703525297670604, + "learning_rate": 1.9903051610420513e-05, + "loss": 1.0269, + "step": 454 + }, + { + "epoch": 0.07, + "grad_norm": 1.5054716713238074, + "learning_rate": 1.990232523204447e-05, + "loss": 1.0465, + "step": 455 + }, + { + "epoch": 0.07, + "grad_norm": 1.6121865869864453, + "learning_rate": 1.9901596156002068e-05, + "loss": 1.0375, + "step": 456 + }, + { + "epoch": 0.07, + "grad_norm": 1.7807917397564925, + "learning_rate": 1.9900864382491936e-05, + "loss": 1.0955, + "step": 457 + }, + { + "epoch": 0.07, + "grad_norm": 1.4097293434748794, + "learning_rate": 1.9900129911713432e-05, + "loss": 0.9582, + "step": 458 + }, + { + "epoch": 0.07, + "grad_norm": 1.7116592226250393, + "learning_rate": 1.9899392743866638e-05, + "loss": 1.04, + "step": 459 + }, + { + "epoch": 0.07, + "grad_norm": 1.7387317555884088, + "learning_rate": 1.989865287915238e-05, + "loss": 0.977, + "step": 460 + }, + { + "epoch": 0.07, + "grad_norm": 2.1775498709759833, + "learning_rate": 1.9897910317772225e-05, + "loss": 1.0005, + "step": 461 + }, + { + "epoch": 0.07, + "grad_norm": 1.8294968857577099, + "learning_rate": 1.989716505992846e-05, + "loss": 0.9827, + "step": 462 + }, + { + "epoch": 0.07, + "grad_norm": 1.8810275201714084, + "learning_rate": 1.9896417105824113e-05, + "loss": 0.9714, + "step": 463 + }, + { + "epoch": 0.07, + "grad_norm": 1.36651019008579, + "learning_rate": 1.9895666455662953e-05, + "loss": 0.9835, + "step": 464 + }, + { + "epoch": 0.07, + "grad_norm": 1.3585447617215967, + "learning_rate": 1.9894913109649473e-05, + "loss": 1.0176, + "step": 465 + }, + { + "epoch": 0.08, + "grad_norm": 1.6100825309998545, + "learning_rate": 1.9894157067988908e-05, + "loss": 0.9368, + "step": 466 + }, + { + "epoch": 0.08, + "grad_norm": 1.3114512239743978, + "learning_rate": 1.989339833088722e-05, + "loss": 1.0122, + "step": 467 + }, + { + "epoch": 0.08, + "grad_norm": 1.5010144995819839, + "learning_rate": 1.989263689855112e-05, + "loss": 0.9749, + "step": 468 + }, + { + "epoch": 0.08, + "grad_norm": 1.5059774695126955, + "learning_rate": 1.9891872771188033e-05, + "loss": 0.9067, + "step": 469 + }, + { + "epoch": 0.08, + "grad_norm": 1.6094562165127992, + "learning_rate": 1.989110594900613e-05, + "loss": 1.0768, + "step": 470 + }, + { + "epoch": 0.08, + "grad_norm": 1.4925052644007377, + "learning_rate": 1.989033643221432e-05, + "loss": 1.0608, + "step": 471 + }, + { + "epoch": 0.08, + "grad_norm": 1.544481324366754, + "learning_rate": 1.9889564221022238e-05, + "loss": 0.9554, + "step": 472 + }, + { + "epoch": 0.08, + "grad_norm": 1.5266762681060964, + "learning_rate": 1.9888789315640253e-05, + "loss": 1.0018, + "step": 473 + }, + { + "epoch": 0.08, + "grad_norm": 1.4751214603497247, + "learning_rate": 1.9888011716279473e-05, + "loss": 0.943, + "step": 474 + }, + { + "epoch": 0.08, + "grad_norm": 1.3785953866989327, + "learning_rate": 1.9887231423151734e-05, + "loss": 0.9564, + "step": 475 + }, + { + "epoch": 0.08, + "grad_norm": 1.3859412827864828, + "learning_rate": 1.9886448436469618e-05, + "loss": 0.9552, + "step": 476 + }, + { + "epoch": 0.08, + "grad_norm": 0.8289578636784593, + "learning_rate": 1.988566275644642e-05, + "loss": 0.3187, + "step": 477 + }, + { + "epoch": 0.08, + "grad_norm": 1.3707416308466214, + "learning_rate": 1.988487438329619e-05, + "loss": 0.9648, + "step": 478 + }, + { + "epoch": 0.08, + "grad_norm": 1.6446717801385633, + "learning_rate": 1.98840833172337e-05, + "loss": 0.9258, + "step": 479 + }, + { + "epoch": 0.08, + "grad_norm": 1.2461160662276098, + "learning_rate": 1.988328955847446e-05, + "loss": 0.9164, + "step": 480 + }, + { + "epoch": 0.08, + "grad_norm": 1.6122334555239015, + "learning_rate": 1.9882493107234706e-05, + "loss": 0.8933, + "step": 481 + }, + { + "epoch": 0.08, + "grad_norm": 1.4534250252179526, + "learning_rate": 1.9881693963731418e-05, + "loss": 1.0058, + "step": 482 + }, + { + "epoch": 0.08, + "grad_norm": 1.3428309716123157, + "learning_rate": 1.9880892128182302e-05, + "loss": 1.0055, + "step": 483 + }, + { + "epoch": 0.08, + "grad_norm": 1.4970362703567461, + "learning_rate": 1.9880087600805807e-05, + "loss": 0.981, + "step": 484 + }, + { + "epoch": 0.08, + "grad_norm": 1.4071957461062348, + "learning_rate": 1.9879280381821104e-05, + "loss": 0.9707, + "step": 485 + }, + { + "epoch": 0.08, + "grad_norm": 1.4701499465382966, + "learning_rate": 1.9878470471448094e-05, + "loss": 1.0165, + "step": 486 + }, + { + "epoch": 0.08, + "grad_norm": 1.6010401333283664, + "learning_rate": 1.987765786990743e-05, + "loss": 0.9935, + "step": 487 + }, + { + "epoch": 0.08, + "grad_norm": 1.3917991517985413, + "learning_rate": 1.9876842577420484e-05, + "loss": 1.069, + "step": 488 + }, + { + "epoch": 0.08, + "grad_norm": 1.4724889693126333, + "learning_rate": 1.987602459420936e-05, + "loss": 1.0009, + "step": 489 + }, + { + "epoch": 0.08, + "grad_norm": 0.8976833241275014, + "learning_rate": 1.9875203920496905e-05, + "loss": 0.3302, + "step": 490 + }, + { + "epoch": 0.08, + "grad_norm": 1.6949833141619641, + "learning_rate": 1.987438055650669e-05, + "loss": 1.0055, + "step": 491 + }, + { + "epoch": 0.08, + "grad_norm": 2.5229104062877368, + "learning_rate": 1.987355450246302e-05, + "loss": 1.038, + "step": 492 + }, + { + "epoch": 0.08, + "grad_norm": 1.4779530075832674, + "learning_rate": 1.9872725758590943e-05, + "loss": 0.9973, + "step": 493 + }, + { + "epoch": 0.08, + "grad_norm": 1.421245705880415, + "learning_rate": 1.987189432511622e-05, + "loss": 0.9712, + "step": 494 + }, + { + "epoch": 0.08, + "grad_norm": 1.469867318722636, + "learning_rate": 1.9871060202265367e-05, + "loss": 0.9934, + "step": 495 + }, + { + "epoch": 0.08, + "grad_norm": 1.7104782006206596, + "learning_rate": 1.9870223390265614e-05, + "loss": 1.0077, + "step": 496 + }, + { + "epoch": 0.08, + "grad_norm": 1.706796633268622, + "learning_rate": 1.9869383889344937e-05, + "loss": 1.0335, + "step": 497 + }, + { + "epoch": 0.08, + "grad_norm": 1.480755832863949, + "learning_rate": 1.9868541699732037e-05, + "loss": 0.9532, + "step": 498 + }, + { + "epoch": 0.08, + "grad_norm": 1.6526716957477727, + "learning_rate": 1.986769682165635e-05, + "loss": 0.9847, + "step": 499 + }, + { + "epoch": 0.08, + "grad_norm": 2.1786649935095883, + "learning_rate": 1.9866849255348045e-05, + "loss": 1.0877, + "step": 500 + }, + { + "epoch": 0.08, + "grad_norm": 1.4980741993228723, + "learning_rate": 1.9865999001038022e-05, + "loss": 0.951, + "step": 501 + }, + { + "epoch": 0.08, + "grad_norm": 1.575089665729132, + "learning_rate": 1.986514605895791e-05, + "loss": 1.0699, + "step": 502 + }, + { + "epoch": 0.08, + "grad_norm": 1.7843754122845494, + "learning_rate": 1.9864290429340084e-05, + "loss": 1.003, + "step": 503 + }, + { + "epoch": 0.08, + "grad_norm": 1.4165192622628238, + "learning_rate": 1.9863432112417628e-05, + "loss": 0.9954, + "step": 504 + }, + { + "epoch": 0.08, + "grad_norm": 1.593616671688619, + "learning_rate": 1.9862571108424377e-05, + "loss": 0.934, + "step": 505 + }, + { + "epoch": 0.08, + "grad_norm": 1.5011571931828283, + "learning_rate": 1.9861707417594896e-05, + "loss": 1.0165, + "step": 506 + }, + { + "epoch": 0.08, + "grad_norm": 1.7520585979918581, + "learning_rate": 1.9860841040164476e-05, + "loss": 0.9835, + "step": 507 + }, + { + "epoch": 0.08, + "grad_norm": 1.7396596527894448, + "learning_rate": 1.9859971976369136e-05, + "loss": 1.0211, + "step": 508 + }, + { + "epoch": 0.08, + "grad_norm": 1.7306438081944724, + "learning_rate": 1.9859100226445643e-05, + "loss": 1.1035, + "step": 509 + }, + { + "epoch": 0.08, + "grad_norm": 1.7422858229719738, + "learning_rate": 1.9858225790631477e-05, + "loss": 1.0117, + "step": 510 + }, + { + "epoch": 0.08, + "grad_norm": 1.7122130353245015, + "learning_rate": 1.9857348669164863e-05, + "loss": 1.0893, + "step": 511 + }, + { + "epoch": 0.08, + "grad_norm": 1.5510761829707955, + "learning_rate": 1.9856468862284752e-05, + "loss": 0.9938, + "step": 512 + }, + { + "epoch": 0.08, + "grad_norm": 1.600675357534195, + "learning_rate": 1.9855586370230832e-05, + "loss": 0.9905, + "step": 513 + }, + { + "epoch": 0.08, + "grad_norm": 0.8417931222500095, + "learning_rate": 1.9854701193243507e-05, + "loss": 0.3402, + "step": 514 + }, + { + "epoch": 0.08, + "grad_norm": 1.4444928667550752, + "learning_rate": 1.9853813331563934e-05, + "loss": 1.0186, + "step": 515 + }, + { + "epoch": 0.08, + "grad_norm": 1.8910820355383855, + "learning_rate": 1.9852922785433985e-05, + "loss": 1.0379, + "step": 516 + }, + { + "epoch": 0.08, + "grad_norm": 1.6887345136293097, + "learning_rate": 1.9852029555096278e-05, + "loss": 1.0209, + "step": 517 + }, + { + "epoch": 0.08, + "grad_norm": 1.5834507190227096, + "learning_rate": 1.985113364079414e-05, + "loss": 0.9821, + "step": 518 + }, + { + "epoch": 0.08, + "grad_norm": 1.2275269601842889, + "learning_rate": 1.9850235042771655e-05, + "loss": 0.9621, + "step": 519 + }, + { + "epoch": 0.08, + "grad_norm": 1.8640374742348473, + "learning_rate": 1.984933376127362e-05, + "loss": 1.0059, + "step": 520 + }, + { + "epoch": 0.08, + "grad_norm": 0.8888832694970744, + "learning_rate": 1.9848429796545566e-05, + "loss": 0.3355, + "step": 521 + }, + { + "epoch": 0.08, + "grad_norm": 1.814925186786731, + "learning_rate": 1.9847523148833767e-05, + "loss": 1.0618, + "step": 522 + }, + { + "epoch": 0.08, + "grad_norm": 1.549381263161665, + "learning_rate": 1.9846613818385215e-05, + "loss": 0.9817, + "step": 523 + }, + { + "epoch": 0.08, + "grad_norm": 0.7808394402329957, + "learning_rate": 1.984570180544763e-05, + "loss": 0.3423, + "step": 524 + }, + { + "epoch": 0.08, + "grad_norm": 1.530479577938019, + "learning_rate": 1.9844787110269478e-05, + "loss": 0.9335, + "step": 525 + }, + { + "epoch": 0.08, + "grad_norm": 1.5625682839731763, + "learning_rate": 1.984386973309994e-05, + "loss": 0.9963, + "step": 526 + }, + { + "epoch": 0.08, + "grad_norm": 1.5507215445144904, + "learning_rate": 1.9842949674188946e-05, + "loss": 0.9877, + "step": 527 + }, + { + "epoch": 0.09, + "grad_norm": 1.8920167261577927, + "learning_rate": 1.9842026933787134e-05, + "loss": 1.016, + "step": 528 + }, + { + "epoch": 0.09, + "grad_norm": 1.6458307679423931, + "learning_rate": 1.984110151214589e-05, + "loss": 0.9544, + "step": 529 + }, + { + "epoch": 0.09, + "grad_norm": 1.5886101119090976, + "learning_rate": 1.984017340951732e-05, + "loss": 1.0105, + "step": 530 + }, + { + "epoch": 0.09, + "grad_norm": 1.592595595085504, + "learning_rate": 1.983924262615427e-05, + "loss": 0.997, + "step": 531 + }, + { + "epoch": 0.09, + "grad_norm": 1.6143874314245261, + "learning_rate": 1.9838309162310304e-05, + "loss": 0.9618, + "step": 532 + }, + { + "epoch": 0.09, + "grad_norm": 1.6251791227176355, + "learning_rate": 1.9837373018239733e-05, + "loss": 1.0344, + "step": 533 + }, + { + "epoch": 0.09, + "grad_norm": 1.6484005955547039, + "learning_rate": 1.983643419419758e-05, + "loss": 1.0121, + "step": 534 + }, + { + "epoch": 0.09, + "grad_norm": 1.6276407380127966, + "learning_rate": 1.983549269043961e-05, + "loss": 1.0624, + "step": 535 + }, + { + "epoch": 0.09, + "grad_norm": 1.5911812042159124, + "learning_rate": 1.9834548507222312e-05, + "loss": 0.9862, + "step": 536 + }, + { + "epoch": 0.09, + "grad_norm": 1.5357598975532294, + "learning_rate": 1.9833601644802915e-05, + "loss": 0.9959, + "step": 537 + }, + { + "epoch": 0.09, + "grad_norm": 1.4626722669422325, + "learning_rate": 1.983265210343936e-05, + "loss": 1.0341, + "step": 538 + }, + { + "epoch": 0.09, + "grad_norm": 1.563432996072597, + "learning_rate": 1.9831699883390335e-05, + "loss": 0.926, + "step": 539 + }, + { + "epoch": 0.09, + "grad_norm": 1.541617320144639, + "learning_rate": 1.9830744984915247e-05, + "loss": 0.9362, + "step": 540 + }, + { + "epoch": 0.09, + "grad_norm": 1.6672714895545047, + "learning_rate": 1.9829787408274247e-05, + "loss": 0.9223, + "step": 541 + }, + { + "epoch": 0.09, + "grad_norm": 1.610094210069431, + "learning_rate": 1.982882715372819e-05, + "loss": 0.9344, + "step": 542 + }, + { + "epoch": 0.09, + "grad_norm": 1.7564744911497732, + "learning_rate": 1.9827864221538684e-05, + "loss": 1.0191, + "step": 543 + }, + { + "epoch": 0.09, + "grad_norm": 1.5489001948845225, + "learning_rate": 1.982689861196806e-05, + "loss": 0.9263, + "step": 544 + }, + { + "epoch": 0.09, + "grad_norm": 1.512339729347485, + "learning_rate": 1.9825930325279373e-05, + "loss": 0.9847, + "step": 545 + }, + { + "epoch": 0.09, + "grad_norm": 1.4154611172666631, + "learning_rate": 1.982495936173641e-05, + "loss": 1.0316, + "step": 546 + }, + { + "epoch": 0.09, + "grad_norm": 1.7988069400513216, + "learning_rate": 1.9823985721603693e-05, + "loss": 0.9575, + "step": 547 + }, + { + "epoch": 0.09, + "grad_norm": 1.5701253516142373, + "learning_rate": 1.9823009405146465e-05, + "loss": 1.0409, + "step": 548 + }, + { + "epoch": 0.09, + "grad_norm": 1.4442892215336347, + "learning_rate": 1.98220304126307e-05, + "loss": 0.8872, + "step": 549 + }, + { + "epoch": 0.09, + "grad_norm": 1.8217126021464274, + "learning_rate": 1.9821048744323108e-05, + "loss": 0.9411, + "step": 550 + }, + { + "epoch": 0.09, + "grad_norm": 1.5172658286588774, + "learning_rate": 1.9820064400491118e-05, + "loss": 0.9895, + "step": 551 + }, + { + "epoch": 0.09, + "grad_norm": 1.5446771221060194, + "learning_rate": 1.9819077381402895e-05, + "loss": 1.0491, + "step": 552 + }, + { + "epoch": 0.09, + "grad_norm": 1.7391716680339055, + "learning_rate": 1.9818087687327328e-05, + "loss": 1.058, + "step": 553 + }, + { + "epoch": 0.09, + "grad_norm": 1.4993407829935261, + "learning_rate": 1.9817095318534038e-05, + "loss": 0.9445, + "step": 554 + }, + { + "epoch": 0.09, + "grad_norm": 1.685212541974343, + "learning_rate": 1.981610027529337e-05, + "loss": 1.0597, + "step": 555 + }, + { + "epoch": 0.09, + "grad_norm": 1.4486640179510566, + "learning_rate": 1.9815102557876406e-05, + "loss": 0.9561, + "step": 556 + }, + { + "epoch": 0.09, + "grad_norm": 1.4420148642321307, + "learning_rate": 1.9814102166554954e-05, + "loss": 0.9668, + "step": 557 + }, + { + "epoch": 0.09, + "grad_norm": 1.5418215156501733, + "learning_rate": 1.981309910160154e-05, + "loss": 1.017, + "step": 558 + }, + { + "epoch": 0.09, + "grad_norm": 1.61033671252527, + "learning_rate": 1.9812093363289433e-05, + "loss": 1.0394, + "step": 559 + }, + { + "epoch": 0.09, + "grad_norm": 1.6622225675803144, + "learning_rate": 1.9811084951892625e-05, + "loss": 0.8988, + "step": 560 + }, + { + "epoch": 0.09, + "grad_norm": 1.6713045686798444, + "learning_rate": 1.9810073867685828e-05, + "loss": 0.9534, + "step": 561 + }, + { + "epoch": 0.09, + "grad_norm": 2.0464720757135413, + "learning_rate": 1.980906011094449e-05, + "loss": 1.0253, + "step": 562 + }, + { + "epoch": 0.09, + "grad_norm": 1.6212485078684926, + "learning_rate": 1.9808043681944794e-05, + "loss": 1.0683, + "step": 563 + }, + { + "epoch": 0.09, + "grad_norm": 1.5537987635669974, + "learning_rate": 1.980702458096364e-05, + "loss": 1.0626, + "step": 564 + }, + { + "epoch": 0.09, + "grad_norm": 1.4552820328435718, + "learning_rate": 1.9806002808278658e-05, + "loss": 0.9823, + "step": 565 + }, + { + "epoch": 0.09, + "grad_norm": 1.6850626278621434, + "learning_rate": 1.9804978364168203e-05, + "loss": 1.0844, + "step": 566 + }, + { + "epoch": 0.09, + "grad_norm": 1.3830419941974257, + "learning_rate": 1.980395124891137e-05, + "loss": 1.0151, + "step": 567 + }, + { + "epoch": 0.09, + "grad_norm": 1.6673025569701518, + "learning_rate": 1.9802921462787968e-05, + "loss": 0.9543, + "step": 568 + }, + { + "epoch": 0.09, + "grad_norm": 1.846863848915581, + "learning_rate": 1.980188900607854e-05, + "loss": 1.0018, + "step": 569 + }, + { + "epoch": 0.09, + "grad_norm": 1.7474502493652029, + "learning_rate": 1.9800853879064356e-05, + "loss": 0.9577, + "step": 570 + }, + { + "epoch": 0.09, + "grad_norm": 1.5046378300788403, + "learning_rate": 1.9799816082027413e-05, + "loss": 1.0407, + "step": 571 + }, + { + "epoch": 0.09, + "grad_norm": 1.5528864468175123, + "learning_rate": 1.9798775615250434e-05, + "loss": 0.9954, + "step": 572 + }, + { + "epoch": 0.09, + "grad_norm": 1.825977782272403, + "learning_rate": 1.9797732479016874e-05, + "loss": 1.0272, + "step": 573 + }, + { + "epoch": 0.09, + "grad_norm": 1.637205952713108, + "learning_rate": 1.979668667361091e-05, + "loss": 1.0378, + "step": 574 + }, + { + "epoch": 0.09, + "grad_norm": 1.5489515265026832, + "learning_rate": 1.9795638199317452e-05, + "loss": 0.9048, + "step": 575 + }, + { + "epoch": 0.09, + "grad_norm": 1.440307956013016, + "learning_rate": 1.9794587056422125e-05, + "loss": 0.9603, + "step": 576 + }, + { + "epoch": 0.09, + "grad_norm": 1.502972004265578, + "learning_rate": 1.9793533245211298e-05, + "loss": 0.9497, + "step": 577 + }, + { + "epoch": 0.09, + "grad_norm": 1.6354326991618577, + "learning_rate": 1.9792476765972055e-05, + "loss": 1.0276, + "step": 578 + }, + { + "epoch": 0.09, + "grad_norm": 1.4253277612656148, + "learning_rate": 1.979141761899221e-05, + "loss": 0.9528, + "step": 579 + }, + { + "epoch": 0.09, + "grad_norm": 0.9072753643670058, + "learning_rate": 1.9790355804560303e-05, + "loss": 0.3403, + "step": 580 + }, + { + "epoch": 0.09, + "grad_norm": 1.5730365640109556, + "learning_rate": 1.97892913229656e-05, + "loss": 1.0069, + "step": 581 + }, + { + "epoch": 0.09, + "grad_norm": 2.153647820009763, + "learning_rate": 1.9788224174498098e-05, + "loss": 1.0298, + "step": 582 + }, + { + "epoch": 0.09, + "grad_norm": 1.3392441269046116, + "learning_rate": 1.9787154359448518e-05, + "loss": 0.8837, + "step": 583 + }, + { + "epoch": 0.09, + "grad_norm": 1.749464771152944, + "learning_rate": 1.9786081878108304e-05, + "loss": 0.9364, + "step": 584 + }, + { + "epoch": 0.09, + "grad_norm": 1.5865814036377155, + "learning_rate": 1.9785006730769636e-05, + "loss": 0.96, + "step": 585 + }, + { + "epoch": 0.09, + "grad_norm": 1.5561946184480493, + "learning_rate": 1.9783928917725404e-05, + "loss": 1.0065, + "step": 586 + }, + { + "epoch": 0.09, + "grad_norm": 1.5148461346265685, + "learning_rate": 1.9782848439269244e-05, + "loss": 0.914, + "step": 587 + }, + { + "epoch": 0.09, + "grad_norm": 1.661197143623179, + "learning_rate": 1.97817652956955e-05, + "loss": 0.9168, + "step": 588 + }, + { + "epoch": 0.09, + "grad_norm": 0.7360151521247759, + "learning_rate": 1.9780679487299255e-05, + "loss": 0.337, + "step": 589 + }, + { + "epoch": 0.1, + "grad_norm": 1.526903974532986, + "learning_rate": 1.9779591014376312e-05, + "loss": 0.9939, + "step": 590 + }, + { + "epoch": 0.1, + "grad_norm": 1.3157470935434137, + "learning_rate": 1.9778499877223198e-05, + "loss": 0.9608, + "step": 591 + }, + { + "epoch": 0.1, + "grad_norm": 0.8192849490062599, + "learning_rate": 1.9777406076137174e-05, + "loss": 0.3163, + "step": 592 + }, + { + "epoch": 0.1, + "grad_norm": 1.4728078296169678, + "learning_rate": 1.977630961141622e-05, + "loss": 0.9406, + "step": 593 + }, + { + "epoch": 0.1, + "grad_norm": 1.7115210992278456, + "learning_rate": 1.9775210483359037e-05, + "loss": 1.017, + "step": 594 + }, + { + "epoch": 0.1, + "grad_norm": 1.387427566598701, + "learning_rate": 1.977410869226507e-05, + "loss": 1.0243, + "step": 595 + }, + { + "epoch": 0.1, + "grad_norm": 1.5437284941082634, + "learning_rate": 1.9773004238434465e-05, + "loss": 0.9284, + "step": 596 + }, + { + "epoch": 0.1, + "grad_norm": 1.4392059151420191, + "learning_rate": 1.9771897122168112e-05, + "loss": 0.9605, + "step": 597 + }, + { + "epoch": 0.1, + "grad_norm": 1.7448877099396356, + "learning_rate": 1.9770787343767622e-05, + "loss": 0.9585, + "step": 598 + }, + { + "epoch": 0.1, + "grad_norm": 1.6424659504728774, + "learning_rate": 1.9769674903535324e-05, + "loss": 0.9871, + "step": 599 + }, + { + "epoch": 0.1, + "grad_norm": 1.4184094557276807, + "learning_rate": 1.976855980177428e-05, + "loss": 0.9796, + "step": 600 + }, + { + "epoch": 0.1, + "grad_norm": 0.8428809416753293, + "learning_rate": 1.9767442038788273e-05, + "loss": 0.3396, + "step": 601 + }, + { + "epoch": 0.1, + "grad_norm": 1.4714204677621197, + "learning_rate": 1.9766321614881814e-05, + "loss": 0.9283, + "step": 602 + }, + { + "epoch": 0.1, + "grad_norm": 1.9803664690802016, + "learning_rate": 1.976519853036014e-05, + "loss": 0.9631, + "step": 603 + }, + { + "epoch": 0.1, + "grad_norm": 1.95296992926116, + "learning_rate": 1.9764072785529203e-05, + "loss": 1.034, + "step": 604 + }, + { + "epoch": 0.1, + "grad_norm": 1.67264133898393, + "learning_rate": 1.9762944380695692e-05, + "loss": 1.0424, + "step": 605 + }, + { + "epoch": 0.1, + "grad_norm": 1.5533802161818493, + "learning_rate": 1.9761813316167014e-05, + "loss": 0.9876, + "step": 606 + }, + { + "epoch": 0.1, + "grad_norm": 1.6730692427326996, + "learning_rate": 1.9760679592251306e-05, + "loss": 0.9586, + "step": 607 + }, + { + "epoch": 0.1, + "grad_norm": 1.834201012837671, + "learning_rate": 1.975954320925742e-05, + "loss": 0.9773, + "step": 608 + }, + { + "epoch": 0.1, + "grad_norm": 1.7643256712935411, + "learning_rate": 1.975840416749494e-05, + "loss": 1.0029, + "step": 609 + }, + { + "epoch": 0.1, + "grad_norm": 1.447756337232945, + "learning_rate": 1.9757262467274173e-05, + "loss": 0.9421, + "step": 610 + }, + { + "epoch": 0.1, + "grad_norm": 1.6828710792886568, + "learning_rate": 1.975611810890615e-05, + "loss": 0.863, + "step": 611 + }, + { + "epoch": 0.1, + "grad_norm": 1.5781019328338877, + "learning_rate": 1.9754971092702623e-05, + "loss": 0.9429, + "step": 612 + }, + { + "epoch": 0.1, + "grad_norm": 1.616119128613189, + "learning_rate": 1.9753821418976077e-05, + "loss": 0.9497, + "step": 613 + }, + { + "epoch": 0.1, + "grad_norm": 1.8413753957676127, + "learning_rate": 1.975266908803971e-05, + "loss": 1.0221, + "step": 614 + }, + { + "epoch": 0.1, + "grad_norm": 1.7576390000655862, + "learning_rate": 1.9751514100207444e-05, + "loss": 0.998, + "step": 615 + }, + { + "epoch": 0.1, + "grad_norm": 1.532347846351968, + "learning_rate": 1.975035645579394e-05, + "loss": 0.9582, + "step": 616 + }, + { + "epoch": 0.1, + "grad_norm": 1.7654309962669035, + "learning_rate": 1.9749196155114568e-05, + "loss": 0.9294, + "step": 617 + }, + { + "epoch": 0.1, + "grad_norm": 1.8417837389237854, + "learning_rate": 1.9748033198485422e-05, + "loss": 0.939, + "step": 618 + }, + { + "epoch": 0.1, + "grad_norm": 1.8983221332990239, + "learning_rate": 1.974686758622333e-05, + "loss": 0.9404, + "step": 619 + }, + { + "epoch": 0.1, + "grad_norm": 1.41671136601195, + "learning_rate": 1.9745699318645833e-05, + "loss": 0.9284, + "step": 620 + }, + { + "epoch": 0.1, + "grad_norm": 1.5361361670290725, + "learning_rate": 1.9744528396071197e-05, + "loss": 1.0013, + "step": 621 + }, + { + "epoch": 0.1, + "grad_norm": 1.4955227922733323, + "learning_rate": 1.9743354818818418e-05, + "loss": 1.071, + "step": 622 + }, + { + "epoch": 0.1, + "grad_norm": 1.9223903191922407, + "learning_rate": 1.974217858720721e-05, + "loss": 1.0495, + "step": 623 + }, + { + "epoch": 0.1, + "grad_norm": 1.6179983817389056, + "learning_rate": 1.9740999701558007e-05, + "loss": 0.97, + "step": 624 + }, + { + "epoch": 0.1, + "grad_norm": 1.39945336690626, + "learning_rate": 1.9739818162191976e-05, + "loss": 0.9472, + "step": 625 + }, + { + "epoch": 0.1, + "grad_norm": 1.5573252263780237, + "learning_rate": 1.9738633969431e-05, + "loss": 0.9569, + "step": 626 + }, + { + "epoch": 0.1, + "grad_norm": 1.8919730759552575, + "learning_rate": 1.973744712359768e-05, + "loss": 1.0172, + "step": 627 + }, + { + "epoch": 0.1, + "grad_norm": 1.6220258997211678, + "learning_rate": 1.973625762501535e-05, + "loss": 0.9754, + "step": 628 + }, + { + "epoch": 0.1, + "grad_norm": 1.4493628562627832, + "learning_rate": 1.973506547400806e-05, + "loss": 0.8455, + "step": 629 + }, + { + "epoch": 0.1, + "grad_norm": 1.4652016348754395, + "learning_rate": 1.9733870670900586e-05, + "loss": 1.0017, + "step": 630 + }, + { + "epoch": 0.1, + "grad_norm": 2.198366364166625, + "learning_rate": 1.973267321601843e-05, + "loss": 0.8924, + "step": 631 + }, + { + "epoch": 0.1, + "grad_norm": 1.539601500583414, + "learning_rate": 1.9731473109687807e-05, + "loss": 1.0371, + "step": 632 + }, + { + "epoch": 0.1, + "grad_norm": 1.4713246272224831, + "learning_rate": 1.9730270352235657e-05, + "loss": 1.0249, + "step": 633 + }, + { + "epoch": 0.1, + "grad_norm": 1.4570848983061602, + "learning_rate": 1.9729064943989646e-05, + "loss": 0.9311, + "step": 634 + }, + { + "epoch": 0.1, + "grad_norm": 1.470201882463915, + "learning_rate": 1.972785688527816e-05, + "loss": 0.9737, + "step": 635 + }, + { + "epoch": 0.1, + "grad_norm": 1.567526958052572, + "learning_rate": 1.972664617643031e-05, + "loss": 0.9863, + "step": 636 + }, + { + "epoch": 0.1, + "grad_norm": 1.4042792538000235, + "learning_rate": 1.9725432817775925e-05, + "loss": 0.9627, + "step": 637 + }, + { + "epoch": 0.1, + "grad_norm": 1.4887018003786663, + "learning_rate": 1.9724216809645557e-05, + "loss": 0.9179, + "step": 638 + }, + { + "epoch": 0.1, + "grad_norm": 1.4677996445915789, + "learning_rate": 1.9722998152370482e-05, + "loss": 0.9562, + "step": 639 + }, + { + "epoch": 0.1, + "grad_norm": 1.4153372724811504, + "learning_rate": 1.9721776846282692e-05, + "loss": 0.9961, + "step": 640 + }, + { + "epoch": 0.1, + "grad_norm": 1.7276058119433573, + "learning_rate": 1.9720552891714912e-05, + "loss": 1.0181, + "step": 641 + }, + { + "epoch": 0.1, + "grad_norm": 1.4730948988886279, + "learning_rate": 1.9719326289000568e-05, + "loss": 1.0313, + "step": 642 + }, + { + "epoch": 0.1, + "grad_norm": 1.750053732762348, + "learning_rate": 1.971809703847383e-05, + "loss": 0.9964, + "step": 643 + }, + { + "epoch": 0.1, + "grad_norm": 1.7496420279162306, + "learning_rate": 1.971686514046958e-05, + "loss": 0.9402, + "step": 644 + }, + { + "epoch": 0.1, + "grad_norm": 1.6719026819504315, + "learning_rate": 1.971563059532342e-05, + "loss": 0.9999, + "step": 645 + }, + { + "epoch": 0.1, + "grad_norm": 1.367447775861517, + "learning_rate": 1.971439340337167e-05, + "loss": 1.0441, + "step": 646 + }, + { + "epoch": 0.1, + "grad_norm": 1.7507052985343017, + "learning_rate": 1.971315356495138e-05, + "loss": 0.9752, + "step": 647 + }, + { + "epoch": 0.1, + "grad_norm": 1.5748322063049078, + "learning_rate": 1.9711911080400313e-05, + "loss": 0.9808, + "step": 648 + }, + { + "epoch": 0.1, + "grad_norm": 1.7030050679509858, + "learning_rate": 1.971066595005696e-05, + "loss": 1.05, + "step": 649 + }, + { + "epoch": 0.1, + "grad_norm": 1.4900469662127558, + "learning_rate": 1.9709418174260523e-05, + "loss": 1.0449, + "step": 650 + }, + { + "epoch": 0.1, + "grad_norm": 1.457156969787931, + "learning_rate": 1.9708167753350932e-05, + "loss": 0.9481, + "step": 651 + }, + { + "epoch": 0.11, + "grad_norm": 1.3808206009175872, + "learning_rate": 1.9706914687668842e-05, + "loss": 1.0495, + "step": 652 + }, + { + "epoch": 0.11, + "grad_norm": 0.8923930729142374, + "learning_rate": 1.9705658977555617e-05, + "loss": 0.3003, + "step": 653 + }, + { + "epoch": 0.11, + "grad_norm": 1.610521852316017, + "learning_rate": 1.970440062335335e-05, + "loss": 0.9797, + "step": 654 + }, + { + "epoch": 0.11, + "grad_norm": 1.399962063710708, + "learning_rate": 1.9703139625404847e-05, + "loss": 0.9548, + "step": 655 + }, + { + "epoch": 0.11, + "grad_norm": 1.463369223098361, + "learning_rate": 1.9701875984053642e-05, + "loss": 0.9197, + "step": 656 + }, + { + "epoch": 0.11, + "grad_norm": 2.2312646170813215, + "learning_rate": 1.9700609699643984e-05, + "loss": 1.0428, + "step": 657 + }, + { + "epoch": 0.11, + "grad_norm": 1.597291799915292, + "learning_rate": 1.9699340772520847e-05, + "loss": 1.016, + "step": 658 + }, + { + "epoch": 0.11, + "grad_norm": 1.7694565907785076, + "learning_rate": 1.969806920302992e-05, + "loss": 0.9757, + "step": 659 + }, + { + "epoch": 0.11, + "grad_norm": 1.7935228250027269, + "learning_rate": 1.9696794991517613e-05, + "loss": 0.9329, + "step": 660 + }, + { + "epoch": 0.11, + "grad_norm": 1.6505922019468575, + "learning_rate": 1.9695518138331055e-05, + "loss": 0.9741, + "step": 661 + }, + { + "epoch": 0.11, + "grad_norm": 1.5367578628034648, + "learning_rate": 1.9694238643818097e-05, + "loss": 0.9973, + "step": 662 + }, + { + "epoch": 0.11, + "grad_norm": 1.4502875305165264, + "learning_rate": 1.9692956508327313e-05, + "loss": 0.9538, + "step": 663 + }, + { + "epoch": 0.11, + "grad_norm": 1.4146351004012245, + "learning_rate": 1.9691671732207986e-05, + "loss": 0.9769, + "step": 664 + }, + { + "epoch": 0.11, + "grad_norm": 1.7567028951603778, + "learning_rate": 1.9690384315810126e-05, + "loss": 0.9966, + "step": 665 + }, + { + "epoch": 0.11, + "grad_norm": 1.2693071152830517, + "learning_rate": 1.9689094259484462e-05, + "loss": 0.9405, + "step": 666 + }, + { + "epoch": 0.11, + "grad_norm": 1.568509474729593, + "learning_rate": 1.968780156358244e-05, + "loss": 1.045, + "step": 667 + }, + { + "epoch": 0.11, + "grad_norm": 1.6338375811717487, + "learning_rate": 1.9686506228456226e-05, + "loss": 0.9617, + "step": 668 + }, + { + "epoch": 0.11, + "grad_norm": 1.8935986755243135, + "learning_rate": 1.9685208254458707e-05, + "loss": 1.0117, + "step": 669 + }, + { + "epoch": 0.11, + "grad_norm": 1.4740464405465192, + "learning_rate": 1.9683907641943484e-05, + "loss": 0.9596, + "step": 670 + }, + { + "epoch": 0.11, + "grad_norm": 1.7310962777025045, + "learning_rate": 1.9682604391264882e-05, + "loss": 1.0062, + "step": 671 + }, + { + "epoch": 0.11, + "grad_norm": 1.7158998786832855, + "learning_rate": 1.968129850277794e-05, + "loss": 0.9914, + "step": 672 + }, + { + "epoch": 0.11, + "grad_norm": 1.3717880158855513, + "learning_rate": 1.9679989976838417e-05, + "loss": 0.9737, + "step": 673 + }, + { + "epoch": 0.11, + "grad_norm": 1.3973638032947824, + "learning_rate": 1.9678678813802796e-05, + "loss": 0.9382, + "step": 674 + }, + { + "epoch": 0.11, + "grad_norm": 1.6606824500968425, + "learning_rate": 1.9677365014028275e-05, + "loss": 1.0297, + "step": 675 + }, + { + "epoch": 0.11, + "grad_norm": 0.8757940823127864, + "learning_rate": 1.9676048577872762e-05, + "loss": 0.3257, + "step": 676 + }, + { + "epoch": 0.11, + "grad_norm": 1.6892297122950004, + "learning_rate": 1.9674729505694894e-05, + "loss": 1.0191, + "step": 677 + }, + { + "epoch": 0.11, + "grad_norm": 1.3807853317373082, + "learning_rate": 1.9673407797854024e-05, + "loss": 0.9823, + "step": 678 + }, + { + "epoch": 0.11, + "grad_norm": 1.721324456665953, + "learning_rate": 1.967208345471022e-05, + "loss": 1.0126, + "step": 679 + }, + { + "epoch": 0.11, + "grad_norm": 1.5577845785428022, + "learning_rate": 1.967075647662427e-05, + "loss": 0.9194, + "step": 680 + }, + { + "epoch": 0.11, + "grad_norm": 1.727788862903958, + "learning_rate": 1.9669426863957685e-05, + "loss": 0.9571, + "step": 681 + }, + { + "epoch": 0.11, + "grad_norm": 0.7814221275968994, + "learning_rate": 1.9668094617072683e-05, + "loss": 0.3445, + "step": 682 + }, + { + "epoch": 0.11, + "grad_norm": 1.8131501957442242, + "learning_rate": 1.96667597363322e-05, + "loss": 1.0327, + "step": 683 + }, + { + "epoch": 0.11, + "grad_norm": 1.632343488391787, + "learning_rate": 1.96654222220999e-05, + "loss": 0.9829, + "step": 684 + }, + { + "epoch": 0.11, + "grad_norm": 1.4787056830991403, + "learning_rate": 1.966408207474016e-05, + "loss": 0.9649, + "step": 685 + }, + { + "epoch": 0.11, + "grad_norm": 1.3966674731337554, + "learning_rate": 1.9662739294618067e-05, + "loss": 0.9872, + "step": 686 + }, + { + "epoch": 0.11, + "grad_norm": 1.39018185391193, + "learning_rate": 1.9661393882099436e-05, + "loss": 0.9799, + "step": 687 + }, + { + "epoch": 0.11, + "grad_norm": 1.475626628726188, + "learning_rate": 1.9660045837550798e-05, + "loss": 0.9701, + "step": 688 + }, + { + "epoch": 0.11, + "grad_norm": 1.5287454080941583, + "learning_rate": 1.965869516133939e-05, + "loss": 0.9643, + "step": 689 + }, + { + "epoch": 0.11, + "grad_norm": 1.7054515661264253, + "learning_rate": 1.9657341853833176e-05, + "loss": 1.087, + "step": 690 + }, + { + "epoch": 0.11, + "grad_norm": 1.657946627428576, + "learning_rate": 1.9655985915400834e-05, + "loss": 0.9404, + "step": 691 + }, + { + "epoch": 0.11, + "grad_norm": 1.6907505621177301, + "learning_rate": 1.965462734641176e-05, + "loss": 0.9408, + "step": 692 + }, + { + "epoch": 0.11, + "grad_norm": 1.8749806015680115, + "learning_rate": 1.965326614723607e-05, + "loss": 0.9964, + "step": 693 + }, + { + "epoch": 0.11, + "grad_norm": 1.45616433608959, + "learning_rate": 1.9651902318244582e-05, + "loss": 0.9314, + "step": 694 + }, + { + "epoch": 0.11, + "grad_norm": 2.0767157063071906, + "learning_rate": 1.9650535859808847e-05, + "loss": 0.999, + "step": 695 + }, + { + "epoch": 0.11, + "grad_norm": 1.5365009800331833, + "learning_rate": 1.9649166772301126e-05, + "loss": 1.0027, + "step": 696 + }, + { + "epoch": 0.11, + "grad_norm": 1.441424532195454, + "learning_rate": 1.96477950560944e-05, + "loss": 0.9286, + "step": 697 + }, + { + "epoch": 0.11, + "grad_norm": 1.349589205879623, + "learning_rate": 1.9646420711562354e-05, + "loss": 0.9229, + "step": 698 + }, + { + "epoch": 0.11, + "grad_norm": 1.4226175611955258, + "learning_rate": 1.9645043739079398e-05, + "loss": 1.0558, + "step": 699 + }, + { + "epoch": 0.11, + "grad_norm": 1.3311118518059941, + "learning_rate": 1.9643664139020666e-05, + "loss": 0.9723, + "step": 700 + }, + { + "epoch": 0.11, + "grad_norm": 1.7226782157258724, + "learning_rate": 1.964228191176199e-05, + "loss": 0.9137, + "step": 701 + }, + { + "epoch": 0.11, + "grad_norm": 1.5117564480046164, + "learning_rate": 1.964089705767993e-05, + "loss": 1.0138, + "step": 702 + }, + { + "epoch": 0.11, + "grad_norm": 2.416227656352775, + "learning_rate": 1.963950957715176e-05, + "loss": 0.9068, + "step": 703 + }, + { + "epoch": 0.11, + "grad_norm": 1.3735480994446339, + "learning_rate": 1.9638119470555462e-05, + "loss": 0.9429, + "step": 704 + }, + { + "epoch": 0.11, + "grad_norm": 1.7373350161156125, + "learning_rate": 1.9636726738269752e-05, + "loss": 0.9379, + "step": 705 + }, + { + "epoch": 0.11, + "grad_norm": 1.7882266538742742, + "learning_rate": 1.9635331380674035e-05, + "loss": 0.9882, + "step": 706 + }, + { + "epoch": 0.11, + "grad_norm": 1.4977221056090533, + "learning_rate": 1.9633933398148452e-05, + "loss": 1.0493, + "step": 707 + }, + { + "epoch": 0.11, + "grad_norm": 1.4260295886797731, + "learning_rate": 1.9632532791073847e-05, + "loss": 0.895, + "step": 708 + }, + { + "epoch": 0.11, + "grad_norm": 1.5192417669480027, + "learning_rate": 1.963112955983179e-05, + "loss": 0.9807, + "step": 709 + }, + { + "epoch": 0.11, + "grad_norm": 2.1667505531833817, + "learning_rate": 1.962972370480456e-05, + "loss": 0.9853, + "step": 710 + }, + { + "epoch": 0.11, + "grad_norm": 1.6719722719505665, + "learning_rate": 1.9628315226375146e-05, + "loss": 0.9816, + "step": 711 + }, + { + "epoch": 0.11, + "grad_norm": 1.7255617700254204, + "learning_rate": 1.962690412492726e-05, + "loss": 1.0424, + "step": 712 + }, + { + "epoch": 0.11, + "grad_norm": 1.5500255732580126, + "learning_rate": 1.9625490400845318e-05, + "loss": 0.9391, + "step": 713 + }, + { + "epoch": 0.12, + "grad_norm": 1.6285249929417152, + "learning_rate": 1.9624074054514467e-05, + "loss": 0.9907, + "step": 714 + }, + { + "epoch": 0.12, + "grad_norm": 1.5597101601498649, + "learning_rate": 1.9622655086320553e-05, + "loss": 1.0135, + "step": 715 + }, + { + "epoch": 0.12, + "grad_norm": 1.5207713302428703, + "learning_rate": 1.9621233496650143e-05, + "loss": 0.9534, + "step": 716 + }, + { + "epoch": 0.12, + "grad_norm": 1.5493225881391293, + "learning_rate": 1.9619809285890522e-05, + "loss": 0.962, + "step": 717 + }, + { + "epoch": 0.12, + "grad_norm": 1.50810602955461, + "learning_rate": 1.9618382454429675e-05, + "loss": 0.9718, + "step": 718 + }, + { + "epoch": 0.12, + "grad_norm": 1.2724014690640397, + "learning_rate": 1.9616953002656316e-05, + "loss": 0.9657, + "step": 719 + }, + { + "epoch": 0.12, + "grad_norm": 0.9781501948064185, + "learning_rate": 1.961552093095987e-05, + "loss": 0.3045, + "step": 720 + }, + { + "epoch": 0.12, + "grad_norm": 1.8744044167605414, + "learning_rate": 1.9614086239730464e-05, + "loss": 1.0012, + "step": 721 + }, + { + "epoch": 0.12, + "grad_norm": 1.473644649642615, + "learning_rate": 1.9612648929358953e-05, + "loss": 0.9605, + "step": 722 + }, + { + "epoch": 0.12, + "grad_norm": 1.468918031100101, + "learning_rate": 1.96112090002369e-05, + "loss": 1.0273, + "step": 723 + }, + { + "epoch": 0.12, + "grad_norm": 1.6835241369738312, + "learning_rate": 1.960976645275658e-05, + "loss": 0.9245, + "step": 724 + }, + { + "epoch": 0.12, + "grad_norm": 1.560987440184807, + "learning_rate": 1.9608321287310988e-05, + "loss": 0.9835, + "step": 725 + }, + { + "epoch": 0.12, + "grad_norm": 1.7013735366972578, + "learning_rate": 1.9606873504293818e-05, + "loss": 0.9607, + "step": 726 + }, + { + "epoch": 0.12, + "grad_norm": 1.771160337932077, + "learning_rate": 1.9605423104099492e-05, + "loss": 0.9985, + "step": 727 + }, + { + "epoch": 0.12, + "grad_norm": 1.5019205620637528, + "learning_rate": 1.9603970087123138e-05, + "loss": 0.9475, + "step": 728 + }, + { + "epoch": 0.12, + "grad_norm": 1.365805532363745, + "learning_rate": 1.9602514453760594e-05, + "loss": 0.979, + "step": 729 + }, + { + "epoch": 0.12, + "grad_norm": 1.5650361804561912, + "learning_rate": 1.960105620440842e-05, + "loss": 0.9291, + "step": 730 + }, + { + "epoch": 0.12, + "grad_norm": 1.5180372000381994, + "learning_rate": 1.959959533946388e-05, + "loss": 1.0143, + "step": 731 + }, + { + "epoch": 0.12, + "grad_norm": 1.5059946484298883, + "learning_rate": 1.9598131859324957e-05, + "loss": 0.9948, + "step": 732 + }, + { + "epoch": 0.12, + "grad_norm": 1.3796741695019268, + "learning_rate": 1.9596665764390344e-05, + "loss": 0.9525, + "step": 733 + }, + { + "epoch": 0.12, + "grad_norm": 1.5811880931920612, + "learning_rate": 1.959519705505944e-05, + "loss": 0.9811, + "step": 734 + }, + { + "epoch": 0.12, + "grad_norm": 1.4000327628662808, + "learning_rate": 1.9593725731732366e-05, + "loss": 0.9445, + "step": 735 + }, + { + "epoch": 0.12, + "grad_norm": 1.541705488401343, + "learning_rate": 1.9592251794809945e-05, + "loss": 0.8818, + "step": 736 + }, + { + "epoch": 0.12, + "grad_norm": 1.546617510586694, + "learning_rate": 1.959077524469373e-05, + "loss": 0.9014, + "step": 737 + }, + { + "epoch": 0.12, + "grad_norm": 1.424102952254737, + "learning_rate": 1.9589296081785966e-05, + "loss": 0.9278, + "step": 738 + }, + { + "epoch": 0.12, + "grad_norm": 1.4411367291040114, + "learning_rate": 1.9587814306489616e-05, + "loss": 1.0, + "step": 739 + }, + { + "epoch": 0.12, + "grad_norm": 1.8873381590103104, + "learning_rate": 1.958632991920836e-05, + "loss": 0.9418, + "step": 740 + }, + { + "epoch": 0.12, + "grad_norm": 1.5136138763391123, + "learning_rate": 1.9584842920346587e-05, + "loss": 1.0045, + "step": 741 + }, + { + "epoch": 0.12, + "grad_norm": 1.8201100754642396, + "learning_rate": 1.9583353310309393e-05, + "loss": 0.9731, + "step": 742 + }, + { + "epoch": 0.12, + "grad_norm": 0.8966083357799485, + "learning_rate": 1.9581861089502593e-05, + "loss": 0.3393, + "step": 743 + }, + { + "epoch": 0.12, + "grad_norm": 1.5898697626015028, + "learning_rate": 1.9580366258332703e-05, + "loss": 0.9036, + "step": 744 + }, + { + "epoch": 0.12, + "grad_norm": 1.3720072633368825, + "learning_rate": 1.957886881720696e-05, + "loss": 1.0097, + "step": 745 + }, + { + "epoch": 0.12, + "grad_norm": 1.771442218308349, + "learning_rate": 1.957736876653331e-05, + "loss": 0.9831, + "step": 746 + }, + { + "epoch": 0.12, + "grad_norm": 1.3587493124522976, + "learning_rate": 1.9575866106720405e-05, + "loss": 0.9454, + "step": 747 + }, + { + "epoch": 0.12, + "grad_norm": 1.5740061828229877, + "learning_rate": 1.957436083817762e-05, + "loss": 0.9369, + "step": 748 + }, + { + "epoch": 0.12, + "grad_norm": 1.5381501859536975, + "learning_rate": 1.9572852961315014e-05, + "loss": 0.9945, + "step": 749 + }, + { + "epoch": 0.12, + "grad_norm": 1.7111310544553027, + "learning_rate": 1.9571342476543387e-05, + "loss": 0.9366, + "step": 750 + }, + { + "epoch": 0.12, + "grad_norm": 1.4992931831532665, + "learning_rate": 1.9569829384274236e-05, + "loss": 1.0093, + "step": 751 + }, + { + "epoch": 0.12, + "grad_norm": 1.5998138359943668, + "learning_rate": 1.9568313684919765e-05, + "loss": 1.0145, + "step": 752 + }, + { + "epoch": 0.12, + "grad_norm": 1.6760071210983765, + "learning_rate": 1.9566795378892894e-05, + "loss": 1.0251, + "step": 753 + }, + { + "epoch": 0.12, + "grad_norm": 1.506149623967082, + "learning_rate": 1.9565274466607248e-05, + "loss": 1.0222, + "step": 754 + }, + { + "epoch": 0.12, + "grad_norm": 1.4014235854015034, + "learning_rate": 1.9563750948477174e-05, + "loss": 0.951, + "step": 755 + }, + { + "epoch": 0.12, + "grad_norm": 1.5764428500224703, + "learning_rate": 1.9562224824917716e-05, + "loss": 0.9965, + "step": 756 + }, + { + "epoch": 0.12, + "grad_norm": 1.5551635121890086, + "learning_rate": 1.956069609634463e-05, + "loss": 0.9541, + "step": 757 + }, + { + "epoch": 0.12, + "grad_norm": 1.334932481145727, + "learning_rate": 1.9559164763174384e-05, + "loss": 0.9184, + "step": 758 + }, + { + "epoch": 0.12, + "grad_norm": 1.7232799136344867, + "learning_rate": 1.9557630825824156e-05, + "loss": 0.9821, + "step": 759 + }, + { + "epoch": 0.12, + "grad_norm": 1.4842009015631046, + "learning_rate": 1.9556094284711834e-05, + "loss": 0.9316, + "step": 760 + }, + { + "epoch": 0.12, + "grad_norm": 1.4021308001397177, + "learning_rate": 1.9554555140256016e-05, + "loss": 0.9371, + "step": 761 + }, + { + "epoch": 0.12, + "grad_norm": 1.6581601428245059, + "learning_rate": 1.9553013392876005e-05, + "loss": 1.0304, + "step": 762 + }, + { + "epoch": 0.12, + "grad_norm": 1.5336689939807528, + "learning_rate": 1.955146904299181e-05, + "loss": 0.9504, + "step": 763 + }, + { + "epoch": 0.12, + "grad_norm": 1.4200238023350953, + "learning_rate": 1.9549922091024164e-05, + "loss": 0.9908, + "step": 764 + }, + { + "epoch": 0.12, + "grad_norm": 1.600408378979318, + "learning_rate": 1.9548372537394494e-05, + "loss": 0.9249, + "step": 765 + }, + { + "epoch": 0.12, + "grad_norm": 1.3351951775922273, + "learning_rate": 1.9546820382524943e-05, + "loss": 0.9492, + "step": 766 + }, + { + "epoch": 0.12, + "grad_norm": 1.3801701760115814, + "learning_rate": 1.9545265626838362e-05, + "loss": 0.9564, + "step": 767 + }, + { + "epoch": 0.12, + "grad_norm": 1.4167002923448229, + "learning_rate": 1.9543708270758306e-05, + "loss": 1.0039, + "step": 768 + }, + { + "epoch": 0.12, + "grad_norm": 1.4814022330782992, + "learning_rate": 1.9542148314709042e-05, + "loss": 0.9401, + "step": 769 + }, + { + "epoch": 0.12, + "grad_norm": 1.7291027375883745, + "learning_rate": 1.9540585759115548e-05, + "loss": 0.9193, + "step": 770 + }, + { + "epoch": 0.12, + "grad_norm": 1.4139960830591987, + "learning_rate": 1.9539020604403502e-05, + "loss": 0.9399, + "step": 771 + }, + { + "epoch": 0.12, + "grad_norm": 1.560926802916431, + "learning_rate": 1.95374528509993e-05, + "loss": 1.0697, + "step": 772 + }, + { + "epoch": 0.12, + "grad_norm": 1.5753101347232217, + "learning_rate": 1.9535882499330044e-05, + "loss": 1.0426, + "step": 773 + }, + { + "epoch": 0.12, + "grad_norm": 1.6574101899872662, + "learning_rate": 1.9534309549823535e-05, + "loss": 0.8504, + "step": 774 + }, + { + "epoch": 0.12, + "grad_norm": 1.8636191659484618, + "learning_rate": 1.9532734002908287e-05, + "loss": 0.9566, + "step": 775 + }, + { + "epoch": 0.13, + "grad_norm": 1.4122755657238062, + "learning_rate": 1.9531155859013527e-05, + "loss": 0.9675, + "step": 776 + }, + { + "epoch": 0.13, + "grad_norm": 1.3879651781044995, + "learning_rate": 1.9529575118569185e-05, + "loss": 0.9536, + "step": 777 + }, + { + "epoch": 0.13, + "grad_norm": 1.429765313211883, + "learning_rate": 1.952799178200589e-05, + "loss": 0.9512, + "step": 778 + }, + { + "epoch": 0.13, + "grad_norm": 1.4069032618666792, + "learning_rate": 1.9526405849755003e-05, + "loss": 0.9536, + "step": 779 + }, + { + "epoch": 0.13, + "grad_norm": 1.387120905172671, + "learning_rate": 1.9524817322248558e-05, + "loss": 0.9271, + "step": 780 + }, + { + "epoch": 0.13, + "grad_norm": 1.5871068584995849, + "learning_rate": 1.9523226199919326e-05, + "loss": 1.0133, + "step": 781 + }, + { + "epoch": 0.13, + "grad_norm": 1.582381055564859, + "learning_rate": 1.9521632483200767e-05, + "loss": 0.9479, + "step": 782 + }, + { + "epoch": 0.13, + "grad_norm": 1.4830617390703464, + "learning_rate": 1.952003617252705e-05, + "loss": 0.9778, + "step": 783 + }, + { + "epoch": 0.13, + "grad_norm": 1.5597433652275832, + "learning_rate": 1.9518437268333062e-05, + "loss": 0.9918, + "step": 784 + }, + { + "epoch": 0.13, + "grad_norm": 1.4565578636407874, + "learning_rate": 1.951683577105438e-05, + "loss": 0.9601, + "step": 785 + }, + { + "epoch": 0.13, + "grad_norm": 1.512021029916176, + "learning_rate": 1.9515231681127308e-05, + "loss": 1.0207, + "step": 786 + }, + { + "epoch": 0.13, + "grad_norm": 1.9285301152705443, + "learning_rate": 1.9513624998988833e-05, + "loss": 0.9738, + "step": 787 + }, + { + "epoch": 0.13, + "grad_norm": 1.4349048573688477, + "learning_rate": 1.9512015725076666e-05, + "loss": 0.9433, + "step": 788 + }, + { + "epoch": 0.13, + "grad_norm": 1.3277804664104127, + "learning_rate": 1.9510403859829213e-05, + "loss": 1.018, + "step": 789 + }, + { + "epoch": 0.13, + "grad_norm": 1.4533672887046032, + "learning_rate": 1.9508789403685594e-05, + "loss": 0.9801, + "step": 790 + }, + { + "epoch": 0.13, + "grad_norm": 1.3128964734989959, + "learning_rate": 1.950717235708563e-05, + "loss": 0.8838, + "step": 791 + }, + { + "epoch": 0.13, + "grad_norm": 1.4273050085952863, + "learning_rate": 1.950555272046985e-05, + "loss": 1.0446, + "step": 792 + }, + { + "epoch": 0.13, + "grad_norm": 1.4562282311721848, + "learning_rate": 1.9503930494279487e-05, + "loss": 0.9752, + "step": 793 + }, + { + "epoch": 0.13, + "grad_norm": 1.434527031003438, + "learning_rate": 1.9502305678956476e-05, + "loss": 0.973, + "step": 794 + }, + { + "epoch": 0.13, + "grad_norm": 1.58974759528106, + "learning_rate": 1.9500678274943467e-05, + "loss": 0.9697, + "step": 795 + }, + { + "epoch": 0.13, + "grad_norm": 1.4101459900385027, + "learning_rate": 1.949904828268381e-05, + "loss": 0.9928, + "step": 796 + }, + { + "epoch": 0.13, + "grad_norm": 1.4357921883796063, + "learning_rate": 1.949741570262156e-05, + "loss": 0.9663, + "step": 797 + }, + { + "epoch": 0.13, + "grad_norm": 1.4032743727850312, + "learning_rate": 1.949578053520147e-05, + "loss": 1.0036, + "step": 798 + }, + { + "epoch": 0.13, + "grad_norm": 1.6426460589990182, + "learning_rate": 1.9494142780869013e-05, + "loss": 1.0142, + "step": 799 + }, + { + "epoch": 0.13, + "grad_norm": 1.56254976193343, + "learning_rate": 1.9492502440070353e-05, + "loss": 1.0101, + "step": 800 + }, + { + "epoch": 0.13, + "grad_norm": 1.5994100190275868, + "learning_rate": 1.949085951325237e-05, + "loss": 0.9953, + "step": 801 + }, + { + "epoch": 0.13, + "grad_norm": 1.6477139057709178, + "learning_rate": 1.9489214000862633e-05, + "loss": 0.9475, + "step": 802 + }, + { + "epoch": 0.13, + "grad_norm": 1.8477935190603303, + "learning_rate": 1.948756590334943e-05, + "loss": 1.0045, + "step": 803 + }, + { + "epoch": 0.13, + "grad_norm": 1.4038817494445788, + "learning_rate": 1.9485915221161753e-05, + "loss": 0.9542, + "step": 804 + }, + { + "epoch": 0.13, + "grad_norm": 1.4851164779123587, + "learning_rate": 1.9484261954749286e-05, + "loss": 0.9602, + "step": 805 + }, + { + "epoch": 0.13, + "grad_norm": 1.5985778223552407, + "learning_rate": 1.9482606104562428e-05, + "loss": 0.9253, + "step": 806 + }, + { + "epoch": 0.13, + "grad_norm": 1.270414929582952, + "learning_rate": 1.9480947671052278e-05, + "loss": 0.97, + "step": 807 + }, + { + "epoch": 0.13, + "grad_norm": 1.334368682506892, + "learning_rate": 1.947928665467064e-05, + "loss": 1.0086, + "step": 808 + }, + { + "epoch": 0.13, + "grad_norm": 1.6859505358997195, + "learning_rate": 1.9477623055870016e-05, + "loss": 0.9201, + "step": 809 + }, + { + "epoch": 0.13, + "grad_norm": 1.4899527532372088, + "learning_rate": 1.9475956875103623e-05, + "loss": 0.938, + "step": 810 + }, + { + "epoch": 0.13, + "grad_norm": 1.873172251629141, + "learning_rate": 1.947428811282537e-05, + "loss": 0.9749, + "step": 811 + }, + { + "epoch": 0.13, + "grad_norm": 1.3575691870472932, + "learning_rate": 1.947261676948987e-05, + "loss": 0.9536, + "step": 812 + }, + { + "epoch": 0.13, + "grad_norm": 1.412110665361651, + "learning_rate": 1.947094284555245e-05, + "loss": 0.909, + "step": 813 + }, + { + "epoch": 0.13, + "grad_norm": 1.4137851089996896, + "learning_rate": 1.9469266341469136e-05, + "loss": 0.9555, + "step": 814 + }, + { + "epoch": 0.13, + "grad_norm": 1.6931908495024213, + "learning_rate": 1.9467587257696646e-05, + "loss": 0.9518, + "step": 815 + }, + { + "epoch": 0.13, + "grad_norm": 1.681237618851009, + "learning_rate": 1.946590559469241e-05, + "loss": 0.9068, + "step": 816 + }, + { + "epoch": 0.13, + "grad_norm": 1.5138215101790222, + "learning_rate": 1.946422135291456e-05, + "loss": 0.9392, + "step": 817 + }, + { + "epoch": 0.13, + "grad_norm": 1.4173301987688298, + "learning_rate": 1.9462534532821934e-05, + "loss": 0.9422, + "step": 818 + }, + { + "epoch": 0.13, + "grad_norm": 1.412644124115625, + "learning_rate": 1.946084513487406e-05, + "loss": 0.8784, + "step": 819 + }, + { + "epoch": 0.13, + "grad_norm": 1.4431817851421995, + "learning_rate": 1.9459153159531188e-05, + "loss": 0.9321, + "step": 820 + }, + { + "epoch": 0.13, + "grad_norm": 1.3078977894146726, + "learning_rate": 1.9457458607254247e-05, + "loss": 0.9175, + "step": 821 + }, + { + "epoch": 0.13, + "grad_norm": 1.388140340608216, + "learning_rate": 1.9455761478504888e-05, + "loss": 0.9785, + "step": 822 + }, + { + "epoch": 0.13, + "grad_norm": 1.4435349513776867, + "learning_rate": 1.9454061773745448e-05, + "loss": 0.9672, + "step": 823 + }, + { + "epoch": 0.13, + "grad_norm": 1.6256350493697957, + "learning_rate": 1.9452359493438984e-05, + "loss": 1.0384, + "step": 824 + }, + { + "epoch": 0.13, + "grad_norm": 1.4608218298984064, + "learning_rate": 1.9450654638049235e-05, + "loss": 0.9847, + "step": 825 + }, + { + "epoch": 0.13, + "grad_norm": 1.5046955669192774, + "learning_rate": 1.9448947208040655e-05, + "loss": 0.9557, + "step": 826 + }, + { + "epoch": 0.13, + "grad_norm": 1.7634757541211445, + "learning_rate": 1.9447237203878395e-05, + "loss": 0.9673, + "step": 827 + }, + { + "epoch": 0.13, + "grad_norm": 1.7766129783464395, + "learning_rate": 1.9445524626028303e-05, + "loss": 0.9198, + "step": 828 + }, + { + "epoch": 0.13, + "grad_norm": 1.5245188305010635, + "learning_rate": 1.9443809474956937e-05, + "loss": 0.9896, + "step": 829 + }, + { + "epoch": 0.13, + "grad_norm": 1.3239636632659209, + "learning_rate": 1.944209175113155e-05, + "loss": 0.9924, + "step": 830 + }, + { + "epoch": 0.13, + "grad_norm": 1.3373024998275491, + "learning_rate": 1.9440371455020094e-05, + "loss": 0.9313, + "step": 831 + }, + { + "epoch": 0.13, + "grad_norm": 1.750510368048217, + "learning_rate": 1.943864858709123e-05, + "loss": 0.9856, + "step": 832 + }, + { + "epoch": 0.13, + "grad_norm": 1.518554602865235, + "learning_rate": 1.9436923147814317e-05, + "loss": 0.9882, + "step": 833 + }, + { + "epoch": 0.13, + "grad_norm": 1.5232333248206844, + "learning_rate": 1.9435195137659404e-05, + "loss": 0.9433, + "step": 834 + }, + { + "epoch": 0.13, + "grad_norm": 1.418079630664839, + "learning_rate": 1.9433464557097257e-05, + "loss": 0.9344, + "step": 835 + }, + { + "epoch": 0.13, + "grad_norm": 1.357032280868288, + "learning_rate": 1.9431731406599325e-05, + "loss": 0.9845, + "step": 836 + }, + { + "epoch": 0.13, + "grad_norm": 1.5619016741535259, + "learning_rate": 1.9429995686637774e-05, + "loss": 1.0308, + "step": 837 + }, + { + "epoch": 0.14, + "grad_norm": 1.932681854596408, + "learning_rate": 1.9428257397685456e-05, + "loss": 0.9207, + "step": 838 + }, + { + "epoch": 0.14, + "grad_norm": 1.4217283977659552, + "learning_rate": 1.942651654021593e-05, + "loss": 0.9916, + "step": 839 + }, + { + "epoch": 0.14, + "grad_norm": 1.4439132663912166, + "learning_rate": 1.942477311470346e-05, + "loss": 0.8862, + "step": 840 + }, + { + "epoch": 0.14, + "grad_norm": 1.4432247115822097, + "learning_rate": 1.9423027121622995e-05, + "loss": 1.0597, + "step": 841 + }, + { + "epoch": 0.14, + "grad_norm": 1.4411351480729526, + "learning_rate": 1.9421278561450195e-05, + "loss": 0.9145, + "step": 842 + }, + { + "epoch": 0.14, + "grad_norm": 1.721403304238875, + "learning_rate": 1.9419527434661418e-05, + "loss": 0.9384, + "step": 843 + }, + { + "epoch": 0.14, + "grad_norm": 1.6063558565178955, + "learning_rate": 1.9417773741733713e-05, + "loss": 0.9649, + "step": 844 + }, + { + "epoch": 0.14, + "grad_norm": 1.3479855763899746, + "learning_rate": 1.9416017483144844e-05, + "loss": 0.9472, + "step": 845 + }, + { + "epoch": 0.14, + "grad_norm": 1.400074300593139, + "learning_rate": 1.941425865937325e-05, + "loss": 0.9639, + "step": 846 + }, + { + "epoch": 0.14, + "grad_norm": 1.5260155309927952, + "learning_rate": 1.9412497270898097e-05, + "loss": 0.9922, + "step": 847 + }, + { + "epoch": 0.14, + "grad_norm": 1.4907098251675208, + "learning_rate": 1.9410733318199233e-05, + "loss": 0.9049, + "step": 848 + }, + { + "epoch": 0.14, + "grad_norm": 1.389156664020996, + "learning_rate": 1.9408966801757204e-05, + "loss": 0.9888, + "step": 849 + }, + { + "epoch": 0.14, + "grad_norm": 1.4590221786963395, + "learning_rate": 1.9407197722053254e-05, + "loss": 0.9458, + "step": 850 + }, + { + "epoch": 0.14, + "grad_norm": 1.483571110934844, + "learning_rate": 1.9405426079569337e-05, + "loss": 0.9709, + "step": 851 + }, + { + "epoch": 0.14, + "grad_norm": 1.4181297193802171, + "learning_rate": 1.940365187478809e-05, + "loss": 0.9759, + "step": 852 + }, + { + "epoch": 0.14, + "grad_norm": 1.1491552319143914, + "learning_rate": 1.9401875108192863e-05, + "loss": 0.3367, + "step": 853 + }, + { + "epoch": 0.14, + "grad_norm": 0.9316460771221197, + "learning_rate": 1.940009578026769e-05, + "loss": 0.3313, + "step": 854 + }, + { + "epoch": 0.14, + "grad_norm": 1.5256254580771318, + "learning_rate": 1.9398313891497314e-05, + "loss": 0.9626, + "step": 855 + }, + { + "epoch": 0.14, + "grad_norm": 1.4357723694764464, + "learning_rate": 1.9396529442367167e-05, + "loss": 0.9401, + "step": 856 + }, + { + "epoch": 0.14, + "grad_norm": 0.9344649260443684, + "learning_rate": 1.939474243336338e-05, + "loss": 0.3439, + "step": 857 + }, + { + "epoch": 0.14, + "grad_norm": 1.5587911020783471, + "learning_rate": 1.939295286497279e-05, + "loss": 0.9467, + "step": 858 + }, + { + "epoch": 0.14, + "grad_norm": 1.6964086798975246, + "learning_rate": 1.9391160737682924e-05, + "loss": 0.9435, + "step": 859 + }, + { + "epoch": 0.14, + "grad_norm": 1.7229682527366523, + "learning_rate": 1.9389366051982e-05, + "loss": 0.942, + "step": 860 + }, + { + "epoch": 0.14, + "grad_norm": 1.38940016981017, + "learning_rate": 1.9387568808358944e-05, + "loss": 1.0144, + "step": 861 + }, + { + "epoch": 0.14, + "grad_norm": 1.4658141574438126, + "learning_rate": 1.9385769007303374e-05, + "loss": 0.9858, + "step": 862 + }, + { + "epoch": 0.14, + "grad_norm": 1.8948778120606111, + "learning_rate": 1.9383966649305608e-05, + "loss": 0.8593, + "step": 863 + }, + { + "epoch": 0.14, + "grad_norm": 1.3086717877676333, + "learning_rate": 1.9382161734856656e-05, + "loss": 0.9415, + "step": 864 + }, + { + "epoch": 0.14, + "grad_norm": 1.478112034030081, + "learning_rate": 1.9380354264448225e-05, + "loss": 1.0231, + "step": 865 + }, + { + "epoch": 0.14, + "grad_norm": 1.759533252891076, + "learning_rate": 1.9378544238572723e-05, + "loss": 0.9382, + "step": 866 + }, + { + "epoch": 0.14, + "grad_norm": 1.52135385282774, + "learning_rate": 1.937673165772325e-05, + "loss": 1.0443, + "step": 867 + }, + { + "epoch": 0.14, + "grad_norm": 1.8633761281823003, + "learning_rate": 1.93749165223936e-05, + "loss": 0.9817, + "step": 868 + }, + { + "epoch": 0.14, + "grad_norm": 1.4416107043069595, + "learning_rate": 1.937309883307827e-05, + "loss": 0.9791, + "step": 869 + }, + { + "epoch": 0.14, + "grad_norm": 1.4080848353169404, + "learning_rate": 1.9371278590272448e-05, + "loss": 0.3452, + "step": 870 + }, + { + "epoch": 0.14, + "grad_norm": 1.6608887481352994, + "learning_rate": 1.9369455794472017e-05, + "loss": 0.9245, + "step": 871 + }, + { + "epoch": 0.14, + "grad_norm": 1.5834200708782555, + "learning_rate": 1.9367630446173554e-05, + "loss": 1.0297, + "step": 872 + }, + { + "epoch": 0.14, + "grad_norm": 0.78904610088039, + "learning_rate": 1.936580254587434e-05, + "loss": 0.3378, + "step": 873 + }, + { + "epoch": 0.14, + "grad_norm": 1.6835572798600182, + "learning_rate": 1.936397209407234e-05, + "loss": 0.9627, + "step": 874 + }, + { + "epoch": 0.14, + "grad_norm": 0.8137946136767582, + "learning_rate": 1.9362139091266227e-05, + "loss": 0.3578, + "step": 875 + }, + { + "epoch": 0.14, + "grad_norm": 1.3846981606506763, + "learning_rate": 1.936030353795535e-05, + "loss": 0.9149, + "step": 876 + }, + { + "epoch": 0.14, + "grad_norm": 1.5046467292640469, + "learning_rate": 1.9358465434639773e-05, + "loss": 0.9604, + "step": 877 + }, + { + "epoch": 0.14, + "grad_norm": 1.444163578471167, + "learning_rate": 1.9356624781820246e-05, + "loss": 0.9823, + "step": 878 + }, + { + "epoch": 0.14, + "grad_norm": 1.438098159762969, + "learning_rate": 1.9354781579998212e-05, + "loss": 1.0007, + "step": 879 + }, + { + "epoch": 0.14, + "grad_norm": 1.4544287499856805, + "learning_rate": 1.9352935829675807e-05, + "loss": 0.8822, + "step": 880 + }, + { + "epoch": 0.14, + "grad_norm": 1.0810065969506604, + "learning_rate": 1.935108753135587e-05, + "loss": 0.337, + "step": 881 + }, + { + "epoch": 0.14, + "grad_norm": 1.5879198519748934, + "learning_rate": 1.9349236685541924e-05, + "loss": 0.9747, + "step": 882 + }, + { + "epoch": 0.14, + "grad_norm": 1.4078565374720287, + "learning_rate": 1.934738329273819e-05, + "loss": 0.921, + "step": 883 + }, + { + "epoch": 0.14, + "grad_norm": 1.7054212936014457, + "learning_rate": 1.9345527353449583e-05, + "loss": 1.0073, + "step": 884 + }, + { + "epoch": 0.14, + "grad_norm": 1.4276576001963661, + "learning_rate": 1.9343668868181722e-05, + "loss": 0.9999, + "step": 885 + }, + { + "epoch": 0.14, + "grad_norm": 1.4928971857318731, + "learning_rate": 1.9341807837440893e-05, + "loss": 0.9131, + "step": 886 + }, + { + "epoch": 0.14, + "grad_norm": 1.6622502965391146, + "learning_rate": 1.9339944261734107e-05, + "loss": 0.9812, + "step": 887 + }, + { + "epoch": 0.14, + "grad_norm": 1.4709324898511884, + "learning_rate": 1.933807814156904e-05, + "loss": 0.8792, + "step": 888 + }, + { + "epoch": 0.14, + "grad_norm": 1.3444750009625994, + "learning_rate": 1.933620947745409e-05, + "loss": 1.0058, + "step": 889 + }, + { + "epoch": 0.14, + "grad_norm": 1.4711977429638057, + "learning_rate": 1.9334338269898317e-05, + "loss": 0.9403, + "step": 890 + }, + { + "epoch": 0.14, + "grad_norm": 1.6298787898064873, + "learning_rate": 1.9332464519411503e-05, + "loss": 0.9068, + "step": 891 + }, + { + "epoch": 0.14, + "grad_norm": 1.3491151887569088, + "learning_rate": 1.93305882265041e-05, + "loss": 1.0236, + "step": 892 + }, + { + "epoch": 0.14, + "grad_norm": 1.476311642161689, + "learning_rate": 1.9328709391687264e-05, + "loss": 0.8896, + "step": 893 + }, + { + "epoch": 0.14, + "grad_norm": 1.5163124967156196, + "learning_rate": 1.932682801547284e-05, + "loss": 0.8527, + "step": 894 + }, + { + "epoch": 0.14, + "grad_norm": 1.4905473797859936, + "learning_rate": 1.9324944098373375e-05, + "loss": 0.9881, + "step": 895 + }, + { + "epoch": 0.14, + "grad_norm": 1.3714950006997435, + "learning_rate": 1.932305764090209e-05, + "loss": 0.9674, + "step": 896 + }, + { + "epoch": 0.14, + "grad_norm": 1.6337608761359723, + "learning_rate": 1.9321168643572912e-05, + "loss": 0.9402, + "step": 897 + }, + { + "epoch": 0.14, + "grad_norm": 1.7127512545633239, + "learning_rate": 1.931927710690046e-05, + "loss": 0.9674, + "step": 898 + }, + { + "epoch": 0.14, + "grad_norm": 1.3594732975223536, + "learning_rate": 1.9317383031400032e-05, + "loss": 0.9658, + "step": 899 + }, + { + "epoch": 0.15, + "grad_norm": 1.7858834527983056, + "learning_rate": 1.9315486417587636e-05, + "loss": 0.9369, + "step": 900 + }, + { + "epoch": 0.15, + "grad_norm": 1.4115773850347189, + "learning_rate": 1.9313587265979952e-05, + "loss": 0.988, + "step": 901 + }, + { + "epoch": 0.15, + "grad_norm": 1.3976324350728033, + "learning_rate": 1.9311685577094368e-05, + "loss": 0.8704, + "step": 902 + }, + { + "epoch": 0.15, + "grad_norm": 1.4979896030892448, + "learning_rate": 1.9309781351448956e-05, + "loss": 0.9496, + "step": 903 + }, + { + "epoch": 0.15, + "grad_norm": 1.6238756141206292, + "learning_rate": 1.9307874589562474e-05, + "loss": 0.9588, + "step": 904 + }, + { + "epoch": 0.15, + "grad_norm": 1.6185265965473552, + "learning_rate": 1.9305965291954387e-05, + "loss": 1.0218, + "step": 905 + }, + { + "epoch": 0.15, + "grad_norm": 1.474152080416173, + "learning_rate": 1.930405345914483e-05, + "loss": 0.9648, + "step": 906 + }, + { + "epoch": 0.15, + "grad_norm": 1.4232492041804214, + "learning_rate": 1.9302139091654643e-05, + "loss": 0.9369, + "step": 907 + }, + { + "epoch": 0.15, + "grad_norm": 1.6567992984183608, + "learning_rate": 1.9300222190005352e-05, + "loss": 0.9155, + "step": 908 + }, + { + "epoch": 0.15, + "grad_norm": 1.3238791639395158, + "learning_rate": 1.9298302754719177e-05, + "loss": 0.9655, + "step": 909 + }, + { + "epoch": 0.15, + "grad_norm": 1.4660089223415227, + "learning_rate": 1.929638078631902e-05, + "loss": 0.9456, + "step": 910 + }, + { + "epoch": 0.15, + "grad_norm": 1.4514647036921116, + "learning_rate": 1.9294456285328482e-05, + "loss": 0.9856, + "step": 911 + }, + { + "epoch": 0.15, + "grad_norm": 1.6777324221314334, + "learning_rate": 1.9292529252271848e-05, + "loss": 1.0106, + "step": 912 + }, + { + "epoch": 0.15, + "grad_norm": 1.658859535255508, + "learning_rate": 1.9290599687674095e-05, + "loss": 0.9542, + "step": 913 + }, + { + "epoch": 0.15, + "grad_norm": 1.498134261097887, + "learning_rate": 1.928866759206089e-05, + "loss": 0.9072, + "step": 914 + }, + { + "epoch": 0.15, + "grad_norm": 1.2699501677904774, + "learning_rate": 1.9286732965958593e-05, + "loss": 0.9288, + "step": 915 + }, + { + "epoch": 0.15, + "grad_norm": 1.5443765235943152, + "learning_rate": 1.928479580989424e-05, + "loss": 0.9179, + "step": 916 + }, + { + "epoch": 0.15, + "grad_norm": 1.2770033624895287, + "learning_rate": 1.9282856124395578e-05, + "loss": 0.9704, + "step": 917 + }, + { + "epoch": 0.15, + "grad_norm": 1.5566634783150148, + "learning_rate": 1.9280913909991018e-05, + "loss": 0.9557, + "step": 918 + }, + { + "epoch": 0.15, + "grad_norm": 1.6633964988921692, + "learning_rate": 1.9278969167209682e-05, + "loss": 1.0389, + "step": 919 + }, + { + "epoch": 0.15, + "grad_norm": 1.5818093309050052, + "learning_rate": 1.927702189658137e-05, + "loss": 0.9543, + "step": 920 + }, + { + "epoch": 0.15, + "grad_norm": 1.2904364719504096, + "learning_rate": 1.9275072098636568e-05, + "loss": 0.949, + "step": 921 + }, + { + "epoch": 0.15, + "grad_norm": 1.2946359317252538, + "learning_rate": 1.927311977390646e-05, + "loss": 1.045, + "step": 922 + }, + { + "epoch": 0.15, + "grad_norm": 1.4565977904041179, + "learning_rate": 1.927116492292291e-05, + "loss": 0.9678, + "step": 923 + }, + { + "epoch": 0.15, + "grad_norm": 1.4590149663044147, + "learning_rate": 1.926920754621847e-05, + "loss": 0.9891, + "step": 924 + }, + { + "epoch": 0.15, + "grad_norm": 1.3077281024549297, + "learning_rate": 1.9267247644326392e-05, + "loss": 1.0065, + "step": 925 + }, + { + "epoch": 0.15, + "grad_norm": 1.2745035337413722, + "learning_rate": 1.92652852177806e-05, + "loss": 0.941, + "step": 926 + }, + { + "epoch": 0.15, + "grad_norm": 1.2482031700459941, + "learning_rate": 1.9263320267115716e-05, + "loss": 0.907, + "step": 927 + }, + { + "epoch": 0.15, + "grad_norm": 1.4784136639993546, + "learning_rate": 1.9261352792867047e-05, + "loss": 0.9325, + "step": 928 + }, + { + "epoch": 0.15, + "grad_norm": 1.4441549085819223, + "learning_rate": 1.9259382795570588e-05, + "loss": 0.9781, + "step": 929 + }, + { + "epoch": 0.15, + "grad_norm": 1.5463356248119329, + "learning_rate": 1.9257410275763018e-05, + "loss": 0.9718, + "step": 930 + }, + { + "epoch": 0.15, + "grad_norm": 1.709275710205316, + "learning_rate": 1.9255435233981706e-05, + "loss": 0.9764, + "step": 931 + }, + { + "epoch": 0.15, + "grad_norm": 1.4669741854822027, + "learning_rate": 1.9253457670764707e-05, + "loss": 1.0407, + "step": 932 + }, + { + "epoch": 0.15, + "grad_norm": 1.3359093635228503, + "learning_rate": 1.9251477586650768e-05, + "loss": 0.9561, + "step": 933 + }, + { + "epoch": 0.15, + "grad_norm": 1.542345499362906, + "learning_rate": 1.9249494982179313e-05, + "loss": 1.0087, + "step": 934 + }, + { + "epoch": 0.15, + "grad_norm": 1.8106102093891439, + "learning_rate": 1.9247509857890466e-05, + "loss": 0.9952, + "step": 935 + }, + { + "epoch": 0.15, + "grad_norm": 1.3860013372009408, + "learning_rate": 1.9245522214325022e-05, + "loss": 0.8171, + "step": 936 + }, + { + "epoch": 0.15, + "grad_norm": 1.9998644483906243, + "learning_rate": 1.9243532052024473e-05, + "loss": 0.9111, + "step": 937 + }, + { + "epoch": 0.15, + "grad_norm": 1.6482025652518015, + "learning_rate": 1.9241539371530994e-05, + "loss": 0.9801, + "step": 938 + }, + { + "epoch": 0.15, + "grad_norm": 1.4191058232833251, + "learning_rate": 1.9239544173387442e-05, + "loss": 0.9511, + "step": 939 + }, + { + "epoch": 0.15, + "grad_norm": 1.538327250245726, + "learning_rate": 1.923754645813737e-05, + "loss": 0.9404, + "step": 940 + }, + { + "epoch": 0.15, + "grad_norm": 1.829724023122164, + "learning_rate": 1.923554622632501e-05, + "loss": 0.9411, + "step": 941 + }, + { + "epoch": 0.15, + "grad_norm": 0.9249830622789615, + "learning_rate": 1.9233543478495277e-05, + "loss": 0.3461, + "step": 942 + }, + { + "epoch": 0.15, + "grad_norm": 1.4056394297922208, + "learning_rate": 1.9231538215193776e-05, + "loss": 0.9774, + "step": 943 + }, + { + "epoch": 0.15, + "grad_norm": 1.4900537377616232, + "learning_rate": 1.9229530436966796e-05, + "loss": 0.967, + "step": 944 + }, + { + "epoch": 0.15, + "grad_norm": 1.5022591166330743, + "learning_rate": 1.9227520144361312e-05, + "loss": 1.0246, + "step": 945 + }, + { + "epoch": 0.15, + "grad_norm": 1.3101190616331866, + "learning_rate": 1.9225507337924984e-05, + "loss": 0.9364, + "step": 946 + }, + { + "epoch": 0.15, + "grad_norm": 1.4243306740225017, + "learning_rate": 1.9223492018206155e-05, + "loss": 0.9121, + "step": 947 + }, + { + "epoch": 0.15, + "grad_norm": 1.3578198638492456, + "learning_rate": 1.922147418575385e-05, + "loss": 1.0123, + "step": 948 + }, + { + "epoch": 0.15, + "grad_norm": 1.3629551937544702, + "learning_rate": 1.9219453841117787e-05, + "loss": 0.8879, + "step": 949 + }, + { + "epoch": 0.15, + "grad_norm": 1.5465134730423564, + "learning_rate": 1.921743098484836e-05, + "loss": 0.8955, + "step": 950 + }, + { + "epoch": 0.15, + "grad_norm": 1.6151448459099016, + "learning_rate": 1.9215405617496657e-05, + "loss": 0.9913, + "step": 951 + }, + { + "epoch": 0.15, + "grad_norm": 1.35675638681422, + "learning_rate": 1.921337773961444e-05, + "loss": 0.8799, + "step": 952 + }, + { + "epoch": 0.15, + "grad_norm": 1.546682985265901, + "learning_rate": 1.9211347351754154e-05, + "loss": 0.9863, + "step": 953 + }, + { + "epoch": 0.15, + "grad_norm": 1.5544977521599055, + "learning_rate": 1.9209314454468935e-05, + "loss": 0.9633, + "step": 954 + }, + { + "epoch": 0.15, + "grad_norm": 1.7320375822654397, + "learning_rate": 1.9207279048312612e-05, + "loss": 0.9388, + "step": 955 + }, + { + "epoch": 0.15, + "grad_norm": 1.5608317684472344, + "learning_rate": 1.920524113383967e-05, + "loss": 0.9125, + "step": 956 + }, + { + "epoch": 0.15, + "grad_norm": 1.3108985978936933, + "learning_rate": 1.9203200711605297e-05, + "loss": 0.9269, + "step": 957 + }, + { + "epoch": 0.15, + "grad_norm": 1.3625666554468419, + "learning_rate": 1.9201157782165366e-05, + "loss": 0.9247, + "step": 958 + }, + { + "epoch": 0.15, + "grad_norm": 1.7816393639999777, + "learning_rate": 1.9199112346076422e-05, + "loss": 0.9125, + "step": 959 + }, + { + "epoch": 0.15, + "grad_norm": 1.6366891972097488, + "learning_rate": 1.91970644038957e-05, + "loss": 0.894, + "step": 960 + }, + { + "epoch": 0.15, + "grad_norm": 1.3552353751487338, + "learning_rate": 1.9195013956181115e-05, + "loss": 0.8064, + "step": 961 + }, + { + "epoch": 0.15, + "grad_norm": 1.3206030061198555, + "learning_rate": 1.919296100349127e-05, + "loss": 0.9903, + "step": 962 + }, + { + "epoch": 0.16, + "grad_norm": 1.8241746844048248, + "learning_rate": 1.919090554638544e-05, + "loss": 0.9028, + "step": 963 + }, + { + "epoch": 0.16, + "grad_norm": 1.5474018153517208, + "learning_rate": 1.9188847585423593e-05, + "loss": 1.0327, + "step": 964 + }, + { + "epoch": 0.16, + "grad_norm": 1.6163799059266604, + "learning_rate": 1.9186787121166367e-05, + "loss": 0.9783, + "step": 965 + }, + { + "epoch": 0.16, + "grad_norm": 1.3994849666886506, + "learning_rate": 1.9184724154175096e-05, + "loss": 0.9938, + "step": 966 + }, + { + "epoch": 0.16, + "grad_norm": 1.5046244695965882, + "learning_rate": 1.9182658685011787e-05, + "loss": 0.9087, + "step": 967 + }, + { + "epoch": 0.16, + "grad_norm": 1.5478609211154413, + "learning_rate": 1.918059071423913e-05, + "loss": 0.8732, + "step": 968 + }, + { + "epoch": 0.16, + "grad_norm": 1.4488964814973997, + "learning_rate": 1.9178520242420498e-05, + "loss": 1.0123, + "step": 969 + }, + { + "epoch": 0.16, + "grad_norm": 1.6020275813047413, + "learning_rate": 1.9176447270119945e-05, + "loss": 0.9082, + "step": 970 + }, + { + "epoch": 0.16, + "grad_norm": 1.7740844920167578, + "learning_rate": 1.917437179790221e-05, + "loss": 0.9443, + "step": 971 + }, + { + "epoch": 0.16, + "grad_norm": 1.4097687692149998, + "learning_rate": 1.91722938263327e-05, + "loss": 0.9326, + "step": 972 + }, + { + "epoch": 0.16, + "grad_norm": 1.3934202842818284, + "learning_rate": 1.9170213355977513e-05, + "loss": 0.8556, + "step": 973 + }, + { + "epoch": 0.16, + "grad_norm": 1.512458460730969, + "learning_rate": 1.9168130387403433e-05, + "loss": 0.8828, + "step": 974 + }, + { + "epoch": 0.16, + "grad_norm": 1.4849542071255044, + "learning_rate": 1.916604492117791e-05, + "loss": 1.0031, + "step": 975 + }, + { + "epoch": 0.16, + "grad_norm": 1.4000708016625028, + "learning_rate": 1.9163956957869093e-05, + "loss": 0.881, + "step": 976 + }, + { + "epoch": 0.16, + "grad_norm": 1.544050152794303, + "learning_rate": 1.9161866498045792e-05, + "loss": 0.9371, + "step": 977 + }, + { + "epoch": 0.16, + "grad_norm": 1.4302403444230285, + "learning_rate": 1.9159773542277508e-05, + "loss": 0.9499, + "step": 978 + }, + { + "epoch": 0.16, + "grad_norm": 1.5398397709811054, + "learning_rate": 1.9157678091134424e-05, + "loss": 0.9661, + "step": 979 + }, + { + "epoch": 0.16, + "grad_norm": 1.3243422520978139, + "learning_rate": 1.9155580145187392e-05, + "loss": 0.87, + "step": 980 + }, + { + "epoch": 0.16, + "grad_norm": 1.5311628737769714, + "learning_rate": 1.9153479705007953e-05, + "loss": 0.952, + "step": 981 + }, + { + "epoch": 0.16, + "grad_norm": 1.714219641784785, + "learning_rate": 1.9151376771168325e-05, + "loss": 0.9719, + "step": 982 + }, + { + "epoch": 0.16, + "grad_norm": 1.4822879185420312, + "learning_rate": 1.914927134424141e-05, + "loss": 0.8965, + "step": 983 + }, + { + "epoch": 0.16, + "grad_norm": 1.4169797535304103, + "learning_rate": 1.9147163424800773e-05, + "loss": 0.9007, + "step": 984 + }, + { + "epoch": 0.16, + "grad_norm": 1.5664067851039514, + "learning_rate": 1.914505301342068e-05, + "loss": 0.9321, + "step": 985 + }, + { + "epoch": 0.16, + "grad_norm": 1.7807317343703846, + "learning_rate": 1.914294011067606e-05, + "loss": 0.9584, + "step": 986 + }, + { + "epoch": 0.16, + "grad_norm": 1.4825237010517485, + "learning_rate": 1.914082471714253e-05, + "loss": 0.9751, + "step": 987 + }, + { + "epoch": 0.16, + "grad_norm": 1.5146127631427522, + "learning_rate": 1.9138706833396374e-05, + "loss": 1.0071, + "step": 988 + }, + { + "epoch": 0.16, + "grad_norm": 1.5015119805729173, + "learning_rate": 1.913658646001457e-05, + "loss": 0.9479, + "step": 989 + }, + { + "epoch": 0.16, + "grad_norm": 1.262381761804665, + "learning_rate": 1.913446359757476e-05, + "loss": 0.8898, + "step": 990 + }, + { + "epoch": 0.16, + "grad_norm": 1.3647375669537145, + "learning_rate": 1.9132338246655276e-05, + "loss": 0.9613, + "step": 991 + }, + { + "epoch": 0.16, + "grad_norm": 1.5864504468508505, + "learning_rate": 1.9130210407835118e-05, + "loss": 0.9821, + "step": 992 + }, + { + "epoch": 0.16, + "grad_norm": 1.4646656265043707, + "learning_rate": 1.912808008169397e-05, + "loss": 0.9183, + "step": 993 + }, + { + "epoch": 0.16, + "grad_norm": 1.3796063947437502, + "learning_rate": 1.9125947268812192e-05, + "loss": 0.8932, + "step": 994 + }, + { + "epoch": 0.16, + "grad_norm": 1.1961649585110579, + "learning_rate": 1.9123811969770815e-05, + "loss": 0.8055, + "step": 995 + }, + { + "epoch": 0.16, + "grad_norm": 1.5876943102089769, + "learning_rate": 1.9121674185151562e-05, + "loss": 0.9134, + "step": 996 + }, + { + "epoch": 0.16, + "grad_norm": 0.8498307290511353, + "learning_rate": 1.911953391553682e-05, + "loss": 0.3122, + "step": 997 + }, + { + "epoch": 0.16, + "grad_norm": 1.6238913915356858, + "learning_rate": 1.9117391161509657e-05, + "loss": 1.0385, + "step": 998 + }, + { + "epoch": 0.16, + "grad_norm": 1.4142241464608212, + "learning_rate": 1.911524592365382e-05, + "loss": 0.9436, + "step": 999 + }, + { + "epoch": 0.16, + "grad_norm": 1.434708617822933, + "learning_rate": 1.911309820255373e-05, + "loss": 0.931, + "step": 1000 + }, + { + "epoch": 0.16, + "grad_norm": 1.7810939842839393, + "learning_rate": 1.9110947998794484e-05, + "loss": 0.9253, + "step": 1001 + }, + { + "epoch": 0.16, + "grad_norm": 1.385218040639362, + "learning_rate": 1.910879531296186e-05, + "loss": 1.0058, + "step": 1002 + }, + { + "epoch": 0.16, + "grad_norm": 1.508366568832681, + "learning_rate": 1.9106640145642308e-05, + "loss": 1.0139, + "step": 1003 + }, + { + "epoch": 0.16, + "grad_norm": 1.6345031087695705, + "learning_rate": 1.9104482497422955e-05, + "loss": 0.9531, + "step": 1004 + }, + { + "epoch": 0.16, + "grad_norm": 1.599031381201754, + "learning_rate": 1.9102322368891602e-05, + "loss": 0.9309, + "step": 1005 + }, + { + "epoch": 0.16, + "grad_norm": 1.4184120177646085, + "learning_rate": 1.9100159760636727e-05, + "loss": 0.9132, + "step": 1006 + }, + { + "epoch": 0.16, + "grad_norm": 0.9947098543152184, + "learning_rate": 1.9097994673247488e-05, + "loss": 0.3257, + "step": 1007 + }, + { + "epoch": 0.16, + "grad_norm": 1.4528170629157062, + "learning_rate": 1.9095827107313713e-05, + "loss": 0.9402, + "step": 1008 + }, + { + "epoch": 0.16, + "grad_norm": 1.7316713373862707, + "learning_rate": 1.90936570634259e-05, + "loss": 0.9711, + "step": 1009 + }, + { + "epoch": 0.16, + "grad_norm": 1.6357637546313155, + "learning_rate": 1.9091484542175244e-05, + "loss": 1.0003, + "step": 1010 + }, + { + "epoch": 0.16, + "grad_norm": 1.4904615877707033, + "learning_rate": 1.908930954415358e-05, + "loss": 0.9172, + "step": 1011 + }, + { + "epoch": 0.16, + "grad_norm": 1.4452231598640641, + "learning_rate": 1.9087132069953456e-05, + "loss": 0.8875, + "step": 1012 + }, + { + "epoch": 0.16, + "grad_norm": 1.5082600984183923, + "learning_rate": 1.908495212016807e-05, + "loss": 0.9354, + "step": 1013 + }, + { + "epoch": 0.16, + "grad_norm": 1.707869940471913, + "learning_rate": 1.908276969539129e-05, + "loss": 0.968, + "step": 1014 + }, + { + "epoch": 0.16, + "grad_norm": 1.4684966343108745, + "learning_rate": 1.9080584796217683e-05, + "loss": 0.9227, + "step": 1015 + }, + { + "epoch": 0.16, + "grad_norm": 1.7016849778717056, + "learning_rate": 1.9078397423242467e-05, + "loss": 0.9422, + "step": 1016 + }, + { + "epoch": 0.16, + "grad_norm": 1.3604224817065067, + "learning_rate": 1.9076207577061546e-05, + "loss": 0.9189, + "step": 1017 + }, + { + "epoch": 0.16, + "grad_norm": 1.469361097578541, + "learning_rate": 1.907401525827149e-05, + "loss": 0.8269, + "step": 1018 + }, + { + "epoch": 0.16, + "grad_norm": 1.6235494558522054, + "learning_rate": 1.907182046746956e-05, + "loss": 0.9177, + "step": 1019 + }, + { + "epoch": 0.16, + "grad_norm": 1.7633523894576364, + "learning_rate": 1.906962320525366e-05, + "loss": 0.909, + "step": 1020 + }, + { + "epoch": 0.16, + "grad_norm": 1.5536554711473778, + "learning_rate": 1.90674234722224e-05, + "loss": 1.0051, + "step": 1021 + }, + { + "epoch": 0.16, + "grad_norm": 1.8125168598147263, + "learning_rate": 1.9065221268975037e-05, + "loss": 1.0399, + "step": 1022 + }, + { + "epoch": 0.16, + "grad_norm": 1.4833565304871992, + "learning_rate": 1.9063016596111516e-05, + "loss": 1.0658, + "step": 1023 + }, + { + "epoch": 0.16, + "grad_norm": 1.8551963631314097, + "learning_rate": 1.906080945423245e-05, + "loss": 1.0431, + "step": 1024 + }, + { + "epoch": 0.17, + "grad_norm": 0.9092081426066301, + "learning_rate": 1.9058599843939127e-05, + "loss": 0.3583, + "step": 1025 + }, + { + "epoch": 0.17, + "grad_norm": 1.5057059882795931, + "learning_rate": 1.9056387765833506e-05, + "loss": 0.9491, + "step": 1026 + }, + { + "epoch": 0.17, + "grad_norm": 1.4170223373528612, + "learning_rate": 1.905417322051822e-05, + "loss": 0.9248, + "step": 1027 + }, + { + "epoch": 0.17, + "grad_norm": 1.8438436951106543, + "learning_rate": 1.9051956208596564e-05, + "loss": 0.996, + "step": 1028 + }, + { + "epoch": 0.17, + "grad_norm": 1.2880478738362242, + "learning_rate": 1.9049736730672518e-05, + "loss": 0.9916, + "step": 1029 + }, + { + "epoch": 0.17, + "grad_norm": 1.4935558358995293, + "learning_rate": 1.904751478735073e-05, + "loss": 0.9485, + "step": 1030 + }, + { + "epoch": 0.17, + "grad_norm": 1.432688723639302, + "learning_rate": 1.9045290379236517e-05, + "loss": 0.8824, + "step": 1031 + }, + { + "epoch": 0.17, + "grad_norm": 1.536095838835383, + "learning_rate": 1.904306350693587e-05, + "loss": 0.9005, + "step": 1032 + }, + { + "epoch": 0.17, + "grad_norm": 1.4662415218860207, + "learning_rate": 1.904083417105545e-05, + "loss": 0.9713, + "step": 1033 + }, + { + "epoch": 0.17, + "grad_norm": 1.6070501435355622, + "learning_rate": 1.9038602372202594e-05, + "loss": 0.9201, + "step": 1034 + }, + { + "epoch": 0.17, + "grad_norm": 1.7527244777157707, + "learning_rate": 1.9036368110985296e-05, + "loss": 1.0307, + "step": 1035 + }, + { + "epoch": 0.17, + "grad_norm": 1.4700227711415736, + "learning_rate": 1.9034131388012237e-05, + "loss": 0.9484, + "step": 1036 + }, + { + "epoch": 0.17, + "grad_norm": 1.6424095235107115, + "learning_rate": 1.903189220389276e-05, + "loss": 0.9261, + "step": 1037 + }, + { + "epoch": 0.17, + "grad_norm": 0.7995022579233025, + "learning_rate": 1.9029650559236886e-05, + "loss": 0.3572, + "step": 1038 + }, + { + "epoch": 0.17, + "grad_norm": 1.5408550772297291, + "learning_rate": 1.9027406454655292e-05, + "loss": 0.9673, + "step": 1039 + }, + { + "epoch": 0.17, + "grad_norm": 1.5028551652067113, + "learning_rate": 1.9025159890759336e-05, + "loss": 0.9257, + "step": 1040 + }, + { + "epoch": 0.17, + "grad_norm": 1.84265488393321, + "learning_rate": 1.902291086816105e-05, + "loss": 0.9747, + "step": 1041 + }, + { + "epoch": 0.17, + "grad_norm": 1.697734685091799, + "learning_rate": 1.902065938747312e-05, + "loss": 0.9679, + "step": 1042 + }, + { + "epoch": 0.17, + "grad_norm": 1.3202983150764154, + "learning_rate": 1.901840544930892e-05, + "loss": 0.9037, + "step": 1043 + }, + { + "epoch": 0.17, + "grad_norm": 1.331114765876868, + "learning_rate": 1.9016149054282486e-05, + "loss": 0.9499, + "step": 1044 + }, + { + "epoch": 0.17, + "grad_norm": 1.3967544667940832, + "learning_rate": 1.9013890203008512e-05, + "loss": 0.943, + "step": 1045 + }, + { + "epoch": 0.17, + "grad_norm": 1.2908277576893867, + "learning_rate": 1.901162889610238e-05, + "loss": 0.9728, + "step": 1046 + }, + { + "epoch": 0.17, + "grad_norm": 0.8109643458551492, + "learning_rate": 1.9009365134180128e-05, + "loss": 0.327, + "step": 1047 + }, + { + "epoch": 0.17, + "grad_norm": 1.36900598908847, + "learning_rate": 1.9007098917858472e-05, + "loss": 0.9419, + "step": 1048 + }, + { + "epoch": 0.17, + "grad_norm": 1.5669325737452564, + "learning_rate": 1.9004830247754786e-05, + "loss": 0.9227, + "step": 1049 + }, + { + "epoch": 0.17, + "grad_norm": 1.7750426330114113, + "learning_rate": 1.9002559124487122e-05, + "loss": 1.0072, + "step": 1050 + }, + { + "epoch": 0.17, + "grad_norm": 1.1821655356787864, + "learning_rate": 1.9000285548674195e-05, + "loss": 0.9281, + "step": 1051 + }, + { + "epoch": 0.17, + "grad_norm": 1.6354043224475265, + "learning_rate": 1.8998009520935388e-05, + "loss": 0.8816, + "step": 1052 + }, + { + "epoch": 0.17, + "grad_norm": 1.5272528128613672, + "learning_rate": 1.899573104189076e-05, + "loss": 0.9118, + "step": 1053 + }, + { + "epoch": 0.17, + "grad_norm": 1.3850719391551656, + "learning_rate": 1.8993450112161023e-05, + "loss": 0.9142, + "step": 1054 + }, + { + "epoch": 0.17, + "grad_norm": 1.4922868585711018, + "learning_rate": 1.8991166732367575e-05, + "loss": 0.9923, + "step": 1055 + }, + { + "epoch": 0.17, + "grad_norm": 1.8381640173157345, + "learning_rate": 1.8988880903132464e-05, + "loss": 1.0449, + "step": 1056 + }, + { + "epoch": 0.17, + "grad_norm": 1.3052145866193885, + "learning_rate": 1.8986592625078413e-05, + "loss": 0.9661, + "step": 1057 + }, + { + "epoch": 0.17, + "grad_norm": 1.3947873200998628, + "learning_rate": 1.8984301898828815e-05, + "loss": 0.9036, + "step": 1058 + }, + { + "epoch": 0.17, + "grad_norm": 1.4275290221491295, + "learning_rate": 1.8982008725007725e-05, + "loss": 0.9159, + "step": 1059 + }, + { + "epoch": 0.17, + "grad_norm": 1.5488225845391752, + "learning_rate": 1.897971310423987e-05, + "loss": 0.9511, + "step": 1060 + }, + { + "epoch": 0.17, + "grad_norm": 1.5371142669658056, + "learning_rate": 1.8977415037150638e-05, + "loss": 0.9492, + "step": 1061 + }, + { + "epoch": 0.17, + "grad_norm": 1.5189254860002805, + "learning_rate": 1.8975114524366086e-05, + "loss": 0.9012, + "step": 1062 + }, + { + "epoch": 0.17, + "grad_norm": 1.6247801053067341, + "learning_rate": 1.8972811566512934e-05, + "loss": 0.9495, + "step": 1063 + }, + { + "epoch": 0.17, + "grad_norm": 1.3726842816594798, + "learning_rate": 1.8970506164218578e-05, + "loss": 0.9009, + "step": 1064 + }, + { + "epoch": 0.17, + "grad_norm": 1.6496971151751343, + "learning_rate": 1.8968198318111067e-05, + "loss": 0.9547, + "step": 1065 + }, + { + "epoch": 0.17, + "grad_norm": 1.3988349778562228, + "learning_rate": 1.8965888028819125e-05, + "loss": 0.9177, + "step": 1066 + }, + { + "epoch": 0.17, + "grad_norm": 1.3479779509105108, + "learning_rate": 1.8963575296972137e-05, + "loss": 0.9301, + "step": 1067 + }, + { + "epoch": 0.17, + "grad_norm": 1.3679008741276417, + "learning_rate": 1.8961260123200158e-05, + "loss": 0.9277, + "step": 1068 + }, + { + "epoch": 0.17, + "grad_norm": 1.6981100641923372, + "learning_rate": 1.89589425081339e-05, + "loss": 1.0307, + "step": 1069 + }, + { + "epoch": 0.17, + "grad_norm": 1.3960011012817106, + "learning_rate": 1.895662245240475e-05, + "loss": 0.9262, + "step": 1070 + }, + { + "epoch": 0.17, + "grad_norm": 1.4901549104374738, + "learning_rate": 1.8954299956644757e-05, + "loss": 0.9436, + "step": 1071 + }, + { + "epoch": 0.17, + "grad_norm": 1.6108017616592303, + "learning_rate": 1.8951975021486623e-05, + "loss": 0.9871, + "step": 1072 + }, + { + "epoch": 0.17, + "grad_norm": 1.4882282932251385, + "learning_rate": 1.8949647647563733e-05, + "loss": 1.0219, + "step": 1073 + }, + { + "epoch": 0.17, + "grad_norm": 1.3748520648280866, + "learning_rate": 1.8947317835510125e-05, + "loss": 0.9033, + "step": 1074 + }, + { + "epoch": 0.17, + "grad_norm": 1.3877001909281639, + "learning_rate": 1.8944985585960504e-05, + "loss": 0.9694, + "step": 1075 + }, + { + "epoch": 0.17, + "grad_norm": 1.5869913628164523, + "learning_rate": 1.8942650899550237e-05, + "loss": 0.8942, + "step": 1076 + }, + { + "epoch": 0.17, + "grad_norm": 1.8213825145605622, + "learning_rate": 1.8940313776915365e-05, + "loss": 0.9357, + "step": 1077 + }, + { + "epoch": 0.17, + "grad_norm": 1.3631246519186686, + "learning_rate": 1.8937974218692577e-05, + "loss": 0.946, + "step": 1078 + }, + { + "epoch": 0.17, + "grad_norm": 1.4662945169032926, + "learning_rate": 1.8935632225519232e-05, + "loss": 0.8852, + "step": 1079 + }, + { + "epoch": 0.17, + "grad_norm": 1.7868894178492483, + "learning_rate": 1.893328779803336e-05, + "loss": 0.886, + "step": 1080 + }, + { + "epoch": 0.17, + "grad_norm": 1.6125883006622244, + "learning_rate": 1.8930940936873644e-05, + "loss": 0.9522, + "step": 1081 + }, + { + "epoch": 0.17, + "grad_norm": 1.4192700614985427, + "learning_rate": 1.892859164267943e-05, + "loss": 0.962, + "step": 1082 + }, + { + "epoch": 0.17, + "grad_norm": 1.4507946436057837, + "learning_rate": 1.892623991609074e-05, + "loss": 0.8659, + "step": 1083 + }, + { + "epoch": 0.17, + "grad_norm": 1.452635378796897, + "learning_rate": 1.892388575774824e-05, + "loss": 0.9195, + "step": 1084 + }, + { + "epoch": 0.17, + "grad_norm": 1.490699023182195, + "learning_rate": 1.892152916829327e-05, + "loss": 0.9209, + "step": 1085 + }, + { + "epoch": 0.17, + "grad_norm": 1.31694368787653, + "learning_rate": 1.8919170148367837e-05, + "loss": 0.9767, + "step": 1086 + }, + { + "epoch": 0.18, + "grad_norm": 1.5114321942263294, + "learning_rate": 1.891680869861459e-05, + "loss": 0.9192, + "step": 1087 + }, + { + "epoch": 0.18, + "grad_norm": 0.8074507190035066, + "learning_rate": 1.8914444819676866e-05, + "loss": 0.322, + "step": 1088 + }, + { + "epoch": 0.18, + "grad_norm": 1.6575995763331053, + "learning_rate": 1.8912078512198642e-05, + "loss": 0.9489, + "step": 1089 + }, + { + "epoch": 0.18, + "grad_norm": 1.3927431590611632, + "learning_rate": 1.890970977682457e-05, + "loss": 0.9797, + "step": 1090 + }, + { + "epoch": 0.18, + "grad_norm": 1.200757713558766, + "learning_rate": 1.8907338614199956e-05, + "loss": 0.898, + "step": 1091 + }, + { + "epoch": 0.18, + "grad_norm": 1.234016828994756, + "learning_rate": 1.8904965024970772e-05, + "loss": 0.9421, + "step": 1092 + }, + { + "epoch": 0.18, + "grad_norm": 1.6698401157821126, + "learning_rate": 1.8902589009783648e-05, + "loss": 0.9541, + "step": 1093 + }, + { + "epoch": 0.18, + "grad_norm": 1.3479106849480442, + "learning_rate": 1.8900210569285877e-05, + "loss": 0.9344, + "step": 1094 + }, + { + "epoch": 0.18, + "grad_norm": 0.7299262482782509, + "learning_rate": 1.889782970412541e-05, + "loss": 0.3506, + "step": 1095 + }, + { + "epoch": 0.18, + "grad_norm": 1.562064848148622, + "learning_rate": 1.8895446414950864e-05, + "loss": 0.9812, + "step": 1096 + }, + { + "epoch": 0.18, + "grad_norm": 1.5229708843396277, + "learning_rate": 1.889306070241151e-05, + "loss": 0.9483, + "step": 1097 + }, + { + "epoch": 0.18, + "grad_norm": 1.285892721951612, + "learning_rate": 1.8890672567157278e-05, + "loss": 0.9576, + "step": 1098 + }, + { + "epoch": 0.18, + "grad_norm": 1.4881002836766306, + "learning_rate": 1.888828200983877e-05, + "loss": 0.9378, + "step": 1099 + }, + { + "epoch": 0.18, + "grad_norm": 1.6490733820784063, + "learning_rate": 1.8885889031107232e-05, + "loss": 0.9404, + "step": 1100 + }, + { + "epoch": 0.18, + "grad_norm": 1.5013302620219808, + "learning_rate": 1.8883493631614583e-05, + "loss": 1.0075, + "step": 1101 + }, + { + "epoch": 0.18, + "grad_norm": 1.3918298825132833, + "learning_rate": 1.8881095812013394e-05, + "loss": 0.9868, + "step": 1102 + }, + { + "epoch": 0.18, + "grad_norm": 1.5932968584698821, + "learning_rate": 1.8878695572956896e-05, + "loss": 1.0453, + "step": 1103 + }, + { + "epoch": 0.18, + "grad_norm": 1.53955576073458, + "learning_rate": 1.8876292915098982e-05, + "loss": 0.9656, + "step": 1104 + }, + { + "epoch": 0.18, + "grad_norm": 1.3119470894855925, + "learning_rate": 1.8873887839094202e-05, + "loss": 0.9613, + "step": 1105 + }, + { + "epoch": 0.18, + "grad_norm": 1.6138746611131145, + "learning_rate": 1.8871480345597763e-05, + "loss": 0.9622, + "step": 1106 + }, + { + "epoch": 0.18, + "grad_norm": 1.4441059876080875, + "learning_rate": 1.8869070435265535e-05, + "loss": 0.9152, + "step": 1107 + }, + { + "epoch": 0.18, + "grad_norm": 1.5065047857601885, + "learning_rate": 1.8866658108754045e-05, + "loss": 0.9526, + "step": 1108 + }, + { + "epoch": 0.18, + "grad_norm": 1.5239810994676692, + "learning_rate": 1.886424336672047e-05, + "loss": 0.9679, + "step": 1109 + }, + { + "epoch": 0.18, + "grad_norm": 1.5806650172817183, + "learning_rate": 1.8861826209822662e-05, + "loss": 0.9525, + "step": 1110 + }, + { + "epoch": 0.18, + "grad_norm": 1.3122113079226483, + "learning_rate": 1.8859406638719117e-05, + "loss": 0.9059, + "step": 1111 + }, + { + "epoch": 0.18, + "grad_norm": 1.3703785356113727, + "learning_rate": 1.885698465406899e-05, + "loss": 0.8664, + "step": 1112 + }, + { + "epoch": 0.18, + "grad_norm": 1.5994870426690446, + "learning_rate": 1.8854560256532098e-05, + "loss": 0.9265, + "step": 1113 + }, + { + "epoch": 0.18, + "grad_norm": 1.3044443439277875, + "learning_rate": 1.885213344676892e-05, + "loss": 0.9818, + "step": 1114 + }, + { + "epoch": 0.18, + "grad_norm": 1.529971318708502, + "learning_rate": 1.8849704225440578e-05, + "loss": 0.8763, + "step": 1115 + }, + { + "epoch": 0.18, + "grad_norm": 1.4893470818701977, + "learning_rate": 1.884727259320886e-05, + "loss": 0.9507, + "step": 1116 + }, + { + "epoch": 0.18, + "grad_norm": 1.42765741786646, + "learning_rate": 1.8844838550736215e-05, + "loss": 0.9311, + "step": 1117 + }, + { + "epoch": 0.18, + "grad_norm": 1.6135612099403698, + "learning_rate": 1.8842402098685735e-05, + "loss": 0.9908, + "step": 1118 + }, + { + "epoch": 0.18, + "grad_norm": 1.2903163360701284, + "learning_rate": 1.883996323772118e-05, + "loss": 0.9129, + "step": 1119 + }, + { + "epoch": 0.18, + "grad_norm": 1.737358077431292, + "learning_rate": 1.883752196850697e-05, + "loss": 0.9827, + "step": 1120 + }, + { + "epoch": 0.18, + "grad_norm": 1.4722235091884055, + "learning_rate": 1.8835078291708164e-05, + "loss": 0.9511, + "step": 1121 + }, + { + "epoch": 0.18, + "grad_norm": 1.714144716316613, + "learning_rate": 1.8832632207990493e-05, + "loss": 0.9142, + "step": 1122 + }, + { + "epoch": 0.18, + "grad_norm": 1.644107492176041, + "learning_rate": 1.883018371802033e-05, + "loss": 0.9779, + "step": 1123 + }, + { + "epoch": 0.18, + "grad_norm": 1.4983664772258791, + "learning_rate": 1.8827732822464723e-05, + "loss": 0.9729, + "step": 1124 + }, + { + "epoch": 0.18, + "grad_norm": 1.554168042305571, + "learning_rate": 1.882527952199136e-05, + "loss": 0.8862, + "step": 1125 + }, + { + "epoch": 0.18, + "grad_norm": 1.5179550541712927, + "learning_rate": 1.8822823817268577e-05, + "loss": 0.9066, + "step": 1126 + }, + { + "epoch": 0.18, + "grad_norm": 1.4758783734846341, + "learning_rate": 1.8820365708965385e-05, + "loss": 0.9422, + "step": 1127 + }, + { + "epoch": 0.18, + "grad_norm": 0.8641786711497633, + "learning_rate": 1.881790519775144e-05, + "loss": 0.3186, + "step": 1128 + }, + { + "epoch": 0.18, + "grad_norm": 1.5212905696695405, + "learning_rate": 1.8815442284297055e-05, + "loss": 0.9316, + "step": 1129 + }, + { + "epoch": 0.18, + "grad_norm": 1.2877008471260802, + "learning_rate": 1.881297696927319e-05, + "loss": 0.9422, + "step": 1130 + }, + { + "epoch": 0.18, + "grad_norm": 1.3505760607630661, + "learning_rate": 1.881050925335147e-05, + "loss": 1.0274, + "step": 1131 + }, + { + "epoch": 0.18, + "grad_norm": 1.3627921145090331, + "learning_rate": 1.8808039137204162e-05, + "loss": 0.9084, + "step": 1132 + }, + { + "epoch": 0.18, + "grad_norm": 1.2600794430873121, + "learning_rate": 1.8805566621504203e-05, + "loss": 0.9465, + "step": 1133 + }, + { + "epoch": 0.18, + "grad_norm": 1.6269075128543034, + "learning_rate": 1.8803091706925168e-05, + "loss": 0.9375, + "step": 1134 + }, + { + "epoch": 0.18, + "grad_norm": 1.6372073623197263, + "learning_rate": 1.8800614394141292e-05, + "loss": 0.9401, + "step": 1135 + }, + { + "epoch": 0.18, + "grad_norm": 1.5450912622652375, + "learning_rate": 1.8798134683827464e-05, + "loss": 0.9609, + "step": 1136 + }, + { + "epoch": 0.18, + "grad_norm": 1.287283467398738, + "learning_rate": 1.879565257665923e-05, + "loss": 0.9315, + "step": 1137 + }, + { + "epoch": 0.18, + "grad_norm": 0.7992698432162705, + "learning_rate": 1.8793168073312776e-05, + "loss": 0.301, + "step": 1138 + }, + { + "epoch": 0.18, + "grad_norm": 1.585304787051378, + "learning_rate": 1.8790681174464956e-05, + "loss": 0.933, + "step": 1139 + }, + { + "epoch": 0.18, + "grad_norm": 1.4715045648834797, + "learning_rate": 1.8788191880793264e-05, + "loss": 0.9508, + "step": 1140 + }, + { + "epoch": 0.18, + "grad_norm": 1.63923169191026, + "learning_rate": 1.878570019297586e-05, + "loss": 0.8732, + "step": 1141 + }, + { + "epoch": 0.18, + "grad_norm": 1.5074090349896567, + "learning_rate": 1.8783206111691543e-05, + "loss": 1.0033, + "step": 1142 + }, + { + "epoch": 0.18, + "grad_norm": 1.4349974021832033, + "learning_rate": 1.878070963761977e-05, + "loss": 0.9647, + "step": 1143 + }, + { + "epoch": 0.18, + "grad_norm": 1.5126603998596777, + "learning_rate": 1.877821077144065e-05, + "loss": 0.8579, + "step": 1144 + }, + { + "epoch": 0.18, + "grad_norm": 1.470026454555716, + "learning_rate": 1.8775709513834945e-05, + "loss": 0.9294, + "step": 1145 + }, + { + "epoch": 0.18, + "grad_norm": 1.3708458048630978, + "learning_rate": 1.8773205865484065e-05, + "loss": 0.9371, + "step": 1146 + }, + { + "epoch": 0.18, + "grad_norm": 1.4928609766205274, + "learning_rate": 1.8770699827070068e-05, + "loss": 0.9347, + "step": 1147 + }, + { + "epoch": 0.18, + "grad_norm": 2.062066932208199, + "learning_rate": 1.8768191399275678e-05, + "loss": 1.0131, + "step": 1148 + }, + { + "epoch": 0.19, + "grad_norm": 0.8561238595795774, + "learning_rate": 1.876568058278425e-05, + "loss": 0.3384, + "step": 1149 + }, + { + "epoch": 0.19, + "grad_norm": 1.301562715320867, + "learning_rate": 1.87631673782798e-05, + "loss": 0.955, + "step": 1150 + }, + { + "epoch": 0.19, + "grad_norm": 1.3476308675117146, + "learning_rate": 1.8760651786447006e-05, + "loss": 0.9473, + "step": 1151 + }, + { + "epoch": 0.19, + "grad_norm": 0.8243190040000618, + "learning_rate": 1.8758133807971175e-05, + "loss": 0.3618, + "step": 1152 + }, + { + "epoch": 0.19, + "grad_norm": 1.4619404497995054, + "learning_rate": 1.8755613443538275e-05, + "loss": 0.9, + "step": 1153 + }, + { + "epoch": 0.19, + "grad_norm": 1.5911175782046088, + "learning_rate": 1.875309069383492e-05, + "loss": 1.0168, + "step": 1154 + }, + { + "epoch": 0.19, + "grad_norm": 1.8607834101988006, + "learning_rate": 1.875056555954838e-05, + "loss": 0.9482, + "step": 1155 + }, + { + "epoch": 0.19, + "grad_norm": 1.508935010202807, + "learning_rate": 1.8748038041366573e-05, + "loss": 0.9337, + "step": 1156 + }, + { + "epoch": 0.19, + "grad_norm": 1.5174446446153425, + "learning_rate": 1.8745508139978063e-05, + "loss": 0.9258, + "step": 1157 + }, + { + "epoch": 0.19, + "grad_norm": 1.4569585081458498, + "learning_rate": 1.8742975856072064e-05, + "loss": 0.9465, + "step": 1158 + }, + { + "epoch": 0.19, + "grad_norm": 1.2930779101284477, + "learning_rate": 1.874044119033844e-05, + "loss": 0.8834, + "step": 1159 + }, + { + "epoch": 0.19, + "grad_norm": 1.3076541901097145, + "learning_rate": 1.8737904143467703e-05, + "loss": 0.9218, + "step": 1160 + }, + { + "epoch": 0.19, + "grad_norm": 1.4780577759497269, + "learning_rate": 1.8735364716151017e-05, + "loss": 0.9822, + "step": 1161 + }, + { + "epoch": 0.19, + "grad_norm": 1.3810395348523756, + "learning_rate": 1.873282290908019e-05, + "loss": 0.9556, + "step": 1162 + }, + { + "epoch": 0.19, + "grad_norm": 1.3310767547670135, + "learning_rate": 1.873027872294768e-05, + "loss": 0.8971, + "step": 1163 + }, + { + "epoch": 0.19, + "grad_norm": 1.5574735760628649, + "learning_rate": 1.8727732158446598e-05, + "loss": 0.9192, + "step": 1164 + }, + { + "epoch": 0.19, + "grad_norm": 1.4796583585930962, + "learning_rate": 1.872518321627069e-05, + "loss": 1.0039, + "step": 1165 + }, + { + "epoch": 0.19, + "grad_norm": 1.4270175746979632, + "learning_rate": 1.8722631897114364e-05, + "loss": 0.9247, + "step": 1166 + }, + { + "epoch": 0.19, + "grad_norm": 1.5334476959247347, + "learning_rate": 1.872007820167267e-05, + "loss": 0.9151, + "step": 1167 + }, + { + "epoch": 0.19, + "grad_norm": 1.355961266972235, + "learning_rate": 1.8717522130641305e-05, + "loss": 0.8912, + "step": 1168 + }, + { + "epoch": 0.19, + "grad_norm": 1.415079623023431, + "learning_rate": 1.8714963684716605e-05, + "loss": 0.9768, + "step": 1169 + }, + { + "epoch": 0.19, + "grad_norm": 1.346793709686554, + "learning_rate": 1.8712402864595575e-05, + "loss": 0.9084, + "step": 1170 + }, + { + "epoch": 0.19, + "grad_norm": 0.8871184574100671, + "learning_rate": 1.8709839670975843e-05, + "loss": 0.318, + "step": 1171 + }, + { + "epoch": 0.19, + "grad_norm": 1.3660321554956938, + "learning_rate": 1.8707274104555696e-05, + "loss": 0.9352, + "step": 1172 + }, + { + "epoch": 0.19, + "grad_norm": 1.6630718106841917, + "learning_rate": 1.8704706166034067e-05, + "loss": 0.9703, + "step": 1173 + }, + { + "epoch": 0.19, + "grad_norm": 1.4470234847244023, + "learning_rate": 1.870213585611053e-05, + "loss": 1.0173, + "step": 1174 + }, + { + "epoch": 0.19, + "grad_norm": 1.601384974154196, + "learning_rate": 1.869956317548531e-05, + "loss": 0.8566, + "step": 1175 + }, + { + "epoch": 0.19, + "grad_norm": 1.5269438632914512, + "learning_rate": 1.8696988124859278e-05, + "loss": 0.9693, + "step": 1176 + }, + { + "epoch": 0.19, + "grad_norm": 1.4813745327098924, + "learning_rate": 1.8694410704933944e-05, + "loss": 0.9225, + "step": 1177 + }, + { + "epoch": 0.19, + "grad_norm": 3.103600196574336, + "learning_rate": 1.8691830916411473e-05, + "loss": 0.9414, + "step": 1178 + }, + { + "epoch": 0.19, + "grad_norm": 1.414423166825318, + "learning_rate": 1.868924875999467e-05, + "loss": 0.8825, + "step": 1179 + }, + { + "epoch": 0.19, + "grad_norm": 1.5593897966454986, + "learning_rate": 1.8686664236386984e-05, + "loss": 0.9553, + "step": 1180 + }, + { + "epoch": 0.19, + "grad_norm": 1.4682804343702456, + "learning_rate": 1.8684077346292507e-05, + "loss": 0.9155, + "step": 1181 + }, + { + "epoch": 0.19, + "grad_norm": 1.3887671848376817, + "learning_rate": 1.8681488090415984e-05, + "loss": 0.946, + "step": 1182 + }, + { + "epoch": 0.19, + "grad_norm": 1.3184388157963884, + "learning_rate": 1.86788964694628e-05, + "loss": 0.853, + "step": 1183 + }, + { + "epoch": 0.19, + "grad_norm": 1.8966923215655658, + "learning_rate": 1.8676302484138982e-05, + "loss": 0.957, + "step": 1184 + }, + { + "epoch": 0.19, + "grad_norm": 1.2800339679243535, + "learning_rate": 1.8673706135151205e-05, + "loss": 0.8837, + "step": 1185 + }, + { + "epoch": 0.19, + "grad_norm": 1.421762006063665, + "learning_rate": 1.8671107423206785e-05, + "loss": 0.9173, + "step": 1186 + }, + { + "epoch": 0.19, + "grad_norm": 1.5622682411453128, + "learning_rate": 1.866850634901368e-05, + "loss": 0.9332, + "step": 1187 + }, + { + "epoch": 0.19, + "grad_norm": 1.6950343701530162, + "learning_rate": 1.8665902913280498e-05, + "loss": 0.9734, + "step": 1188 + }, + { + "epoch": 0.19, + "grad_norm": 1.4402419255614587, + "learning_rate": 1.8663297116716488e-05, + "loss": 0.9079, + "step": 1189 + }, + { + "epoch": 0.19, + "grad_norm": 1.837156569521358, + "learning_rate": 1.8660688960031533e-05, + "loss": 0.8776, + "step": 1190 + }, + { + "epoch": 0.19, + "grad_norm": 1.614721114037799, + "learning_rate": 1.8658078443936175e-05, + "loss": 0.9994, + "step": 1191 + }, + { + "epoch": 0.19, + "grad_norm": 0.9409368203318107, + "learning_rate": 1.865546556914159e-05, + "loss": 0.3557, + "step": 1192 + }, + { + "epoch": 0.19, + "grad_norm": 1.3149849926580364, + "learning_rate": 1.865285033635959e-05, + "loss": 0.8661, + "step": 1193 + }, + { + "epoch": 0.19, + "grad_norm": 1.3275920780445138, + "learning_rate": 1.8650232746302645e-05, + "loss": 0.9852, + "step": 1194 + }, + { + "epoch": 0.19, + "grad_norm": 1.503481201695558, + "learning_rate": 1.8647612799683853e-05, + "loss": 0.9394, + "step": 1195 + }, + { + "epoch": 0.19, + "grad_norm": 1.4441586334678203, + "learning_rate": 1.864499049721696e-05, + "loss": 0.9452, + "step": 1196 + }, + { + "epoch": 0.19, + "grad_norm": 1.6884608270762453, + "learning_rate": 1.8642365839616358e-05, + "loss": 0.917, + "step": 1197 + }, + { + "epoch": 0.19, + "grad_norm": 1.524430090986756, + "learning_rate": 1.863973882759707e-05, + "loss": 0.9422, + "step": 1198 + }, + { + "epoch": 0.19, + "grad_norm": 2.23203692617419, + "learning_rate": 1.863710946187477e-05, + "loss": 0.9752, + "step": 1199 + }, + { + "epoch": 0.19, + "grad_norm": 0.9073838691946678, + "learning_rate": 1.8634477743165772e-05, + "loss": 0.3052, + "step": 1200 + }, + { + "epoch": 0.19, + "grad_norm": 1.3363113751570177, + "learning_rate": 1.863184367218702e-05, + "loss": 0.8275, + "step": 1201 + }, + { + "epoch": 0.19, + "grad_norm": 1.667548573947656, + "learning_rate": 1.862920724965612e-05, + "loss": 0.9256, + "step": 1202 + }, + { + "epoch": 0.19, + "grad_norm": 1.4030477472137224, + "learning_rate": 1.8626568476291296e-05, + "loss": 0.9314, + "step": 1203 + }, + { + "epoch": 0.19, + "grad_norm": 1.549424248544921, + "learning_rate": 1.8623927352811432e-05, + "loss": 0.974, + "step": 1204 + }, + { + "epoch": 0.19, + "grad_norm": 1.3990736678480602, + "learning_rate": 1.862128387993603e-05, + "loss": 0.9349, + "step": 1205 + }, + { + "epoch": 0.19, + "grad_norm": 1.2637281899086543, + "learning_rate": 1.8618638058385255e-05, + "loss": 0.8247, + "step": 1206 + }, + { + "epoch": 0.19, + "grad_norm": 1.4868923305675628, + "learning_rate": 1.8615989888879898e-05, + "loss": 0.9991, + "step": 1207 + }, + { + "epoch": 0.19, + "grad_norm": 1.5020154901610467, + "learning_rate": 1.8613339372141395e-05, + "loss": 0.9165, + "step": 1208 + }, + { + "epoch": 0.19, + "grad_norm": 1.3037904800837983, + "learning_rate": 1.861068650889182e-05, + "loss": 0.9479, + "step": 1209 + }, + { + "epoch": 0.19, + "grad_norm": 0.8578471956940509, + "learning_rate": 1.8608031299853882e-05, + "loss": 0.3369, + "step": 1210 + }, + { + "epoch": 0.2, + "grad_norm": 1.4467898306177405, + "learning_rate": 1.860537374575094e-05, + "loss": 0.959, + "step": 1211 + }, + { + "epoch": 0.2, + "grad_norm": 1.5528269788130764, + "learning_rate": 1.860271384730698e-05, + "loss": 0.9586, + "step": 1212 + }, + { + "epoch": 0.2, + "grad_norm": 1.62331017333585, + "learning_rate": 1.860005160524663e-05, + "loss": 0.9701, + "step": 1213 + }, + { + "epoch": 0.2, + "grad_norm": 1.4502500889769396, + "learning_rate": 1.859738702029516e-05, + "loss": 0.8916, + "step": 1214 + }, + { + "epoch": 0.2, + "grad_norm": 0.8434507277450702, + "learning_rate": 1.859472009317848e-05, + "loss": 0.3309, + "step": 1215 + }, + { + "epoch": 0.2, + "grad_norm": 1.4457425366966303, + "learning_rate": 1.8592050824623133e-05, + "loss": 0.936, + "step": 1216 + }, + { + "epoch": 0.2, + "grad_norm": 1.5009382195871088, + "learning_rate": 1.8589379215356302e-05, + "loss": 0.9586, + "step": 1217 + }, + { + "epoch": 0.2, + "grad_norm": 1.788095185668963, + "learning_rate": 1.8586705266105802e-05, + "loss": 0.9034, + "step": 1218 + }, + { + "epoch": 0.2, + "grad_norm": 1.4663373345382535, + "learning_rate": 1.858402897760009e-05, + "loss": 0.9832, + "step": 1219 + }, + { + "epoch": 0.2, + "grad_norm": 1.0309724199353967, + "learning_rate": 1.858135035056827e-05, + "loss": 0.3465, + "step": 1220 + }, + { + "epoch": 0.2, + "grad_norm": 1.4003839865065622, + "learning_rate": 1.8578669385740065e-05, + "loss": 0.9917, + "step": 1221 + }, + { + "epoch": 0.2, + "grad_norm": 1.666548967241235, + "learning_rate": 1.8575986083845848e-05, + "loss": 0.9907, + "step": 1222 + }, + { + "epoch": 0.2, + "grad_norm": 1.3283679585661043, + "learning_rate": 1.8573300445616628e-05, + "loss": 0.9881, + "step": 1223 + }, + { + "epoch": 0.2, + "grad_norm": 1.6300443702930745, + "learning_rate": 1.8570612471784034e-05, + "loss": 0.945, + "step": 1224 + }, + { + "epoch": 0.2, + "grad_norm": 1.5459008424683574, + "learning_rate": 1.8567922163080357e-05, + "loss": 0.9585, + "step": 1225 + }, + { + "epoch": 0.2, + "grad_norm": 1.5981840343820413, + "learning_rate": 1.8565229520238504e-05, + "loss": 0.9809, + "step": 1226 + }, + { + "epoch": 0.2, + "grad_norm": 1.4133003246228455, + "learning_rate": 1.856253454399203e-05, + "loss": 1.0399, + "step": 1227 + }, + { + "epoch": 0.2, + "grad_norm": 1.439196440793315, + "learning_rate": 1.855983723507512e-05, + "loss": 0.9188, + "step": 1228 + }, + { + "epoch": 0.2, + "grad_norm": 1.7505249285619295, + "learning_rate": 1.8557137594222588e-05, + "loss": 0.9124, + "step": 1229 + }, + { + "epoch": 0.2, + "grad_norm": 1.5080343821931554, + "learning_rate": 1.8554435622169903e-05, + "loss": 0.8637, + "step": 1230 + }, + { + "epoch": 0.2, + "grad_norm": 1.5053126787500748, + "learning_rate": 1.8551731319653147e-05, + "loss": 0.9365, + "step": 1231 + }, + { + "epoch": 0.2, + "grad_norm": 1.3074386455559, + "learning_rate": 1.854902468740905e-05, + "loss": 0.8649, + "step": 1232 + }, + { + "epoch": 0.2, + "grad_norm": 1.8538496522426393, + "learning_rate": 1.8546315726174973e-05, + "loss": 0.9378, + "step": 1233 + }, + { + "epoch": 0.2, + "grad_norm": 1.422759533552292, + "learning_rate": 1.854360443668891e-05, + "loss": 0.9003, + "step": 1234 + }, + { + "epoch": 0.2, + "grad_norm": 1.505211989699172, + "learning_rate": 1.8540890819689497e-05, + "loss": 0.9358, + "step": 1235 + }, + { + "epoch": 0.2, + "grad_norm": 1.5046103513652982, + "learning_rate": 1.8538174875915992e-05, + "loss": 0.8674, + "step": 1236 + }, + { + "epoch": 0.2, + "grad_norm": 1.4898623205786867, + "learning_rate": 1.853545660610829e-05, + "loss": 0.9721, + "step": 1237 + }, + { + "epoch": 0.2, + "grad_norm": 1.5865120340360355, + "learning_rate": 1.853273601100693e-05, + "loss": 0.9269, + "step": 1238 + }, + { + "epoch": 0.2, + "grad_norm": 1.2998279337305358, + "learning_rate": 1.8530013091353075e-05, + "loss": 0.8891, + "step": 1239 + }, + { + "epoch": 0.2, + "grad_norm": 1.4829570924869244, + "learning_rate": 1.8527287847888524e-05, + "loss": 0.9925, + "step": 1240 + }, + { + "epoch": 0.2, + "grad_norm": 1.3359230267984405, + "learning_rate": 1.8524560281355705e-05, + "loss": 0.9579, + "step": 1241 + }, + { + "epoch": 0.2, + "grad_norm": 1.5641327362029283, + "learning_rate": 1.8521830392497685e-05, + "loss": 0.8456, + "step": 1242 + }, + { + "epoch": 0.2, + "grad_norm": 0.8720659777030559, + "learning_rate": 1.8519098182058163e-05, + "loss": 0.342, + "step": 1243 + }, + { + "epoch": 0.2, + "grad_norm": 1.3945502943052863, + "learning_rate": 1.8516363650781464e-05, + "loss": 0.9569, + "step": 1244 + }, + { + "epoch": 0.2, + "grad_norm": 1.5314563644398622, + "learning_rate": 1.851362679941255e-05, + "loss": 0.9513, + "step": 1245 + }, + { + "epoch": 0.2, + "grad_norm": 1.5244693914070613, + "learning_rate": 1.851088762869702e-05, + "loss": 0.9881, + "step": 1246 + }, + { + "epoch": 0.2, + "grad_norm": 1.4379823098439193, + "learning_rate": 1.850814613938109e-05, + "loss": 0.9068, + "step": 1247 + }, + { + "epoch": 0.2, + "grad_norm": 1.7768918065409423, + "learning_rate": 1.850540233221163e-05, + "loss": 0.9094, + "step": 1248 + }, + { + "epoch": 0.2, + "grad_norm": 1.486696633712154, + "learning_rate": 1.8502656207936117e-05, + "loss": 0.9771, + "step": 1249 + }, + { + "epoch": 0.2, + "grad_norm": 1.6675580912852332, + "learning_rate": 1.849990776730268e-05, + "loss": 0.8733, + "step": 1250 + }, + { + "epoch": 0.2, + "grad_norm": 1.4415213298603884, + "learning_rate": 1.849715701106006e-05, + "loss": 0.9142, + "step": 1251 + }, + { + "epoch": 0.2, + "grad_norm": 1.5622978049821856, + "learning_rate": 1.849440393995765e-05, + "loss": 0.9651, + "step": 1252 + }, + { + "epoch": 0.2, + "grad_norm": 1.6258274582781227, + "learning_rate": 1.8491648554745457e-05, + "loss": 0.9355, + "step": 1253 + }, + { + "epoch": 0.2, + "grad_norm": 1.4049389890520296, + "learning_rate": 1.848889085617412e-05, + "loss": 0.9626, + "step": 1254 + }, + { + "epoch": 0.2, + "grad_norm": 1.399927875510835, + "learning_rate": 1.848613084499492e-05, + "loss": 0.9147, + "step": 1255 + }, + { + "epoch": 0.2, + "grad_norm": 1.3914621889022578, + "learning_rate": 1.8483368521959753e-05, + "loss": 0.9508, + "step": 1256 + }, + { + "epoch": 0.2, + "grad_norm": 1.3611343501056727, + "learning_rate": 1.8480603887821157e-05, + "loss": 0.9264, + "step": 1257 + }, + { + "epoch": 0.2, + "grad_norm": 1.4065073539432384, + "learning_rate": 1.8477836943332295e-05, + "loss": 0.905, + "step": 1258 + }, + { + "epoch": 0.2, + "grad_norm": 1.8270361502400705, + "learning_rate": 1.8475067689246954e-05, + "loss": 0.9203, + "step": 1259 + }, + { + "epoch": 0.2, + "grad_norm": 1.2900585054698248, + "learning_rate": 1.8472296126319557e-05, + "loss": 0.9391, + "step": 1260 + }, + { + "epoch": 0.2, + "grad_norm": 1.7535506293630616, + "learning_rate": 1.8469522255305156e-05, + "loss": 0.858, + "step": 1261 + }, + { + "epoch": 0.2, + "grad_norm": 1.4980231308702827, + "learning_rate": 1.846674607695943e-05, + "loss": 0.9307, + "step": 1262 + }, + { + "epoch": 0.2, + "grad_norm": 1.5906151925582785, + "learning_rate": 1.8463967592038684e-05, + "loss": 0.9537, + "step": 1263 + }, + { + "epoch": 0.2, + "grad_norm": 1.4208189938808407, + "learning_rate": 1.8461186801299857e-05, + "loss": 0.9038, + "step": 1264 + }, + { + "epoch": 0.2, + "grad_norm": 1.2556586972880175, + "learning_rate": 1.845840370550051e-05, + "loss": 0.9429, + "step": 1265 + }, + { + "epoch": 0.2, + "grad_norm": 1.328714450291548, + "learning_rate": 1.8455618305398836e-05, + "loss": 0.9135, + "step": 1266 + }, + { + "epoch": 0.2, + "grad_norm": 1.462962714126474, + "learning_rate": 1.8452830601753655e-05, + "loss": 1.0109, + "step": 1267 + }, + { + "epoch": 0.2, + "grad_norm": 1.341869331469154, + "learning_rate": 1.8450040595324416e-05, + "loss": 0.992, + "step": 1268 + }, + { + "epoch": 0.2, + "grad_norm": 0.9963080811381916, + "learning_rate": 1.844724828687119e-05, + "loss": 0.3271, + "step": 1269 + }, + { + "epoch": 0.2, + "grad_norm": 1.6746499326513777, + "learning_rate": 1.8444453677154683e-05, + "loss": 0.8863, + "step": 1270 + }, + { + "epoch": 0.2, + "grad_norm": 1.5142722834084215, + "learning_rate": 1.844165676693622e-05, + "loss": 1.0185, + "step": 1271 + }, + { + "epoch": 0.2, + "grad_norm": 1.5738978614988803, + "learning_rate": 1.843885755697776e-05, + "loss": 0.9318, + "step": 1272 + }, + { + "epoch": 0.21, + "grad_norm": 1.3873979859341738, + "learning_rate": 1.843605604804188e-05, + "loss": 0.9643, + "step": 1273 + }, + { + "epoch": 0.21, + "grad_norm": 1.313944898302993, + "learning_rate": 1.8433252240891793e-05, + "loss": 0.8947, + "step": 1274 + }, + { + "epoch": 0.21, + "grad_norm": 1.4738669964597948, + "learning_rate": 1.8430446136291333e-05, + "loss": 0.9366, + "step": 1275 + }, + { + "epoch": 0.21, + "grad_norm": 1.6835590107823177, + "learning_rate": 1.8427637735004957e-05, + "loss": 0.9547, + "step": 1276 + }, + { + "epoch": 0.21, + "grad_norm": 1.295469361069176, + "learning_rate": 1.8424827037797755e-05, + "loss": 0.8369, + "step": 1277 + }, + { + "epoch": 0.21, + "grad_norm": 1.5508988789836728, + "learning_rate": 1.8422014045435433e-05, + "loss": 0.8412, + "step": 1278 + }, + { + "epoch": 0.21, + "grad_norm": 1.7754959084670252, + "learning_rate": 1.841919875868433e-05, + "loss": 0.9368, + "step": 1279 + }, + { + "epoch": 0.21, + "grad_norm": 1.5582288898203667, + "learning_rate": 1.841638117831141e-05, + "loss": 1.0028, + "step": 1280 + }, + { + "epoch": 0.21, + "grad_norm": 1.5337265354324514, + "learning_rate": 1.8413561305084262e-05, + "loss": 0.9136, + "step": 1281 + }, + { + "epoch": 0.21, + "grad_norm": 1.6309186878954023, + "learning_rate": 1.841073913977109e-05, + "loss": 0.9371, + "step": 1282 + }, + { + "epoch": 0.21, + "grad_norm": 1.4181033272892356, + "learning_rate": 1.8407914683140734e-05, + "loss": 0.884, + "step": 1283 + }, + { + "epoch": 0.21, + "grad_norm": 0.9395665092408731, + "learning_rate": 1.840508793596265e-05, + "loss": 0.3704, + "step": 1284 + }, + { + "epoch": 0.21, + "grad_norm": 1.5545111321326446, + "learning_rate": 1.8402258899006926e-05, + "loss": 1.0153, + "step": 1285 + }, + { + "epoch": 0.21, + "grad_norm": 1.3833091471668526, + "learning_rate": 1.839942757304427e-05, + "loss": 0.8784, + "step": 1286 + }, + { + "epoch": 0.21, + "grad_norm": 1.5223349775433335, + "learning_rate": 1.8396593958846013e-05, + "loss": 0.9926, + "step": 1287 + }, + { + "epoch": 0.21, + "grad_norm": 1.7004279027411002, + "learning_rate": 1.8393758057184104e-05, + "loss": 0.8789, + "step": 1288 + }, + { + "epoch": 0.21, + "grad_norm": 1.670071427302, + "learning_rate": 1.8390919868831126e-05, + "loss": 0.9313, + "step": 1289 + }, + { + "epoch": 0.21, + "grad_norm": 1.407897248691519, + "learning_rate": 1.838807939456028e-05, + "loss": 0.9167, + "step": 1290 + }, + { + "epoch": 0.21, + "grad_norm": 1.6662236125223822, + "learning_rate": 1.8385236635145385e-05, + "loss": 0.8904, + "step": 1291 + }, + { + "epoch": 0.21, + "grad_norm": 1.808553923874879, + "learning_rate": 1.8382391591360895e-05, + "loss": 0.8968, + "step": 1292 + }, + { + "epoch": 0.21, + "grad_norm": 1.6574027968636342, + "learning_rate": 1.837954426398187e-05, + "loss": 0.936, + "step": 1293 + }, + { + "epoch": 0.21, + "grad_norm": 1.4557466142668847, + "learning_rate": 1.8376694653784003e-05, + "loss": 0.9361, + "step": 1294 + }, + { + "epoch": 0.21, + "grad_norm": 1.287681993450042, + "learning_rate": 1.837384276154361e-05, + "loss": 0.9523, + "step": 1295 + }, + { + "epoch": 0.21, + "grad_norm": 1.8446314445396148, + "learning_rate": 1.8370988588037622e-05, + "loss": 0.9189, + "step": 1296 + }, + { + "epoch": 0.21, + "grad_norm": 1.4881626817826161, + "learning_rate": 1.8368132134043596e-05, + "loss": 0.9017, + "step": 1297 + }, + { + "epoch": 0.21, + "grad_norm": 1.357751210830098, + "learning_rate": 1.836527340033971e-05, + "loss": 0.9336, + "step": 1298 + }, + { + "epoch": 0.21, + "grad_norm": 0.8515179790880286, + "learning_rate": 1.8362412387704752e-05, + "loss": 0.3158, + "step": 1299 + }, + { + "epoch": 0.21, + "grad_norm": 1.238947413885612, + "learning_rate": 1.8359549096918154e-05, + "loss": 0.8944, + "step": 1300 + }, + { + "epoch": 0.21, + "grad_norm": 1.4450494488192385, + "learning_rate": 1.8356683528759948e-05, + "loss": 0.9333, + "step": 1301 + }, + { + "epoch": 0.21, + "grad_norm": 1.568313467914447, + "learning_rate": 1.8353815684010796e-05, + "loss": 0.9468, + "step": 1302 + }, + { + "epoch": 0.21, + "grad_norm": 1.4015008304418182, + "learning_rate": 1.835094556345198e-05, + "loss": 0.8447, + "step": 1303 + }, + { + "epoch": 0.21, + "grad_norm": 1.450318792332983, + "learning_rate": 1.834807316786539e-05, + "loss": 0.9758, + "step": 1304 + }, + { + "epoch": 0.21, + "grad_norm": 0.8496855208376316, + "learning_rate": 1.834519849803356e-05, + "loss": 0.3448, + "step": 1305 + }, + { + "epoch": 0.21, + "grad_norm": 1.266695364697053, + "learning_rate": 1.834232155473962e-05, + "loss": 0.963, + "step": 1306 + }, + { + "epoch": 0.21, + "grad_norm": 1.494139182644815, + "learning_rate": 1.833944233876733e-05, + "loss": 0.9205, + "step": 1307 + }, + { + "epoch": 0.21, + "grad_norm": 1.4495517673908238, + "learning_rate": 1.833656085090107e-05, + "loss": 0.9574, + "step": 1308 + }, + { + "epoch": 0.21, + "grad_norm": 1.601748799909306, + "learning_rate": 1.8333677091925834e-05, + "loss": 0.9027, + "step": 1309 + }, + { + "epoch": 0.21, + "grad_norm": 1.394548727241224, + "learning_rate": 1.8330791062627244e-05, + "loss": 0.9471, + "step": 1310 + }, + { + "epoch": 0.21, + "grad_norm": 1.3164122819054844, + "learning_rate": 1.8327902763791523e-05, + "loss": 0.9645, + "step": 1311 + }, + { + "epoch": 0.21, + "grad_norm": 1.3643497266778957, + "learning_rate": 1.832501219620553e-05, + "loss": 0.9542, + "step": 1312 + }, + { + "epoch": 0.21, + "grad_norm": 1.6410030389251806, + "learning_rate": 1.8322119360656736e-05, + "loss": 0.9107, + "step": 1313 + }, + { + "epoch": 0.21, + "grad_norm": 1.5315100858914268, + "learning_rate": 1.831922425793323e-05, + "loss": 0.8735, + "step": 1314 + }, + { + "epoch": 0.21, + "grad_norm": 1.404916554956228, + "learning_rate": 1.8316326888823714e-05, + "loss": 0.9312, + "step": 1315 + }, + { + "epoch": 0.21, + "grad_norm": 1.2819264250225257, + "learning_rate": 1.8313427254117508e-05, + "loss": 0.8504, + "step": 1316 + }, + { + "epoch": 0.21, + "grad_norm": 1.4039782439841957, + "learning_rate": 1.8310525354604562e-05, + "loss": 0.9725, + "step": 1317 + }, + { + "epoch": 0.21, + "grad_norm": 1.5163038220050726, + "learning_rate": 1.8307621191075425e-05, + "loss": 0.9238, + "step": 1318 + }, + { + "epoch": 0.21, + "grad_norm": 1.5340439562182835, + "learning_rate": 1.8304714764321277e-05, + "loss": 0.9368, + "step": 1319 + }, + { + "epoch": 0.21, + "grad_norm": 0.7682117268905015, + "learning_rate": 1.8301806075133907e-05, + "loss": 0.313, + "step": 1320 + }, + { + "epoch": 0.21, + "grad_norm": 1.7105857610583735, + "learning_rate": 1.829889512430572e-05, + "loss": 0.9423, + "step": 1321 + }, + { + "epoch": 0.21, + "grad_norm": 1.3772880714963722, + "learning_rate": 1.829598191262974e-05, + "loss": 0.933, + "step": 1322 + }, + { + "epoch": 0.21, + "grad_norm": 1.4981942135446906, + "learning_rate": 1.829306644089961e-05, + "loss": 0.8442, + "step": 1323 + }, + { + "epoch": 0.21, + "grad_norm": 2.1867719818237843, + "learning_rate": 1.829014870990958e-05, + "loss": 1.0171, + "step": 1324 + }, + { + "epoch": 0.21, + "grad_norm": 1.2114262763456343, + "learning_rate": 1.8287228720454522e-05, + "loss": 0.8926, + "step": 1325 + }, + { + "epoch": 0.21, + "grad_norm": 1.6088282219957097, + "learning_rate": 1.8284306473329922e-05, + "loss": 0.9661, + "step": 1326 + }, + { + "epoch": 0.21, + "grad_norm": 1.426200734444402, + "learning_rate": 1.828138196933188e-05, + "loss": 0.8712, + "step": 1327 + }, + { + "epoch": 0.21, + "grad_norm": 0.9511094050778167, + "learning_rate": 1.8278455209257113e-05, + "loss": 0.3699, + "step": 1328 + }, + { + "epoch": 0.21, + "grad_norm": 1.384390392094115, + "learning_rate": 1.8275526193902948e-05, + "loss": 0.9003, + "step": 1329 + }, + { + "epoch": 0.21, + "grad_norm": 1.3687000048610198, + "learning_rate": 1.8272594924067333e-05, + "loss": 0.9678, + "step": 1330 + }, + { + "epoch": 0.21, + "grad_norm": 1.6448160341735587, + "learning_rate": 1.8269661400548825e-05, + "loss": 0.9637, + "step": 1331 + }, + { + "epoch": 0.21, + "grad_norm": 1.374549829283314, + "learning_rate": 1.8266725624146596e-05, + "loss": 0.8922, + "step": 1332 + }, + { + "epoch": 0.21, + "grad_norm": 1.3330331359805405, + "learning_rate": 1.8263787595660437e-05, + "loss": 0.9667, + "step": 1333 + }, + { + "epoch": 0.21, + "grad_norm": 1.582131879455361, + "learning_rate": 1.8260847315890738e-05, + "loss": 0.8867, + "step": 1334 + }, + { + "epoch": 0.22, + "grad_norm": 1.5382919600425355, + "learning_rate": 1.8257904785638523e-05, + "loss": 0.8802, + "step": 1335 + }, + { + "epoch": 0.22, + "grad_norm": 1.4028260207321301, + "learning_rate": 1.8254960005705412e-05, + "loss": 0.9602, + "step": 1336 + }, + { + "epoch": 0.22, + "grad_norm": 1.5333344542997693, + "learning_rate": 1.825201297689365e-05, + "loss": 0.9527, + "step": 1337 + }, + { + "epoch": 0.22, + "grad_norm": 1.3300307632337933, + "learning_rate": 1.824906370000608e-05, + "loss": 0.9405, + "step": 1338 + }, + { + "epoch": 0.22, + "grad_norm": 1.5890640126327098, + "learning_rate": 1.8246112175846175e-05, + "loss": 0.9841, + "step": 1339 + }, + { + "epoch": 0.22, + "grad_norm": 1.4043446958461396, + "learning_rate": 1.8243158405218008e-05, + "loss": 0.9513, + "step": 1340 + }, + { + "epoch": 0.22, + "grad_norm": 0.7984676981980211, + "learning_rate": 1.824020238892626e-05, + "loss": 0.3612, + "step": 1341 + }, + { + "epoch": 0.22, + "grad_norm": 0.9291316161668783, + "learning_rate": 1.8237244127776247e-05, + "loss": 0.3711, + "step": 1342 + }, + { + "epoch": 0.22, + "grad_norm": 1.4498200775608228, + "learning_rate": 1.823428362257387e-05, + "loss": 0.9034, + "step": 1343 + }, + { + "epoch": 0.22, + "grad_norm": 1.5454830914803754, + "learning_rate": 1.8231320874125656e-05, + "loss": 0.9306, + "step": 1344 + }, + { + "epoch": 0.22, + "grad_norm": 1.5392866957701152, + "learning_rate": 1.8228355883238737e-05, + "loss": 0.9445, + "step": 1345 + }, + { + "epoch": 0.22, + "grad_norm": 1.4680372091249896, + "learning_rate": 1.822538865072086e-05, + "loss": 0.9346, + "step": 1346 + }, + { + "epoch": 0.22, + "grad_norm": 1.445781957574018, + "learning_rate": 1.822241917738038e-05, + "loss": 0.9471, + "step": 1347 + }, + { + "epoch": 0.22, + "grad_norm": 0.9040202862023211, + "learning_rate": 1.8219447464026264e-05, + "loss": 0.3414, + "step": 1348 + }, + { + "epoch": 0.22, + "grad_norm": 1.402399291131391, + "learning_rate": 1.821647351146809e-05, + "loss": 0.9158, + "step": 1349 + }, + { + "epoch": 0.22, + "grad_norm": 1.452664364463535, + "learning_rate": 1.821349732051604e-05, + "loss": 0.9418, + "step": 1350 + }, + { + "epoch": 0.22, + "grad_norm": 1.4286752449943367, + "learning_rate": 1.8210518891980914e-05, + "loss": 0.9238, + "step": 1351 + }, + { + "epoch": 0.22, + "grad_norm": 1.4395874276519312, + "learning_rate": 1.820753822667412e-05, + "loss": 0.8863, + "step": 1352 + }, + { + "epoch": 0.22, + "grad_norm": 1.4336554191796156, + "learning_rate": 1.8204555325407668e-05, + "loss": 0.9091, + "step": 1353 + }, + { + "epoch": 0.22, + "grad_norm": 1.4948375622308352, + "learning_rate": 1.8201570188994186e-05, + "loss": 0.8974, + "step": 1354 + }, + { + "epoch": 0.22, + "grad_norm": 1.429506926910762, + "learning_rate": 1.8198582818246908e-05, + "loss": 1.0089, + "step": 1355 + }, + { + "epoch": 0.22, + "grad_norm": 1.3747967290722465, + "learning_rate": 1.8195593213979676e-05, + "loss": 0.9066, + "step": 1356 + }, + { + "epoch": 0.22, + "grad_norm": 1.4261110348968973, + "learning_rate": 1.819260137700694e-05, + "loss": 0.937, + "step": 1357 + }, + { + "epoch": 0.22, + "grad_norm": 1.5329262586740509, + "learning_rate": 1.8189607308143754e-05, + "loss": 0.9663, + "step": 1358 + }, + { + "epoch": 0.22, + "grad_norm": 1.929523001543622, + "learning_rate": 1.8186611008205794e-05, + "loss": 0.8981, + "step": 1359 + }, + { + "epoch": 0.22, + "grad_norm": 1.5368894237284176, + "learning_rate": 1.8183612478009328e-05, + "loss": 0.9369, + "step": 1360 + }, + { + "epoch": 0.22, + "grad_norm": 0.9326727181524269, + "learning_rate": 1.8180611718371238e-05, + "loss": 0.3465, + "step": 1361 + }, + { + "epoch": 0.22, + "grad_norm": 1.4762773774636195, + "learning_rate": 1.8177608730109023e-05, + "loss": 0.9596, + "step": 1362 + }, + { + "epoch": 0.22, + "grad_norm": 1.371631321983861, + "learning_rate": 1.8174603514040767e-05, + "loss": 0.9181, + "step": 1363 + }, + { + "epoch": 0.22, + "grad_norm": 1.3013884012347359, + "learning_rate": 1.817159607098518e-05, + "loss": 0.9082, + "step": 1364 + }, + { + "epoch": 0.22, + "grad_norm": 1.6731856603192323, + "learning_rate": 1.8168586401761573e-05, + "loss": 0.9699, + "step": 1365 + }, + { + "epoch": 0.22, + "grad_norm": 1.391615561030067, + "learning_rate": 1.8165574507189864e-05, + "loss": 0.8855, + "step": 1366 + }, + { + "epoch": 0.22, + "grad_norm": 1.3874372256395486, + "learning_rate": 1.8162560388090573e-05, + "loss": 0.8623, + "step": 1367 + }, + { + "epoch": 0.22, + "grad_norm": 1.5773904585193066, + "learning_rate": 1.8159544045284826e-05, + "loss": 1.0286, + "step": 1368 + }, + { + "epoch": 0.22, + "grad_norm": 1.2803037197567553, + "learning_rate": 1.8156525479594368e-05, + "loss": 0.8752, + "step": 1369 + }, + { + "epoch": 0.22, + "grad_norm": 1.3160383657747867, + "learning_rate": 1.8153504691841528e-05, + "loss": 0.9005, + "step": 1370 + }, + { + "epoch": 0.22, + "grad_norm": 1.2903427164433185, + "learning_rate": 1.8150481682849258e-05, + "loss": 0.8687, + "step": 1371 + }, + { + "epoch": 0.22, + "grad_norm": 1.673302462193393, + "learning_rate": 1.814745645344111e-05, + "loss": 0.9552, + "step": 1372 + }, + { + "epoch": 0.22, + "grad_norm": 1.4645751369878004, + "learning_rate": 1.814442900444124e-05, + "loss": 0.9533, + "step": 1373 + }, + { + "epoch": 0.22, + "grad_norm": 1.305852153899607, + "learning_rate": 1.8141399336674403e-05, + "loss": 0.8746, + "step": 1374 + }, + { + "epoch": 0.22, + "grad_norm": 1.6297915154835378, + "learning_rate": 1.8138367450965968e-05, + "loss": 0.9031, + "step": 1375 + }, + { + "epoch": 0.22, + "grad_norm": 1.333331367853201, + "learning_rate": 1.8135333348141904e-05, + "loss": 0.8936, + "step": 1376 + }, + { + "epoch": 0.22, + "grad_norm": 1.3548416039044349, + "learning_rate": 1.813229702902879e-05, + "loss": 0.9389, + "step": 1377 + }, + { + "epoch": 0.22, + "grad_norm": 1.4322877165711003, + "learning_rate": 1.812925849445379e-05, + "loss": 0.9287, + "step": 1378 + }, + { + "epoch": 0.22, + "grad_norm": 1.5219319441841863, + "learning_rate": 1.8126217745244696e-05, + "loss": 0.9363, + "step": 1379 + }, + { + "epoch": 0.22, + "grad_norm": 1.2925971666289797, + "learning_rate": 1.8123174782229884e-05, + "loss": 0.992, + "step": 1380 + }, + { + "epoch": 0.22, + "grad_norm": 1.6749370789541775, + "learning_rate": 1.8120129606238345e-05, + "loss": 0.9655, + "step": 1381 + }, + { + "epoch": 0.22, + "grad_norm": 1.3194344241684788, + "learning_rate": 1.811708221809967e-05, + "loss": 0.9033, + "step": 1382 + }, + { + "epoch": 0.22, + "grad_norm": 1.1825220465879462, + "learning_rate": 1.8114032618644053e-05, + "loss": 1.007, + "step": 1383 + }, + { + "epoch": 0.22, + "grad_norm": 1.5392617091277325, + "learning_rate": 1.8110980808702282e-05, + "loss": 0.923, + "step": 1384 + }, + { + "epoch": 0.22, + "grad_norm": 1.8354536494358633, + "learning_rate": 1.810792678910576e-05, + "loss": 0.9855, + "step": 1385 + }, + { + "epoch": 0.22, + "grad_norm": 1.5192886406475323, + "learning_rate": 1.810487056068648e-05, + "loss": 0.9802, + "step": 1386 + }, + { + "epoch": 0.22, + "grad_norm": 1.2407133105067525, + "learning_rate": 1.8101812124277053e-05, + "loss": 0.9633, + "step": 1387 + }, + { + "epoch": 0.22, + "grad_norm": 1.2591648872515129, + "learning_rate": 1.8098751480710675e-05, + "loss": 0.9289, + "step": 1388 + }, + { + "epoch": 0.22, + "grad_norm": 1.2967450187360643, + "learning_rate": 1.8095688630821147e-05, + "loss": 0.9089, + "step": 1389 + }, + { + "epoch": 0.22, + "grad_norm": 1.7139866105338972, + "learning_rate": 1.809262357544288e-05, + "loss": 0.9372, + "step": 1390 + }, + { + "epoch": 0.22, + "grad_norm": 1.578663188925344, + "learning_rate": 1.8089556315410875e-05, + "loss": 1.001, + "step": 1391 + }, + { + "epoch": 0.22, + "grad_norm": 1.5414224895548923, + "learning_rate": 1.808648685156074e-05, + "loss": 0.8921, + "step": 1392 + }, + { + "epoch": 0.22, + "grad_norm": 1.2827503437579575, + "learning_rate": 1.808341518472868e-05, + "loss": 0.8932, + "step": 1393 + }, + { + "epoch": 0.22, + "grad_norm": 1.406253161691714, + "learning_rate": 1.8080341315751507e-05, + "loss": 0.9096, + "step": 1394 + }, + { + "epoch": 0.22, + "grad_norm": 1.4180396393324481, + "learning_rate": 1.8077265245466623e-05, + "loss": 0.8994, + "step": 1395 + }, + { + "epoch": 0.22, + "grad_norm": 1.5591949137685248, + "learning_rate": 1.8074186974712033e-05, + "loss": 1.0144, + "step": 1396 + }, + { + "epoch": 0.23, + "grad_norm": 1.5073974608262897, + "learning_rate": 1.8071106504326344e-05, + "loss": 0.9537, + "step": 1397 + }, + { + "epoch": 0.23, + "grad_norm": 1.320117331503544, + "learning_rate": 1.8068023835148763e-05, + "loss": 0.9297, + "step": 1398 + }, + { + "epoch": 0.23, + "grad_norm": 1.4934937816007674, + "learning_rate": 1.8064938968019096e-05, + "loss": 0.9413, + "step": 1399 + }, + { + "epoch": 0.23, + "grad_norm": 1.3676624808163538, + "learning_rate": 1.8061851903777738e-05, + "loss": 0.9321, + "step": 1400 + }, + { + "epoch": 0.23, + "grad_norm": 1.3483786664595354, + "learning_rate": 1.8058762643265697e-05, + "loss": 0.9168, + "step": 1401 + }, + { + "epoch": 0.23, + "grad_norm": 1.3761018353542291, + "learning_rate": 1.805567118732457e-05, + "loss": 0.8949, + "step": 1402 + }, + { + "epoch": 0.23, + "grad_norm": 1.4594460539500764, + "learning_rate": 1.8052577536796557e-05, + "loss": 0.9182, + "step": 1403 + }, + { + "epoch": 0.23, + "grad_norm": 1.2984743489764452, + "learning_rate": 1.804948169252445e-05, + "loss": 0.844, + "step": 1404 + }, + { + "epoch": 0.23, + "grad_norm": 1.5494533536000397, + "learning_rate": 1.804638365535165e-05, + "loss": 0.9278, + "step": 1405 + }, + { + "epoch": 0.23, + "grad_norm": 1.4022473724527305, + "learning_rate": 1.8043283426122137e-05, + "loss": 0.9191, + "step": 1406 + }, + { + "epoch": 0.23, + "grad_norm": 1.3148604664502062, + "learning_rate": 1.8040181005680506e-05, + "loss": 0.9259, + "step": 1407 + }, + { + "epoch": 0.23, + "grad_norm": 1.4626493958565367, + "learning_rate": 1.803707639487194e-05, + "loss": 0.9294, + "step": 1408 + }, + { + "epoch": 0.23, + "grad_norm": 1.406032283141865, + "learning_rate": 1.8033969594542223e-05, + "loss": 0.9313, + "step": 1409 + }, + { + "epoch": 0.23, + "grad_norm": 1.477756492755267, + "learning_rate": 1.803086060553773e-05, + "loss": 0.8774, + "step": 1410 + }, + { + "epoch": 0.23, + "grad_norm": 1.4543686743755668, + "learning_rate": 1.8027749428705432e-05, + "loss": 1.002, + "step": 1411 + }, + { + "epoch": 0.23, + "grad_norm": 1.2492416700457738, + "learning_rate": 1.8024636064892907e-05, + "loss": 0.8866, + "step": 1412 + }, + { + "epoch": 0.23, + "grad_norm": 1.2880484233538472, + "learning_rate": 1.8021520514948315e-05, + "loss": 0.8685, + "step": 1413 + }, + { + "epoch": 0.23, + "grad_norm": 1.506585854441499, + "learning_rate": 1.8018402779720424e-05, + "loss": 0.9914, + "step": 1414 + }, + { + "epoch": 0.23, + "grad_norm": 1.6478668966571188, + "learning_rate": 1.8015282860058584e-05, + "loss": 0.9536, + "step": 1415 + }, + { + "epoch": 0.23, + "grad_norm": 1.4633846760159714, + "learning_rate": 1.8012160756812755e-05, + "loss": 0.8998, + "step": 1416 + }, + { + "epoch": 0.23, + "grad_norm": 1.4804786427652983, + "learning_rate": 1.8009036470833474e-05, + "loss": 0.9834, + "step": 1417 + }, + { + "epoch": 0.23, + "grad_norm": 1.38792622743611, + "learning_rate": 1.800591000297189e-05, + "loss": 0.8847, + "step": 1418 + }, + { + "epoch": 0.23, + "grad_norm": 1.3723432371266042, + "learning_rate": 1.8002781354079736e-05, + "loss": 0.9696, + "step": 1419 + }, + { + "epoch": 0.23, + "grad_norm": 1.5180262756319356, + "learning_rate": 1.7999650525009343e-05, + "loss": 0.9637, + "step": 1420 + }, + { + "epoch": 0.23, + "grad_norm": 1.3724392798698994, + "learning_rate": 1.7996517516613637e-05, + "loss": 0.8766, + "step": 1421 + }, + { + "epoch": 0.23, + "grad_norm": 1.3903772148104654, + "learning_rate": 1.7993382329746135e-05, + "loss": 0.9759, + "step": 1422 + }, + { + "epoch": 0.23, + "grad_norm": 1.520317534911786, + "learning_rate": 1.7990244965260945e-05, + "loss": 0.9438, + "step": 1423 + }, + { + "epoch": 0.23, + "grad_norm": 1.4104427432317215, + "learning_rate": 1.7987105424012777e-05, + "loss": 0.9377, + "step": 1424 + }, + { + "epoch": 0.23, + "grad_norm": 1.4114118946859544, + "learning_rate": 1.798396370685692e-05, + "loss": 0.8924, + "step": 1425 + }, + { + "epoch": 0.23, + "grad_norm": 1.6551549976026878, + "learning_rate": 1.7980819814649276e-05, + "loss": 0.8985, + "step": 1426 + }, + { + "epoch": 0.23, + "grad_norm": 1.3457685338416856, + "learning_rate": 1.797767374824632e-05, + "loss": 0.9271, + "step": 1427 + }, + { + "epoch": 0.23, + "grad_norm": 1.4759906886496073, + "learning_rate": 1.797452550850513e-05, + "loss": 0.9423, + "step": 1428 + }, + { + "epoch": 0.23, + "grad_norm": 1.6250092054528142, + "learning_rate": 1.797137509628337e-05, + "loss": 0.964, + "step": 1429 + }, + { + "epoch": 0.23, + "grad_norm": 1.4859354525571047, + "learning_rate": 1.7968222512439302e-05, + "loss": 0.9189, + "step": 1430 + }, + { + "epoch": 0.23, + "grad_norm": 1.377310656400429, + "learning_rate": 1.7965067757831777e-05, + "loss": 0.8937, + "step": 1431 + }, + { + "epoch": 0.23, + "grad_norm": 1.3970063281142926, + "learning_rate": 1.796191083332024e-05, + "loss": 0.9112, + "step": 1432 + }, + { + "epoch": 0.23, + "grad_norm": 1.3131897864979747, + "learning_rate": 1.7958751739764716e-05, + "loss": 0.8635, + "step": 1433 + }, + { + "epoch": 0.23, + "grad_norm": 1.5551914256220765, + "learning_rate": 1.7955590478025833e-05, + "loss": 0.9073, + "step": 1434 + }, + { + "epoch": 0.23, + "grad_norm": 1.3823252940333748, + "learning_rate": 1.7952427048964808e-05, + "loss": 0.9434, + "step": 1435 + }, + { + "epoch": 0.23, + "grad_norm": 1.5950864372965423, + "learning_rate": 1.7949261453443446e-05, + "loss": 1.0352, + "step": 1436 + }, + { + "epoch": 0.23, + "grad_norm": 1.3557532414566502, + "learning_rate": 1.794609369232414e-05, + "loss": 0.9306, + "step": 1437 + }, + { + "epoch": 0.23, + "grad_norm": 1.2719863048853584, + "learning_rate": 1.794292376646988e-05, + "loss": 0.9049, + "step": 1438 + }, + { + "epoch": 0.23, + "grad_norm": 1.5215359331388483, + "learning_rate": 1.7939751676744234e-05, + "loss": 0.9462, + "step": 1439 + }, + { + "epoch": 0.23, + "grad_norm": 1.4729257154951143, + "learning_rate": 1.7936577424011375e-05, + "loss": 0.902, + "step": 1440 + }, + { + "epoch": 0.23, + "grad_norm": 1.3584938619130646, + "learning_rate": 1.793340100913605e-05, + "loss": 0.9406, + "step": 1441 + }, + { + "epoch": 0.23, + "grad_norm": 1.5936746941801514, + "learning_rate": 1.7930222432983605e-05, + "loss": 0.919, + "step": 1442 + }, + { + "epoch": 0.23, + "grad_norm": 1.5405521711232046, + "learning_rate": 1.792704169641997e-05, + "loss": 0.859, + "step": 1443 + }, + { + "epoch": 0.23, + "grad_norm": 1.4349608230642699, + "learning_rate": 1.7923858800311665e-05, + "loss": 0.9123, + "step": 1444 + }, + { + "epoch": 0.23, + "grad_norm": 1.6949011467323114, + "learning_rate": 1.7920673745525805e-05, + "loss": 0.9873, + "step": 1445 + }, + { + "epoch": 0.23, + "grad_norm": 1.3507833943314131, + "learning_rate": 1.7917486532930082e-05, + "loss": 0.9045, + "step": 1446 + }, + { + "epoch": 0.23, + "grad_norm": 1.4297559349169837, + "learning_rate": 1.7914297163392778e-05, + "loss": 0.8762, + "step": 1447 + }, + { + "epoch": 0.23, + "grad_norm": 1.3909769205628935, + "learning_rate": 1.7911105637782767e-05, + "loss": 0.9199, + "step": 1448 + }, + { + "epoch": 0.23, + "grad_norm": 1.4665736747352947, + "learning_rate": 1.790791195696951e-05, + "loss": 0.9659, + "step": 1449 + }, + { + "epoch": 0.23, + "grad_norm": 1.8178568375054327, + "learning_rate": 1.790471612182306e-05, + "loss": 0.8868, + "step": 1450 + }, + { + "epoch": 0.23, + "grad_norm": 1.3791811401591183, + "learning_rate": 1.7901518133214034e-05, + "loss": 0.9306, + "step": 1451 + }, + { + "epoch": 0.23, + "grad_norm": 1.2551781384772345, + "learning_rate": 1.789831799201367e-05, + "loss": 0.956, + "step": 1452 + }, + { + "epoch": 0.23, + "grad_norm": 1.5692239067382618, + "learning_rate": 1.789511569909376e-05, + "loss": 0.9627, + "step": 1453 + }, + { + "epoch": 0.23, + "grad_norm": 1.4792992135526286, + "learning_rate": 1.7891911255326705e-05, + "loss": 0.844, + "step": 1454 + }, + { + "epoch": 0.23, + "grad_norm": 1.609924411278348, + "learning_rate": 1.7888704661585483e-05, + "loss": 0.8855, + "step": 1455 + }, + { + "epoch": 0.23, + "grad_norm": 1.4508613431306197, + "learning_rate": 1.7885495918743658e-05, + "loss": 0.9068, + "step": 1456 + }, + { + "epoch": 0.23, + "grad_norm": 1.4527909988731622, + "learning_rate": 1.7882285027675377e-05, + "loss": 0.8861, + "step": 1457 + }, + { + "epoch": 0.23, + "grad_norm": 1.516929346022876, + "learning_rate": 1.7879071989255383e-05, + "loss": 0.9063, + "step": 1458 + }, + { + "epoch": 0.24, + "grad_norm": 2.0928756601677976, + "learning_rate": 1.787585680435899e-05, + "loss": 1.0475, + "step": 1459 + }, + { + "epoch": 0.24, + "grad_norm": 1.7781670277544546, + "learning_rate": 1.7872639473862102e-05, + "loss": 0.8542, + "step": 1460 + }, + { + "epoch": 0.24, + "grad_norm": 1.3886237598888043, + "learning_rate": 1.786941999864121e-05, + "loss": 0.9471, + "step": 1461 + }, + { + "epoch": 0.24, + "grad_norm": 1.2231879327501474, + "learning_rate": 1.7866198379573393e-05, + "loss": 0.9136, + "step": 1462 + }, + { + "epoch": 0.24, + "grad_norm": 1.524376787491173, + "learning_rate": 1.78629746175363e-05, + "loss": 0.9295, + "step": 1463 + }, + { + "epoch": 0.24, + "grad_norm": 1.664827037970853, + "learning_rate": 1.785974871340818e-05, + "loss": 0.914, + "step": 1464 + }, + { + "epoch": 0.24, + "grad_norm": 1.3099370111612605, + "learning_rate": 1.7856520668067854e-05, + "loss": 0.92, + "step": 1465 + }, + { + "epoch": 0.24, + "grad_norm": 1.2264179456625537, + "learning_rate": 1.785329048239473e-05, + "loss": 0.9225, + "step": 1466 + }, + { + "epoch": 0.24, + "grad_norm": 1.457765702204839, + "learning_rate": 1.7850058157268803e-05, + "loss": 0.9343, + "step": 1467 + }, + { + "epoch": 0.24, + "grad_norm": 1.7026262898367779, + "learning_rate": 1.7846823693570645e-05, + "loss": 0.9294, + "step": 1468 + }, + { + "epoch": 0.24, + "grad_norm": 1.5711821904416883, + "learning_rate": 1.7843587092181417e-05, + "loss": 0.9419, + "step": 1469 + }, + { + "epoch": 0.24, + "grad_norm": 1.7589592954528017, + "learning_rate": 1.7840348353982852e-05, + "loss": 0.9577, + "step": 1470 + }, + { + "epoch": 0.24, + "grad_norm": 1.5320146437216193, + "learning_rate": 1.7837107479857275e-05, + "loss": 0.9488, + "step": 1471 + }, + { + "epoch": 0.24, + "grad_norm": 1.7398896491356037, + "learning_rate": 1.7833864470687593e-05, + "loss": 0.8914, + "step": 1472 + }, + { + "epoch": 0.24, + "grad_norm": 1.8224640300758423, + "learning_rate": 1.7830619327357282e-05, + "loss": 0.9223, + "step": 1473 + }, + { + "epoch": 0.24, + "grad_norm": 0.8512694112557286, + "learning_rate": 1.7827372050750414e-05, + "loss": 0.3502, + "step": 1474 + }, + { + "epoch": 0.24, + "grad_norm": 1.3786712015918483, + "learning_rate": 1.782412264175164e-05, + "loss": 0.9233, + "step": 1475 + }, + { + "epoch": 0.24, + "grad_norm": 1.6252409817637843, + "learning_rate": 1.7820871101246185e-05, + "loss": 0.9699, + "step": 1476 + }, + { + "epoch": 0.24, + "grad_norm": 1.3838162513732415, + "learning_rate": 1.7817617430119858e-05, + "loss": 0.9232, + "step": 1477 + }, + { + "epoch": 0.24, + "grad_norm": 1.3881408927702015, + "learning_rate": 1.7814361629259052e-05, + "loss": 0.8212, + "step": 1478 + }, + { + "epoch": 0.24, + "grad_norm": 1.696564368734246, + "learning_rate": 1.781110369955073e-05, + "loss": 0.8954, + "step": 1479 + }, + { + "epoch": 0.24, + "grad_norm": 1.588319268970133, + "learning_rate": 1.780784364188245e-05, + "loss": 0.9834, + "step": 1480 + }, + { + "epoch": 0.24, + "grad_norm": 1.776178921479375, + "learning_rate": 1.780458145714233e-05, + "loss": 0.8975, + "step": 1481 + }, + { + "epoch": 0.24, + "grad_norm": 1.4175030019600514, + "learning_rate": 1.780131714621909e-05, + "loss": 0.884, + "step": 1482 + }, + { + "epoch": 0.24, + "grad_norm": 1.3667280586609718, + "learning_rate": 1.779805071000202e-05, + "loss": 0.8617, + "step": 1483 + }, + { + "epoch": 0.24, + "grad_norm": 1.4043583107085242, + "learning_rate": 1.7794782149380977e-05, + "loss": 0.9746, + "step": 1484 + }, + { + "epoch": 0.24, + "grad_norm": 1.2685824560777859, + "learning_rate": 1.7791511465246413e-05, + "loss": 0.9546, + "step": 1485 + }, + { + "epoch": 0.24, + "grad_norm": 1.243090851419248, + "learning_rate": 1.778823865848935e-05, + "loss": 0.9619, + "step": 1486 + }, + { + "epoch": 0.24, + "grad_norm": 1.3205551038843497, + "learning_rate": 1.7784963730001395e-05, + "loss": 0.8851, + "step": 1487 + }, + { + "epoch": 0.24, + "grad_norm": 1.4629639230300375, + "learning_rate": 1.7781686680674726e-05, + "loss": 0.9435, + "step": 1488 + }, + { + "epoch": 0.24, + "grad_norm": 1.5142142437793134, + "learning_rate": 1.77784075114021e-05, + "loss": 0.8821, + "step": 1489 + }, + { + "epoch": 0.24, + "grad_norm": 1.5504916747098654, + "learning_rate": 1.7775126223076857e-05, + "loss": 0.925, + "step": 1490 + }, + { + "epoch": 0.24, + "grad_norm": 1.7386160142418696, + "learning_rate": 1.7771842816592907e-05, + "loss": 0.952, + "step": 1491 + }, + { + "epoch": 0.24, + "grad_norm": 1.5010075854469, + "learning_rate": 1.776855729284474e-05, + "loss": 0.9483, + "step": 1492 + }, + { + "epoch": 0.24, + "grad_norm": 1.322186607129727, + "learning_rate": 1.7765269652727427e-05, + "loss": 0.878, + "step": 1493 + }, + { + "epoch": 0.24, + "grad_norm": 1.3781340249516472, + "learning_rate": 1.7761979897136606e-05, + "loss": 0.9551, + "step": 1494 + }, + { + "epoch": 0.24, + "grad_norm": 1.3560533939197716, + "learning_rate": 1.7758688026968506e-05, + "loss": 0.8735, + "step": 1495 + }, + { + "epoch": 0.24, + "grad_norm": 1.3132814478689676, + "learning_rate": 1.7755394043119916e-05, + "loss": 0.9825, + "step": 1496 + }, + { + "epoch": 0.24, + "grad_norm": 1.3836426559886001, + "learning_rate": 1.775209794648821e-05, + "loss": 0.9209, + "step": 1497 + }, + { + "epoch": 0.24, + "grad_norm": 1.6041134441255716, + "learning_rate": 1.7748799737971335e-05, + "loss": 0.831, + "step": 1498 + }, + { + "epoch": 0.24, + "grad_norm": 1.3286040869005302, + "learning_rate": 1.7745499418467814e-05, + "loss": 0.9287, + "step": 1499 + }, + { + "epoch": 0.24, + "grad_norm": 1.6941438648508664, + "learning_rate": 1.774219698887675e-05, + "loss": 0.9639, + "step": 1500 + }, + { + "epoch": 0.24, + "grad_norm": 1.3842512965905442, + "learning_rate": 1.7738892450097806e-05, + "loss": 0.9151, + "step": 1501 + }, + { + "epoch": 0.24, + "grad_norm": 1.5450894256477892, + "learning_rate": 1.7735585803031235e-05, + "loss": 0.9361, + "step": 1502 + }, + { + "epoch": 0.24, + "grad_norm": 1.4557121249343312, + "learning_rate": 1.7732277048577858e-05, + "loss": 0.9788, + "step": 1503 + }, + { + "epoch": 0.24, + "grad_norm": 1.4652782279042018, + "learning_rate": 1.7728966187639075e-05, + "loss": 0.9639, + "step": 1504 + }, + { + "epoch": 0.24, + "grad_norm": 1.5597329428143736, + "learning_rate": 1.7725653221116853e-05, + "loss": 0.9454, + "step": 1505 + }, + { + "epoch": 0.24, + "grad_norm": 1.5876108459151903, + "learning_rate": 1.772233814991373e-05, + "loss": 0.8667, + "step": 1506 + }, + { + "epoch": 0.24, + "grad_norm": 1.403979079799209, + "learning_rate": 1.771902097493283e-05, + "loss": 1.0105, + "step": 1507 + }, + { + "epoch": 0.24, + "grad_norm": 1.34899227080926, + "learning_rate": 1.771570169707784e-05, + "loss": 0.9293, + "step": 1508 + }, + { + "epoch": 0.24, + "grad_norm": 1.3286380211749882, + "learning_rate": 1.7712380317253025e-05, + "loss": 0.9212, + "step": 1509 + }, + { + "epoch": 0.24, + "grad_norm": 1.5016283073453969, + "learning_rate": 1.7709056836363218e-05, + "loss": 0.9574, + "step": 1510 + }, + { + "epoch": 0.24, + "grad_norm": 1.3886239999923242, + "learning_rate": 1.7705731255313823e-05, + "loss": 0.8998, + "step": 1511 + }, + { + "epoch": 0.24, + "grad_norm": 1.4753339200059699, + "learning_rate": 1.770240357501083e-05, + "loss": 0.9191, + "step": 1512 + }, + { + "epoch": 0.24, + "grad_norm": 1.3819384385229503, + "learning_rate": 1.769907379636078e-05, + "loss": 0.9306, + "step": 1513 + }, + { + "epoch": 0.24, + "grad_norm": 1.2355884445510876, + "learning_rate": 1.7695741920270807e-05, + "loss": 0.8063, + "step": 1514 + }, + { + "epoch": 0.24, + "grad_norm": 1.2513299743575326, + "learning_rate": 1.76924079476486e-05, + "loss": 0.9152, + "step": 1515 + }, + { + "epoch": 0.24, + "grad_norm": 1.3067684952074006, + "learning_rate": 1.768907187940242e-05, + "loss": 0.9547, + "step": 1516 + }, + { + "epoch": 0.24, + "grad_norm": 1.303801981286533, + "learning_rate": 1.7685733716441117e-05, + "loss": 0.9201, + "step": 1517 + }, + { + "epoch": 0.24, + "grad_norm": 1.6604463368708118, + "learning_rate": 1.7682393459674087e-05, + "loss": 0.8578, + "step": 1518 + }, + { + "epoch": 0.24, + "grad_norm": 1.3768999718491584, + "learning_rate": 1.7679051110011312e-05, + "loss": 0.9712, + "step": 1519 + }, + { + "epoch": 0.24, + "grad_norm": 1.5061142318623844, + "learning_rate": 1.7675706668363338e-05, + "loss": 1.0097, + "step": 1520 + }, + { + "epoch": 0.25, + "grad_norm": 1.3707112868406561, + "learning_rate": 1.7672360135641292e-05, + "loss": 0.8881, + "step": 1521 + }, + { + "epoch": 0.25, + "grad_norm": 1.6107150275469724, + "learning_rate": 1.766901151275685e-05, + "loss": 0.9494, + "step": 1522 + }, + { + "epoch": 0.25, + "grad_norm": 1.6488736460415707, + "learning_rate": 1.7665660800622274e-05, + "loss": 0.9439, + "step": 1523 + }, + { + "epoch": 0.25, + "grad_norm": 1.3736304139249622, + "learning_rate": 1.7662308000150393e-05, + "loss": 0.9085, + "step": 1524 + }, + { + "epoch": 0.25, + "grad_norm": 1.998250365499198, + "learning_rate": 1.7658953112254603e-05, + "loss": 0.9154, + "step": 1525 + }, + { + "epoch": 0.25, + "grad_norm": 1.3879826360340926, + "learning_rate": 1.765559613784886e-05, + "loss": 0.8934, + "step": 1526 + }, + { + "epoch": 0.25, + "grad_norm": 1.4338861006249786, + "learning_rate": 1.7652237077847703e-05, + "loss": 0.874, + "step": 1527 + }, + { + "epoch": 0.25, + "grad_norm": 1.265560115369577, + "learning_rate": 1.7648875933166228e-05, + "loss": 0.9352, + "step": 1528 + }, + { + "epoch": 0.25, + "grad_norm": 1.3934951587732518, + "learning_rate": 1.764551270472011e-05, + "loss": 0.9258, + "step": 1529 + }, + { + "epoch": 0.25, + "grad_norm": 1.4487740189011102, + "learning_rate": 1.764214739342558e-05, + "loss": 0.9284, + "step": 1530 + }, + { + "epoch": 0.25, + "grad_norm": 1.3826910598011186, + "learning_rate": 1.7638780000199446e-05, + "loss": 0.9548, + "step": 1531 + }, + { + "epoch": 0.25, + "grad_norm": 1.5046801446079305, + "learning_rate": 1.7635410525959072e-05, + "loss": 0.8838, + "step": 1532 + }, + { + "epoch": 0.25, + "grad_norm": 1.3354668527167677, + "learning_rate": 1.76320389716224e-05, + "loss": 0.8897, + "step": 1533 + }, + { + "epoch": 0.25, + "grad_norm": 1.5934494801093575, + "learning_rate": 1.7628665338107936e-05, + "loss": 0.8925, + "step": 1534 + }, + { + "epoch": 0.25, + "grad_norm": 1.626466152725449, + "learning_rate": 1.7625289626334744e-05, + "loss": 0.913, + "step": 1535 + }, + { + "epoch": 0.25, + "grad_norm": 1.3699017271698222, + "learning_rate": 1.762191183722247e-05, + "loss": 0.8034, + "step": 1536 + }, + { + "epoch": 0.25, + "grad_norm": 1.566841406321332, + "learning_rate": 1.761853197169131e-05, + "loss": 0.8913, + "step": 1537 + }, + { + "epoch": 0.25, + "grad_norm": 0.822387670398342, + "learning_rate": 1.7615150030662037e-05, + "loss": 0.3522, + "step": 1538 + }, + { + "epoch": 0.25, + "grad_norm": 1.5330349052378704, + "learning_rate": 1.7611766015055984e-05, + "loss": 0.9569, + "step": 1539 + }, + { + "epoch": 0.25, + "grad_norm": 1.6057219439079193, + "learning_rate": 1.7608379925795043e-05, + "loss": 0.9539, + "step": 1540 + }, + { + "epoch": 0.25, + "grad_norm": 1.51380736575235, + "learning_rate": 1.7604991763801688e-05, + "loss": 0.9078, + "step": 1541 + }, + { + "epoch": 0.25, + "grad_norm": 1.2881288145331344, + "learning_rate": 1.760160152999894e-05, + "loss": 0.8744, + "step": 1542 + }, + { + "epoch": 0.25, + "grad_norm": 1.4834039843708575, + "learning_rate": 1.75982092253104e-05, + "loss": 0.8857, + "step": 1543 + }, + { + "epoch": 0.25, + "grad_norm": 1.4069587395081684, + "learning_rate": 1.759481485066022e-05, + "loss": 0.9222, + "step": 1544 + }, + { + "epoch": 0.25, + "grad_norm": 1.4591545008345639, + "learning_rate": 1.7591418406973126e-05, + "loss": 0.9346, + "step": 1545 + }, + { + "epoch": 0.25, + "grad_norm": 1.4680304717831996, + "learning_rate": 1.7588019895174394e-05, + "loss": 0.8929, + "step": 1546 + }, + { + "epoch": 0.25, + "grad_norm": 1.5366527807449342, + "learning_rate": 1.758461931618988e-05, + "loss": 0.9313, + "step": 1547 + }, + { + "epoch": 0.25, + "grad_norm": 1.537533306872229, + "learning_rate": 1.758121667094599e-05, + "loss": 0.9283, + "step": 1548 + }, + { + "epoch": 0.25, + "grad_norm": 1.6314236367191264, + "learning_rate": 1.7577811960369703e-05, + "loss": 0.9175, + "step": 1549 + }, + { + "epoch": 0.25, + "grad_norm": 1.2702649948843467, + "learning_rate": 1.7574405185388556e-05, + "loss": 0.8699, + "step": 1550 + }, + { + "epoch": 0.25, + "grad_norm": 0.7877521658956605, + "learning_rate": 1.7570996346930645e-05, + "loss": 0.334, + "step": 1551 + }, + { + "epoch": 0.25, + "grad_norm": 1.6333969863361506, + "learning_rate": 1.7567585445924632e-05, + "loss": 0.9768, + "step": 1552 + }, + { + "epoch": 0.25, + "grad_norm": 1.6370386810166844, + "learning_rate": 1.7564172483299748e-05, + "loss": 0.8915, + "step": 1553 + }, + { + "epoch": 0.25, + "grad_norm": 1.3711045983420596, + "learning_rate": 1.7560757459985767e-05, + "loss": 0.9273, + "step": 1554 + }, + { + "epoch": 0.25, + "grad_norm": 1.485050782919194, + "learning_rate": 1.7557340376913038e-05, + "loss": 0.8857, + "step": 1555 + }, + { + "epoch": 0.25, + "grad_norm": 0.8168810212271668, + "learning_rate": 1.7553921235012475e-05, + "loss": 0.3767, + "step": 1556 + }, + { + "epoch": 0.25, + "grad_norm": 1.616051608411855, + "learning_rate": 1.755050003521554e-05, + "loss": 0.8764, + "step": 1557 + }, + { + "epoch": 0.25, + "grad_norm": 1.3313738771877215, + "learning_rate": 1.7547076778454268e-05, + "loss": 0.9767, + "step": 1558 + }, + { + "epoch": 0.25, + "grad_norm": 1.3897301433064186, + "learning_rate": 1.7543651465661244e-05, + "loss": 0.9189, + "step": 1559 + }, + { + "epoch": 0.25, + "grad_norm": 1.4656128619022848, + "learning_rate": 1.754022409776962e-05, + "loss": 0.8615, + "step": 1560 + }, + { + "epoch": 0.25, + "grad_norm": 1.2790980766983986, + "learning_rate": 1.75367946757131e-05, + "loss": 0.8541, + "step": 1561 + }, + { + "epoch": 0.25, + "grad_norm": 1.6955185866016322, + "learning_rate": 1.7533363200425962e-05, + "loss": 0.925, + "step": 1562 + }, + { + "epoch": 0.25, + "grad_norm": 1.476534364447397, + "learning_rate": 1.752992967284303e-05, + "loss": 0.9457, + "step": 1563 + }, + { + "epoch": 0.25, + "grad_norm": 1.235884520191518, + "learning_rate": 1.7526494093899687e-05, + "loss": 0.8745, + "step": 1564 + }, + { + "epoch": 0.25, + "grad_norm": 1.4026909693198406, + "learning_rate": 1.7523056464531887e-05, + "loss": 0.8942, + "step": 1565 + }, + { + "epoch": 0.25, + "grad_norm": 1.3107086400169938, + "learning_rate": 1.751961678567613e-05, + "loss": 0.8997, + "step": 1566 + }, + { + "epoch": 0.25, + "grad_norm": 1.3116070336565369, + "learning_rate": 1.751617505826948e-05, + "loss": 0.8633, + "step": 1567 + }, + { + "epoch": 0.25, + "grad_norm": 1.2509419085030418, + "learning_rate": 1.7512731283249563e-05, + "loss": 0.8663, + "step": 1568 + }, + { + "epoch": 0.25, + "grad_norm": 1.3406504796458962, + "learning_rate": 1.7509285461554552e-05, + "loss": 0.9598, + "step": 1569 + }, + { + "epoch": 0.25, + "grad_norm": 1.4142874629654651, + "learning_rate": 1.7505837594123186e-05, + "loss": 0.8582, + "step": 1570 + }, + { + "epoch": 0.25, + "grad_norm": 1.6180474890259768, + "learning_rate": 1.7502387681894765e-05, + "loss": 0.9012, + "step": 1571 + }, + { + "epoch": 0.25, + "grad_norm": 1.2981787219717944, + "learning_rate": 1.749893572580913e-05, + "loss": 0.901, + "step": 1572 + }, + { + "epoch": 0.25, + "grad_norm": 1.6972041695095221, + "learning_rate": 1.7495481726806697e-05, + "loss": 0.9211, + "step": 1573 + }, + { + "epoch": 0.25, + "grad_norm": 1.4548929762334823, + "learning_rate": 1.7492025685828428e-05, + "loss": 0.8066, + "step": 1574 + }, + { + "epoch": 0.25, + "grad_norm": 1.2501479299750213, + "learning_rate": 1.748856760381584e-05, + "loss": 0.9318, + "step": 1575 + }, + { + "epoch": 0.25, + "grad_norm": 1.8617111814603586, + "learning_rate": 1.7485107481711014e-05, + "loss": 0.9229, + "step": 1576 + }, + { + "epoch": 0.25, + "grad_norm": 1.2702661954527332, + "learning_rate": 1.748164532045658e-05, + "loss": 0.9123, + "step": 1577 + }, + { + "epoch": 0.25, + "grad_norm": 1.3505731528250098, + "learning_rate": 1.747818112099573e-05, + "loss": 0.9757, + "step": 1578 + }, + { + "epoch": 0.25, + "grad_norm": 1.433443666418462, + "learning_rate": 1.7474714884272208e-05, + "loss": 0.8932, + "step": 1579 + }, + { + "epoch": 0.25, + "grad_norm": 1.351289736092245, + "learning_rate": 1.7471246611230307e-05, + "loss": 0.9298, + "step": 1580 + }, + { + "epoch": 0.25, + "grad_norm": 1.331558394963455, + "learning_rate": 1.7467776302814882e-05, + "loss": 0.9135, + "step": 1581 + }, + { + "epoch": 0.25, + "grad_norm": 1.271593672585788, + "learning_rate": 1.7464303959971343e-05, + "loss": 0.9564, + "step": 1582 + }, + { + "epoch": 0.26, + "grad_norm": 1.35125491881556, + "learning_rate": 1.746082958364565e-05, + "loss": 0.8665, + "step": 1583 + }, + { + "epoch": 0.26, + "grad_norm": 1.6404041289200506, + "learning_rate": 1.745735317478432e-05, + "loss": 0.86, + "step": 1584 + }, + { + "epoch": 0.26, + "grad_norm": 1.3675904075400056, + "learning_rate": 1.7453874734334422e-05, + "loss": 0.928, + "step": 1585 + }, + { + "epoch": 0.26, + "grad_norm": 1.3558293036200917, + "learning_rate": 1.745039426324358e-05, + "loss": 0.8523, + "step": 1586 + }, + { + "epoch": 0.26, + "grad_norm": 1.4274944868394677, + "learning_rate": 1.7446911762459968e-05, + "loss": 0.9072, + "step": 1587 + }, + { + "epoch": 0.26, + "grad_norm": 1.6804319883126988, + "learning_rate": 1.7443427232932316e-05, + "loss": 0.913, + "step": 1588 + }, + { + "epoch": 0.26, + "grad_norm": 1.467127302441046, + "learning_rate": 1.7439940675609906e-05, + "loss": 0.8291, + "step": 1589 + }, + { + "epoch": 0.26, + "grad_norm": 1.3284174232563115, + "learning_rate": 1.7436452091442574e-05, + "loss": 0.9267, + "step": 1590 + }, + { + "epoch": 0.26, + "grad_norm": 1.360130199282049, + "learning_rate": 1.7432961481380707e-05, + "loss": 0.9188, + "step": 1591 + }, + { + "epoch": 0.26, + "grad_norm": 1.5272952655013883, + "learning_rate": 1.742946884637524e-05, + "loss": 0.9278, + "step": 1592 + }, + { + "epoch": 0.26, + "grad_norm": 1.5232543040435773, + "learning_rate": 1.7425974187377665e-05, + "loss": 0.9895, + "step": 1593 + }, + { + "epoch": 0.26, + "grad_norm": 1.6917296798562969, + "learning_rate": 1.7422477505340024e-05, + "loss": 0.8411, + "step": 1594 + }, + { + "epoch": 0.26, + "grad_norm": 1.68951994592901, + "learning_rate": 1.7418978801214906e-05, + "loss": 0.9481, + "step": 1595 + }, + { + "epoch": 0.26, + "grad_norm": 1.3740161056431388, + "learning_rate": 1.741547807595546e-05, + "loss": 0.9881, + "step": 1596 + }, + { + "epoch": 0.26, + "grad_norm": 1.537902430531926, + "learning_rate": 1.7411975330515377e-05, + "loss": 0.9062, + "step": 1597 + }, + { + "epoch": 0.26, + "grad_norm": 1.2796198641016825, + "learning_rate": 1.7408470565848897e-05, + "loss": 0.8651, + "step": 1598 + }, + { + "epoch": 0.26, + "grad_norm": 1.4728954940453995, + "learning_rate": 1.7404963782910828e-05, + "loss": 0.9563, + "step": 1599 + }, + { + "epoch": 0.26, + "grad_norm": 1.42905022926618, + "learning_rate": 1.74014549826565e-05, + "loss": 0.8388, + "step": 1600 + }, + { + "epoch": 0.26, + "grad_norm": 1.528822518367215, + "learning_rate": 1.739794416604181e-05, + "loss": 0.9046, + "step": 1601 + }, + { + "epoch": 0.26, + "grad_norm": 0.9858454618629265, + "learning_rate": 1.739443133402321e-05, + "loss": 0.3389, + "step": 1602 + }, + { + "epoch": 0.26, + "grad_norm": 1.3971105378849025, + "learning_rate": 1.739091648755768e-05, + "loss": 0.897, + "step": 1603 + }, + { + "epoch": 0.26, + "grad_norm": 1.687847479723696, + "learning_rate": 1.7387399627602773e-05, + "loss": 0.9807, + "step": 1604 + }, + { + "epoch": 0.26, + "grad_norm": 1.2858939003245111, + "learning_rate": 1.7383880755116567e-05, + "loss": 0.9248, + "step": 1605 + }, + { + "epoch": 0.26, + "grad_norm": 1.4310881685870116, + "learning_rate": 1.7380359871057707e-05, + "loss": 0.9417, + "step": 1606 + }, + { + "epoch": 0.26, + "grad_norm": 1.3008869968709962, + "learning_rate": 1.737683697638538e-05, + "loss": 0.8745, + "step": 1607 + }, + { + "epoch": 0.26, + "grad_norm": 1.399665773944484, + "learning_rate": 1.7373312072059313e-05, + "loss": 0.9181, + "step": 1608 + }, + { + "epoch": 0.26, + "grad_norm": 1.4466663489248912, + "learning_rate": 1.7369785159039794e-05, + "loss": 0.8917, + "step": 1609 + }, + { + "epoch": 0.26, + "grad_norm": 1.3981887760023952, + "learning_rate": 1.7366256238287647e-05, + "loss": 0.8774, + "step": 1610 + }, + { + "epoch": 0.26, + "grad_norm": 1.2575035011838098, + "learning_rate": 1.736272531076425e-05, + "loss": 0.9025, + "step": 1611 + }, + { + "epoch": 0.26, + "grad_norm": 1.6762845343994273, + "learning_rate": 1.735919237743152e-05, + "loss": 1.0239, + "step": 1612 + }, + { + "epoch": 0.26, + "grad_norm": 1.3580818355800202, + "learning_rate": 1.7355657439251933e-05, + "loss": 0.9134, + "step": 1613 + }, + { + "epoch": 0.26, + "grad_norm": 1.5195426033400008, + "learning_rate": 1.7352120497188497e-05, + "loss": 0.8867, + "step": 1614 + }, + { + "epoch": 0.26, + "grad_norm": 1.6766742514404924, + "learning_rate": 1.7348581552204776e-05, + "loss": 0.9361, + "step": 1615 + }, + { + "epoch": 0.26, + "grad_norm": 1.597199248728031, + "learning_rate": 1.734504060526488e-05, + "loss": 0.897, + "step": 1616 + }, + { + "epoch": 0.26, + "grad_norm": 1.453109387985574, + "learning_rate": 1.734149765733345e-05, + "loss": 0.9055, + "step": 1617 + }, + { + "epoch": 0.26, + "grad_norm": 1.3422999320495346, + "learning_rate": 1.7337952709375688e-05, + "loss": 0.8093, + "step": 1618 + }, + { + "epoch": 0.26, + "grad_norm": 1.4037057344651518, + "learning_rate": 1.733440576235734e-05, + "loss": 1.0066, + "step": 1619 + }, + { + "epoch": 0.26, + "grad_norm": 1.2437857068479816, + "learning_rate": 1.7330856817244686e-05, + "loss": 0.9456, + "step": 1620 + }, + { + "epoch": 0.26, + "grad_norm": 0.763271014075135, + "learning_rate": 1.732730587500456e-05, + "loss": 0.3637, + "step": 1621 + }, + { + "epoch": 0.26, + "grad_norm": 1.2821856490011891, + "learning_rate": 1.7323752936604334e-05, + "loss": 0.827, + "step": 1622 + }, + { + "epoch": 0.26, + "grad_norm": 4.456608199721232, + "learning_rate": 1.732019800301193e-05, + "loss": 0.9262, + "step": 1623 + }, + { + "epoch": 0.26, + "grad_norm": 1.379231485503792, + "learning_rate": 1.7316641075195802e-05, + "loss": 0.8474, + "step": 1624 + }, + { + "epoch": 0.26, + "grad_norm": 1.4459960873865894, + "learning_rate": 1.7313082154124966e-05, + "loss": 0.8392, + "step": 1625 + }, + { + "epoch": 0.26, + "grad_norm": 1.3425809548875087, + "learning_rate": 1.7309521240768964e-05, + "loss": 0.8642, + "step": 1626 + }, + { + "epoch": 0.26, + "grad_norm": 1.6765583699540803, + "learning_rate": 1.7305958336097888e-05, + "loss": 0.9333, + "step": 1627 + }, + { + "epoch": 0.26, + "grad_norm": 1.560699147516626, + "learning_rate": 1.730239344108237e-05, + "loss": 1.0107, + "step": 1628 + }, + { + "epoch": 0.26, + "grad_norm": 1.348318388580259, + "learning_rate": 1.729882655669359e-05, + "loss": 0.9269, + "step": 1629 + }, + { + "epoch": 0.26, + "grad_norm": 1.5533882833493289, + "learning_rate": 1.729525768390326e-05, + "loss": 0.9577, + "step": 1630 + }, + { + "epoch": 0.26, + "grad_norm": 1.5295036938521527, + "learning_rate": 1.729168682368364e-05, + "loss": 0.9564, + "step": 1631 + }, + { + "epoch": 0.26, + "grad_norm": 1.5305043425469185, + "learning_rate": 1.7288113977007538e-05, + "loss": 0.9332, + "step": 1632 + }, + { + "epoch": 0.26, + "grad_norm": 1.5423458888365909, + "learning_rate": 1.7284539144848286e-05, + "loss": 0.9306, + "step": 1633 + }, + { + "epoch": 0.26, + "grad_norm": 1.717440114691662, + "learning_rate": 1.7280962328179773e-05, + "loss": 0.9688, + "step": 1634 + }, + { + "epoch": 0.26, + "grad_norm": 1.409033535775763, + "learning_rate": 1.727738352797642e-05, + "loss": 0.9159, + "step": 1635 + }, + { + "epoch": 0.26, + "grad_norm": 1.5232554736574733, + "learning_rate": 1.7273802745213193e-05, + "loss": 0.9008, + "step": 1636 + }, + { + "epoch": 0.26, + "grad_norm": 1.731236077805256, + "learning_rate": 1.7270219980865594e-05, + "loss": 0.9496, + "step": 1637 + }, + { + "epoch": 0.26, + "grad_norm": 1.671215276759986, + "learning_rate": 1.7266635235909664e-05, + "loss": 0.8994, + "step": 1638 + }, + { + "epoch": 0.26, + "grad_norm": 1.4765817468135582, + "learning_rate": 1.7263048511321995e-05, + "loss": 0.9116, + "step": 1639 + }, + { + "epoch": 0.26, + "grad_norm": 1.7645839010152913, + "learning_rate": 1.7259459808079705e-05, + "loss": 0.9373, + "step": 1640 + }, + { + "epoch": 0.26, + "grad_norm": 0.947990431203616, + "learning_rate": 1.7255869127160452e-05, + "loss": 0.3403, + "step": 1641 + }, + { + "epoch": 0.26, + "grad_norm": 1.3994209365249972, + "learning_rate": 1.725227646954244e-05, + "loss": 0.8872, + "step": 1642 + }, + { + "epoch": 0.26, + "grad_norm": 1.5450412966921077, + "learning_rate": 1.724868183620441e-05, + "loss": 0.8651, + "step": 1643 + }, + { + "epoch": 0.26, + "grad_norm": 1.527420349956401, + "learning_rate": 1.724508522812563e-05, + "loss": 0.8958, + "step": 1644 + }, + { + "epoch": 0.27, + "grad_norm": 1.3164689133300786, + "learning_rate": 1.724148664628593e-05, + "loss": 0.9359, + "step": 1645 + }, + { + "epoch": 0.27, + "grad_norm": 0.9120700529554698, + "learning_rate": 1.7237886091665653e-05, + "loss": 0.3342, + "step": 1646 + }, + { + "epoch": 0.27, + "grad_norm": 1.4503632528504864, + "learning_rate": 1.7234283565245688e-05, + "loss": 1.0021, + "step": 1647 + }, + { + "epoch": 0.27, + "grad_norm": 1.7308955925501437, + "learning_rate": 1.723067906800747e-05, + "loss": 0.9014, + "step": 1648 + }, + { + "epoch": 0.27, + "grad_norm": 1.2958425856213454, + "learning_rate": 1.7227072600932952e-05, + "loss": 0.9942, + "step": 1649 + }, + { + "epoch": 0.27, + "grad_norm": 1.4800440259555905, + "learning_rate": 1.7223464165004648e-05, + "loss": 0.9659, + "step": 1650 + }, + { + "epoch": 0.27, + "grad_norm": 1.5539254964093996, + "learning_rate": 1.7219853761205588e-05, + "loss": 0.8367, + "step": 1651 + }, + { + "epoch": 0.27, + "grad_norm": 1.289624835671746, + "learning_rate": 1.7216241390519348e-05, + "loss": 0.8484, + "step": 1652 + }, + { + "epoch": 0.27, + "grad_norm": 1.421199623775637, + "learning_rate": 1.7212627053930034e-05, + "loss": 0.9456, + "step": 1653 + }, + { + "epoch": 0.27, + "grad_norm": 1.2468958742383034, + "learning_rate": 1.7209010752422296e-05, + "loss": 0.8961, + "step": 1654 + }, + { + "epoch": 0.27, + "grad_norm": 1.4575552758669268, + "learning_rate": 1.7205392486981306e-05, + "loss": 0.873, + "step": 1655 + }, + { + "epoch": 0.27, + "grad_norm": 1.6644534627810683, + "learning_rate": 1.720177225859279e-05, + "loss": 0.8779, + "step": 1656 + }, + { + "epoch": 0.27, + "grad_norm": 1.4490642028207916, + "learning_rate": 1.7198150068242992e-05, + "loss": 0.8806, + "step": 1657 + }, + { + "epoch": 0.27, + "grad_norm": 1.2586906074984874, + "learning_rate": 1.71945259169187e-05, + "loss": 0.9563, + "step": 1658 + }, + { + "epoch": 0.27, + "grad_norm": 1.5917832239419862, + "learning_rate": 1.7190899805607224e-05, + "loss": 0.875, + "step": 1659 + }, + { + "epoch": 0.27, + "grad_norm": 1.5328061385610867, + "learning_rate": 1.7187271735296428e-05, + "loss": 0.878, + "step": 1660 + }, + { + "epoch": 0.27, + "grad_norm": 1.3410265014545164, + "learning_rate": 1.718364170697469e-05, + "loss": 0.9706, + "step": 1661 + }, + { + "epoch": 0.27, + "grad_norm": 1.509223176791907, + "learning_rate": 1.7180009721630932e-05, + "loss": 0.8493, + "step": 1662 + }, + { + "epoch": 0.27, + "grad_norm": 1.2803901623517842, + "learning_rate": 1.717637578025461e-05, + "loss": 0.8431, + "step": 1663 + }, + { + "epoch": 0.27, + "grad_norm": 1.4815940241647154, + "learning_rate": 1.7172739883835705e-05, + "loss": 0.8787, + "step": 1664 + }, + { + "epoch": 0.27, + "grad_norm": 1.456881659271399, + "learning_rate": 1.716910203336474e-05, + "loss": 0.9107, + "step": 1665 + }, + { + "epoch": 0.27, + "grad_norm": 1.5197804057946243, + "learning_rate": 1.716546222983276e-05, + "loss": 0.9026, + "step": 1666 + }, + { + "epoch": 0.27, + "grad_norm": 1.1046689574632567, + "learning_rate": 1.7161820474231355e-05, + "loss": 0.8771, + "step": 1667 + }, + { + "epoch": 0.27, + "grad_norm": 1.4683710941746164, + "learning_rate": 1.7158176767552636e-05, + "loss": 0.9889, + "step": 1668 + }, + { + "epoch": 0.27, + "grad_norm": 1.2468308201649616, + "learning_rate": 1.715453111078925e-05, + "loss": 0.8368, + "step": 1669 + }, + { + "epoch": 0.27, + "grad_norm": 1.2475229433643198, + "learning_rate": 1.715088350493437e-05, + "loss": 0.9047, + "step": 1670 + }, + { + "epoch": 0.27, + "grad_norm": 1.378033428626109, + "learning_rate": 1.714723395098171e-05, + "loss": 0.9149, + "step": 1671 + }, + { + "epoch": 0.27, + "grad_norm": 1.3639837916870199, + "learning_rate": 1.7143582449925508e-05, + "loss": 0.8805, + "step": 1672 + }, + { + "epoch": 0.27, + "grad_norm": 1.3696970441600163, + "learning_rate": 1.7139929002760532e-05, + "loss": 0.9298, + "step": 1673 + }, + { + "epoch": 0.27, + "grad_norm": 1.6390529912303684, + "learning_rate": 1.713627361048208e-05, + "loss": 0.8627, + "step": 1674 + }, + { + "epoch": 0.27, + "grad_norm": 1.5105147754660921, + "learning_rate": 1.713261627408599e-05, + "loss": 0.9467, + "step": 1675 + }, + { + "epoch": 0.27, + "grad_norm": 1.5657000376656307, + "learning_rate": 1.7128956994568612e-05, + "loss": 0.9782, + "step": 1676 + }, + { + "epoch": 0.27, + "grad_norm": 1.4419605008802805, + "learning_rate": 1.7125295772926834e-05, + "loss": 0.8904, + "step": 1677 + }, + { + "epoch": 0.27, + "grad_norm": 1.4045057207775635, + "learning_rate": 1.7121632610158083e-05, + "loss": 0.8312, + "step": 1678 + }, + { + "epoch": 0.27, + "grad_norm": 1.6328565691973866, + "learning_rate": 1.7117967507260298e-05, + "loss": 0.9271, + "step": 1679 + }, + { + "epoch": 0.27, + "grad_norm": 0.8940965067668736, + "learning_rate": 1.7114300465231955e-05, + "loss": 0.37, + "step": 1680 + }, + { + "epoch": 0.27, + "grad_norm": 1.6241324922025069, + "learning_rate": 1.711063148507206e-05, + "loss": 1.0052, + "step": 1681 + }, + { + "epoch": 0.27, + "grad_norm": 1.2804788669084108, + "learning_rate": 1.710696056778014e-05, + "loss": 0.891, + "step": 1682 + }, + { + "epoch": 0.27, + "grad_norm": 1.4692550105482822, + "learning_rate": 1.710328771435626e-05, + "loss": 0.9188, + "step": 1683 + }, + { + "epoch": 0.27, + "grad_norm": 1.317863019117824, + "learning_rate": 1.7099612925801005e-05, + "loss": 0.976, + "step": 1684 + }, + { + "epoch": 0.27, + "grad_norm": 1.3359038985917746, + "learning_rate": 1.7095936203115484e-05, + "loss": 0.8763, + "step": 1685 + }, + { + "epoch": 0.27, + "grad_norm": 1.3203848256102904, + "learning_rate": 1.709225754730134e-05, + "loss": 0.9167, + "step": 1686 + }, + { + "epoch": 0.27, + "grad_norm": 1.4316388214094764, + "learning_rate": 1.7088576959360743e-05, + "loss": 0.8574, + "step": 1687 + }, + { + "epoch": 0.27, + "grad_norm": 1.2602223115586864, + "learning_rate": 1.7084894440296383e-05, + "loss": 0.9142, + "step": 1688 + }, + { + "epoch": 0.27, + "grad_norm": 1.6806280530123008, + "learning_rate": 1.7081209991111483e-05, + "loss": 0.9115, + "step": 1689 + }, + { + "epoch": 0.27, + "grad_norm": 1.4235209282766383, + "learning_rate": 1.7077523612809784e-05, + "loss": 0.8984, + "step": 1690 + }, + { + "epoch": 0.27, + "grad_norm": 0.8117235008262265, + "learning_rate": 1.707383530639556e-05, + "loss": 0.3523, + "step": 1691 + }, + { + "epoch": 0.27, + "grad_norm": 1.493211719258882, + "learning_rate": 1.707014507287361e-05, + "loss": 0.921, + "step": 1692 + }, + { + "epoch": 0.27, + "grad_norm": 1.369873990521049, + "learning_rate": 1.706645291324925e-05, + "loss": 0.8266, + "step": 1693 + }, + { + "epoch": 0.27, + "grad_norm": 1.8139715171475574, + "learning_rate": 1.706275882852833e-05, + "loss": 0.8617, + "step": 1694 + }, + { + "epoch": 0.27, + "grad_norm": 1.479826783595611, + "learning_rate": 1.705906281971722e-05, + "loss": 1.0706, + "step": 1695 + }, + { + "epoch": 0.27, + "grad_norm": 1.366313854782972, + "learning_rate": 1.705536488782281e-05, + "loss": 0.8836, + "step": 1696 + }, + { + "epoch": 0.27, + "grad_norm": 1.338671068202973, + "learning_rate": 1.7051665033852525e-05, + "loss": 1.0078, + "step": 1697 + }, + { + "epoch": 0.27, + "grad_norm": 1.7228660693594156, + "learning_rate": 1.70479632588143e-05, + "loss": 0.8365, + "step": 1698 + }, + { + "epoch": 0.27, + "grad_norm": 1.375946305851306, + "learning_rate": 1.704425956371661e-05, + "loss": 0.9553, + "step": 1699 + }, + { + "epoch": 0.27, + "grad_norm": 1.6316573948782636, + "learning_rate": 1.704055394956844e-05, + "loss": 0.8432, + "step": 1700 + }, + { + "epoch": 0.27, + "grad_norm": 1.3826319400179754, + "learning_rate": 1.7036846417379295e-05, + "loss": 0.861, + "step": 1701 + }, + { + "epoch": 0.27, + "grad_norm": 1.3587403439616919, + "learning_rate": 1.7033136968159218e-05, + "loss": 0.9736, + "step": 1702 + }, + { + "epoch": 0.27, + "grad_norm": 1.4372309797131653, + "learning_rate": 1.7029425602918758e-05, + "loss": 0.933, + "step": 1703 + }, + { + "epoch": 0.27, + "grad_norm": 1.5215591013214864, + "learning_rate": 1.7025712322669e-05, + "loss": 0.9169, + "step": 1704 + }, + { + "epoch": 0.27, + "grad_norm": 1.5997183032553017, + "learning_rate": 1.7021997128421537e-05, + "loss": 0.8882, + "step": 1705 + }, + { + "epoch": 0.27, + "grad_norm": 1.7303226411417902, + "learning_rate": 1.7018280021188497e-05, + "loss": 0.8889, + "step": 1706 + }, + { + "epoch": 0.28, + "grad_norm": 1.6925961084332686, + "learning_rate": 1.701456100198252e-05, + "loss": 0.9085, + "step": 1707 + }, + { + "epoch": 0.28, + "grad_norm": 1.4651990612170933, + "learning_rate": 1.7010840071816765e-05, + "loss": 0.9634, + "step": 1708 + }, + { + "epoch": 0.28, + "grad_norm": 1.3180207093019016, + "learning_rate": 1.7007117231704923e-05, + "loss": 0.9305, + "step": 1709 + }, + { + "epoch": 0.28, + "grad_norm": 1.4493006910566164, + "learning_rate": 1.7003392482661188e-05, + "loss": 0.9591, + "step": 1710 + }, + { + "epoch": 0.28, + "grad_norm": 1.273046617838578, + "learning_rate": 1.69996658257003e-05, + "loss": 0.8742, + "step": 1711 + }, + { + "epoch": 0.28, + "grad_norm": 1.3709116051260666, + "learning_rate": 1.6995937261837487e-05, + "loss": 0.9056, + "step": 1712 + }, + { + "epoch": 0.28, + "grad_norm": 1.5000023810794925, + "learning_rate": 1.6992206792088525e-05, + "loss": 0.9522, + "step": 1713 + }, + { + "epoch": 0.28, + "grad_norm": 1.4590540419903482, + "learning_rate": 1.6988474417469686e-05, + "loss": 0.8774, + "step": 1714 + }, + { + "epoch": 0.28, + "grad_norm": 1.3638290203817247, + "learning_rate": 1.6984740138997784e-05, + "loss": 0.8818, + "step": 1715 + }, + { + "epoch": 0.28, + "grad_norm": 1.440169521694651, + "learning_rate": 1.6981003957690128e-05, + "loss": 0.8865, + "step": 1716 + }, + { + "epoch": 0.28, + "grad_norm": 1.5270008015551286, + "learning_rate": 1.6977265874564562e-05, + "loss": 0.9415, + "step": 1717 + }, + { + "epoch": 0.28, + "grad_norm": 1.4467885309534196, + "learning_rate": 1.6973525890639445e-05, + "loss": 0.9579, + "step": 1718 + }, + { + "epoch": 0.28, + "grad_norm": 1.4793824705981617, + "learning_rate": 1.6969784006933647e-05, + "loss": 0.8845, + "step": 1719 + }, + { + "epoch": 0.28, + "grad_norm": 1.6671657851770463, + "learning_rate": 1.696604022446656e-05, + "loss": 0.8528, + "step": 1720 + }, + { + "epoch": 0.28, + "grad_norm": 1.6090800729202437, + "learning_rate": 1.69622945442581e-05, + "loss": 0.9804, + "step": 1721 + }, + { + "epoch": 0.28, + "grad_norm": 1.4576746009878208, + "learning_rate": 1.6958546967328688e-05, + "loss": 0.936, + "step": 1722 + }, + { + "epoch": 0.28, + "grad_norm": 1.6419589455276276, + "learning_rate": 1.6954797494699263e-05, + "loss": 0.9106, + "step": 1723 + }, + { + "epoch": 0.28, + "grad_norm": 1.4258216995876707, + "learning_rate": 1.69510461273913e-05, + "loss": 0.9607, + "step": 1724 + }, + { + "epoch": 0.28, + "grad_norm": 1.3383984722612516, + "learning_rate": 1.6947292866426757e-05, + "loss": 0.8822, + "step": 1725 + }, + { + "epoch": 0.28, + "grad_norm": 1.3589109568707913, + "learning_rate": 1.6943537712828138e-05, + "loss": 0.8985, + "step": 1726 + }, + { + "epoch": 0.28, + "grad_norm": 1.5050136860770933, + "learning_rate": 1.6939780667618445e-05, + "loss": 0.9289, + "step": 1727 + }, + { + "epoch": 0.28, + "grad_norm": 1.2657478943957379, + "learning_rate": 1.6936021731821202e-05, + "loss": 0.9021, + "step": 1728 + }, + { + "epoch": 0.28, + "grad_norm": 1.4600311020031127, + "learning_rate": 1.6932260906460448e-05, + "loss": 0.9809, + "step": 1729 + }, + { + "epoch": 0.28, + "grad_norm": 1.3254060669012642, + "learning_rate": 1.6928498192560737e-05, + "loss": 0.8399, + "step": 1730 + }, + { + "epoch": 0.28, + "grad_norm": 1.4588964019562303, + "learning_rate": 1.6924733591147127e-05, + "loss": 0.8732, + "step": 1731 + }, + { + "epoch": 0.28, + "grad_norm": 1.4744808115714747, + "learning_rate": 1.692096710324521e-05, + "loss": 0.9501, + "step": 1732 + }, + { + "epoch": 0.28, + "grad_norm": 1.2299454007482096, + "learning_rate": 1.6917198729881073e-05, + "loss": 0.9078, + "step": 1733 + }, + { + "epoch": 0.28, + "grad_norm": 1.536771252995333, + "learning_rate": 1.6913428472081328e-05, + "loss": 0.9818, + "step": 1734 + }, + { + "epoch": 0.28, + "grad_norm": 1.2879916489907295, + "learning_rate": 1.69096563308731e-05, + "loss": 0.9526, + "step": 1735 + }, + { + "epoch": 0.28, + "grad_norm": 1.2548301381436988, + "learning_rate": 1.690588230728402e-05, + "loss": 0.858, + "step": 1736 + }, + { + "epoch": 0.28, + "grad_norm": 1.339787731584422, + "learning_rate": 1.690210640234224e-05, + "loss": 0.9282, + "step": 1737 + }, + { + "epoch": 0.28, + "grad_norm": 1.2911566774123973, + "learning_rate": 1.6898328617076418e-05, + "loss": 0.9327, + "step": 1738 + }, + { + "epoch": 0.28, + "grad_norm": 1.2147070686689085, + "learning_rate": 1.6894548952515726e-05, + "loss": 0.8746, + "step": 1739 + }, + { + "epoch": 0.28, + "grad_norm": 1.0206226631039683, + "learning_rate": 1.689076740968985e-05, + "loss": 0.3319, + "step": 1740 + }, + { + "epoch": 0.28, + "grad_norm": 1.392396382674935, + "learning_rate": 1.6886983989628985e-05, + "loss": 0.9386, + "step": 1741 + }, + { + "epoch": 0.28, + "grad_norm": 1.450032399078668, + "learning_rate": 1.6883198693363843e-05, + "loss": 0.9075, + "step": 1742 + }, + { + "epoch": 0.28, + "grad_norm": 1.2890749237155443, + "learning_rate": 1.687941152192564e-05, + "loss": 0.851, + "step": 1743 + }, + { + "epoch": 0.28, + "grad_norm": 1.3037717708581953, + "learning_rate": 1.6875622476346107e-05, + "loss": 0.9362, + "step": 1744 + }, + { + "epoch": 0.28, + "grad_norm": 1.7497504761034086, + "learning_rate": 1.687183155765748e-05, + "loss": 0.8896, + "step": 1745 + }, + { + "epoch": 0.28, + "grad_norm": 1.5210663407270555, + "learning_rate": 1.686803876689252e-05, + "loss": 0.8484, + "step": 1746 + }, + { + "epoch": 0.28, + "grad_norm": 1.6886943072210736, + "learning_rate": 1.6864244105084477e-05, + "loss": 0.8443, + "step": 1747 + }, + { + "epoch": 0.28, + "grad_norm": 1.545226204577288, + "learning_rate": 1.6860447573267125e-05, + "loss": 0.882, + "step": 1748 + }, + { + "epoch": 0.28, + "grad_norm": 1.3567707695719973, + "learning_rate": 1.6856649172474747e-05, + "loss": 0.9174, + "step": 1749 + }, + { + "epoch": 0.28, + "grad_norm": 1.2927242405128132, + "learning_rate": 1.6852848903742125e-05, + "loss": 0.9099, + "step": 1750 + }, + { + "epoch": 0.28, + "grad_norm": 1.975215208612929, + "learning_rate": 1.6849046768104567e-05, + "loss": 0.9642, + "step": 1751 + }, + { + "epoch": 0.28, + "grad_norm": 1.326490441208533, + "learning_rate": 1.6845242766597873e-05, + "loss": 0.8566, + "step": 1752 + }, + { + "epoch": 0.28, + "grad_norm": 1.439425723101024, + "learning_rate": 1.684143690025836e-05, + "loss": 0.9197, + "step": 1753 + }, + { + "epoch": 0.28, + "grad_norm": 1.5118173430325141, + "learning_rate": 1.6837629170122846e-05, + "loss": 0.8348, + "step": 1754 + }, + { + "epoch": 0.28, + "grad_norm": 1.564935230117965, + "learning_rate": 1.683381957722867e-05, + "loss": 0.8856, + "step": 1755 + }, + { + "epoch": 0.28, + "grad_norm": 1.349188491954696, + "learning_rate": 1.6830008122613665e-05, + "loss": 0.9681, + "step": 1756 + }, + { + "epoch": 0.28, + "grad_norm": 1.5095425239835352, + "learning_rate": 1.6826194807316177e-05, + "loss": 0.9068, + "step": 1757 + }, + { + "epoch": 0.28, + "grad_norm": 1.242826540332664, + "learning_rate": 1.6822379632375055e-05, + "loss": 0.9144, + "step": 1758 + }, + { + "epoch": 0.28, + "grad_norm": 1.5488815647156526, + "learning_rate": 1.6818562598829666e-05, + "loss": 0.8395, + "step": 1759 + }, + { + "epoch": 0.28, + "grad_norm": 1.563083006453182, + "learning_rate": 1.681474370771987e-05, + "loss": 0.9642, + "step": 1760 + }, + { + "epoch": 0.28, + "grad_norm": 1.4054693068311859, + "learning_rate": 1.6810922960086037e-05, + "loss": 0.8713, + "step": 1761 + }, + { + "epoch": 0.28, + "grad_norm": 1.5935769881277606, + "learning_rate": 1.6807100356969043e-05, + "loss": 0.8644, + "step": 1762 + }, + { + "epoch": 0.28, + "grad_norm": 0.9061703708622056, + "learning_rate": 1.6803275899410277e-05, + "loss": 0.379, + "step": 1763 + }, + { + "epoch": 0.28, + "grad_norm": 1.7638195315045784, + "learning_rate": 1.6799449588451618e-05, + "loss": 0.9026, + "step": 1764 + }, + { + "epoch": 0.28, + "grad_norm": 2.172764518794516, + "learning_rate": 1.6795621425135465e-05, + "loss": 0.9395, + "step": 1765 + }, + { + "epoch": 0.28, + "grad_norm": 1.251709454980728, + "learning_rate": 1.679179141050472e-05, + "loss": 0.9014, + "step": 1766 + }, + { + "epoch": 0.28, + "grad_norm": 1.3237932585005006, + "learning_rate": 1.678795954560277e-05, + "loss": 0.8873, + "step": 1767 + }, + { + "epoch": 0.28, + "grad_norm": 1.5650318780179766, + "learning_rate": 1.678412583147353e-05, + "loss": 0.8584, + "step": 1768 + }, + { + "epoch": 0.29, + "grad_norm": 1.4289068594858778, + "learning_rate": 1.678029026916141e-05, + "loss": 0.8444, + "step": 1769 + }, + { + "epoch": 0.29, + "grad_norm": 1.343947804940275, + "learning_rate": 1.677645285971132e-05, + "loss": 0.917, + "step": 1770 + }, + { + "epoch": 0.29, + "grad_norm": 1.3209994408204184, + "learning_rate": 1.6772613604168677e-05, + "loss": 0.9182, + "step": 1771 + }, + { + "epoch": 0.29, + "grad_norm": 1.4723952565519836, + "learning_rate": 1.67687725035794e-05, + "loss": 0.8935, + "step": 1772 + }, + { + "epoch": 0.29, + "grad_norm": 1.46843043678263, + "learning_rate": 1.676492955898991e-05, + "loss": 0.8259, + "step": 1773 + }, + { + "epoch": 0.29, + "grad_norm": 1.2072424121359966, + "learning_rate": 1.6761084771447133e-05, + "loss": 0.8722, + "step": 1774 + }, + { + "epoch": 0.29, + "grad_norm": 1.3663046738573317, + "learning_rate": 1.675723814199849e-05, + "loss": 0.9134, + "step": 1775 + }, + { + "epoch": 0.29, + "grad_norm": 0.9270701188670847, + "learning_rate": 1.675338967169192e-05, + "loss": 0.3224, + "step": 1776 + }, + { + "epoch": 0.29, + "grad_norm": 1.3612592869926525, + "learning_rate": 1.674953936157584e-05, + "loss": 0.9547, + "step": 1777 + }, + { + "epoch": 0.29, + "grad_norm": 1.3710154622433415, + "learning_rate": 1.6745687212699188e-05, + "loss": 0.8766, + "step": 1778 + }, + { + "epoch": 0.29, + "grad_norm": 1.5105537637336963, + "learning_rate": 1.674183322611139e-05, + "loss": 0.9358, + "step": 1779 + }, + { + "epoch": 0.29, + "grad_norm": 1.5378724994613515, + "learning_rate": 1.6737977402862383e-05, + "loss": 0.9574, + "step": 1780 + }, + { + "epoch": 0.29, + "grad_norm": 1.4201469741444492, + "learning_rate": 1.67341197440026e-05, + "loss": 0.9326, + "step": 1781 + }, + { + "epoch": 0.29, + "grad_norm": 1.3079506233156866, + "learning_rate": 1.673026025058297e-05, + "loss": 0.9078, + "step": 1782 + }, + { + "epoch": 0.29, + "grad_norm": 1.3163909932449473, + "learning_rate": 1.672639892365493e-05, + "loss": 0.9849, + "step": 1783 + }, + { + "epoch": 0.29, + "grad_norm": 1.4954216115828285, + "learning_rate": 1.6722535764270405e-05, + "loss": 0.8567, + "step": 1784 + }, + { + "epoch": 0.29, + "grad_norm": 1.440579610231591, + "learning_rate": 1.671867077348183e-05, + "loss": 0.864, + "step": 1785 + }, + { + "epoch": 0.29, + "grad_norm": 1.592551097168017, + "learning_rate": 1.6714803952342137e-05, + "loss": 0.8657, + "step": 1786 + }, + { + "epoch": 0.29, + "grad_norm": 1.5722606410071949, + "learning_rate": 1.671093530190475e-05, + "loss": 0.8906, + "step": 1787 + }, + { + "epoch": 0.29, + "grad_norm": 1.2369225219673405, + "learning_rate": 1.6707064823223603e-05, + "loss": 0.9543, + "step": 1788 + }, + { + "epoch": 0.29, + "grad_norm": 1.366212061594568, + "learning_rate": 1.6703192517353116e-05, + "loss": 0.856, + "step": 1789 + }, + { + "epoch": 0.29, + "grad_norm": 1.556054955203911, + "learning_rate": 1.6699318385348207e-05, + "loss": 0.9443, + "step": 1790 + }, + { + "epoch": 0.29, + "grad_norm": 1.7602980408446298, + "learning_rate": 1.6695442428264308e-05, + "loss": 0.9147, + "step": 1791 + }, + { + "epoch": 0.29, + "grad_norm": 1.625324947508096, + "learning_rate": 1.669156464715733e-05, + "loss": 0.9261, + "step": 1792 + }, + { + "epoch": 0.29, + "grad_norm": 1.2799900914019162, + "learning_rate": 1.668768504308369e-05, + "loss": 0.9036, + "step": 1793 + }, + { + "epoch": 0.29, + "grad_norm": 1.3627750302606503, + "learning_rate": 1.6683803617100293e-05, + "loss": 0.9227, + "step": 1794 + }, + { + "epoch": 0.29, + "grad_norm": 1.3568214625493475, + "learning_rate": 1.667992037026455e-05, + "loss": 0.9658, + "step": 1795 + }, + { + "epoch": 0.29, + "grad_norm": 1.3787841634763032, + "learning_rate": 1.6676035303634366e-05, + "loss": 0.9085, + "step": 1796 + }, + { + "epoch": 0.29, + "grad_norm": 1.0827242408634352, + "learning_rate": 1.6672148418268143e-05, + "loss": 0.3393, + "step": 1797 + }, + { + "epoch": 0.29, + "grad_norm": 1.457818621390555, + "learning_rate": 1.6668259715224767e-05, + "loss": 0.9083, + "step": 1798 + }, + { + "epoch": 0.29, + "grad_norm": 1.6112687007465818, + "learning_rate": 1.6664369195563635e-05, + "loss": 0.8646, + "step": 1799 + }, + { + "epoch": 0.29, + "grad_norm": 1.3988407342167224, + "learning_rate": 1.666047686034463e-05, + "loss": 0.8479, + "step": 1800 + }, + { + "epoch": 0.29, + "grad_norm": 1.3395026123014104, + "learning_rate": 1.6656582710628133e-05, + "loss": 0.8968, + "step": 1801 + }, + { + "epoch": 0.29, + "grad_norm": 1.7639819492570241, + "learning_rate": 1.665268674747501e-05, + "loss": 0.92, + "step": 1802 + }, + { + "epoch": 0.29, + "grad_norm": 1.6330661154637676, + "learning_rate": 1.664878897194664e-05, + "loss": 0.924, + "step": 1803 + }, + { + "epoch": 0.29, + "grad_norm": 1.532684934724698, + "learning_rate": 1.6644889385104875e-05, + "loss": 0.9683, + "step": 1804 + }, + { + "epoch": 0.29, + "grad_norm": 1.4137379889089718, + "learning_rate": 1.6640987988012077e-05, + "loss": 0.9706, + "step": 1805 + }, + { + "epoch": 0.29, + "grad_norm": 1.5235202864625528, + "learning_rate": 1.663708478173109e-05, + "loss": 0.9588, + "step": 1806 + }, + { + "epoch": 0.29, + "grad_norm": 1.2330784779080506, + "learning_rate": 1.6633179767325258e-05, + "loss": 0.9011, + "step": 1807 + }, + { + "epoch": 0.29, + "grad_norm": 1.6098673221397422, + "learning_rate": 1.6629272945858413e-05, + "loss": 0.8826, + "step": 1808 + }, + { + "epoch": 0.29, + "grad_norm": 1.3560816714758392, + "learning_rate": 1.662536431839488e-05, + "loss": 0.9486, + "step": 1809 + }, + { + "epoch": 0.29, + "grad_norm": 1.443297722961885, + "learning_rate": 1.6621453885999473e-05, + "loss": 0.9282, + "step": 1810 + }, + { + "epoch": 0.29, + "grad_norm": 0.812590067369242, + "learning_rate": 1.6617541649737514e-05, + "loss": 0.3494, + "step": 1811 + }, + { + "epoch": 0.29, + "grad_norm": 1.257451338776845, + "learning_rate": 1.661362761067479e-05, + "loss": 0.9186, + "step": 1812 + }, + { + "epoch": 0.29, + "grad_norm": 1.3689316292121216, + "learning_rate": 1.66097117698776e-05, + "loss": 0.9328, + "step": 1813 + }, + { + "epoch": 0.29, + "grad_norm": 1.2479203138370454, + "learning_rate": 1.6605794128412725e-05, + "loss": 0.8719, + "step": 1814 + }, + { + "epoch": 0.29, + "grad_norm": 1.450862103151999, + "learning_rate": 1.6601874687347443e-05, + "loss": 0.8834, + "step": 1815 + }, + { + "epoch": 0.29, + "grad_norm": 1.7379808434210793, + "learning_rate": 1.6597953447749514e-05, + "loss": 0.8571, + "step": 1816 + }, + { + "epoch": 0.29, + "grad_norm": 1.484289441654545, + "learning_rate": 1.659403041068719e-05, + "loss": 0.8145, + "step": 1817 + }, + { + "epoch": 0.29, + "grad_norm": 1.3237566141538748, + "learning_rate": 1.6590105577229216e-05, + "loss": 0.8666, + "step": 1818 + }, + { + "epoch": 0.29, + "grad_norm": 1.3234371492121928, + "learning_rate": 1.6586178948444828e-05, + "loss": 0.8594, + "step": 1819 + }, + { + "epoch": 0.29, + "grad_norm": 1.6171035312884443, + "learning_rate": 1.6582250525403748e-05, + "loss": 1.0314, + "step": 1820 + }, + { + "epoch": 0.29, + "grad_norm": 1.3116288007876589, + "learning_rate": 1.6578320309176175e-05, + "loss": 0.9505, + "step": 1821 + }, + { + "epoch": 0.29, + "grad_norm": 1.4404083353819983, + "learning_rate": 1.6574388300832827e-05, + "loss": 0.9358, + "step": 1822 + }, + { + "epoch": 0.29, + "grad_norm": 1.4140924574381648, + "learning_rate": 1.6570454501444878e-05, + "loss": 0.9739, + "step": 1823 + }, + { + "epoch": 0.29, + "grad_norm": 1.3663781024951993, + "learning_rate": 1.6566518912084e-05, + "loss": 0.8941, + "step": 1824 + }, + { + "epoch": 0.29, + "grad_norm": 1.586553682341651, + "learning_rate": 1.6562581533822374e-05, + "loss": 0.8944, + "step": 1825 + }, + { + "epoch": 0.29, + "grad_norm": 1.2136709794298102, + "learning_rate": 1.6558642367732635e-05, + "loss": 0.8994, + "step": 1826 + }, + { + "epoch": 0.29, + "grad_norm": 1.3946488842382352, + "learning_rate": 1.6554701414887927e-05, + "loss": 0.9143, + "step": 1827 + }, + { + "epoch": 0.29, + "grad_norm": 1.7669819326937006, + "learning_rate": 1.6550758676361872e-05, + "loss": 0.9683, + "step": 1828 + }, + { + "epoch": 0.29, + "grad_norm": 1.5094245979500365, + "learning_rate": 1.6546814153228576e-05, + "loss": 0.9029, + "step": 1829 + }, + { + "epoch": 0.29, + "grad_norm": 1.464662752413458, + "learning_rate": 1.6542867846562644e-05, + "loss": 0.9318, + "step": 1830 + }, + { + "epoch": 0.3, + "grad_norm": 1.4866145556006098, + "learning_rate": 1.653891975743916e-05, + "loss": 0.9326, + "step": 1831 + }, + { + "epoch": 0.3, + "grad_norm": 1.2524207026088447, + "learning_rate": 1.653496988693368e-05, + "loss": 0.8729, + "step": 1832 + }, + { + "epoch": 0.3, + "grad_norm": 1.4601656154701645, + "learning_rate": 1.653101823612227e-05, + "loss": 0.9251, + "step": 1833 + }, + { + "epoch": 0.3, + "grad_norm": 1.5345831443120987, + "learning_rate": 1.652706480608146e-05, + "loss": 0.9465, + "step": 1834 + }, + { + "epoch": 0.3, + "grad_norm": 1.3847654292882612, + "learning_rate": 1.652310959788828e-05, + "loss": 0.9148, + "step": 1835 + }, + { + "epoch": 0.3, + "grad_norm": 1.5969414738454522, + "learning_rate": 1.6519152612620236e-05, + "loss": 0.9995, + "step": 1836 + }, + { + "epoch": 0.3, + "grad_norm": 1.3906857096889536, + "learning_rate": 1.6515193851355314e-05, + "loss": 0.8599, + "step": 1837 + }, + { + "epoch": 0.3, + "grad_norm": 1.2850762316249045, + "learning_rate": 1.6511233315172e-05, + "loss": 0.8718, + "step": 1838 + }, + { + "epoch": 0.3, + "grad_norm": 1.278397945351855, + "learning_rate": 1.6507271005149246e-05, + "loss": 0.8751, + "step": 1839 + }, + { + "epoch": 0.3, + "grad_norm": 1.5388183433366884, + "learning_rate": 1.6503306922366497e-05, + "loss": 0.8374, + "step": 1840 + }, + { + "epoch": 0.3, + "grad_norm": 1.6401831122625574, + "learning_rate": 1.6499341067903675e-05, + "loss": 0.9508, + "step": 1841 + }, + { + "epoch": 0.3, + "grad_norm": 1.2463090928773701, + "learning_rate": 1.6495373442841192e-05, + "loss": 0.9153, + "step": 1842 + }, + { + "epoch": 0.3, + "grad_norm": 1.5078476620067778, + "learning_rate": 1.649140404825994e-05, + "loss": 0.9205, + "step": 1843 + }, + { + "epoch": 0.3, + "grad_norm": 1.0778705756279008, + "learning_rate": 1.6487432885241287e-05, + "loss": 0.3591, + "step": 1844 + }, + { + "epoch": 0.3, + "grad_norm": 1.422654447863389, + "learning_rate": 1.6483459954867086e-05, + "loss": 0.9037, + "step": 1845 + }, + { + "epoch": 0.3, + "grad_norm": 1.4647929971167872, + "learning_rate": 1.6479485258219677e-05, + "loss": 0.9241, + "step": 1846 + }, + { + "epoch": 0.3, + "grad_norm": 1.5676452815064608, + "learning_rate": 1.6475508796381875e-05, + "loss": 0.9066, + "step": 1847 + }, + { + "epoch": 0.3, + "grad_norm": 1.3706296769828599, + "learning_rate": 1.6471530570436983e-05, + "loss": 0.9679, + "step": 1848 + }, + { + "epoch": 0.3, + "grad_norm": 1.3047505723247015, + "learning_rate": 1.646755058146877e-05, + "loss": 0.846, + "step": 1849 + }, + { + "epoch": 0.3, + "grad_norm": 1.1552555747114357, + "learning_rate": 1.64635688305615e-05, + "loss": 0.3565, + "step": 1850 + }, + { + "epoch": 0.3, + "grad_norm": 1.5758281791556659, + "learning_rate": 1.6459585318799914e-05, + "loss": 0.8125, + "step": 1851 + }, + { + "epoch": 0.3, + "grad_norm": 1.4557076404517675, + "learning_rate": 1.6455600047269228e-05, + "loss": 0.888, + "step": 1852 + }, + { + "epoch": 0.3, + "grad_norm": 0.8531464666593211, + "learning_rate": 1.6451613017055136e-05, + "loss": 0.366, + "step": 1853 + }, + { + "epoch": 0.3, + "grad_norm": 1.236892314965465, + "learning_rate": 1.644762422924382e-05, + "loss": 0.9813, + "step": 1854 + }, + { + "epoch": 0.3, + "grad_norm": 0.8437537352161382, + "learning_rate": 1.6443633684921934e-05, + "loss": 0.3252, + "step": 1855 + }, + { + "epoch": 0.3, + "grad_norm": 1.5837266561820837, + "learning_rate": 1.643964138517661e-05, + "loss": 0.9894, + "step": 1856 + }, + { + "epoch": 0.3, + "grad_norm": 1.4148796484051305, + "learning_rate": 1.6435647331095466e-05, + "loss": 0.9045, + "step": 1857 + }, + { + "epoch": 0.3, + "grad_norm": 1.2883450361044577, + "learning_rate": 1.643165152376659e-05, + "loss": 0.9638, + "step": 1858 + }, + { + "epoch": 0.3, + "grad_norm": 1.4272516180682604, + "learning_rate": 1.6427653964278547e-05, + "loss": 0.9388, + "step": 1859 + }, + { + "epoch": 0.3, + "grad_norm": 1.2096146687684552, + "learning_rate": 1.6423654653720385e-05, + "loss": 0.9594, + "step": 1860 + }, + { + "epoch": 0.3, + "grad_norm": 1.489937051433043, + "learning_rate": 1.6419653593181628e-05, + "loss": 0.7749, + "step": 1861 + }, + { + "epoch": 0.3, + "grad_norm": 1.355225912322674, + "learning_rate": 1.6415650783752274e-05, + "loss": 0.9044, + "step": 1862 + }, + { + "epoch": 0.3, + "grad_norm": 1.4914100891211741, + "learning_rate": 1.6411646226522793e-05, + "loss": 0.9411, + "step": 1863 + }, + { + "epoch": 0.3, + "grad_norm": 1.4348033731294667, + "learning_rate": 1.6407639922584148e-05, + "loss": 0.9198, + "step": 1864 + }, + { + "epoch": 0.3, + "grad_norm": 1.6303689002119117, + "learning_rate": 1.640363187302776e-05, + "loss": 0.9531, + "step": 1865 + }, + { + "epoch": 0.3, + "grad_norm": 1.4725151215731733, + "learning_rate": 1.639962207894553e-05, + "loss": 0.9768, + "step": 1866 + }, + { + "epoch": 0.3, + "grad_norm": 1.468409529243215, + "learning_rate": 1.6395610541429836e-05, + "loss": 0.8686, + "step": 1867 + }, + { + "epoch": 0.3, + "grad_norm": 1.3630561403657215, + "learning_rate": 1.639159726157354e-05, + "loss": 0.8506, + "step": 1868 + }, + { + "epoch": 0.3, + "grad_norm": 1.5142797676875221, + "learning_rate": 1.6387582240469963e-05, + "loss": 0.9296, + "step": 1869 + }, + { + "epoch": 0.3, + "grad_norm": 1.298882509111199, + "learning_rate": 1.638356547921291e-05, + "loss": 0.914, + "step": 1870 + }, + { + "epoch": 0.3, + "grad_norm": 0.8269400359416621, + "learning_rate": 1.6379546978896655e-05, + "loss": 0.3392, + "step": 1871 + }, + { + "epoch": 0.3, + "grad_norm": 1.5039602492905442, + "learning_rate": 1.637552674061595e-05, + "loss": 0.8839, + "step": 1872 + }, + { + "epoch": 0.3, + "grad_norm": 0.8194078554819518, + "learning_rate": 1.6371504765466014e-05, + "loss": 0.2919, + "step": 1873 + }, + { + "epoch": 0.3, + "grad_norm": 1.499692720862973, + "learning_rate": 1.6367481054542556e-05, + "loss": 0.9457, + "step": 1874 + }, + { + "epoch": 0.3, + "grad_norm": 1.4548253787151557, + "learning_rate": 1.6363455608941734e-05, + "loss": 0.9164, + "step": 1875 + }, + { + "epoch": 0.3, + "grad_norm": 1.277285653653831, + "learning_rate": 1.6359428429760192e-05, + "loss": 0.9211, + "step": 1876 + }, + { + "epoch": 0.3, + "grad_norm": 1.4171023572477432, + "learning_rate": 1.635539951809505e-05, + "loss": 0.9129, + "step": 1877 + }, + { + "epoch": 0.3, + "grad_norm": 1.3930811201001108, + "learning_rate": 1.6351368875043893e-05, + "loss": 0.8563, + "step": 1878 + }, + { + "epoch": 0.3, + "grad_norm": 1.4159797104852054, + "learning_rate": 1.6347336501704777e-05, + "loss": 0.9508, + "step": 1879 + }, + { + "epoch": 0.3, + "grad_norm": 1.4981195296681808, + "learning_rate": 1.6343302399176235e-05, + "loss": 0.9181, + "step": 1880 + }, + { + "epoch": 0.3, + "grad_norm": 0.842047480068682, + "learning_rate": 1.633926656855726e-05, + "loss": 0.3538, + "step": 1881 + }, + { + "epoch": 0.3, + "grad_norm": 1.4269311709507366, + "learning_rate": 1.6335229010947333e-05, + "loss": 0.9183, + "step": 1882 + }, + { + "epoch": 0.3, + "grad_norm": 1.4151447097429506, + "learning_rate": 1.6331189727446393e-05, + "loss": 0.9394, + "step": 1883 + }, + { + "epoch": 0.3, + "grad_norm": 1.4266576503345458, + "learning_rate": 1.6327148719154845e-05, + "loss": 0.9376, + "step": 1884 + }, + { + "epoch": 0.3, + "grad_norm": 1.3298951960062875, + "learning_rate": 1.632310598717358e-05, + "loss": 0.9058, + "step": 1885 + }, + { + "epoch": 0.3, + "grad_norm": 1.5969633252954751, + "learning_rate": 1.6319061532603947e-05, + "loss": 0.8349, + "step": 1886 + }, + { + "epoch": 0.3, + "grad_norm": 1.517028094613631, + "learning_rate": 1.631501535654777e-05, + "loss": 0.8948, + "step": 1887 + }, + { + "epoch": 0.3, + "grad_norm": 1.2892759789358808, + "learning_rate": 1.6310967460107328e-05, + "loss": 0.9436, + "step": 1888 + }, + { + "epoch": 0.3, + "grad_norm": 1.637203354209914, + "learning_rate": 1.630691784438539e-05, + "loss": 0.8824, + "step": 1889 + }, + { + "epoch": 0.3, + "grad_norm": 1.0759367610257575, + "learning_rate": 1.630286651048518e-05, + "loss": 0.3086, + "step": 1890 + }, + { + "epoch": 0.3, + "grad_norm": 1.5267048253037414, + "learning_rate": 1.6298813459510396e-05, + "loss": 1.0233, + "step": 1891 + }, + { + "epoch": 0.3, + "grad_norm": 1.2437257157224155, + "learning_rate": 1.6294758692565197e-05, + "loss": 0.959, + "step": 1892 + }, + { + "epoch": 0.31, + "grad_norm": 1.6027426872158737, + "learning_rate": 1.6290702210754213e-05, + "loss": 0.9224, + "step": 1893 + }, + { + "epoch": 0.31, + "grad_norm": 1.4511738580764633, + "learning_rate": 1.628664401518254e-05, + "loss": 0.8946, + "step": 1894 + }, + { + "epoch": 0.31, + "grad_norm": 1.245691753286502, + "learning_rate": 1.6282584106955747e-05, + "loss": 0.8767, + "step": 1895 + }, + { + "epoch": 0.31, + "grad_norm": 1.442912440344975, + "learning_rate": 1.6278522487179867e-05, + "loss": 0.9146, + "step": 1896 + }, + { + "epoch": 0.31, + "grad_norm": 1.5031029190555176, + "learning_rate": 1.6274459156961388e-05, + "loss": 0.8824, + "step": 1897 + }, + { + "epoch": 0.31, + "grad_norm": 1.3614520676014332, + "learning_rate": 1.6270394117407277e-05, + "loss": 0.8791, + "step": 1898 + }, + { + "epoch": 0.31, + "grad_norm": 1.279253632054858, + "learning_rate": 1.626632736962497e-05, + "loss": 0.9005, + "step": 1899 + }, + { + "epoch": 0.31, + "grad_norm": 1.4344592431206322, + "learning_rate": 1.6262258914722352e-05, + "loss": 0.9292, + "step": 1900 + }, + { + "epoch": 0.31, + "grad_norm": 1.6104961160353812, + "learning_rate": 1.6258188753807783e-05, + "loss": 0.916, + "step": 1901 + }, + { + "epoch": 0.31, + "grad_norm": 1.3992711005489513, + "learning_rate": 1.625411688799009e-05, + "loss": 0.873, + "step": 1902 + }, + { + "epoch": 0.31, + "grad_norm": 1.5572392235306647, + "learning_rate": 1.6250043318378563e-05, + "loss": 0.8436, + "step": 1903 + }, + { + "epoch": 0.31, + "grad_norm": 1.313176324273467, + "learning_rate": 1.6245968046082952e-05, + "loss": 0.8471, + "step": 1904 + }, + { + "epoch": 0.31, + "grad_norm": 1.1381759790585577, + "learning_rate": 1.6241891072213475e-05, + "loss": 0.3717, + "step": 1905 + }, + { + "epoch": 0.31, + "grad_norm": 1.3214802402811656, + "learning_rate": 1.6237812397880806e-05, + "loss": 0.9464, + "step": 1906 + }, + { + "epoch": 0.31, + "grad_norm": 1.763875373979312, + "learning_rate": 1.62337320241961e-05, + "loss": 0.8829, + "step": 1907 + }, + { + "epoch": 0.31, + "grad_norm": 0.8674387589304035, + "learning_rate": 1.6229649952270955e-05, + "loss": 0.3582, + "step": 1908 + }, + { + "epoch": 0.31, + "grad_norm": 1.3696017114723968, + "learning_rate": 1.622556618321744e-05, + "loss": 0.8835, + "step": 1909 + }, + { + "epoch": 0.31, + "grad_norm": 1.456235674170605, + "learning_rate": 1.622148071814809e-05, + "loss": 1.0319, + "step": 1910 + }, + { + "epoch": 0.31, + "grad_norm": 1.2798709935201869, + "learning_rate": 1.6217393558175897e-05, + "loss": 0.8944, + "step": 1911 + }, + { + "epoch": 0.31, + "grad_norm": 1.393940428120887, + "learning_rate": 1.6213304704414314e-05, + "loss": 0.9134, + "step": 1912 + }, + { + "epoch": 0.31, + "grad_norm": 1.4409895238945154, + "learning_rate": 1.620921415797726e-05, + "loss": 0.766, + "step": 1913 + }, + { + "epoch": 0.31, + "grad_norm": 1.6585663289863886, + "learning_rate": 1.620512191997911e-05, + "loss": 0.8966, + "step": 1914 + }, + { + "epoch": 0.31, + "grad_norm": 1.2650912662057499, + "learning_rate": 1.6201027991534705e-05, + "loss": 0.8632, + "step": 1915 + }, + { + "epoch": 0.31, + "grad_norm": 1.4393435136270214, + "learning_rate": 1.6196932373759337e-05, + "loss": 0.9465, + "step": 1916 + }, + { + "epoch": 0.31, + "grad_norm": 1.334636792244241, + "learning_rate": 1.6192835067768776e-05, + "loss": 0.8987, + "step": 1917 + }, + { + "epoch": 0.31, + "grad_norm": 1.2041960116127077, + "learning_rate": 1.6188736074679237e-05, + "loss": 0.8552, + "step": 1918 + }, + { + "epoch": 0.31, + "grad_norm": 0.8990943559996785, + "learning_rate": 1.6184635395607395e-05, + "loss": 0.3548, + "step": 1919 + }, + { + "epoch": 0.31, + "grad_norm": 2.1427925275564728, + "learning_rate": 1.6180533031670395e-05, + "loss": 0.9072, + "step": 1920 + }, + { + "epoch": 0.31, + "grad_norm": 1.4672609933396465, + "learning_rate": 1.6176428983985825e-05, + "loss": 0.9374, + "step": 1921 + }, + { + "epoch": 0.31, + "grad_norm": 1.4538621301839003, + "learning_rate": 1.6172323253671745e-05, + "loss": 0.8838, + "step": 1922 + }, + { + "epoch": 0.31, + "grad_norm": 1.5457096936498687, + "learning_rate": 1.6168215841846673e-05, + "loss": 0.9056, + "step": 1923 + }, + { + "epoch": 0.31, + "grad_norm": 1.3066342115601206, + "learning_rate": 1.616410674962958e-05, + "loss": 0.923, + "step": 1924 + }, + { + "epoch": 0.31, + "grad_norm": 1.3231451878766702, + "learning_rate": 1.615999597813989e-05, + "loss": 0.9614, + "step": 1925 + }, + { + "epoch": 0.31, + "grad_norm": 1.4278456217976934, + "learning_rate": 1.6155883528497492e-05, + "loss": 0.8681, + "step": 1926 + }, + { + "epoch": 0.31, + "grad_norm": 1.4635257055446242, + "learning_rate": 1.6151769401822735e-05, + "loss": 0.9694, + "step": 1927 + }, + { + "epoch": 0.31, + "grad_norm": 1.4383715502548073, + "learning_rate": 1.614765359923642e-05, + "loss": 0.8516, + "step": 1928 + }, + { + "epoch": 0.31, + "grad_norm": 1.4921943030277178, + "learning_rate": 1.6143536121859805e-05, + "loss": 0.8864, + "step": 1929 + }, + { + "epoch": 0.31, + "grad_norm": 1.3852743255540962, + "learning_rate": 1.61394169708146e-05, + "loss": 0.8516, + "step": 1930 + }, + { + "epoch": 0.31, + "grad_norm": 1.460610669980455, + "learning_rate": 1.613529614722298e-05, + "loss": 0.8586, + "step": 1931 + }, + { + "epoch": 0.31, + "grad_norm": 1.2469697667531376, + "learning_rate": 1.6131173652207565e-05, + "loss": 0.8835, + "step": 1932 + }, + { + "epoch": 0.31, + "grad_norm": 1.551797209732303, + "learning_rate": 1.6127049486891442e-05, + "loss": 0.8796, + "step": 1933 + }, + { + "epoch": 0.31, + "grad_norm": 1.4913047406285018, + "learning_rate": 1.612292365239815e-05, + "loss": 0.9068, + "step": 1934 + }, + { + "epoch": 0.31, + "grad_norm": 1.4759843441186518, + "learning_rate": 1.611879614985167e-05, + "loss": 0.9129, + "step": 1935 + }, + { + "epoch": 0.31, + "grad_norm": 1.7478676805340232, + "learning_rate": 1.6114666980376455e-05, + "loss": 0.9263, + "step": 1936 + }, + { + "epoch": 0.31, + "grad_norm": 1.3796207127763283, + "learning_rate": 1.6110536145097407e-05, + "loss": 0.8591, + "step": 1937 + }, + { + "epoch": 0.31, + "grad_norm": 1.4062992464820359, + "learning_rate": 1.6106403645139866e-05, + "loss": 0.8777, + "step": 1938 + }, + { + "epoch": 0.31, + "grad_norm": 1.3156829192342445, + "learning_rate": 1.6102269481629654e-05, + "loss": 0.8738, + "step": 1939 + }, + { + "epoch": 0.31, + "grad_norm": 1.524495203912162, + "learning_rate": 1.6098133655693027e-05, + "loss": 0.9024, + "step": 1940 + }, + { + "epoch": 0.31, + "grad_norm": 1.371689653367193, + "learning_rate": 1.6093996168456694e-05, + "loss": 0.9143, + "step": 1941 + }, + { + "epoch": 0.31, + "grad_norm": 1.2592867894907427, + "learning_rate": 1.6089857021047822e-05, + "loss": 0.8772, + "step": 1942 + }, + { + "epoch": 0.31, + "grad_norm": 1.5089332826443895, + "learning_rate": 1.608571621459403e-05, + "loss": 0.914, + "step": 1943 + }, + { + "epoch": 0.31, + "grad_norm": 1.6946082399306537, + "learning_rate": 1.6081573750223388e-05, + "loss": 0.9012, + "step": 1944 + }, + { + "epoch": 0.31, + "grad_norm": 1.4515750916543633, + "learning_rate": 1.607742962906442e-05, + "loss": 0.915, + "step": 1945 + }, + { + "epoch": 0.31, + "grad_norm": 1.4306076097254299, + "learning_rate": 1.6073283852246087e-05, + "loss": 0.9183, + "step": 1946 + }, + { + "epoch": 0.31, + "grad_norm": 1.2846640967756937, + "learning_rate": 1.6069136420897827e-05, + "loss": 0.8739, + "step": 1947 + }, + { + "epoch": 0.31, + "grad_norm": 1.3388333354596353, + "learning_rate": 1.606498733614951e-05, + "loss": 0.93, + "step": 1948 + }, + { + "epoch": 0.31, + "grad_norm": 1.4542791790978111, + "learning_rate": 1.606083659913146e-05, + "loss": 0.9383, + "step": 1949 + }, + { + "epoch": 0.31, + "grad_norm": 1.257295988562237, + "learning_rate": 1.605668421097445e-05, + "loss": 0.9528, + "step": 1950 + }, + { + "epoch": 0.31, + "grad_norm": 1.6888426714181268, + "learning_rate": 1.605253017280971e-05, + "loss": 0.9161, + "step": 1951 + }, + { + "epoch": 0.31, + "grad_norm": 1.3883795970516037, + "learning_rate": 1.6048374485768912e-05, + "loss": 0.9688, + "step": 1952 + }, + { + "epoch": 0.31, + "grad_norm": 1.5040407064946615, + "learning_rate": 1.604421715098418e-05, + "loss": 0.9332, + "step": 1953 + }, + { + "epoch": 0.31, + "grad_norm": 1.346406664153787, + "learning_rate": 1.6040058169588086e-05, + "loss": 0.9112, + "step": 1954 + }, + { + "epoch": 0.31, + "grad_norm": 1.6776534994702716, + "learning_rate": 1.603589754271365e-05, + "loss": 0.9081, + "step": 1955 + }, + { + "epoch": 0.32, + "grad_norm": 1.330353867940767, + "learning_rate": 1.603173527149434e-05, + "loss": 0.8344, + "step": 1956 + }, + { + "epoch": 0.32, + "grad_norm": 1.2980454377577682, + "learning_rate": 1.602757135706408e-05, + "loss": 0.8686, + "step": 1957 + }, + { + "epoch": 0.32, + "grad_norm": 1.4192877721796435, + "learning_rate": 1.602340580055723e-05, + "loss": 0.9823, + "step": 1958 + }, + { + "epoch": 0.32, + "grad_norm": 1.3824046780890138, + "learning_rate": 1.6019238603108605e-05, + "loss": 0.8438, + "step": 1959 + }, + { + "epoch": 0.32, + "grad_norm": 1.5052048253862187, + "learning_rate": 1.6015069765853462e-05, + "loss": 0.8439, + "step": 1960 + }, + { + "epoch": 0.32, + "grad_norm": 1.7751854084455048, + "learning_rate": 1.6010899289927513e-05, + "loss": 0.9124, + "step": 1961 + }, + { + "epoch": 0.32, + "grad_norm": 1.6650943998610765, + "learning_rate": 1.60067271764669e-05, + "loss": 0.8793, + "step": 1962 + }, + { + "epoch": 0.32, + "grad_norm": 1.224744920058577, + "learning_rate": 1.600255342660823e-05, + "loss": 0.8953, + "step": 1963 + }, + { + "epoch": 0.32, + "grad_norm": 1.7133515574027438, + "learning_rate": 1.5998378041488547e-05, + "loss": 0.9532, + "step": 1964 + }, + { + "epoch": 0.32, + "grad_norm": 1.2283299082852681, + "learning_rate": 1.5994201022245338e-05, + "loss": 0.8681, + "step": 1965 + }, + { + "epoch": 0.32, + "grad_norm": 1.1344981198319442, + "learning_rate": 1.599002237001654e-05, + "loss": 0.9084, + "step": 1966 + }, + { + "epoch": 0.32, + "grad_norm": 1.3177238794130328, + "learning_rate": 1.598584208594053e-05, + "loss": 0.8337, + "step": 1967 + }, + { + "epoch": 0.32, + "grad_norm": 1.397189333184374, + "learning_rate": 1.5981660171156136e-05, + "loss": 0.8724, + "step": 1968 + }, + { + "epoch": 0.32, + "grad_norm": 1.3013798400330248, + "learning_rate": 1.5977476626802624e-05, + "loss": 0.9118, + "step": 1969 + }, + { + "epoch": 0.32, + "grad_norm": 1.2816872980786425, + "learning_rate": 1.5973291454019713e-05, + "loss": 0.9026, + "step": 1970 + }, + { + "epoch": 0.32, + "grad_norm": 1.388597238254469, + "learning_rate": 1.5969104653947552e-05, + "loss": 0.9196, + "step": 1971 + }, + { + "epoch": 0.32, + "grad_norm": 1.4010803068574738, + "learning_rate": 1.5964916227726747e-05, + "loss": 0.8973, + "step": 1972 + }, + { + "epoch": 0.32, + "grad_norm": 1.320947376654923, + "learning_rate": 1.5960726176498334e-05, + "loss": 0.8945, + "step": 1973 + }, + { + "epoch": 0.32, + "grad_norm": 1.3197637685973724, + "learning_rate": 1.5956534501403808e-05, + "loss": 0.8162, + "step": 1974 + }, + { + "epoch": 0.32, + "grad_norm": 1.1748864370586247, + "learning_rate": 1.5952341203585086e-05, + "loss": 0.9614, + "step": 1975 + }, + { + "epoch": 0.32, + "grad_norm": 1.427350448593724, + "learning_rate": 1.594814628418454e-05, + "loss": 0.9338, + "step": 1976 + }, + { + "epoch": 0.32, + "grad_norm": 1.4361101187369865, + "learning_rate": 1.5943949744344994e-05, + "loss": 0.8075, + "step": 1977 + }, + { + "epoch": 0.32, + "grad_norm": 1.135290864732535, + "learning_rate": 1.5939751585209685e-05, + "loss": 0.8992, + "step": 1978 + }, + { + "epoch": 0.32, + "grad_norm": 1.4497168782491758, + "learning_rate": 1.5935551807922315e-05, + "loss": 0.8677, + "step": 1979 + }, + { + "epoch": 0.32, + "grad_norm": 1.3081699111973126, + "learning_rate": 1.593135041362702e-05, + "loss": 0.8933, + "step": 1980 + }, + { + "epoch": 0.32, + "grad_norm": 1.6592582405629155, + "learning_rate": 1.592714740346837e-05, + "loss": 0.8719, + "step": 1981 + }, + { + "epoch": 0.32, + "grad_norm": 1.4928880914272047, + "learning_rate": 1.592294277859139e-05, + "loss": 0.8737, + "step": 1982 + }, + { + "epoch": 0.32, + "grad_norm": 1.6162576501555364, + "learning_rate": 1.5918736540141525e-05, + "loss": 0.917, + "step": 1983 + }, + { + "epoch": 0.32, + "grad_norm": 1.5205918670432066, + "learning_rate": 1.5914528689264677e-05, + "loss": 0.8819, + "step": 1984 + }, + { + "epoch": 0.32, + "grad_norm": 1.209985270959786, + "learning_rate": 1.591031922710718e-05, + "loss": 0.8604, + "step": 1985 + }, + { + "epoch": 0.32, + "grad_norm": 1.3971386917149347, + "learning_rate": 1.5906108154815805e-05, + "loss": 0.9026, + "step": 1986 + }, + { + "epoch": 0.32, + "grad_norm": 1.4232462202789518, + "learning_rate": 1.5901895473537768e-05, + "loss": 0.8754, + "step": 1987 + }, + { + "epoch": 0.32, + "grad_norm": 1.3081648116482805, + "learning_rate": 1.5897681184420716e-05, + "loss": 0.9134, + "step": 1988 + }, + { + "epoch": 0.32, + "grad_norm": 1.4026718921257686, + "learning_rate": 1.589346528861274e-05, + "loss": 0.8925, + "step": 1989 + }, + { + "epoch": 0.32, + "grad_norm": 1.4225940104700663, + "learning_rate": 1.588924778726236e-05, + "loss": 0.904, + "step": 1990 + }, + { + "epoch": 0.32, + "grad_norm": 1.3410152465859781, + "learning_rate": 1.588502868151855e-05, + "loss": 0.9033, + "step": 1991 + }, + { + "epoch": 0.32, + "grad_norm": 0.8958031573961167, + "learning_rate": 1.5880807972530705e-05, + "loss": 0.3577, + "step": 1992 + }, + { + "epoch": 0.32, + "grad_norm": 1.531633992486013, + "learning_rate": 1.587658566144866e-05, + "loss": 0.8827, + "step": 1993 + }, + { + "epoch": 0.32, + "grad_norm": 1.375815286653979, + "learning_rate": 1.5872361749422694e-05, + "loss": 0.9018, + "step": 1994 + }, + { + "epoch": 0.32, + "grad_norm": 1.8890854995027901, + "learning_rate": 1.5868136237603516e-05, + "loss": 0.9174, + "step": 1995 + }, + { + "epoch": 0.32, + "grad_norm": 1.3732840450533814, + "learning_rate": 1.5863909127142268e-05, + "loss": 0.9104, + "step": 1996 + }, + { + "epoch": 0.32, + "grad_norm": 1.4159055114551702, + "learning_rate": 1.585968041919054e-05, + "loss": 0.9233, + "step": 1997 + }, + { + "epoch": 0.32, + "grad_norm": 1.4254468532238582, + "learning_rate": 1.585545011490034e-05, + "loss": 0.8491, + "step": 1998 + }, + { + "epoch": 0.32, + "grad_norm": 1.3744619905952045, + "learning_rate": 1.5851218215424115e-05, + "loss": 0.7947, + "step": 1999 + }, + { + "epoch": 0.32, + "grad_norm": 1.2951859119407894, + "learning_rate": 1.5846984721914765e-05, + "loss": 0.9306, + "step": 2000 + }, + { + "epoch": 0.32, + "grad_norm": 1.2681900657433085, + "learning_rate": 1.5842749635525602e-05, + "loss": 0.9174, + "step": 2001 + }, + { + "epoch": 0.32, + "grad_norm": 1.4291576663572392, + "learning_rate": 1.5838512957410384e-05, + "loss": 0.9311, + "step": 2002 + }, + { + "epoch": 0.32, + "grad_norm": 1.618749993314614, + "learning_rate": 1.5834274688723293e-05, + "loss": 0.8007, + "step": 2003 + }, + { + "epoch": 0.32, + "grad_norm": 1.349696689271039, + "learning_rate": 1.583003483061896e-05, + "loss": 0.9781, + "step": 2004 + }, + { + "epoch": 0.32, + "grad_norm": 1.5430955086319913, + "learning_rate": 1.5825793384252432e-05, + "loss": 0.93, + "step": 2005 + }, + { + "epoch": 0.32, + "grad_norm": 1.3214809647712604, + "learning_rate": 1.582155035077919e-05, + "loss": 0.8707, + "step": 2006 + }, + { + "epoch": 0.32, + "grad_norm": 1.3105344398664698, + "learning_rate": 1.5817305731355168e-05, + "loss": 0.96, + "step": 2007 + }, + { + "epoch": 0.32, + "grad_norm": 1.526164995153077, + "learning_rate": 1.5813059527136708e-05, + "loss": 0.8891, + "step": 2008 + }, + { + "epoch": 0.32, + "grad_norm": 1.4207678017094645, + "learning_rate": 1.5808811739280592e-05, + "loss": 0.8665, + "step": 2009 + }, + { + "epoch": 0.32, + "grad_norm": 1.6024967056664947, + "learning_rate": 1.5804562368944042e-05, + "loss": 0.9544, + "step": 2010 + }, + { + "epoch": 0.32, + "grad_norm": 1.300661571971281, + "learning_rate": 1.5800311417284695e-05, + "loss": 0.9215, + "step": 2011 + }, + { + "epoch": 0.32, + "grad_norm": 1.5023343404831306, + "learning_rate": 1.579605888546063e-05, + "loss": 0.976, + "step": 2012 + }, + { + "epoch": 0.32, + "grad_norm": 1.366843390756666, + "learning_rate": 1.579180477463036e-05, + "loss": 0.9197, + "step": 2013 + }, + { + "epoch": 0.32, + "grad_norm": 1.637873498302596, + "learning_rate": 1.5787549085952816e-05, + "loss": 0.9639, + "step": 2014 + }, + { + "epoch": 0.32, + "grad_norm": 1.2562467710254221, + "learning_rate": 1.5783291820587366e-05, + "loss": 0.889, + "step": 2015 + }, + { + "epoch": 0.32, + "grad_norm": 1.4917293164702217, + "learning_rate": 1.5779032979693808e-05, + "loss": 0.9411, + "step": 2016 + }, + { + "epoch": 0.32, + "grad_norm": 1.3295242648530172, + "learning_rate": 1.5774772564432365e-05, + "loss": 0.8543, + "step": 2017 + }, + { + "epoch": 0.33, + "grad_norm": 1.3157751964905036, + "learning_rate": 1.577051057596369e-05, + "loss": 0.792, + "step": 2018 + }, + { + "epoch": 0.33, + "grad_norm": 1.1483011026868462, + "learning_rate": 1.5766247015448875e-05, + "loss": 0.9328, + "step": 2019 + }, + { + "epoch": 0.33, + "grad_norm": 0.9090602234923024, + "learning_rate": 1.576198188404942e-05, + "loss": 0.3851, + "step": 2020 + }, + { + "epoch": 0.33, + "grad_norm": 1.417335480499871, + "learning_rate": 1.5757715182927273e-05, + "loss": 0.8626, + "step": 2021 + }, + { + "epoch": 0.33, + "grad_norm": 1.3146688523759742, + "learning_rate": 1.5753446913244798e-05, + "loss": 0.8823, + "step": 2022 + }, + { + "epoch": 0.33, + "grad_norm": 1.4287771501666913, + "learning_rate": 1.5749177076164788e-05, + "loss": 0.8827, + "step": 2023 + }, + { + "epoch": 0.33, + "grad_norm": 1.4310137266084093, + "learning_rate": 1.5744905672850467e-05, + "loss": 0.875, + "step": 2024 + }, + { + "epoch": 0.33, + "grad_norm": 1.315271893080312, + "learning_rate": 1.5740632704465478e-05, + "loss": 0.853, + "step": 2025 + }, + { + "epoch": 0.33, + "grad_norm": 1.9589015709291007, + "learning_rate": 1.5736358172173903e-05, + "loss": 0.9599, + "step": 2026 + }, + { + "epoch": 0.33, + "grad_norm": 1.461253692329211, + "learning_rate": 1.5732082077140235e-05, + "loss": 0.9402, + "step": 2027 + }, + { + "epoch": 0.33, + "grad_norm": 0.8454234299989747, + "learning_rate": 1.5727804420529405e-05, + "loss": 0.3236, + "step": 2028 + }, + { + "epoch": 0.33, + "grad_norm": 1.5518179378300554, + "learning_rate": 1.5723525203506758e-05, + "loss": 0.8517, + "step": 2029 + }, + { + "epoch": 0.33, + "grad_norm": 1.4448462481686997, + "learning_rate": 1.5719244427238086e-05, + "loss": 0.8893, + "step": 2030 + }, + { + "epoch": 0.33, + "grad_norm": 1.4342007410924213, + "learning_rate": 1.571496209288957e-05, + "loss": 0.895, + "step": 2031 + }, + { + "epoch": 0.33, + "grad_norm": 1.3268758677759624, + "learning_rate": 1.571067820162785e-05, + "loss": 0.868, + "step": 2032 + }, + { + "epoch": 0.33, + "grad_norm": 1.3988853478704937, + "learning_rate": 1.5706392754619973e-05, + "loss": 0.9087, + "step": 2033 + }, + { + "epoch": 0.33, + "grad_norm": 1.379079422194281, + "learning_rate": 1.5702105753033415e-05, + "loss": 0.8396, + "step": 2034 + }, + { + "epoch": 0.33, + "grad_norm": 1.2855426445157951, + "learning_rate": 1.5697817198036066e-05, + "loss": 0.8341, + "step": 2035 + }, + { + "epoch": 0.33, + "grad_norm": 1.3832603203605502, + "learning_rate": 1.569352709079625e-05, + "loss": 0.8867, + "step": 2036 + }, + { + "epoch": 0.33, + "grad_norm": 1.3716137618580329, + "learning_rate": 1.5689235432482715e-05, + "loss": 0.8929, + "step": 2037 + }, + { + "epoch": 0.33, + "grad_norm": 1.83297503466964, + "learning_rate": 1.5684942224264622e-05, + "loss": 0.8686, + "step": 2038 + }, + { + "epoch": 0.33, + "grad_norm": 1.5586894365686241, + "learning_rate": 1.568064746731156e-05, + "loss": 0.9573, + "step": 2039 + }, + { + "epoch": 0.33, + "grad_norm": 1.5354239397855813, + "learning_rate": 1.567635116279354e-05, + "loss": 0.8918, + "step": 2040 + }, + { + "epoch": 0.33, + "grad_norm": 1.64151972204166, + "learning_rate": 1.5672053311880994e-05, + "loss": 0.9438, + "step": 2041 + }, + { + "epoch": 0.33, + "grad_norm": 1.3953362713846793, + "learning_rate": 1.5667753915744776e-05, + "loss": 0.938, + "step": 2042 + }, + { + "epoch": 0.33, + "grad_norm": 1.3405617411122808, + "learning_rate": 1.566345297555616e-05, + "loss": 0.811, + "step": 2043 + }, + { + "epoch": 0.33, + "grad_norm": 1.359147699579635, + "learning_rate": 1.5659150492486833e-05, + "loss": 0.8493, + "step": 2044 + }, + { + "epoch": 0.33, + "grad_norm": 1.620516542717152, + "learning_rate": 1.565484646770892e-05, + "loss": 0.795, + "step": 2045 + }, + { + "epoch": 0.33, + "grad_norm": 1.4572061792913213, + "learning_rate": 1.5650540902394954e-05, + "loss": 0.8851, + "step": 2046 + }, + { + "epoch": 0.33, + "grad_norm": 1.8338082003450566, + "learning_rate": 1.564623379771789e-05, + "loss": 0.9402, + "step": 2047 + }, + { + "epoch": 0.33, + "grad_norm": 1.5077881514604323, + "learning_rate": 1.5641925154851096e-05, + "loss": 0.8017, + "step": 2048 + }, + { + "epoch": 0.33, + "grad_norm": 1.5976108419825408, + "learning_rate": 1.5637614974968372e-05, + "loss": 0.9932, + "step": 2049 + }, + { + "epoch": 0.33, + "grad_norm": 1.3321729798580437, + "learning_rate": 1.5633303259243927e-05, + "loss": 0.9171, + "step": 2050 + }, + { + "epoch": 0.33, + "grad_norm": 1.123374686289018, + "learning_rate": 1.562899000885239e-05, + "loss": 0.8792, + "step": 2051 + }, + { + "epoch": 0.33, + "grad_norm": 1.6622765319182398, + "learning_rate": 1.5624675224968808e-05, + "loss": 0.8477, + "step": 2052 + }, + { + "epoch": 0.33, + "grad_norm": 1.386391314278079, + "learning_rate": 1.562035890876865e-05, + "loss": 0.867, + "step": 2053 + }, + { + "epoch": 0.33, + "grad_norm": 1.3491757996520053, + "learning_rate": 1.5616041061427805e-05, + "loss": 0.8796, + "step": 2054 + }, + { + "epoch": 0.33, + "grad_norm": 1.496533532780325, + "learning_rate": 1.5611721684122564e-05, + "loss": 0.8806, + "step": 2055 + }, + { + "epoch": 0.33, + "grad_norm": 1.5834088320571502, + "learning_rate": 1.5607400778029648e-05, + "loss": 0.8817, + "step": 2056 + }, + { + "epoch": 0.33, + "grad_norm": 1.3630594247955332, + "learning_rate": 1.5603078344326194e-05, + "loss": 0.8974, + "step": 2057 + }, + { + "epoch": 0.33, + "grad_norm": 1.3372602362896633, + "learning_rate": 1.5598754384189745e-05, + "loss": 0.9163, + "step": 2058 + }, + { + "epoch": 0.33, + "grad_norm": 1.4580252199864865, + "learning_rate": 1.5594428898798272e-05, + "loss": 0.9119, + "step": 2059 + }, + { + "epoch": 0.33, + "grad_norm": 1.4017634001141213, + "learning_rate": 1.559010188933016e-05, + "loss": 0.8673, + "step": 2060 + }, + { + "epoch": 0.33, + "grad_norm": 1.6540001126748292, + "learning_rate": 1.5585773356964192e-05, + "loss": 0.876, + "step": 2061 + }, + { + "epoch": 0.33, + "grad_norm": 1.3125592754807214, + "learning_rate": 1.55814433028796e-05, + "loss": 0.866, + "step": 2062 + }, + { + "epoch": 0.33, + "grad_norm": 1.4900815461663042, + "learning_rate": 1.557711172825599e-05, + "loss": 0.8275, + "step": 2063 + }, + { + "epoch": 0.33, + "grad_norm": 1.3004657746564787, + "learning_rate": 1.5572778634273417e-05, + "loss": 0.8604, + "step": 2064 + }, + { + "epoch": 0.33, + "grad_norm": 1.3775636131276412, + "learning_rate": 1.556844402211233e-05, + "loss": 0.9643, + "step": 2065 + }, + { + "epoch": 0.33, + "grad_norm": 1.4253117676737015, + "learning_rate": 1.556410789295359e-05, + "loss": 0.9308, + "step": 2066 + }, + { + "epoch": 0.33, + "grad_norm": 1.4748849308158234, + "learning_rate": 1.5559770247978488e-05, + "loss": 0.8501, + "step": 2067 + }, + { + "epoch": 0.33, + "grad_norm": 1.5464628867544787, + "learning_rate": 1.5555431088368716e-05, + "loss": 0.9305, + "step": 2068 + }, + { + "epoch": 0.33, + "grad_norm": 1.4544607798623925, + "learning_rate": 1.5551090415306377e-05, + "loss": 0.9296, + "step": 2069 + }, + { + "epoch": 0.33, + "grad_norm": 1.2954031861049777, + "learning_rate": 1.554674822997399e-05, + "loss": 0.8785, + "step": 2070 + }, + { + "epoch": 0.33, + "grad_norm": 1.4661568230617419, + "learning_rate": 1.5542404533554493e-05, + "loss": 0.9481, + "step": 2071 + }, + { + "epoch": 0.33, + "grad_norm": 1.3282663683825553, + "learning_rate": 1.553805932723122e-05, + "loss": 0.8221, + "step": 2072 + }, + { + "epoch": 0.33, + "grad_norm": 1.6164971504445893, + "learning_rate": 1.5533712612187933e-05, + "loss": 0.892, + "step": 2073 + }, + { + "epoch": 0.33, + "grad_norm": 1.3477981043530414, + "learning_rate": 1.5529364389608788e-05, + "loss": 0.8857, + "step": 2074 + }, + { + "epoch": 0.33, + "grad_norm": 1.2017522890499357, + "learning_rate": 1.552501466067837e-05, + "loss": 0.8632, + "step": 2075 + }, + { + "epoch": 0.33, + "grad_norm": 1.3419983933224382, + "learning_rate": 1.552066342658166e-05, + "loss": 0.871, + "step": 2076 + }, + { + "epoch": 0.33, + "grad_norm": 1.3984443235497255, + "learning_rate": 1.551631068850405e-05, + "loss": 0.8985, + "step": 2077 + }, + { + "epoch": 0.33, + "grad_norm": 1.3442374526304328, + "learning_rate": 1.5511956447631355e-05, + "loss": 0.899, + "step": 2078 + }, + { + "epoch": 0.33, + "grad_norm": 1.4140398118533117, + "learning_rate": 1.5507600705149782e-05, + "loss": 0.838, + "step": 2079 + }, + { + "epoch": 0.34, + "grad_norm": 1.40675016819898, + "learning_rate": 1.5503243462245963e-05, + "loss": 0.7899, + "step": 2080 + }, + { + "epoch": 0.34, + "grad_norm": 1.4594952960282812, + "learning_rate": 1.5498884720106925e-05, + "loss": 0.8555, + "step": 2081 + }, + { + "epoch": 0.34, + "grad_norm": 1.3959283942411733, + "learning_rate": 1.549452447992011e-05, + "loss": 0.9066, + "step": 2082 + }, + { + "epoch": 0.34, + "grad_norm": 1.3857328335296446, + "learning_rate": 1.5490162742873372e-05, + "loss": 0.8954, + "step": 2083 + }, + { + "epoch": 0.34, + "grad_norm": 1.498538007015675, + "learning_rate": 1.5485799510154965e-05, + "loss": 0.9087, + "step": 2084 + }, + { + "epoch": 0.34, + "grad_norm": 1.4580167360026324, + "learning_rate": 1.5481434782953557e-05, + "loss": 0.8489, + "step": 2085 + }, + { + "epoch": 0.34, + "grad_norm": 0.8358803422050336, + "learning_rate": 1.5477068562458212e-05, + "loss": 0.3442, + "step": 2086 + }, + { + "epoch": 0.34, + "grad_norm": 1.482331657051731, + "learning_rate": 1.5472700849858417e-05, + "loss": 0.8924, + "step": 2087 + }, + { + "epoch": 0.34, + "grad_norm": 1.3434217899117815, + "learning_rate": 1.5468331646344056e-05, + "loss": 0.9145, + "step": 2088 + }, + { + "epoch": 0.34, + "grad_norm": 1.3673883474505215, + "learning_rate": 1.5463960953105416e-05, + "loss": 0.8984, + "step": 2089 + }, + { + "epoch": 0.34, + "grad_norm": 1.3328144128814778, + "learning_rate": 1.54595887713332e-05, + "loss": 0.8554, + "step": 2090 + }, + { + "epoch": 0.34, + "grad_norm": 0.8019379037164139, + "learning_rate": 1.5455215102218505e-05, + "loss": 0.3203, + "step": 2091 + }, + { + "epoch": 0.34, + "grad_norm": 1.2745206079486524, + "learning_rate": 1.5450839946952845e-05, + "loss": 0.918, + "step": 2092 + }, + { + "epoch": 0.34, + "grad_norm": 1.7349121136265553, + "learning_rate": 1.544646330672813e-05, + "loss": 0.8086, + "step": 2093 + }, + { + "epoch": 0.34, + "grad_norm": 1.3974543204107575, + "learning_rate": 1.544208518273668e-05, + "loss": 0.9076, + "step": 2094 + }, + { + "epoch": 0.34, + "grad_norm": 1.388588513851327, + "learning_rate": 1.5437705576171208e-05, + "loss": 0.9572, + "step": 2095 + }, + { + "epoch": 0.34, + "grad_norm": 1.3782447857493447, + "learning_rate": 1.543332448822485e-05, + "loss": 0.881, + "step": 2096 + }, + { + "epoch": 0.34, + "grad_norm": 1.206195458602207, + "learning_rate": 1.542894192009113e-05, + "loss": 0.8796, + "step": 2097 + }, + { + "epoch": 0.34, + "grad_norm": 0.8772987717871875, + "learning_rate": 1.5424557872963984e-05, + "loss": 0.3326, + "step": 2098 + }, + { + "epoch": 0.34, + "grad_norm": 1.5106064008610238, + "learning_rate": 1.5420172348037742e-05, + "loss": 0.8839, + "step": 2099 + }, + { + "epoch": 0.34, + "grad_norm": 1.3602666095789804, + "learning_rate": 1.5415785346507143e-05, + "loss": 0.8538, + "step": 2100 + }, + { + "epoch": 0.34, + "grad_norm": 1.655156785798686, + "learning_rate": 1.5411396869567332e-05, + "loss": 0.8745, + "step": 2101 + }, + { + "epoch": 0.34, + "grad_norm": 1.2886400446280795, + "learning_rate": 1.5407006918413843e-05, + "loss": 0.9232, + "step": 2102 + }, + { + "epoch": 0.34, + "grad_norm": 1.3390862875503675, + "learning_rate": 1.540261549424263e-05, + "loss": 0.8931, + "step": 2103 + }, + { + "epoch": 0.34, + "grad_norm": 1.323889844833962, + "learning_rate": 1.539822259825003e-05, + "loss": 0.9047, + "step": 2104 + }, + { + "epoch": 0.34, + "grad_norm": 1.5010173639424311, + "learning_rate": 1.539382823163279e-05, + "loss": 0.9984, + "step": 2105 + }, + { + "epoch": 0.34, + "grad_norm": 1.5083787773814015, + "learning_rate": 1.538943239558806e-05, + "loss": 0.8741, + "step": 2106 + }, + { + "epoch": 0.34, + "grad_norm": 1.3395065711909675, + "learning_rate": 1.5385035091313382e-05, + "loss": 0.8379, + "step": 2107 + }, + { + "epoch": 0.34, + "grad_norm": 1.4325983485703977, + "learning_rate": 1.538063632000671e-05, + "loss": 0.9036, + "step": 2108 + }, + { + "epoch": 0.34, + "grad_norm": 1.4259969260769814, + "learning_rate": 1.5376236082866384e-05, + "loss": 0.8324, + "step": 2109 + }, + { + "epoch": 0.34, + "grad_norm": 1.424697088188185, + "learning_rate": 1.5371834381091152e-05, + "loss": 0.8851, + "step": 2110 + }, + { + "epoch": 0.34, + "grad_norm": 1.4600805597194675, + "learning_rate": 1.5367431215880156e-05, + "loss": 0.9204, + "step": 2111 + }, + { + "epoch": 0.34, + "grad_norm": 1.582653588464009, + "learning_rate": 1.536302658843295e-05, + "loss": 0.9067, + "step": 2112 + }, + { + "epoch": 0.34, + "grad_norm": 1.4688398563260845, + "learning_rate": 1.5358620499949464e-05, + "loss": 0.8702, + "step": 2113 + }, + { + "epoch": 0.34, + "grad_norm": 1.5876682871882286, + "learning_rate": 1.5354212951630043e-05, + "loss": 0.9259, + "step": 2114 + }, + { + "epoch": 0.34, + "grad_norm": 1.5684207245714736, + "learning_rate": 1.5349803944675424e-05, + "loss": 0.9044, + "step": 2115 + }, + { + "epoch": 0.34, + "grad_norm": 1.2026772594859434, + "learning_rate": 1.5345393480286744e-05, + "loss": 0.8738, + "step": 2116 + }, + { + "epoch": 0.34, + "grad_norm": 1.4195604887655733, + "learning_rate": 1.5340981559665533e-05, + "loss": 0.9376, + "step": 2117 + }, + { + "epoch": 0.34, + "grad_norm": 1.7199205043335877, + "learning_rate": 1.5336568184013717e-05, + "loss": 0.9292, + "step": 2118 + }, + { + "epoch": 0.34, + "grad_norm": 1.323290673384635, + "learning_rate": 1.5332153354533626e-05, + "loss": 0.8559, + "step": 2119 + }, + { + "epoch": 0.34, + "grad_norm": 1.3758325301729055, + "learning_rate": 1.532773707242798e-05, + "loss": 0.9043, + "step": 2120 + }, + { + "epoch": 0.34, + "grad_norm": 1.3444443266614672, + "learning_rate": 1.5323319338899896e-05, + "loss": 0.8863, + "step": 2121 + }, + { + "epoch": 0.34, + "grad_norm": 1.5512818974478426, + "learning_rate": 1.531890015515289e-05, + "loss": 0.8728, + "step": 2122 + }, + { + "epoch": 0.34, + "grad_norm": 1.323649199554473, + "learning_rate": 1.5314479522390856e-05, + "loss": 0.9153, + "step": 2123 + }, + { + "epoch": 0.34, + "grad_norm": 1.3178959215559662, + "learning_rate": 1.5310057441818115e-05, + "loss": 0.858, + "step": 2124 + }, + { + "epoch": 0.34, + "grad_norm": 1.4805177275374284, + "learning_rate": 1.5305633914639348e-05, + "loss": 0.9368, + "step": 2125 + }, + { + "epoch": 0.34, + "grad_norm": 1.3399720380773832, + "learning_rate": 1.530120894205965e-05, + "loss": 0.8256, + "step": 2126 + }, + { + "epoch": 0.34, + "grad_norm": 1.403337365048835, + "learning_rate": 1.5296782525284514e-05, + "loss": 0.8855, + "step": 2127 + }, + { + "epoch": 0.34, + "grad_norm": 1.4219346361439533, + "learning_rate": 1.529235466551981e-05, + "loss": 0.8182, + "step": 2128 + }, + { + "epoch": 0.34, + "grad_norm": 1.5538172892263111, + "learning_rate": 1.5287925363971807e-05, + "loss": 0.958, + "step": 2129 + }, + { + "epoch": 0.34, + "grad_norm": 1.3767689629085758, + "learning_rate": 1.5283494621847175e-05, + "loss": 0.8653, + "step": 2130 + }, + { + "epoch": 0.34, + "grad_norm": 1.38441744672304, + "learning_rate": 1.5279062440352968e-05, + "loss": 0.8887, + "step": 2131 + }, + { + "epoch": 0.34, + "grad_norm": 1.5654703847101017, + "learning_rate": 1.527462882069663e-05, + "loss": 0.8576, + "step": 2132 + }, + { + "epoch": 0.34, + "grad_norm": 1.3513552714943207, + "learning_rate": 1.5270193764086012e-05, + "loss": 0.9431, + "step": 2133 + }, + { + "epoch": 0.34, + "grad_norm": 1.2289987316939581, + "learning_rate": 1.5265757271729333e-05, + "loss": 0.906, + "step": 2134 + }, + { + "epoch": 0.34, + "grad_norm": 1.3670286481398486, + "learning_rate": 1.5261319344835225e-05, + "loss": 0.8459, + "step": 2135 + }, + { + "epoch": 0.34, + "grad_norm": 1.4874394819727386, + "learning_rate": 1.5256879984612698e-05, + "loss": 0.9315, + "step": 2136 + }, + { + "epoch": 0.34, + "grad_norm": 1.6014259978018532, + "learning_rate": 1.5252439192271156e-05, + "loss": 0.8426, + "step": 2137 + }, + { + "epoch": 0.34, + "grad_norm": 1.438900216382297, + "learning_rate": 1.5247996969020394e-05, + "loss": 0.8941, + "step": 2138 + }, + { + "epoch": 0.34, + "grad_norm": 1.3001680503945106, + "learning_rate": 1.5243553316070596e-05, + "loss": 0.9218, + "step": 2139 + }, + { + "epoch": 0.34, + "grad_norm": 1.3216152292556937, + "learning_rate": 1.523910823463233e-05, + "loss": 0.911, + "step": 2140 + }, + { + "epoch": 0.34, + "grad_norm": 1.42414910619103, + "learning_rate": 1.5234661725916573e-05, + "loss": 0.9348, + "step": 2141 + }, + { + "epoch": 0.35, + "grad_norm": 1.3936181311276843, + "learning_rate": 1.5230213791134662e-05, + "loss": 0.9034, + "step": 2142 + }, + { + "epoch": 0.35, + "grad_norm": 1.3051830574115397, + "learning_rate": 1.5225764431498344e-05, + "loss": 0.8586, + "step": 2143 + }, + { + "epoch": 0.35, + "grad_norm": 1.462065254425266, + "learning_rate": 1.5221313648219749e-05, + "loss": 0.9315, + "step": 2144 + }, + { + "epoch": 0.35, + "grad_norm": 1.255502092313941, + "learning_rate": 1.5216861442511382e-05, + "loss": 0.8246, + "step": 2145 + }, + { + "epoch": 0.35, + "grad_norm": 1.3723939520722246, + "learning_rate": 1.5212407815586162e-05, + "loss": 0.8995, + "step": 2146 + }, + { + "epoch": 0.35, + "grad_norm": 1.2710295646032514, + "learning_rate": 1.5207952768657368e-05, + "loss": 0.8931, + "step": 2147 + }, + { + "epoch": 0.35, + "grad_norm": 0.9181240153388553, + "learning_rate": 1.5203496302938682e-05, + "loss": 0.3409, + "step": 2148 + }, + { + "epoch": 0.35, + "grad_norm": 1.5664433875640127, + "learning_rate": 1.519903841964417e-05, + "loss": 0.885, + "step": 2149 + }, + { + "epoch": 0.35, + "grad_norm": 1.441685924043669, + "learning_rate": 1.519457911998828e-05, + "loss": 0.8769, + "step": 2150 + }, + { + "epoch": 0.35, + "grad_norm": 1.435474668360641, + "learning_rate": 1.5190118405185845e-05, + "loss": 0.8977, + "step": 2151 + }, + { + "epoch": 0.35, + "grad_norm": 1.1582444251562716, + "learning_rate": 1.5185656276452095e-05, + "loss": 0.8118, + "step": 2152 + }, + { + "epoch": 0.35, + "grad_norm": 1.6594681368388973, + "learning_rate": 1.5181192735002628e-05, + "loss": 0.9349, + "step": 2153 + }, + { + "epoch": 0.35, + "grad_norm": 1.3490758688792044, + "learning_rate": 1.517672778205344e-05, + "loss": 0.848, + "step": 2154 + }, + { + "epoch": 0.35, + "grad_norm": 1.372940632520386, + "learning_rate": 1.5172261418820908e-05, + "loss": 0.9409, + "step": 2155 + }, + { + "epoch": 0.35, + "grad_norm": 1.1506409724841307, + "learning_rate": 1.5167793646521788e-05, + "loss": 0.7848, + "step": 2156 + }, + { + "epoch": 0.35, + "grad_norm": 1.265659140491366, + "learning_rate": 1.5163324466373236e-05, + "loss": 0.8734, + "step": 2157 + }, + { + "epoch": 0.35, + "grad_norm": 1.5096840410779218, + "learning_rate": 1.5158853879592763e-05, + "loss": 0.9241, + "step": 2158 + }, + { + "epoch": 0.35, + "grad_norm": 1.4449940781756598, + "learning_rate": 1.515438188739829e-05, + "loss": 0.8552, + "step": 2159 + }, + { + "epoch": 0.35, + "grad_norm": 1.41538473215533, + "learning_rate": 1.5149908491008112e-05, + "loss": 0.8302, + "step": 2160 + }, + { + "epoch": 0.35, + "grad_norm": 1.2742926506876147, + "learning_rate": 1.5145433691640903e-05, + "loss": 0.8659, + "step": 2161 + }, + { + "epoch": 0.35, + "grad_norm": 1.2053077593342505, + "learning_rate": 1.514095749051572e-05, + "loss": 0.8499, + "step": 2162 + }, + { + "epoch": 0.35, + "grad_norm": 1.407376910059663, + "learning_rate": 1.5136479888852006e-05, + "loss": 0.8892, + "step": 2163 + }, + { + "epoch": 0.35, + "grad_norm": 1.3727099051272096, + "learning_rate": 1.5132000887869583e-05, + "loss": 0.9336, + "step": 2164 + }, + { + "epoch": 0.35, + "grad_norm": 1.4988829668066508, + "learning_rate": 1.512752048878865e-05, + "loss": 0.8863, + "step": 2165 + }, + { + "epoch": 0.35, + "grad_norm": 1.5059552310611017, + "learning_rate": 1.5123038692829801e-05, + "loss": 0.8937, + "step": 2166 + }, + { + "epoch": 0.35, + "grad_norm": 1.4733453859609438, + "learning_rate": 1.5118555501213989e-05, + "loss": 0.9127, + "step": 2167 + }, + { + "epoch": 0.35, + "grad_norm": 1.3304119401916974, + "learning_rate": 1.5114070915162568e-05, + "loss": 0.9368, + "step": 2168 + }, + { + "epoch": 0.35, + "grad_norm": 1.285562472797957, + "learning_rate": 1.5109584935897259e-05, + "loss": 0.8714, + "step": 2169 + }, + { + "epoch": 0.35, + "grad_norm": 1.5548034869111933, + "learning_rate": 1.5105097564640168e-05, + "loss": 0.8717, + "step": 2170 + }, + { + "epoch": 0.35, + "grad_norm": 1.5448864946039587, + "learning_rate": 1.5100608802613775e-05, + "loss": 0.9049, + "step": 2171 + }, + { + "epoch": 0.35, + "grad_norm": 1.564177162557625, + "learning_rate": 1.5096118651040945e-05, + "loss": 0.8878, + "step": 2172 + }, + { + "epoch": 0.35, + "grad_norm": 1.6108031189204868, + "learning_rate": 1.5091627111144923e-05, + "loss": 0.8345, + "step": 2173 + }, + { + "epoch": 0.35, + "grad_norm": 1.4910460795049896, + "learning_rate": 1.508713418414932e-05, + "loss": 0.8902, + "step": 2174 + }, + { + "epoch": 0.35, + "grad_norm": 1.385847744271693, + "learning_rate": 1.5082639871278139e-05, + "loss": 0.8166, + "step": 2175 + }, + { + "epoch": 0.35, + "grad_norm": 1.6720076702188817, + "learning_rate": 1.5078144173755754e-05, + "loss": 0.8664, + "step": 2176 + }, + { + "epoch": 0.35, + "grad_norm": 1.4218040804940049, + "learning_rate": 1.5073647092806916e-05, + "loss": 0.8736, + "step": 2177 + }, + { + "epoch": 0.35, + "grad_norm": 1.4466330153398057, + "learning_rate": 1.5069148629656752e-05, + "loss": 0.8521, + "step": 2178 + }, + { + "epoch": 0.35, + "grad_norm": 1.439371853996637, + "learning_rate": 1.5064648785530774e-05, + "loss": 0.8977, + "step": 2179 + }, + { + "epoch": 0.35, + "grad_norm": 1.2355846765580558, + "learning_rate": 1.5060147561654854e-05, + "loss": 0.8855, + "step": 2180 + }, + { + "epoch": 0.35, + "grad_norm": 1.663069990958584, + "learning_rate": 1.5055644959255257e-05, + "loss": 0.9327, + "step": 2181 + }, + { + "epoch": 0.35, + "grad_norm": 1.4364188024558948, + "learning_rate": 1.5051140979558614e-05, + "loss": 0.9091, + "step": 2182 + }, + { + "epoch": 0.35, + "grad_norm": 1.3251886880662678, + "learning_rate": 1.504663562379193e-05, + "loss": 0.8782, + "step": 2183 + }, + { + "epoch": 0.35, + "grad_norm": 1.4595920147654875, + "learning_rate": 1.5042128893182595e-05, + "loss": 0.8864, + "step": 2184 + }, + { + "epoch": 0.35, + "grad_norm": 1.387895235482913, + "learning_rate": 1.5037620788958359e-05, + "loss": 0.9167, + "step": 2185 + }, + { + "epoch": 0.35, + "grad_norm": 1.237258308477773, + "learning_rate": 1.5033111312347357e-05, + "loss": 0.8547, + "step": 2186 + }, + { + "epoch": 0.35, + "grad_norm": 1.0173283103746193, + "learning_rate": 1.5028600464578099e-05, + "loss": 0.3934, + "step": 2187 + }, + { + "epoch": 0.35, + "grad_norm": 1.695130830690747, + "learning_rate": 1.5024088246879456e-05, + "loss": 0.8903, + "step": 2188 + }, + { + "epoch": 0.35, + "grad_norm": 1.4066679062034422, + "learning_rate": 1.5019574660480685e-05, + "loss": 0.8846, + "step": 2189 + }, + { + "epoch": 0.35, + "grad_norm": 1.5432713984385202, + "learning_rate": 1.5015059706611413e-05, + "loss": 0.8517, + "step": 2190 + }, + { + "epoch": 0.35, + "grad_norm": 1.3007086473686038, + "learning_rate": 1.5010543386501634e-05, + "loss": 0.8822, + "step": 2191 + }, + { + "epoch": 0.35, + "grad_norm": 1.4091574322799574, + "learning_rate": 1.500602570138172e-05, + "loss": 0.913, + "step": 2192 + }, + { + "epoch": 0.35, + "grad_norm": 1.5937616848049787, + "learning_rate": 1.5001506652482415e-05, + "loss": 0.9059, + "step": 2193 + }, + { + "epoch": 0.35, + "grad_norm": 1.6838849755100995, + "learning_rate": 1.499698624103483e-05, + "loss": 0.9309, + "step": 2194 + }, + { + "epoch": 0.35, + "grad_norm": 1.7340667833071515, + "learning_rate": 1.4992464468270451e-05, + "loss": 0.9185, + "step": 2195 + }, + { + "epoch": 0.35, + "grad_norm": 1.401545246123321, + "learning_rate": 1.4987941335421132e-05, + "loss": 0.901, + "step": 2196 + }, + { + "epoch": 0.35, + "grad_norm": 1.6470834823130729, + "learning_rate": 1.4983416843719099e-05, + "loss": 0.8706, + "step": 2197 + }, + { + "epoch": 0.35, + "grad_norm": 1.3348927613006705, + "learning_rate": 1.497889099439695e-05, + "loss": 0.8099, + "step": 2198 + }, + { + "epoch": 0.35, + "grad_norm": 1.3441023905137717, + "learning_rate": 1.4974363788687651e-05, + "loss": 0.8681, + "step": 2199 + }, + { + "epoch": 0.35, + "grad_norm": 1.4593763048433683, + "learning_rate": 1.4969835227824533e-05, + "loss": 0.9, + "step": 2200 + }, + { + "epoch": 0.35, + "grad_norm": 1.4980166347017778, + "learning_rate": 1.496530531304131e-05, + "loss": 0.9216, + "step": 2201 + }, + { + "epoch": 0.35, + "grad_norm": 1.3940221367396826, + "learning_rate": 1.4960774045572046e-05, + "loss": 0.8919, + "step": 2202 + }, + { + "epoch": 0.35, + "grad_norm": 1.3829439506175594, + "learning_rate": 1.495624142665119e-05, + "loss": 0.9305, + "step": 2203 + }, + { + "epoch": 0.36, + "grad_norm": 1.4953974947545636, + "learning_rate": 1.4951707457513549e-05, + "loss": 0.874, + "step": 2204 + }, + { + "epoch": 0.36, + "grad_norm": 1.3430045144049578, + "learning_rate": 1.4947172139394301e-05, + "loss": 0.8792, + "step": 2205 + }, + { + "epoch": 0.36, + "grad_norm": 1.3363199634631722, + "learning_rate": 1.4942635473528994e-05, + "loss": 0.9126, + "step": 2206 + }, + { + "epoch": 0.36, + "grad_norm": 1.5989883915243177, + "learning_rate": 1.493809746115354e-05, + "loss": 0.9173, + "step": 2207 + }, + { + "epoch": 0.36, + "grad_norm": 1.4735093693888386, + "learning_rate": 1.4933558103504215e-05, + "loss": 0.8741, + "step": 2208 + }, + { + "epoch": 0.36, + "grad_norm": 1.3664428420044636, + "learning_rate": 1.4929017401817672e-05, + "loss": 0.8533, + "step": 2209 + }, + { + "epoch": 0.36, + "grad_norm": 1.33577014546894, + "learning_rate": 1.4924475357330919e-05, + "loss": 0.8291, + "step": 2210 + }, + { + "epoch": 0.36, + "grad_norm": 1.5534383257712345, + "learning_rate": 1.491993197128133e-05, + "loss": 0.9107, + "step": 2211 + }, + { + "epoch": 0.36, + "grad_norm": 1.5562728766929625, + "learning_rate": 1.4915387244906658e-05, + "loss": 0.8227, + "step": 2212 + }, + { + "epoch": 0.36, + "grad_norm": 1.3764733689427457, + "learning_rate": 1.4910841179445007e-05, + "loss": 0.9447, + "step": 2213 + }, + { + "epoch": 0.36, + "grad_norm": 0.9009180427465397, + "learning_rate": 1.4906293776134849e-05, + "loss": 0.3532, + "step": 2214 + }, + { + "epoch": 0.36, + "grad_norm": 1.3928709491375821, + "learning_rate": 1.4901745036215022e-05, + "loss": 0.9205, + "step": 2215 + }, + { + "epoch": 0.36, + "grad_norm": 1.4288269772864715, + "learning_rate": 1.4897194960924732e-05, + "loss": 0.9116, + "step": 2216 + }, + { + "epoch": 0.36, + "grad_norm": 1.5364191948760364, + "learning_rate": 1.4892643551503545e-05, + "loss": 0.9371, + "step": 2217 + }, + { + "epoch": 0.36, + "grad_norm": 1.2509108428606786, + "learning_rate": 1.4888090809191384e-05, + "loss": 0.7942, + "step": 2218 + }, + { + "epoch": 0.36, + "grad_norm": 1.2718800392039336, + "learning_rate": 1.4883536735228548e-05, + "loss": 0.8238, + "step": 2219 + }, + { + "epoch": 0.36, + "grad_norm": 1.3849004137136909, + "learning_rate": 1.4878981330855688e-05, + "loss": 0.865, + "step": 2220 + }, + { + "epoch": 0.36, + "grad_norm": 1.5411500956725745, + "learning_rate": 1.4874424597313828e-05, + "loss": 0.9407, + "step": 2221 + }, + { + "epoch": 0.36, + "grad_norm": 1.3353674265555382, + "learning_rate": 1.4869866535844337e-05, + "loss": 0.8845, + "step": 2222 + }, + { + "epoch": 0.36, + "grad_norm": 1.3698484887338873, + "learning_rate": 1.4865307147688967e-05, + "loss": 0.9461, + "step": 2223 + }, + { + "epoch": 0.36, + "grad_norm": 1.2663180747978635, + "learning_rate": 1.4860746434089817e-05, + "loss": 0.8473, + "step": 2224 + }, + { + "epoch": 0.36, + "grad_norm": 1.5572281952642122, + "learning_rate": 1.4856184396289348e-05, + "loss": 0.8364, + "step": 2225 + }, + { + "epoch": 0.36, + "grad_norm": 0.8211036301561949, + "learning_rate": 1.4851621035530392e-05, + "loss": 0.3428, + "step": 2226 + }, + { + "epoch": 0.36, + "grad_norm": 1.203748414150844, + "learning_rate": 1.4847056353056126e-05, + "loss": 0.8329, + "step": 2227 + }, + { + "epoch": 0.36, + "grad_norm": 1.4945221208212804, + "learning_rate": 1.4842490350110103e-05, + "loss": 0.9543, + "step": 2228 + }, + { + "epoch": 0.36, + "grad_norm": 1.19980046983959, + "learning_rate": 1.4837923027936223e-05, + "loss": 0.9187, + "step": 2229 + }, + { + "epoch": 0.36, + "grad_norm": 1.3635322311079516, + "learning_rate": 1.4833354387778753e-05, + "loss": 0.8704, + "step": 2230 + }, + { + "epoch": 0.36, + "grad_norm": 1.3559404330255946, + "learning_rate": 1.4828784430882315e-05, + "loss": 0.8784, + "step": 2231 + }, + { + "epoch": 0.36, + "grad_norm": 1.2887310757991217, + "learning_rate": 1.482421315849189e-05, + "loss": 0.9186, + "step": 2232 + }, + { + "epoch": 0.36, + "grad_norm": 1.1409952296876684, + "learning_rate": 1.4819640571852823e-05, + "loss": 0.8771, + "step": 2233 + }, + { + "epoch": 0.36, + "grad_norm": 1.3235816442735124, + "learning_rate": 1.4815066672210809e-05, + "loss": 0.8059, + "step": 2234 + }, + { + "epoch": 0.36, + "grad_norm": 1.5422495270297572, + "learning_rate": 1.4810491460811907e-05, + "loss": 0.8857, + "step": 2235 + }, + { + "epoch": 0.36, + "grad_norm": 1.3866930482389916, + "learning_rate": 1.4805914938902525e-05, + "loss": 0.8656, + "step": 2236 + }, + { + "epoch": 0.36, + "grad_norm": 1.3360330313272575, + "learning_rate": 1.4801337107729443e-05, + "loss": 0.9133, + "step": 2237 + }, + { + "epoch": 0.36, + "grad_norm": 1.4473589031826841, + "learning_rate": 1.4796757968539779e-05, + "loss": 0.8265, + "step": 2238 + }, + { + "epoch": 0.36, + "grad_norm": 1.429804220021035, + "learning_rate": 1.4792177522581023e-05, + "loss": 0.9466, + "step": 2239 + }, + { + "epoch": 0.36, + "grad_norm": 1.4044861407329503, + "learning_rate": 1.4787595771101013e-05, + "loss": 0.8658, + "step": 2240 + }, + { + "epoch": 0.36, + "grad_norm": 1.4843499934449191, + "learning_rate": 1.4783012715347944e-05, + "loss": 0.9163, + "step": 2241 + }, + { + "epoch": 0.36, + "grad_norm": 1.4298231986793588, + "learning_rate": 1.4778428356570365e-05, + "loss": 0.8582, + "step": 2242 + }, + { + "epoch": 0.36, + "grad_norm": 1.6599928702575104, + "learning_rate": 1.4773842696017184e-05, + "loss": 0.8379, + "step": 2243 + }, + { + "epoch": 0.36, + "grad_norm": 1.4407991885818552, + "learning_rate": 1.4769255734937662e-05, + "loss": 0.8623, + "step": 2244 + }, + { + "epoch": 0.36, + "grad_norm": 1.5203572362848454, + "learning_rate": 1.4764667474581416e-05, + "loss": 0.8996, + "step": 2245 + }, + { + "epoch": 0.36, + "grad_norm": 1.5506097689160783, + "learning_rate": 1.4760077916198405e-05, + "loss": 0.9055, + "step": 2246 + }, + { + "epoch": 0.36, + "grad_norm": 1.2026901202285056, + "learning_rate": 1.475548706103896e-05, + "loss": 0.8403, + "step": 2247 + }, + { + "epoch": 0.36, + "grad_norm": 1.466505080049432, + "learning_rate": 1.4750894910353754e-05, + "loss": 0.8992, + "step": 2248 + }, + { + "epoch": 0.36, + "grad_norm": 1.377555539535488, + "learning_rate": 1.4746301465393814e-05, + "loss": 0.9135, + "step": 2249 + }, + { + "epoch": 0.36, + "grad_norm": 1.4315536753647504, + "learning_rate": 1.4741706727410522e-05, + "loss": 0.8818, + "step": 2250 + }, + { + "epoch": 0.36, + "grad_norm": 1.595508009048657, + "learning_rate": 1.4737110697655613e-05, + "loss": 0.9461, + "step": 2251 + }, + { + "epoch": 0.36, + "grad_norm": 1.4742230168145904, + "learning_rate": 1.473251337738117e-05, + "loss": 0.812, + "step": 2252 + }, + { + "epoch": 0.36, + "grad_norm": 1.5138875149358872, + "learning_rate": 1.472791476783963e-05, + "loss": 0.8835, + "step": 2253 + }, + { + "epoch": 0.36, + "grad_norm": 1.5286564667514386, + "learning_rate": 1.4723314870283783e-05, + "loss": 0.9118, + "step": 2254 + }, + { + "epoch": 0.36, + "grad_norm": 1.3020136164041431, + "learning_rate": 1.4718713685966765e-05, + "loss": 0.9112, + "step": 2255 + }, + { + "epoch": 0.36, + "grad_norm": 1.3832504311772398, + "learning_rate": 1.4714111216142068e-05, + "loss": 0.8596, + "step": 2256 + }, + { + "epoch": 0.36, + "grad_norm": 1.409251480230001, + "learning_rate": 1.470950746206353e-05, + "loss": 0.917, + "step": 2257 + }, + { + "epoch": 0.36, + "grad_norm": 1.3172843194993251, + "learning_rate": 1.4704902424985341e-05, + "loss": 0.9592, + "step": 2258 + }, + { + "epoch": 0.36, + "grad_norm": 1.3252559611442432, + "learning_rate": 1.4700296106162042e-05, + "loss": 0.8878, + "step": 2259 + }, + { + "epoch": 0.36, + "grad_norm": 1.8388986964673923, + "learning_rate": 1.4695688506848513e-05, + "loss": 0.8958, + "step": 2260 + }, + { + "epoch": 0.36, + "grad_norm": 1.2903790062134557, + "learning_rate": 1.4691079628300004e-05, + "loss": 0.9224, + "step": 2261 + }, + { + "epoch": 0.36, + "grad_norm": 1.2773115207969163, + "learning_rate": 1.4686469471772089e-05, + "loss": 0.9451, + "step": 2262 + }, + { + "epoch": 0.36, + "grad_norm": 1.4205982795733245, + "learning_rate": 1.4681858038520711e-05, + "loss": 0.8737, + "step": 2263 + }, + { + "epoch": 0.36, + "grad_norm": 1.5601666910035128, + "learning_rate": 1.4677245329802146e-05, + "loss": 0.8408, + "step": 2264 + }, + { + "epoch": 0.36, + "grad_norm": 1.4217344426803438, + "learning_rate": 1.4672631346873023e-05, + "loss": 0.8615, + "step": 2265 + }, + { + "epoch": 0.37, + "grad_norm": 1.350937697527269, + "learning_rate": 1.466801609099032e-05, + "loss": 0.8693, + "step": 2266 + }, + { + "epoch": 0.37, + "grad_norm": 1.3570689840401682, + "learning_rate": 1.4663399563411358e-05, + "loss": 0.8811, + "step": 2267 + }, + { + "epoch": 0.37, + "grad_norm": 1.3496948672624813, + "learning_rate": 1.4658781765393808e-05, + "loss": 0.8953, + "step": 2268 + }, + { + "epoch": 0.37, + "grad_norm": 1.327053535442286, + "learning_rate": 1.4654162698195684e-05, + "loss": 0.8904, + "step": 2269 + }, + { + "epoch": 0.37, + "grad_norm": 1.5174920065021593, + "learning_rate": 1.4649542363075353e-05, + "loss": 0.8481, + "step": 2270 + }, + { + "epoch": 0.37, + "grad_norm": 1.4116288497959797, + "learning_rate": 1.464492076129151e-05, + "loss": 0.9165, + "step": 2271 + }, + { + "epoch": 0.37, + "grad_norm": 1.8629959785761128, + "learning_rate": 1.464029789410322e-05, + "loss": 0.9483, + "step": 2272 + }, + { + "epoch": 0.37, + "grad_norm": 0.93831840397108, + "learning_rate": 1.4635673762769868e-05, + "loss": 0.345, + "step": 2273 + }, + { + "epoch": 0.37, + "grad_norm": 1.1893920115413457, + "learning_rate": 1.4631048368551204e-05, + "loss": 0.8606, + "step": 2274 + }, + { + "epoch": 0.37, + "grad_norm": 1.3871992796453383, + "learning_rate": 1.4626421712707307e-05, + "loss": 0.8842, + "step": 2275 + }, + { + "epoch": 0.37, + "grad_norm": 1.6366211405716014, + "learning_rate": 1.4621793796498606e-05, + "loss": 0.8938, + "step": 2276 + }, + { + "epoch": 0.37, + "grad_norm": 1.801098204003515, + "learning_rate": 1.4617164621185877e-05, + "loss": 0.9431, + "step": 2277 + }, + { + "epoch": 0.37, + "grad_norm": 1.644681898723982, + "learning_rate": 1.4612534188030233e-05, + "loss": 0.8997, + "step": 2278 + }, + { + "epoch": 0.37, + "grad_norm": 1.3780727954251806, + "learning_rate": 1.4607902498293127e-05, + "loss": 0.8016, + "step": 2279 + }, + { + "epoch": 0.37, + "grad_norm": 1.3223759282589622, + "learning_rate": 1.4603269553236366e-05, + "loss": 0.9506, + "step": 2280 + }, + { + "epoch": 0.37, + "grad_norm": 1.3953561376735486, + "learning_rate": 1.4598635354122087e-05, + "loss": 0.864, + "step": 2281 + }, + { + "epoch": 0.37, + "grad_norm": 1.7313769424374013, + "learning_rate": 1.4593999902212775e-05, + "loss": 0.869, + "step": 2282 + }, + { + "epoch": 0.37, + "grad_norm": 1.274980748635463, + "learning_rate": 1.4589363198771258e-05, + "loss": 0.8288, + "step": 2283 + }, + { + "epoch": 0.37, + "grad_norm": 1.4108575136592878, + "learning_rate": 1.4584725245060694e-05, + "loss": 0.8715, + "step": 2284 + }, + { + "epoch": 0.37, + "grad_norm": 1.5195966822331568, + "learning_rate": 1.45800860423446e-05, + "loss": 0.9155, + "step": 2285 + }, + { + "epoch": 0.37, + "grad_norm": 1.3887607298563331, + "learning_rate": 1.4575445591886814e-05, + "loss": 0.9505, + "step": 2286 + }, + { + "epoch": 0.37, + "grad_norm": 1.3024717728254744, + "learning_rate": 1.4570803894951528e-05, + "loss": 0.863, + "step": 2287 + }, + { + "epoch": 0.37, + "grad_norm": 1.4590078840953333, + "learning_rate": 1.4566160952803268e-05, + "loss": 0.8935, + "step": 2288 + }, + { + "epoch": 0.37, + "grad_norm": 1.3611168327986414, + "learning_rate": 1.4561516766706893e-05, + "loss": 0.9488, + "step": 2289 + }, + { + "epoch": 0.37, + "grad_norm": 1.4840563394419044, + "learning_rate": 1.4556871337927615e-05, + "loss": 0.8351, + "step": 2290 + }, + { + "epoch": 0.37, + "grad_norm": 1.4736371182479293, + "learning_rate": 1.455222466773097e-05, + "loss": 0.8934, + "step": 2291 + }, + { + "epoch": 0.37, + "grad_norm": 1.3172184737020978, + "learning_rate": 1.4547576757382843e-05, + "loss": 0.8857, + "step": 2292 + }, + { + "epoch": 0.37, + "grad_norm": 1.0850948715462614, + "learning_rate": 1.4542927608149456e-05, + "loss": 0.3364, + "step": 2293 + }, + { + "epoch": 0.37, + "grad_norm": 1.3167077909124218, + "learning_rate": 1.453827722129736e-05, + "loss": 0.8587, + "step": 2294 + }, + { + "epoch": 0.37, + "grad_norm": 1.4148094617542475, + "learning_rate": 1.4533625598093453e-05, + "loss": 0.9536, + "step": 2295 + }, + { + "epoch": 0.37, + "grad_norm": 1.3690713852694407, + "learning_rate": 1.452897273980496e-05, + "loss": 0.9188, + "step": 2296 + }, + { + "epoch": 0.37, + "grad_norm": 1.815478259601864, + "learning_rate": 1.452431864769945e-05, + "loss": 0.9335, + "step": 2297 + }, + { + "epoch": 0.37, + "grad_norm": 1.1742590840678162, + "learning_rate": 1.451966332304483e-05, + "loss": 0.9484, + "step": 2298 + }, + { + "epoch": 0.37, + "grad_norm": 0.8761479848988268, + "learning_rate": 1.4515006767109336e-05, + "loss": 0.353, + "step": 2299 + }, + { + "epoch": 0.37, + "grad_norm": 1.4187932664831986, + "learning_rate": 1.451034898116154e-05, + "loss": 0.9211, + "step": 2300 + }, + { + "epoch": 0.37, + "grad_norm": 1.4367702150280137, + "learning_rate": 1.4505689966470353e-05, + "loss": 0.8344, + "step": 2301 + }, + { + "epoch": 0.37, + "grad_norm": 1.4237007246332254, + "learning_rate": 1.4501029724305019e-05, + "loss": 0.8346, + "step": 2302 + }, + { + "epoch": 0.37, + "grad_norm": 1.3004173440643338, + "learning_rate": 1.4496368255935115e-05, + "loss": 0.8892, + "step": 2303 + }, + { + "epoch": 0.37, + "grad_norm": 1.3140514441636295, + "learning_rate": 1.4491705562630555e-05, + "loss": 0.8881, + "step": 2304 + }, + { + "epoch": 0.37, + "grad_norm": 1.7634660784521374, + "learning_rate": 1.4487041645661588e-05, + "loss": 0.9144, + "step": 2305 + }, + { + "epoch": 0.37, + "grad_norm": 1.428872698037481, + "learning_rate": 1.448237650629879e-05, + "loss": 0.866, + "step": 2306 + }, + { + "epoch": 0.37, + "grad_norm": 1.2516594124180394, + "learning_rate": 1.4477710145813074e-05, + "loss": 0.945, + "step": 2307 + }, + { + "epoch": 0.37, + "grad_norm": 0.8378396013147641, + "learning_rate": 1.4473042565475684e-05, + "loss": 0.3709, + "step": 2308 + }, + { + "epoch": 0.37, + "grad_norm": 1.2823692124034947, + "learning_rate": 1.44683737665582e-05, + "loss": 0.885, + "step": 2309 + }, + { + "epoch": 0.37, + "grad_norm": 1.335757176459738, + "learning_rate": 1.4463703750332532e-05, + "loss": 0.8193, + "step": 2310 + }, + { + "epoch": 0.37, + "grad_norm": 1.3921597575946034, + "learning_rate": 1.4459032518070917e-05, + "loss": 0.8593, + "step": 2311 + }, + { + "epoch": 0.37, + "grad_norm": 1.8924746913222323, + "learning_rate": 1.4454360071045933e-05, + "loss": 0.9466, + "step": 2312 + }, + { + "epoch": 0.37, + "grad_norm": 1.3528466628879237, + "learning_rate": 1.4449686410530478e-05, + "loss": 0.8469, + "step": 2313 + }, + { + "epoch": 0.37, + "grad_norm": 1.3656002572194696, + "learning_rate": 1.4445011537797788e-05, + "loss": 0.9335, + "step": 2314 + }, + { + "epoch": 0.37, + "grad_norm": 1.0229486770446363, + "learning_rate": 1.4440335454121428e-05, + "loss": 0.3387, + "step": 2315 + }, + { + "epoch": 0.37, + "grad_norm": 1.2594263647563648, + "learning_rate": 1.4435658160775296e-05, + "loss": 0.8384, + "step": 2316 + }, + { + "epoch": 0.37, + "grad_norm": 1.349953835192534, + "learning_rate": 1.4430979659033609e-05, + "loss": 0.8855, + "step": 2317 + }, + { + "epoch": 0.37, + "grad_norm": 1.2380791925715058, + "learning_rate": 1.442629995017092e-05, + "loss": 0.8738, + "step": 2318 + }, + { + "epoch": 0.37, + "grad_norm": 1.3517152046749763, + "learning_rate": 1.4421619035462115e-05, + "loss": 0.8757, + "step": 2319 + }, + { + "epoch": 0.37, + "grad_norm": 1.4407721445358312, + "learning_rate": 1.44169369161824e-05, + "loss": 0.7921, + "step": 2320 + }, + { + "epoch": 0.37, + "grad_norm": 1.4268828757590253, + "learning_rate": 1.4412253593607317e-05, + "loss": 0.9313, + "step": 2321 + }, + { + "epoch": 0.37, + "grad_norm": 1.3541678774045764, + "learning_rate": 1.4407569069012729e-05, + "loss": 0.8978, + "step": 2322 + }, + { + "epoch": 0.37, + "grad_norm": 1.4713251981724855, + "learning_rate": 1.4402883343674834e-05, + "loss": 0.9266, + "step": 2323 + }, + { + "epoch": 0.37, + "grad_norm": 1.3447001931812363, + "learning_rate": 1.4398196418870147e-05, + "loss": 0.8623, + "step": 2324 + }, + { + "epoch": 0.37, + "grad_norm": 0.8588017581060287, + "learning_rate": 1.439350829587552e-05, + "loss": 0.3442, + "step": 2325 + }, + { + "epoch": 0.37, + "grad_norm": 1.4449285448265363, + "learning_rate": 1.4388818975968126e-05, + "loss": 0.8218, + "step": 2326 + }, + { + "epoch": 0.37, + "grad_norm": 1.66227848501601, + "learning_rate": 1.4384128460425467e-05, + "loss": 0.8696, + "step": 2327 + }, + { + "epoch": 0.38, + "grad_norm": 1.6301332124844432, + "learning_rate": 1.4379436750525362e-05, + "loss": 0.9313, + "step": 2328 + }, + { + "epoch": 0.38, + "grad_norm": 1.4705068538346993, + "learning_rate": 1.4374743847545967e-05, + "loss": 0.9439, + "step": 2329 + }, + { + "epoch": 0.38, + "grad_norm": 1.513450974373518, + "learning_rate": 1.437004975276576e-05, + "loss": 0.9266, + "step": 2330 + }, + { + "epoch": 0.38, + "grad_norm": 1.308554240954074, + "learning_rate": 1.4365354467463535e-05, + "loss": 0.913, + "step": 2331 + }, + { + "epoch": 0.38, + "grad_norm": 1.5428335746270734, + "learning_rate": 1.4360657992918423e-05, + "loss": 0.9383, + "step": 2332 + }, + { + "epoch": 0.38, + "grad_norm": 1.5027962142639386, + "learning_rate": 1.435596033040987e-05, + "loss": 0.8901, + "step": 2333 + }, + { + "epoch": 0.38, + "grad_norm": 1.335328444994548, + "learning_rate": 1.4351261481217655e-05, + "loss": 0.9365, + "step": 2334 + }, + { + "epoch": 0.38, + "grad_norm": 1.3755082876140545, + "learning_rate": 1.4346561446621865e-05, + "loss": 0.9054, + "step": 2335 + }, + { + "epoch": 0.38, + "grad_norm": 1.5176193140888927, + "learning_rate": 1.4341860227902923e-05, + "loss": 0.9517, + "step": 2336 + }, + { + "epoch": 0.38, + "grad_norm": 1.3000791712698776, + "learning_rate": 1.4337157826341575e-05, + "loss": 0.9027, + "step": 2337 + }, + { + "epoch": 0.38, + "grad_norm": 1.3134990640577662, + "learning_rate": 1.4332454243218878e-05, + "loss": 0.8481, + "step": 2338 + }, + { + "epoch": 0.38, + "grad_norm": 1.3828734664624016, + "learning_rate": 1.432774947981622e-05, + "loss": 0.8451, + "step": 2339 + }, + { + "epoch": 0.38, + "grad_norm": 1.1983359660795525, + "learning_rate": 1.4323043537415311e-05, + "loss": 0.8485, + "step": 2340 + }, + { + "epoch": 0.38, + "grad_norm": 1.2515336478047323, + "learning_rate": 1.4318336417298173e-05, + "loss": 0.8891, + "step": 2341 + }, + { + "epoch": 0.38, + "grad_norm": 1.4104676180271158, + "learning_rate": 1.431362812074716e-05, + "loss": 0.8303, + "step": 2342 + }, + { + "epoch": 0.38, + "grad_norm": 1.3026118771381907, + "learning_rate": 1.4308918649044947e-05, + "loss": 0.912, + "step": 2343 + }, + { + "epoch": 0.38, + "grad_norm": 1.298827873316002, + "learning_rate": 1.4304208003474508e-05, + "loss": 0.8445, + "step": 2344 + }, + { + "epoch": 0.38, + "grad_norm": 1.3275492065140901, + "learning_rate": 1.429949618531917e-05, + "loss": 0.945, + "step": 2345 + }, + { + "epoch": 0.38, + "grad_norm": 1.7085651428451163, + "learning_rate": 1.4294783195862553e-05, + "loss": 0.8325, + "step": 2346 + }, + { + "epoch": 0.38, + "grad_norm": 0.8666770819402482, + "learning_rate": 1.4290069036388607e-05, + "loss": 0.3645, + "step": 2347 + }, + { + "epoch": 0.38, + "grad_norm": 0.9054608973959942, + "learning_rate": 1.42853537081816e-05, + "loss": 0.3374, + "step": 2348 + }, + { + "epoch": 0.38, + "grad_norm": 1.6988285263369804, + "learning_rate": 1.4280637212526116e-05, + "loss": 0.9316, + "step": 2349 + }, + { + "epoch": 0.38, + "grad_norm": 1.5271065947375175, + "learning_rate": 1.4275919550707058e-05, + "loss": 0.8365, + "step": 2350 + }, + { + "epoch": 0.38, + "grad_norm": 1.3399314205247148, + "learning_rate": 1.4271200724009648e-05, + "loss": 0.9861, + "step": 2351 + }, + { + "epoch": 0.38, + "grad_norm": 1.8278897604834503, + "learning_rate": 1.4266480733719426e-05, + "loss": 0.9477, + "step": 2352 + }, + { + "epoch": 0.38, + "grad_norm": 1.4272361713264188, + "learning_rate": 1.4261759581122243e-05, + "loss": 0.9376, + "step": 2353 + }, + { + "epoch": 0.38, + "grad_norm": 0.904939267805562, + "learning_rate": 1.4257037267504277e-05, + "loss": 0.341, + "step": 2354 + }, + { + "epoch": 0.38, + "grad_norm": 1.2982544040578603, + "learning_rate": 1.425231379415201e-05, + "loss": 0.8394, + "step": 2355 + }, + { + "epoch": 0.38, + "grad_norm": 1.29295131723459, + "learning_rate": 1.4247589162352254e-05, + "loss": 0.8835, + "step": 2356 + }, + { + "epoch": 0.38, + "grad_norm": 0.8530824722049526, + "learning_rate": 1.4242863373392123e-05, + "loss": 0.3404, + "step": 2357 + }, + { + "epoch": 0.38, + "grad_norm": 1.3336248101738803, + "learning_rate": 1.423813642855905e-05, + "loss": 0.8602, + "step": 2358 + }, + { + "epoch": 0.38, + "grad_norm": 1.4707176947410265, + "learning_rate": 1.4233408329140796e-05, + "loss": 0.9193, + "step": 2359 + }, + { + "epoch": 0.38, + "grad_norm": 1.2490026980804068, + "learning_rate": 1.4228679076425414e-05, + "loss": 0.8729, + "step": 2360 + }, + { + "epoch": 0.38, + "grad_norm": 1.3143610475663956, + "learning_rate": 1.4223948671701289e-05, + "loss": 0.7777, + "step": 2361 + }, + { + "epoch": 0.38, + "grad_norm": 0.7865738916708855, + "learning_rate": 1.4219217116257111e-05, + "loss": 0.3279, + "step": 2362 + }, + { + "epoch": 0.38, + "grad_norm": 1.5862865392761671, + "learning_rate": 1.4214484411381885e-05, + "loss": 0.8773, + "step": 2363 + }, + { + "epoch": 0.38, + "grad_norm": 1.248686163702446, + "learning_rate": 1.4209750558364936e-05, + "loss": 0.9352, + "step": 2364 + }, + { + "epoch": 0.38, + "grad_norm": 1.45230269430469, + "learning_rate": 1.4205015558495893e-05, + "loss": 0.8383, + "step": 2365 + }, + { + "epoch": 0.38, + "grad_norm": 1.5173336486325004, + "learning_rate": 1.4200279413064695e-05, + "loss": 0.9778, + "step": 2366 + }, + { + "epoch": 0.38, + "grad_norm": 1.5354128355012306, + "learning_rate": 1.419554212336161e-05, + "loss": 0.9282, + "step": 2367 + }, + { + "epoch": 0.38, + "grad_norm": 1.644502211380055, + "learning_rate": 1.4190803690677195e-05, + "loss": 0.9084, + "step": 2368 + }, + { + "epoch": 0.38, + "grad_norm": 1.3611401088758242, + "learning_rate": 1.4186064116302336e-05, + "loss": 0.8821, + "step": 2369 + }, + { + "epoch": 0.38, + "grad_norm": 0.9827002423409152, + "learning_rate": 1.4181323401528224e-05, + "loss": 0.3581, + "step": 2370 + }, + { + "epoch": 0.38, + "grad_norm": 1.3561769471292597, + "learning_rate": 1.4176581547646354e-05, + "loss": 0.8966, + "step": 2371 + }, + { + "epoch": 0.38, + "grad_norm": 1.4878033254855738, + "learning_rate": 1.4171838555948548e-05, + "loss": 0.8066, + "step": 2372 + }, + { + "epoch": 0.38, + "grad_norm": 1.1602231999387538, + "learning_rate": 1.4167094427726916e-05, + "loss": 0.896, + "step": 2373 + }, + { + "epoch": 0.38, + "grad_norm": 1.5066576986406188, + "learning_rate": 1.4162349164273899e-05, + "loss": 0.9404, + "step": 2374 + }, + { + "epoch": 0.38, + "grad_norm": 1.5156690852891446, + "learning_rate": 1.4157602766882233e-05, + "loss": 0.8483, + "step": 2375 + }, + { + "epoch": 0.38, + "grad_norm": 1.8639810755389616, + "learning_rate": 1.4152855236844969e-05, + "loss": 0.8858, + "step": 2376 + }, + { + "epoch": 0.38, + "grad_norm": 1.357957809638119, + "learning_rate": 1.414810657545546e-05, + "loss": 0.8565, + "step": 2377 + }, + { + "epoch": 0.38, + "grad_norm": 1.3921808322111036, + "learning_rate": 1.4143356784007383e-05, + "loss": 0.9132, + "step": 2378 + }, + { + "epoch": 0.38, + "grad_norm": 1.438989282458107, + "learning_rate": 1.4138605863794703e-05, + "loss": 0.9174, + "step": 2379 + }, + { + "epoch": 0.38, + "grad_norm": 1.5333557052984321, + "learning_rate": 1.4133853816111703e-05, + "loss": 0.8386, + "step": 2380 + }, + { + "epoch": 0.38, + "grad_norm": 1.4108096957277927, + "learning_rate": 1.4129100642252978e-05, + "loss": 0.8925, + "step": 2381 + }, + { + "epoch": 0.38, + "grad_norm": 0.8295845735459987, + "learning_rate": 1.4124346343513411e-05, + "loss": 0.3456, + "step": 2382 + }, + { + "epoch": 0.38, + "grad_norm": 1.4893170667023041, + "learning_rate": 1.4119590921188217e-05, + "loss": 0.9038, + "step": 2383 + }, + { + "epoch": 0.38, + "grad_norm": 1.1956011756265903, + "learning_rate": 1.4114834376572898e-05, + "loss": 0.8457, + "step": 2384 + }, + { + "epoch": 0.38, + "grad_norm": 1.3722441597081454, + "learning_rate": 1.4110076710963269e-05, + "loss": 0.937, + "step": 2385 + }, + { + "epoch": 0.38, + "grad_norm": 1.3862584189526093, + "learning_rate": 1.4105317925655448e-05, + "loss": 0.8846, + "step": 2386 + }, + { + "epoch": 0.38, + "grad_norm": 1.3173203082346663, + "learning_rate": 1.4100558021945863e-05, + "loss": 0.8444, + "step": 2387 + }, + { + "epoch": 0.38, + "grad_norm": 1.4414543877810566, + "learning_rate": 1.4095797001131238e-05, + "loss": 0.8533, + "step": 2388 + }, + { + "epoch": 0.38, + "grad_norm": 1.0625404006625905, + "learning_rate": 1.4091034864508608e-05, + "loss": 0.3078, + "step": 2389 + }, + { + "epoch": 0.39, + "grad_norm": 1.2559781670933106, + "learning_rate": 1.408627161337531e-05, + "loss": 0.8449, + "step": 2390 + }, + { + "epoch": 0.39, + "grad_norm": 1.301360216970839, + "learning_rate": 1.4081507249028987e-05, + "loss": 0.9401, + "step": 2391 + }, + { + "epoch": 0.39, + "grad_norm": 1.4230749835856906, + "learning_rate": 1.4076741772767586e-05, + "loss": 0.8865, + "step": 2392 + }, + { + "epoch": 0.39, + "grad_norm": 1.3386574935029263, + "learning_rate": 1.4071975185889344e-05, + "loss": 0.8756, + "step": 2393 + }, + { + "epoch": 0.39, + "grad_norm": 1.223114574469079, + "learning_rate": 1.406720748969282e-05, + "loss": 0.863, + "step": 2394 + }, + { + "epoch": 0.39, + "grad_norm": 1.4618617980472217, + "learning_rate": 1.4062438685476862e-05, + "loss": 0.9081, + "step": 2395 + }, + { + "epoch": 0.39, + "grad_norm": 1.7333851021267135, + "learning_rate": 1.4057668774540622e-05, + "loss": 0.8528, + "step": 2396 + }, + { + "epoch": 0.39, + "grad_norm": 1.3268877774134125, + "learning_rate": 1.405289775818356e-05, + "loss": 0.8836, + "step": 2397 + }, + { + "epoch": 0.39, + "grad_norm": 1.4578891937130678, + "learning_rate": 1.4048125637705433e-05, + "loss": 0.8496, + "step": 2398 + }, + { + "epoch": 0.39, + "grad_norm": 1.2941225995663115, + "learning_rate": 1.404335241440629e-05, + "loss": 0.8377, + "step": 2399 + }, + { + "epoch": 0.39, + "grad_norm": 1.4734029704413851, + "learning_rate": 1.4038578089586493e-05, + "loss": 0.8771, + "step": 2400 + }, + { + "epoch": 0.39, + "grad_norm": 1.2483468792100927, + "learning_rate": 1.4033802664546704e-05, + "loss": 0.9133, + "step": 2401 + }, + { + "epoch": 0.39, + "grad_norm": 1.591754700973777, + "learning_rate": 1.4029026140587876e-05, + "loss": 0.8767, + "step": 2402 + }, + { + "epoch": 0.39, + "grad_norm": 1.371542106718214, + "learning_rate": 1.4024248519011266e-05, + "loss": 0.8427, + "step": 2403 + }, + { + "epoch": 0.39, + "grad_norm": 1.5535073899273129, + "learning_rate": 1.4019469801118429e-05, + "loss": 0.8505, + "step": 2404 + }, + { + "epoch": 0.39, + "grad_norm": 1.8314805494354032, + "learning_rate": 1.4014689988211223e-05, + "loss": 0.8411, + "step": 2405 + }, + { + "epoch": 0.39, + "grad_norm": 1.3193028006073544, + "learning_rate": 1.4009909081591798e-05, + "loss": 0.9397, + "step": 2406 + }, + { + "epoch": 0.39, + "grad_norm": 1.3300142608972412, + "learning_rate": 1.4005127082562603e-05, + "loss": 0.8583, + "step": 2407 + }, + { + "epoch": 0.39, + "grad_norm": 1.375453318841157, + "learning_rate": 1.4000343992426391e-05, + "loss": 0.8603, + "step": 2408 + }, + { + "epoch": 0.39, + "grad_norm": 1.5195993834766162, + "learning_rate": 1.3995559812486205e-05, + "loss": 0.85, + "step": 2409 + }, + { + "epoch": 0.39, + "grad_norm": 1.7163499569419223, + "learning_rate": 1.399077454404539e-05, + "loss": 0.896, + "step": 2410 + }, + { + "epoch": 0.39, + "grad_norm": 1.1600600380932284, + "learning_rate": 1.398598818840758e-05, + "loss": 0.8758, + "step": 2411 + }, + { + "epoch": 0.39, + "grad_norm": 1.2530655527075543, + "learning_rate": 1.3981200746876713e-05, + "loss": 0.8189, + "step": 2412 + }, + { + "epoch": 0.39, + "grad_norm": 1.1740726570895015, + "learning_rate": 1.397641222075702e-05, + "loss": 0.9119, + "step": 2413 + }, + { + "epoch": 0.39, + "grad_norm": 1.3950908263470454, + "learning_rate": 1.397162261135303e-05, + "loss": 0.8324, + "step": 2414 + }, + { + "epoch": 0.39, + "grad_norm": 1.4170086989173616, + "learning_rate": 1.396683191996956e-05, + "loss": 0.854, + "step": 2415 + }, + { + "epoch": 0.39, + "grad_norm": 1.3969800062896214, + "learning_rate": 1.3962040147911731e-05, + "loss": 0.7899, + "step": 2416 + }, + { + "epoch": 0.39, + "grad_norm": 1.3521077625202005, + "learning_rate": 1.3957247296484948e-05, + "loss": 0.8687, + "step": 2417 + }, + { + "epoch": 0.39, + "grad_norm": 1.445349802360639, + "learning_rate": 1.3952453366994921e-05, + "loss": 0.9665, + "step": 2418 + }, + { + "epoch": 0.39, + "grad_norm": 1.5941380245820356, + "learning_rate": 1.3947658360747646e-05, + "loss": 0.898, + "step": 2419 + }, + { + "epoch": 0.39, + "grad_norm": 1.4172281598521912, + "learning_rate": 1.3942862279049418e-05, + "loss": 0.8759, + "step": 2420 + }, + { + "epoch": 0.39, + "grad_norm": 1.5887946157405524, + "learning_rate": 1.393806512320682e-05, + "loss": 0.8981, + "step": 2421 + }, + { + "epoch": 0.39, + "grad_norm": 1.2385421441988698, + "learning_rate": 1.3933266894526725e-05, + "loss": 0.8691, + "step": 2422 + }, + { + "epoch": 0.39, + "grad_norm": 1.4540322654254652, + "learning_rate": 1.392846759431631e-05, + "loss": 0.8785, + "step": 2423 + }, + { + "epoch": 0.39, + "grad_norm": 1.5417677430309058, + "learning_rate": 1.392366722388303e-05, + "loss": 0.8963, + "step": 2424 + }, + { + "epoch": 0.39, + "grad_norm": 1.5877710202933286, + "learning_rate": 1.3918865784534647e-05, + "loss": 0.9224, + "step": 2425 + }, + { + "epoch": 0.39, + "grad_norm": 1.3737331753464996, + "learning_rate": 1.3914063277579195e-05, + "loss": 0.8323, + "step": 2426 + }, + { + "epoch": 0.39, + "grad_norm": 0.9291795657350221, + "learning_rate": 1.3909259704325018e-05, + "loss": 0.3265, + "step": 2427 + }, + { + "epoch": 0.39, + "grad_norm": 1.3074598267548723, + "learning_rate": 1.3904455066080737e-05, + "loss": 0.8468, + "step": 2428 + }, + { + "epoch": 0.39, + "grad_norm": 1.4620030964665989, + "learning_rate": 1.389964936415527e-05, + "loss": 0.8903, + "step": 2429 + }, + { + "epoch": 0.39, + "grad_norm": 1.3884231087030878, + "learning_rate": 1.3894842599857818e-05, + "loss": 0.8725, + "step": 2430 + }, + { + "epoch": 0.39, + "grad_norm": 1.4230503618887171, + "learning_rate": 1.3890034774497884e-05, + "loss": 0.8849, + "step": 2431 + }, + { + "epoch": 0.39, + "grad_norm": 1.2181851640090477, + "learning_rate": 1.3885225889385247e-05, + "loss": 0.8085, + "step": 2432 + }, + { + "epoch": 0.39, + "grad_norm": 1.5692719800790655, + "learning_rate": 1.3880415945829979e-05, + "loss": 0.8339, + "step": 2433 + }, + { + "epoch": 0.39, + "grad_norm": 1.4769324004599105, + "learning_rate": 1.3875604945142445e-05, + "loss": 0.8536, + "step": 2434 + }, + { + "epoch": 0.39, + "grad_norm": 1.5356410646626897, + "learning_rate": 1.3870792888633293e-05, + "loss": 0.836, + "step": 2435 + }, + { + "epoch": 0.39, + "grad_norm": 0.8496642812986422, + "learning_rate": 1.3865979777613459e-05, + "loss": 0.3421, + "step": 2436 + }, + { + "epoch": 0.39, + "grad_norm": 0.902998542697636, + "learning_rate": 1.3861165613394163e-05, + "loss": 0.3262, + "step": 2437 + }, + { + "epoch": 0.39, + "grad_norm": 1.4805731183004274, + "learning_rate": 1.3856350397286926e-05, + "loss": 0.8918, + "step": 2438 + }, + { + "epoch": 0.39, + "grad_norm": 1.4858714328182152, + "learning_rate": 1.3851534130603535e-05, + "loss": 0.898, + "step": 2439 + }, + { + "epoch": 0.39, + "grad_norm": 1.5888001422534668, + "learning_rate": 1.384671681465608e-05, + "loss": 0.871, + "step": 2440 + }, + { + "epoch": 0.39, + "grad_norm": 1.431369314077371, + "learning_rate": 1.3841898450756933e-05, + "loss": 0.899, + "step": 2441 + }, + { + "epoch": 0.39, + "grad_norm": 1.4392971953326603, + "learning_rate": 1.383707904021874e-05, + "loss": 0.8709, + "step": 2442 + }, + { + "epoch": 0.39, + "grad_norm": 1.2756894288830853, + "learning_rate": 1.383225858435445e-05, + "loss": 0.9531, + "step": 2443 + }, + { + "epoch": 0.39, + "grad_norm": 1.2519655137780676, + "learning_rate": 1.3827437084477285e-05, + "loss": 0.8775, + "step": 2444 + }, + { + "epoch": 0.39, + "grad_norm": 1.57830987510728, + "learning_rate": 1.3822614541900751e-05, + "loss": 0.8995, + "step": 2445 + }, + { + "epoch": 0.39, + "grad_norm": 1.2262534736298292, + "learning_rate": 1.3817790957938648e-05, + "loss": 0.8664, + "step": 2446 + }, + { + "epoch": 0.39, + "grad_norm": 1.4233709655814089, + "learning_rate": 1.3812966333905052e-05, + "loss": 0.9363, + "step": 2447 + }, + { + "epoch": 0.39, + "grad_norm": 1.4578438543380297, + "learning_rate": 1.3808140671114316e-05, + "loss": 0.8649, + "step": 2448 + }, + { + "epoch": 0.39, + "grad_norm": 1.649130293725084, + "learning_rate": 1.3803313970881093e-05, + "loss": 0.8275, + "step": 2449 + }, + { + "epoch": 0.39, + "grad_norm": 1.4220256227843642, + "learning_rate": 1.3798486234520306e-05, + "loss": 0.8266, + "step": 2450 + }, + { + "epoch": 0.39, + "grad_norm": 1.3347601554698256, + "learning_rate": 1.3793657463347158e-05, + "loss": 0.8999, + "step": 2451 + }, + { + "epoch": 0.4, + "grad_norm": 1.2935304682078652, + "learning_rate": 1.3788827658677151e-05, + "loss": 0.8632, + "step": 2452 + }, + { + "epoch": 0.4, + "grad_norm": 1.2701500851569192, + "learning_rate": 1.3783996821826043e-05, + "loss": 0.8942, + "step": 2453 + }, + { + "epoch": 0.4, + "grad_norm": 1.2948400189633782, + "learning_rate": 1.37791649541099e-05, + "loss": 0.8906, + "step": 2454 + }, + { + "epoch": 0.4, + "grad_norm": 1.6108081221311763, + "learning_rate": 1.3774332056845047e-05, + "loss": 0.8447, + "step": 2455 + }, + { + "epoch": 0.4, + "grad_norm": 1.3971923514038544, + "learning_rate": 1.3769498131348102e-05, + "loss": 0.8839, + "step": 2456 + }, + { + "epoch": 0.4, + "grad_norm": 1.4268500254530283, + "learning_rate": 1.376466317893596e-05, + "loss": 0.8898, + "step": 2457 + }, + { + "epoch": 0.4, + "grad_norm": 1.3137124363855885, + "learning_rate": 1.3759827200925796e-05, + "loss": 0.8794, + "step": 2458 + }, + { + "epoch": 0.4, + "grad_norm": 1.6602536184143386, + "learning_rate": 1.375499019863506e-05, + "loss": 0.913, + "step": 2459 + }, + { + "epoch": 0.4, + "grad_norm": 1.3080705799663142, + "learning_rate": 1.3750152173381488e-05, + "loss": 0.917, + "step": 2460 + }, + { + "epoch": 0.4, + "grad_norm": 1.5480241173913074, + "learning_rate": 1.3745313126483089e-05, + "loss": 0.8507, + "step": 2461 + }, + { + "epoch": 0.4, + "grad_norm": 1.272820031210293, + "learning_rate": 1.3740473059258155e-05, + "loss": 0.8996, + "step": 2462 + }, + { + "epoch": 0.4, + "grad_norm": 1.5763211859091717, + "learning_rate": 1.3735631973025254e-05, + "loss": 0.8086, + "step": 2463 + }, + { + "epoch": 0.4, + "grad_norm": 1.477612631630901, + "learning_rate": 1.3730789869103227e-05, + "loss": 0.9521, + "step": 2464 + }, + { + "epoch": 0.4, + "grad_norm": 1.396855974774549, + "learning_rate": 1.3725946748811203e-05, + "loss": 0.846, + "step": 2465 + }, + { + "epoch": 0.4, + "grad_norm": 1.3262585287197404, + "learning_rate": 1.3721102613468578e-05, + "loss": 0.8947, + "step": 2466 + }, + { + "epoch": 0.4, + "grad_norm": 1.2660739251142574, + "learning_rate": 1.3716257464395026e-05, + "loss": 0.9009, + "step": 2467 + }, + { + "epoch": 0.4, + "grad_norm": 1.2530867373363663, + "learning_rate": 1.3711411302910504e-05, + "loss": 0.8496, + "step": 2468 + }, + { + "epoch": 0.4, + "grad_norm": 1.4175445103279811, + "learning_rate": 1.3706564130335236e-05, + "loss": 0.8775, + "step": 2469 + }, + { + "epoch": 0.4, + "grad_norm": 1.3125592279410676, + "learning_rate": 1.370171594798973e-05, + "loss": 0.9216, + "step": 2470 + }, + { + "epoch": 0.4, + "grad_norm": 1.1922923357579691, + "learning_rate": 1.3696866757194757e-05, + "loss": 0.8413, + "step": 2471 + }, + { + "epoch": 0.4, + "grad_norm": 1.4614569191972286, + "learning_rate": 1.3692016559271377e-05, + "loss": 0.9114, + "step": 2472 + }, + { + "epoch": 0.4, + "grad_norm": 1.3003547587881557, + "learning_rate": 1.3687165355540915e-05, + "loss": 0.8758, + "step": 2473 + }, + { + "epoch": 0.4, + "grad_norm": 1.425584269333321, + "learning_rate": 1.3682313147324972e-05, + "loss": 0.8967, + "step": 2474 + }, + { + "epoch": 0.4, + "grad_norm": 1.3289298928392963, + "learning_rate": 1.3677459935945425e-05, + "loss": 0.8635, + "step": 2475 + }, + { + "epoch": 0.4, + "grad_norm": 1.4773843489489369, + "learning_rate": 1.3672605722724422e-05, + "loss": 0.9136, + "step": 2476 + }, + { + "epoch": 0.4, + "grad_norm": 1.4183727091611855, + "learning_rate": 1.3667750508984383e-05, + "loss": 0.8227, + "step": 2477 + }, + { + "epoch": 0.4, + "grad_norm": 1.5000880663818352, + "learning_rate": 1.3662894296048004e-05, + "loss": 0.9126, + "step": 2478 + }, + { + "epoch": 0.4, + "grad_norm": 1.4049797652567548, + "learning_rate": 1.365803708523825e-05, + "loss": 0.9545, + "step": 2479 + }, + { + "epoch": 0.4, + "grad_norm": 1.6061232681736757, + "learning_rate": 1.365317887787836e-05, + "loss": 0.8535, + "step": 2480 + }, + { + "epoch": 0.4, + "grad_norm": 1.7455157869149789, + "learning_rate": 1.3648319675291842e-05, + "loss": 0.9012, + "step": 2481 + }, + { + "epoch": 0.4, + "grad_norm": 1.4901183284407453, + "learning_rate": 1.3643459478802479e-05, + "loss": 0.9315, + "step": 2482 + }, + { + "epoch": 0.4, + "grad_norm": 1.4792642505927425, + "learning_rate": 1.3638598289734321e-05, + "loss": 0.8216, + "step": 2483 + }, + { + "epoch": 0.4, + "grad_norm": 1.6928427346986987, + "learning_rate": 1.3633736109411691e-05, + "loss": 0.8351, + "step": 2484 + }, + { + "epoch": 0.4, + "grad_norm": 0.864568784231787, + "learning_rate": 1.362887293915918e-05, + "loss": 0.3327, + "step": 2485 + }, + { + "epoch": 0.4, + "grad_norm": 1.3418242106089469, + "learning_rate": 1.362400878030165e-05, + "loss": 0.9269, + "step": 2486 + }, + { + "epoch": 0.4, + "grad_norm": 1.7172047756495814, + "learning_rate": 1.3619143634164234e-05, + "loss": 0.8961, + "step": 2487 + }, + { + "epoch": 0.4, + "grad_norm": 1.3602502741351972, + "learning_rate": 1.3614277502072327e-05, + "loss": 0.8704, + "step": 2488 + }, + { + "epoch": 0.4, + "grad_norm": 1.6566686438538933, + "learning_rate": 1.3609410385351598e-05, + "loss": 0.9263, + "step": 2489 + }, + { + "epoch": 0.4, + "grad_norm": 1.4854773688224303, + "learning_rate": 1.3604542285327988e-05, + "loss": 0.7858, + "step": 2490 + }, + { + "epoch": 0.4, + "grad_norm": 1.3336200041943738, + "learning_rate": 1.3599673203327702e-05, + "loss": 0.8677, + "step": 2491 + }, + { + "epoch": 0.4, + "grad_norm": 1.4551009893221094, + "learning_rate": 1.3594803140677208e-05, + "loss": 0.8108, + "step": 2492 + }, + { + "epoch": 0.4, + "grad_norm": 1.274539173313676, + "learning_rate": 1.358993209870325e-05, + "loss": 0.8279, + "step": 2493 + }, + { + "epoch": 0.4, + "grad_norm": 1.3199988964748557, + "learning_rate": 1.3585060078732827e-05, + "loss": 0.9708, + "step": 2494 + }, + { + "epoch": 0.4, + "grad_norm": 1.4590130321849322, + "learning_rate": 1.3580187082093217e-05, + "loss": 0.8396, + "step": 2495 + }, + { + "epoch": 0.4, + "grad_norm": 1.4208027758300172, + "learning_rate": 1.3575313110111958e-05, + "loss": 0.849, + "step": 2496 + }, + { + "epoch": 0.4, + "grad_norm": 1.366423111174138, + "learning_rate": 1.3570438164116852e-05, + "loss": 0.8966, + "step": 2497 + }, + { + "epoch": 0.4, + "grad_norm": 1.4501265557683753, + "learning_rate": 1.3565562245435974e-05, + "loss": 0.8714, + "step": 2498 + }, + { + "epoch": 0.4, + "grad_norm": 1.464461936145771, + "learning_rate": 1.3560685355397651e-05, + "loss": 0.9147, + "step": 2499 + }, + { + "epoch": 0.4, + "grad_norm": 1.4975417005284506, + "learning_rate": 1.355580749533049e-05, + "loss": 0.8207, + "step": 2500 + }, + { + "epoch": 0.4, + "grad_norm": 1.2715553988632624, + "learning_rate": 1.3550928666563348e-05, + "loss": 0.9026, + "step": 2501 + }, + { + "epoch": 0.4, + "grad_norm": 1.4911337778180596, + "learning_rate": 1.3546048870425356e-05, + "loss": 0.9768, + "step": 2502 + }, + { + "epoch": 0.4, + "grad_norm": 1.5085502435504217, + "learning_rate": 1.3541168108245907e-05, + "loss": 0.803, + "step": 2503 + }, + { + "epoch": 0.4, + "grad_norm": 1.233563560100716, + "learning_rate": 1.3536286381354651e-05, + "loss": 0.8709, + "step": 2504 + }, + { + "epoch": 0.4, + "grad_norm": 1.5068261819302413, + "learning_rate": 1.3531403691081505e-05, + "loss": 0.869, + "step": 2505 + }, + { + "epoch": 0.4, + "grad_norm": 1.165559616409158, + "learning_rate": 1.352652003875665e-05, + "loss": 0.8803, + "step": 2506 + }, + { + "epoch": 0.4, + "grad_norm": 1.4923555756324929, + "learning_rate": 1.3521635425710531e-05, + "loss": 0.7875, + "step": 2507 + }, + { + "epoch": 0.4, + "grad_norm": 1.2477350539435565, + "learning_rate": 1.351674985327384e-05, + "loss": 0.8019, + "step": 2508 + }, + { + "epoch": 0.4, + "grad_norm": 1.446911170607255, + "learning_rate": 1.3511863322777557e-05, + "loss": 0.7207, + "step": 2509 + }, + { + "epoch": 0.4, + "grad_norm": 1.5000339245933587, + "learning_rate": 1.3506975835552894e-05, + "loss": 0.8816, + "step": 2510 + }, + { + "epoch": 0.4, + "grad_norm": 1.214076603374283, + "learning_rate": 1.3502087392931347e-05, + "loss": 0.877, + "step": 2511 + }, + { + "epoch": 0.4, + "grad_norm": 1.2646272252487343, + "learning_rate": 1.3497197996244655e-05, + "loss": 0.8897, + "step": 2512 + }, + { + "epoch": 0.4, + "grad_norm": 0.7950867053199877, + "learning_rate": 1.3492307646824832e-05, + "loss": 0.3627, + "step": 2513 + }, + { + "epoch": 0.41, + "grad_norm": 1.3758026138988586, + "learning_rate": 1.3487416346004139e-05, + "loss": 0.8413, + "step": 2514 + }, + { + "epoch": 0.41, + "grad_norm": 1.3099062003135604, + "learning_rate": 1.3482524095115099e-05, + "loss": 0.9307, + "step": 2515 + }, + { + "epoch": 0.41, + "grad_norm": 1.5371718000171308, + "learning_rate": 1.3477630895490501e-05, + "loss": 0.8679, + "step": 2516 + }, + { + "epoch": 0.41, + "grad_norm": 1.3586745279810586, + "learning_rate": 1.3472736748463387e-05, + "loss": 0.897, + "step": 2517 + }, + { + "epoch": 0.41, + "grad_norm": 1.484723316283319, + "learning_rate": 1.3467841655367058e-05, + "loss": 0.8904, + "step": 2518 + }, + { + "epoch": 0.41, + "grad_norm": 1.4734172145098396, + "learning_rate": 1.3462945617535063e-05, + "loss": 0.9074, + "step": 2519 + }, + { + "epoch": 0.41, + "grad_norm": 1.1588953805588347, + "learning_rate": 1.3458048636301233e-05, + "loss": 0.9002, + "step": 2520 + }, + { + "epoch": 0.41, + "grad_norm": 1.34901673158023, + "learning_rate": 1.3453150712999628e-05, + "loss": 0.8588, + "step": 2521 + }, + { + "epoch": 0.41, + "grad_norm": 1.6542838111866047, + "learning_rate": 1.3448251848964584e-05, + "loss": 0.812, + "step": 2522 + }, + { + "epoch": 0.41, + "grad_norm": 1.505706654435004, + "learning_rate": 1.3443352045530685e-05, + "loss": 0.8031, + "step": 2523 + }, + { + "epoch": 0.41, + "grad_norm": 1.6319209658227931, + "learning_rate": 1.3438451304032769e-05, + "loss": 0.8741, + "step": 2524 + }, + { + "epoch": 0.41, + "grad_norm": 1.177064904628516, + "learning_rate": 1.3433549625805941e-05, + "loss": 0.7971, + "step": 2525 + }, + { + "epoch": 0.41, + "grad_norm": 1.2198736714450558, + "learning_rate": 1.3428647012185545e-05, + "loss": 0.9194, + "step": 2526 + }, + { + "epoch": 0.41, + "grad_norm": 1.5395112085504292, + "learning_rate": 1.3423743464507192e-05, + "loss": 0.8614, + "step": 2527 + }, + { + "epoch": 0.41, + "grad_norm": 1.3943315998848314, + "learning_rate": 1.3418838984106746e-05, + "loss": 0.8335, + "step": 2528 + }, + { + "epoch": 0.41, + "grad_norm": 1.2866153910544904, + "learning_rate": 1.3413933572320317e-05, + "loss": 0.9011, + "step": 2529 + }, + { + "epoch": 0.41, + "grad_norm": 1.3563332109700361, + "learning_rate": 1.3409027230484279e-05, + "loss": 0.8919, + "step": 2530 + }, + { + "epoch": 0.41, + "grad_norm": 1.3613020663182294, + "learning_rate": 1.3404119959935254e-05, + "loss": 0.7869, + "step": 2531 + }, + { + "epoch": 0.41, + "grad_norm": 1.417774532615658, + "learning_rate": 1.3399211762010117e-05, + "loss": 0.8858, + "step": 2532 + }, + { + "epoch": 0.41, + "grad_norm": 1.4025287240532758, + "learning_rate": 1.3394302638045992e-05, + "loss": 0.8782, + "step": 2533 + }, + { + "epoch": 0.41, + "grad_norm": 1.3302872168883468, + "learning_rate": 1.3389392589380265e-05, + "loss": 0.8702, + "step": 2534 + }, + { + "epoch": 0.41, + "grad_norm": 1.3075191782999955, + "learning_rate": 1.3384481617350572e-05, + "loss": 0.9001, + "step": 2535 + }, + { + "epoch": 0.41, + "grad_norm": 1.3280330170306085, + "learning_rate": 1.337956972329479e-05, + "loss": 0.8165, + "step": 2536 + }, + { + "epoch": 0.41, + "grad_norm": 0.9618068286745403, + "learning_rate": 1.3374656908551055e-05, + "loss": 0.3383, + "step": 2537 + }, + { + "epoch": 0.41, + "grad_norm": 1.9820078736957742, + "learning_rate": 1.3369743174457756e-05, + "loss": 0.8589, + "step": 2538 + }, + { + "epoch": 0.41, + "grad_norm": 1.4487974127617518, + "learning_rate": 1.336482852235353e-05, + "loss": 0.8904, + "step": 2539 + }, + { + "epoch": 0.41, + "grad_norm": 1.4180925580559416, + "learning_rate": 1.3359912953577261e-05, + "loss": 0.8851, + "step": 2540 + }, + { + "epoch": 0.41, + "grad_norm": 1.4103470947909094, + "learning_rate": 1.335499646946809e-05, + "loss": 0.9073, + "step": 2541 + }, + { + "epoch": 0.41, + "grad_norm": 1.1586714010203598, + "learning_rate": 1.3350079071365398e-05, + "loss": 0.806, + "step": 2542 + }, + { + "epoch": 0.41, + "grad_norm": 1.2868201490887554, + "learning_rate": 1.334516076060882e-05, + "loss": 0.8684, + "step": 2543 + }, + { + "epoch": 0.41, + "grad_norm": 1.3294892863206522, + "learning_rate": 1.334024153853824e-05, + "loss": 0.8603, + "step": 2544 + }, + { + "epoch": 0.41, + "grad_norm": 1.4313667566894113, + "learning_rate": 1.3335321406493795e-05, + "loss": 0.8601, + "step": 2545 + }, + { + "epoch": 0.41, + "grad_norm": 1.3237249911795446, + "learning_rate": 1.3330400365815856e-05, + "loss": 0.8671, + "step": 2546 + }, + { + "epoch": 0.41, + "grad_norm": 1.4722250369067422, + "learning_rate": 1.3325478417845057e-05, + "loss": 0.8923, + "step": 2547 + }, + { + "epoch": 0.41, + "grad_norm": 1.3990681850392614, + "learning_rate": 1.332055556392227e-05, + "loss": 0.9051, + "step": 2548 + }, + { + "epoch": 0.41, + "grad_norm": 1.3288438955049955, + "learning_rate": 1.3315631805388613e-05, + "loss": 0.8843, + "step": 2549 + }, + { + "epoch": 0.41, + "grad_norm": 1.4187240664384753, + "learning_rate": 1.3310707143585458e-05, + "loss": 0.8413, + "step": 2550 + }, + { + "epoch": 0.41, + "grad_norm": 1.3800066280227417, + "learning_rate": 1.3305781579854417e-05, + "loss": 0.9358, + "step": 2551 + }, + { + "epoch": 0.41, + "grad_norm": 1.3792045334128331, + "learning_rate": 1.3300855115537347e-05, + "loss": 0.9177, + "step": 2552 + }, + { + "epoch": 0.41, + "grad_norm": 1.491787464884296, + "learning_rate": 1.3295927751976358e-05, + "loss": 0.923, + "step": 2553 + }, + { + "epoch": 0.41, + "grad_norm": 1.3572395593347804, + "learning_rate": 1.3290999490513796e-05, + "loss": 0.8991, + "step": 2554 + }, + { + "epoch": 0.41, + "grad_norm": 1.2575532908232072, + "learning_rate": 1.3286070332492256e-05, + "loss": 0.9602, + "step": 2555 + }, + { + "epoch": 0.41, + "grad_norm": 1.2735467283233441, + "learning_rate": 1.3281140279254576e-05, + "loss": 0.8785, + "step": 2556 + }, + { + "epoch": 0.41, + "grad_norm": 1.2458656675547382, + "learning_rate": 1.3276209332143836e-05, + "loss": 0.8074, + "step": 2557 + }, + { + "epoch": 0.41, + "grad_norm": 1.2705728300202341, + "learning_rate": 1.327127749250337e-05, + "loss": 0.874, + "step": 2558 + }, + { + "epoch": 0.41, + "grad_norm": 1.416496249220626, + "learning_rate": 1.3266344761676735e-05, + "loss": 0.8076, + "step": 2559 + }, + { + "epoch": 0.41, + "grad_norm": 1.5030210800590615, + "learning_rate": 1.3261411141007757e-05, + "loss": 0.8775, + "step": 2560 + }, + { + "epoch": 0.41, + "grad_norm": 1.5277822418731677, + "learning_rate": 1.3256476631840478e-05, + "loss": 0.8721, + "step": 2561 + }, + { + "epoch": 0.41, + "grad_norm": 1.4173015888312606, + "learning_rate": 1.3251541235519199e-05, + "loss": 0.9598, + "step": 2562 + }, + { + "epoch": 0.41, + "grad_norm": 0.8482713780177396, + "learning_rate": 1.324660495338846e-05, + "loss": 0.3593, + "step": 2563 + }, + { + "epoch": 0.41, + "grad_norm": 1.455733430756094, + "learning_rate": 1.324166778679304e-05, + "loss": 0.8761, + "step": 2564 + }, + { + "epoch": 0.41, + "grad_norm": 1.3243371592669817, + "learning_rate": 1.3236729737077956e-05, + "loss": 0.8653, + "step": 2565 + }, + { + "epoch": 0.41, + "grad_norm": 1.3623856523853173, + "learning_rate": 1.3231790805588469e-05, + "loss": 0.9093, + "step": 2566 + }, + { + "epoch": 0.41, + "grad_norm": 1.3138929309820877, + "learning_rate": 1.3226850993670087e-05, + "loss": 0.842, + "step": 2567 + }, + { + "epoch": 0.41, + "grad_norm": 1.3399232077601675, + "learning_rate": 1.3221910302668538e-05, + "loss": 0.8972, + "step": 2568 + }, + { + "epoch": 0.41, + "grad_norm": 1.4822591389479125, + "learning_rate": 1.3216968733929817e-05, + "loss": 0.7926, + "step": 2569 + }, + { + "epoch": 0.41, + "grad_norm": 1.5428916936054882, + "learning_rate": 1.321202628880013e-05, + "loss": 0.8091, + "step": 2570 + }, + { + "epoch": 0.41, + "grad_norm": 1.722716451531505, + "learning_rate": 1.3207082968625948e-05, + "loss": 0.8542, + "step": 2571 + }, + { + "epoch": 0.41, + "grad_norm": 1.3941513146353774, + "learning_rate": 1.3202138774753958e-05, + "loss": 0.8951, + "step": 2572 + }, + { + "epoch": 0.41, + "grad_norm": 1.4681943694672512, + "learning_rate": 1.3197193708531099e-05, + "loss": 0.8389, + "step": 2573 + }, + { + "epoch": 0.41, + "grad_norm": 1.1803899575802237, + "learning_rate": 1.3192247771304543e-05, + "loss": 0.8591, + "step": 2574 + }, + { + "epoch": 0.41, + "grad_norm": 1.1760188984152213, + "learning_rate": 1.3187300964421702e-05, + "loss": 0.853, + "step": 2575 + }, + { + "epoch": 0.42, + "grad_norm": 1.6497706524127025, + "learning_rate": 1.3182353289230216e-05, + "loss": 0.9069, + "step": 2576 + }, + { + "epoch": 0.42, + "grad_norm": 1.4897179977583297, + "learning_rate": 1.3177404747077973e-05, + "loss": 0.843, + "step": 2577 + }, + { + "epoch": 0.42, + "grad_norm": 1.2520766554365843, + "learning_rate": 1.3172455339313091e-05, + "loss": 0.8505, + "step": 2578 + }, + { + "epoch": 0.42, + "grad_norm": 1.3593661832797572, + "learning_rate": 1.3167505067283926e-05, + "loss": 0.897, + "step": 2579 + }, + { + "epoch": 0.42, + "grad_norm": 1.4025352083011648, + "learning_rate": 1.3162553932339068e-05, + "loss": 0.9526, + "step": 2580 + }, + { + "epoch": 0.42, + "grad_norm": 1.4569447360036825, + "learning_rate": 1.315760193582734e-05, + "loss": 0.7865, + "step": 2581 + }, + { + "epoch": 0.42, + "grad_norm": 1.4870978178352274, + "learning_rate": 1.3152649079097808e-05, + "loss": 0.9068, + "step": 2582 + }, + { + "epoch": 0.42, + "grad_norm": 1.4880688420605621, + "learning_rate": 1.3147695363499762e-05, + "loss": 0.7927, + "step": 2583 + }, + { + "epoch": 0.42, + "grad_norm": 1.2447239161220391, + "learning_rate": 1.3142740790382733e-05, + "loss": 0.8211, + "step": 2584 + }, + { + "epoch": 0.42, + "grad_norm": 1.3306044742843068, + "learning_rate": 1.3137785361096485e-05, + "loss": 0.8269, + "step": 2585 + }, + { + "epoch": 0.42, + "grad_norm": 1.3099201931183027, + "learning_rate": 1.3132829076991007e-05, + "loss": 0.8618, + "step": 2586 + }, + { + "epoch": 0.42, + "grad_norm": 0.9106767858361681, + "learning_rate": 1.3127871939416533e-05, + "loss": 0.31, + "step": 2587 + }, + { + "epoch": 0.42, + "grad_norm": 1.2283520978232152, + "learning_rate": 1.3122913949723522e-05, + "loss": 0.9179, + "step": 2588 + }, + { + "epoch": 0.42, + "grad_norm": 1.2005677580368528, + "learning_rate": 1.3117955109262668e-05, + "loss": 0.9024, + "step": 2589 + }, + { + "epoch": 0.42, + "grad_norm": 1.263292792296421, + "learning_rate": 1.3112995419384894e-05, + "loss": 0.89, + "step": 2590 + }, + { + "epoch": 0.42, + "grad_norm": 1.2336260932079128, + "learning_rate": 1.3108034881441359e-05, + "loss": 0.9273, + "step": 2591 + }, + { + "epoch": 0.42, + "grad_norm": 1.4880816877789842, + "learning_rate": 1.3103073496783447e-05, + "loss": 0.8324, + "step": 2592 + }, + { + "epoch": 0.42, + "grad_norm": 1.6044059133523552, + "learning_rate": 1.309811126676278e-05, + "loss": 0.8335, + "step": 2593 + }, + { + "epoch": 0.42, + "grad_norm": 1.4301598164770153, + "learning_rate": 1.3093148192731202e-05, + "loss": 0.9194, + "step": 2594 + }, + { + "epoch": 0.42, + "grad_norm": 1.252018050535171, + "learning_rate": 1.3088184276040794e-05, + "loss": 0.8713, + "step": 2595 + }, + { + "epoch": 0.42, + "grad_norm": 1.5921118664489209, + "learning_rate": 1.3083219518043866e-05, + "loss": 0.9194, + "step": 2596 + }, + { + "epoch": 0.42, + "grad_norm": 1.3405381890124524, + "learning_rate": 1.3078253920092949e-05, + "loss": 0.84, + "step": 2597 + }, + { + "epoch": 0.42, + "grad_norm": 1.4458761276096601, + "learning_rate": 1.3073287483540811e-05, + "loss": 0.9318, + "step": 2598 + }, + { + "epoch": 0.42, + "grad_norm": 1.338476422419945, + "learning_rate": 1.3068320209740448e-05, + "loss": 0.9278, + "step": 2599 + }, + { + "epoch": 0.42, + "grad_norm": 1.4647763974282297, + "learning_rate": 1.3063352100045079e-05, + "loss": 0.9142, + "step": 2600 + }, + { + "epoch": 0.42, + "grad_norm": 1.5151063680605492, + "learning_rate": 1.3058383155808159e-05, + "loss": 0.923, + "step": 2601 + }, + { + "epoch": 0.42, + "grad_norm": 1.512645373372797, + "learning_rate": 1.3053413378383361e-05, + "loss": 0.9433, + "step": 2602 + }, + { + "epoch": 0.42, + "grad_norm": 1.2796770914001458, + "learning_rate": 1.304844276912459e-05, + "loss": 0.8676, + "step": 2603 + }, + { + "epoch": 0.42, + "grad_norm": 0.8416558820272357, + "learning_rate": 1.3043471329385979e-05, + "loss": 0.3275, + "step": 2604 + }, + { + "epoch": 0.42, + "grad_norm": 1.256285641055514, + "learning_rate": 1.3038499060521886e-05, + "loss": 0.834, + "step": 2605 + }, + { + "epoch": 0.42, + "grad_norm": 1.6829836455198537, + "learning_rate": 1.3033525963886888e-05, + "loss": 0.9554, + "step": 2606 + }, + { + "epoch": 0.42, + "grad_norm": 1.2218507346624357, + "learning_rate": 1.3028552040835802e-05, + "loss": 0.8236, + "step": 2607 + }, + { + "epoch": 0.42, + "grad_norm": 1.3481418108057788, + "learning_rate": 1.3023577292723655e-05, + "loss": 0.8983, + "step": 2608 + }, + { + "epoch": 0.42, + "grad_norm": 1.3004050423185876, + "learning_rate": 1.3018601720905708e-05, + "loss": 0.9056, + "step": 2609 + }, + { + "epoch": 0.42, + "grad_norm": 1.318733892226603, + "learning_rate": 1.3013625326737444e-05, + "loss": 0.8386, + "step": 2610 + }, + { + "epoch": 0.42, + "grad_norm": 1.452227004798366, + "learning_rate": 1.300864811157457e-05, + "loss": 0.9305, + "step": 2611 + }, + { + "epoch": 0.42, + "grad_norm": 1.4335034092494865, + "learning_rate": 1.3003670076773018e-05, + "loss": 0.8872, + "step": 2612 + }, + { + "epoch": 0.42, + "grad_norm": 1.4721167149011813, + "learning_rate": 1.2998691223688942e-05, + "loss": 0.8891, + "step": 2613 + }, + { + "epoch": 0.42, + "grad_norm": 1.1829638425891258, + "learning_rate": 1.2993711553678714e-05, + "loss": 0.7439, + "step": 2614 + }, + { + "epoch": 0.42, + "grad_norm": 1.7452711641535947, + "learning_rate": 1.2988731068098938e-05, + "loss": 0.8013, + "step": 2615 + }, + { + "epoch": 0.42, + "grad_norm": 1.6827409845951422, + "learning_rate": 1.2983749768306434e-05, + "loss": 0.8366, + "step": 2616 + }, + { + "epoch": 0.42, + "grad_norm": 1.1555665656819616, + "learning_rate": 1.2978767655658245e-05, + "loss": 0.9188, + "step": 2617 + }, + { + "epoch": 0.42, + "grad_norm": 1.41296337416045, + "learning_rate": 1.2973784731511638e-05, + "loss": 0.8922, + "step": 2618 + }, + { + "epoch": 0.42, + "grad_norm": 1.300809667322695, + "learning_rate": 1.2968800997224093e-05, + "loss": 0.8401, + "step": 2619 + }, + { + "epoch": 0.42, + "grad_norm": 1.4572320945418977, + "learning_rate": 1.2963816454153327e-05, + "loss": 0.884, + "step": 2620 + }, + { + "epoch": 0.42, + "grad_norm": 1.366138714019059, + "learning_rate": 1.2958831103657255e-05, + "loss": 0.8169, + "step": 2621 + }, + { + "epoch": 0.42, + "grad_norm": 2.074652625537922, + "learning_rate": 1.2953844947094032e-05, + "loss": 0.9005, + "step": 2622 + }, + { + "epoch": 0.42, + "grad_norm": 1.5331221097449668, + "learning_rate": 1.2948857985822023e-05, + "loss": 0.8465, + "step": 2623 + }, + { + "epoch": 0.42, + "grad_norm": 1.9361767961179173, + "learning_rate": 1.2943870221199814e-05, + "loss": 0.8314, + "step": 2624 + }, + { + "epoch": 0.42, + "grad_norm": 1.4216558384457334, + "learning_rate": 1.2938881654586207e-05, + "loss": 0.884, + "step": 2625 + }, + { + "epoch": 0.42, + "grad_norm": 1.300658750064086, + "learning_rate": 1.2933892287340226e-05, + "loss": 0.8771, + "step": 2626 + }, + { + "epoch": 0.42, + "grad_norm": 1.5011052614431577, + "learning_rate": 1.2928902120821111e-05, + "loss": 0.8303, + "step": 2627 + }, + { + "epoch": 0.42, + "grad_norm": 1.4151939566629255, + "learning_rate": 1.2923911156388327e-05, + "loss": 0.8905, + "step": 2628 + }, + { + "epoch": 0.42, + "grad_norm": 1.286310763637221, + "learning_rate": 1.2918919395401544e-05, + "loss": 0.8189, + "step": 2629 + }, + { + "epoch": 0.42, + "grad_norm": 1.473585937303035, + "learning_rate": 1.2913926839220654e-05, + "loss": 0.8695, + "step": 2630 + }, + { + "epoch": 0.42, + "grad_norm": 1.56640376533159, + "learning_rate": 1.2908933489205776e-05, + "loss": 0.8711, + "step": 2631 + }, + { + "epoch": 0.42, + "grad_norm": 1.7078866446708545, + "learning_rate": 1.2903939346717226e-05, + "loss": 0.8863, + "step": 2632 + }, + { + "epoch": 0.42, + "grad_norm": 1.529668886969601, + "learning_rate": 1.2898944413115553e-05, + "loss": 0.9429, + "step": 2633 + }, + { + "epoch": 0.42, + "grad_norm": 1.4737030123806616, + "learning_rate": 1.2893948689761509e-05, + "loss": 0.8652, + "step": 2634 + }, + { + "epoch": 0.42, + "grad_norm": 1.4191085290993761, + "learning_rate": 1.2888952178016075e-05, + "loss": 0.8948, + "step": 2635 + }, + { + "epoch": 0.42, + "grad_norm": 1.5087838614167377, + "learning_rate": 1.2883954879240428e-05, + "loss": 0.8405, + "step": 2636 + }, + { + "epoch": 0.42, + "grad_norm": 1.5296946467445982, + "learning_rate": 1.2878956794795978e-05, + "loss": 0.8923, + "step": 2637 + }, + { + "epoch": 0.43, + "grad_norm": 1.6408508661878922, + "learning_rate": 1.2873957926044336e-05, + "loss": 0.9277, + "step": 2638 + }, + { + "epoch": 0.43, + "grad_norm": 0.8587017266729103, + "learning_rate": 1.2868958274347334e-05, + "loss": 0.3524, + "step": 2639 + }, + { + "epoch": 0.43, + "grad_norm": 1.504428968991861, + "learning_rate": 1.2863957841067018e-05, + "loss": 0.8191, + "step": 2640 + }, + { + "epoch": 0.43, + "grad_norm": 1.2794169966525712, + "learning_rate": 1.2858956627565635e-05, + "loss": 0.8926, + "step": 2641 + }, + { + "epoch": 0.43, + "grad_norm": 1.469706023189959, + "learning_rate": 1.2853954635205665e-05, + "loss": 0.8518, + "step": 2642 + }, + { + "epoch": 0.43, + "grad_norm": 1.3393238326733468, + "learning_rate": 1.2848951865349778e-05, + "loss": 0.8402, + "step": 2643 + }, + { + "epoch": 0.43, + "grad_norm": 1.3353973824938152, + "learning_rate": 1.2843948319360875e-05, + "loss": 0.9042, + "step": 2644 + }, + { + "epoch": 0.43, + "grad_norm": 1.3159054424701762, + "learning_rate": 1.2838943998602054e-05, + "loss": 0.8658, + "step": 2645 + }, + { + "epoch": 0.43, + "grad_norm": 1.2605262869922957, + "learning_rate": 1.2833938904436634e-05, + "loss": 0.8761, + "step": 2646 + }, + { + "epoch": 0.43, + "grad_norm": 1.4916405761479787, + "learning_rate": 1.2828933038228136e-05, + "loss": 0.8372, + "step": 2647 + }, + { + "epoch": 0.43, + "grad_norm": 1.3571887701209984, + "learning_rate": 1.2823926401340296e-05, + "loss": 0.8497, + "step": 2648 + }, + { + "epoch": 0.43, + "grad_norm": 1.5049692453395742, + "learning_rate": 1.2818918995137066e-05, + "loss": 0.9612, + "step": 2649 + }, + { + "epoch": 0.43, + "grad_norm": 1.2500086750534154, + "learning_rate": 1.2813910820982596e-05, + "loss": 0.844, + "step": 2650 + }, + { + "epoch": 0.43, + "grad_norm": 1.6582192924529107, + "learning_rate": 1.2808901880241257e-05, + "loss": 0.7362, + "step": 2651 + }, + { + "epoch": 0.43, + "grad_norm": 1.5266517482514566, + "learning_rate": 1.2803892174277611e-05, + "loss": 0.7828, + "step": 2652 + }, + { + "epoch": 0.43, + "grad_norm": 1.4871359483325157, + "learning_rate": 1.2798881704456452e-05, + "loss": 0.8531, + "step": 2653 + }, + { + "epoch": 0.43, + "grad_norm": 1.1410052652780025, + "learning_rate": 1.2793870472142762e-05, + "loss": 0.8159, + "step": 2654 + }, + { + "epoch": 0.43, + "grad_norm": 1.3196910744423755, + "learning_rate": 1.2788858478701745e-05, + "loss": 0.8573, + "step": 2655 + }, + { + "epoch": 0.43, + "grad_norm": 1.2802450488250747, + "learning_rate": 1.27838457254988e-05, + "loss": 0.8645, + "step": 2656 + }, + { + "epoch": 0.43, + "grad_norm": 1.4046888308471044, + "learning_rate": 1.277883221389954e-05, + "loss": 0.8484, + "step": 2657 + }, + { + "epoch": 0.43, + "grad_norm": 1.28613889152579, + "learning_rate": 1.2773817945269793e-05, + "loss": 0.852, + "step": 2658 + }, + { + "epoch": 0.43, + "grad_norm": 1.4472431504155767, + "learning_rate": 1.2768802920975574e-05, + "loss": 0.8969, + "step": 2659 + }, + { + "epoch": 0.43, + "grad_norm": 1.2695417653161836, + "learning_rate": 1.2763787142383115e-05, + "loss": 0.8396, + "step": 2660 + }, + { + "epoch": 0.43, + "grad_norm": 1.5010635596473825, + "learning_rate": 1.2758770610858855e-05, + "loss": 0.8211, + "step": 2661 + }, + { + "epoch": 0.43, + "grad_norm": 1.3933435749974046, + "learning_rate": 1.2753753327769437e-05, + "loss": 0.8946, + "step": 2662 + }, + { + "epoch": 0.43, + "grad_norm": 1.2241554065242581, + "learning_rate": 1.2748735294481702e-05, + "loss": 0.868, + "step": 2663 + }, + { + "epoch": 0.43, + "grad_norm": 1.3454995838810169, + "learning_rate": 1.2743716512362705e-05, + "loss": 0.905, + "step": 2664 + }, + { + "epoch": 0.43, + "grad_norm": 1.7739553656448708, + "learning_rate": 1.2738696982779699e-05, + "loss": 0.9268, + "step": 2665 + }, + { + "epoch": 0.43, + "grad_norm": 1.474368299268508, + "learning_rate": 1.273367670710014e-05, + "loss": 0.8748, + "step": 2666 + }, + { + "epoch": 0.43, + "grad_norm": 1.7043444807384016, + "learning_rate": 1.2728655686691693e-05, + "loss": 0.9116, + "step": 2667 + }, + { + "epoch": 0.43, + "grad_norm": 1.467698117504435, + "learning_rate": 1.2723633922922222e-05, + "loss": 0.8484, + "step": 2668 + }, + { + "epoch": 0.43, + "grad_norm": 1.4596530449307805, + "learning_rate": 1.2718611417159793e-05, + "loss": 0.7909, + "step": 2669 + }, + { + "epoch": 0.43, + "grad_norm": 1.2975092684782483, + "learning_rate": 1.2713588170772674e-05, + "loss": 0.92, + "step": 2670 + }, + { + "epoch": 0.43, + "grad_norm": 1.467569241180824, + "learning_rate": 1.2708564185129339e-05, + "loss": 0.7674, + "step": 2671 + }, + { + "epoch": 0.43, + "grad_norm": 1.5221612175260488, + "learning_rate": 1.2703539461598455e-05, + "loss": 0.9167, + "step": 2672 + }, + { + "epoch": 0.43, + "grad_norm": 1.254653860424454, + "learning_rate": 1.2698514001548904e-05, + "loss": 0.8457, + "step": 2673 + }, + { + "epoch": 0.43, + "grad_norm": 1.2960087155045183, + "learning_rate": 1.2693487806349744e-05, + "loss": 0.8458, + "step": 2674 + }, + { + "epoch": 0.43, + "grad_norm": 1.3460064173888369, + "learning_rate": 1.2688460877370269e-05, + "loss": 0.8943, + "step": 2675 + }, + { + "epoch": 0.43, + "grad_norm": 1.4337478179414087, + "learning_rate": 1.2683433215979937e-05, + "loss": 0.9368, + "step": 2676 + }, + { + "epoch": 0.43, + "grad_norm": 1.47673147486063, + "learning_rate": 1.267840482354843e-05, + "loss": 0.8816, + "step": 2677 + }, + { + "epoch": 0.43, + "grad_norm": 1.5435719412580633, + "learning_rate": 1.2673375701445618e-05, + "loss": 0.8417, + "step": 2678 + }, + { + "epoch": 0.43, + "grad_norm": 1.2359780603931836, + "learning_rate": 1.266834585104157e-05, + "loss": 0.8785, + "step": 2679 + }, + { + "epoch": 0.43, + "grad_norm": 1.876629276004696, + "learning_rate": 1.2663315273706563e-05, + "loss": 0.8788, + "step": 2680 + }, + { + "epoch": 0.43, + "grad_norm": 1.5086605628393848, + "learning_rate": 1.2658283970811058e-05, + "loss": 0.8967, + "step": 2681 + }, + { + "epoch": 0.43, + "grad_norm": 1.403250572420167, + "learning_rate": 1.2653251943725726e-05, + "loss": 0.8932, + "step": 2682 + }, + { + "epoch": 0.43, + "grad_norm": 1.4400827170982606, + "learning_rate": 1.2648219193821425e-05, + "loss": 0.949, + "step": 2683 + }, + { + "epoch": 0.43, + "grad_norm": 1.2243750905546347, + "learning_rate": 1.264318572246922e-05, + "loss": 0.8674, + "step": 2684 + }, + { + "epoch": 0.43, + "grad_norm": 1.3373017818195951, + "learning_rate": 1.2638151531040359e-05, + "loss": 0.9306, + "step": 2685 + }, + { + "epoch": 0.43, + "grad_norm": 1.416649525260536, + "learning_rate": 1.2633116620906306e-05, + "loss": 0.91, + "step": 2686 + }, + { + "epoch": 0.43, + "grad_norm": 1.2881412103809202, + "learning_rate": 1.2628080993438698e-05, + "loss": 0.8594, + "step": 2687 + }, + { + "epoch": 0.43, + "grad_norm": 1.4876763968249898, + "learning_rate": 1.2623044650009387e-05, + "loss": 0.8607, + "step": 2688 + }, + { + "epoch": 0.43, + "grad_norm": 1.5588882258928356, + "learning_rate": 1.261800759199041e-05, + "loss": 0.8418, + "step": 2689 + }, + { + "epoch": 0.43, + "grad_norm": 1.3516225980616319, + "learning_rate": 1.2612969820753995e-05, + "loss": 0.8497, + "step": 2690 + }, + { + "epoch": 0.43, + "grad_norm": 1.8270304329310074, + "learning_rate": 1.2607931337672576e-05, + "loss": 0.8566, + "step": 2691 + }, + { + "epoch": 0.43, + "grad_norm": 1.5207189984133747, + "learning_rate": 1.2602892144118772e-05, + "loss": 0.8888, + "step": 2692 + }, + { + "epoch": 0.43, + "grad_norm": 1.3797761311635153, + "learning_rate": 1.25978522414654e-05, + "loss": 0.8888, + "step": 2693 + }, + { + "epoch": 0.43, + "grad_norm": 1.240589306573735, + "learning_rate": 1.2592811631085464e-05, + "loss": 0.8878, + "step": 2694 + }, + { + "epoch": 0.43, + "grad_norm": 1.3866993012877769, + "learning_rate": 1.2587770314352175e-05, + "loss": 0.8413, + "step": 2695 + }, + { + "epoch": 0.43, + "grad_norm": 1.3706973554348998, + "learning_rate": 1.2582728292638913e-05, + "loss": 0.894, + "step": 2696 + }, + { + "epoch": 0.43, + "grad_norm": 1.1299601180585, + "learning_rate": 1.2577685567319275e-05, + "loss": 0.9091, + "step": 2697 + }, + { + "epoch": 0.43, + "grad_norm": 1.218787378416545, + "learning_rate": 1.2572642139767033e-05, + "loss": 0.9018, + "step": 2698 + }, + { + "epoch": 0.43, + "grad_norm": 1.4909468981019491, + "learning_rate": 1.2567598011356155e-05, + "loss": 0.8873, + "step": 2699 + }, + { + "epoch": 0.44, + "grad_norm": 1.2850144349352668, + "learning_rate": 1.2562553183460806e-05, + "loss": 0.9555, + "step": 2700 + }, + { + "epoch": 0.44, + "grad_norm": 1.7531809528452016, + "learning_rate": 1.2557507657455327e-05, + "loss": 0.8853, + "step": 2701 + }, + { + "epoch": 0.44, + "grad_norm": 1.453046608163963, + "learning_rate": 1.2552461434714272e-05, + "loss": 0.8997, + "step": 2702 + }, + { + "epoch": 0.44, + "grad_norm": 1.3922361506418268, + "learning_rate": 1.2547414516612357e-05, + "loss": 0.9205, + "step": 2703 + }, + { + "epoch": 0.44, + "grad_norm": 1.2445651435008374, + "learning_rate": 1.254236690452451e-05, + "loss": 0.947, + "step": 2704 + }, + { + "epoch": 0.44, + "grad_norm": 1.815467976170979, + "learning_rate": 1.2537318599825836e-05, + "loss": 0.916, + "step": 2705 + }, + { + "epoch": 0.44, + "grad_norm": 1.3661757724372827, + "learning_rate": 1.2532269603891639e-05, + "loss": 0.9325, + "step": 2706 + }, + { + "epoch": 0.44, + "grad_norm": 1.2308492163139353, + "learning_rate": 1.2527219918097392e-05, + "loss": 0.8579, + "step": 2707 + }, + { + "epoch": 0.44, + "grad_norm": 1.4671141113770734, + "learning_rate": 1.2522169543818781e-05, + "loss": 0.9104, + "step": 2708 + }, + { + "epoch": 0.44, + "grad_norm": 1.4326169950749035, + "learning_rate": 1.251711848243166e-05, + "loss": 0.8886, + "step": 2709 + }, + { + "epoch": 0.44, + "grad_norm": 1.4063801135692475, + "learning_rate": 1.251206673531208e-05, + "loss": 0.9006, + "step": 2710 + }, + { + "epoch": 0.44, + "grad_norm": 1.566053466120827, + "learning_rate": 1.2507014303836277e-05, + "loss": 0.8439, + "step": 2711 + }, + { + "epoch": 0.44, + "grad_norm": 1.3609889194834883, + "learning_rate": 1.250196118938067e-05, + "loss": 0.8834, + "step": 2712 + }, + { + "epoch": 0.44, + "grad_norm": 1.4800275343934521, + "learning_rate": 1.2496907393321868e-05, + "loss": 0.8553, + "step": 2713 + }, + { + "epoch": 0.44, + "grad_norm": 1.5141479770941888, + "learning_rate": 1.2491852917036666e-05, + "loss": 0.8835, + "step": 2714 + }, + { + "epoch": 0.44, + "grad_norm": 1.483565736694982, + "learning_rate": 1.2486797761902039e-05, + "loss": 0.7845, + "step": 2715 + }, + { + "epoch": 0.44, + "grad_norm": 1.467990221231459, + "learning_rate": 1.2481741929295154e-05, + "loss": 0.9628, + "step": 2716 + }, + { + "epoch": 0.44, + "grad_norm": 1.268176929238814, + "learning_rate": 1.2476685420593357e-05, + "loss": 0.9058, + "step": 2717 + }, + { + "epoch": 0.44, + "grad_norm": 1.420534497611871, + "learning_rate": 1.2471628237174183e-05, + "loss": 0.8494, + "step": 2718 + }, + { + "epoch": 0.44, + "grad_norm": 1.4818996343853743, + "learning_rate": 1.2466570380415346e-05, + "loss": 0.845, + "step": 2719 + }, + { + "epoch": 0.44, + "grad_norm": 1.3036555217723786, + "learning_rate": 1.2461511851694743e-05, + "loss": 0.8744, + "step": 2720 + }, + { + "epoch": 0.44, + "grad_norm": 1.34724370612865, + "learning_rate": 1.2456452652390463e-05, + "loss": 0.949, + "step": 2721 + }, + { + "epoch": 0.44, + "grad_norm": 1.326086301388255, + "learning_rate": 1.2451392783880767e-05, + "loss": 0.8408, + "step": 2722 + }, + { + "epoch": 0.44, + "grad_norm": 1.3848436942913445, + "learning_rate": 1.2446332247544102e-05, + "loss": 0.866, + "step": 2723 + }, + { + "epoch": 0.44, + "grad_norm": 1.3696698310164106, + "learning_rate": 1.2441271044759103e-05, + "loss": 0.9182, + "step": 2724 + }, + { + "epoch": 0.44, + "grad_norm": 0.8835694625073143, + "learning_rate": 1.2436209176904575e-05, + "loss": 0.3511, + "step": 2725 + }, + { + "epoch": 0.44, + "grad_norm": 1.4675972909617614, + "learning_rate": 1.2431146645359511e-05, + "loss": 0.8711, + "step": 2726 + }, + { + "epoch": 0.44, + "grad_norm": 1.514163643108094, + "learning_rate": 1.2426083451503086e-05, + "loss": 0.8261, + "step": 2727 + }, + { + "epoch": 0.44, + "grad_norm": 1.546920932824742, + "learning_rate": 1.2421019596714656e-05, + "loss": 0.8742, + "step": 2728 + }, + { + "epoch": 0.44, + "grad_norm": 1.3920859996109693, + "learning_rate": 1.2415955082373752e-05, + "loss": 0.8765, + "step": 2729 + }, + { + "epoch": 0.44, + "grad_norm": 1.5382782046930634, + "learning_rate": 1.2410889909860086e-05, + "loss": 0.8548, + "step": 2730 + }, + { + "epoch": 0.44, + "grad_norm": 1.2657100721012842, + "learning_rate": 1.2405824080553553e-05, + "loss": 0.7863, + "step": 2731 + }, + { + "epoch": 0.44, + "grad_norm": 1.474252277992671, + "learning_rate": 1.2400757595834221e-05, + "loss": 0.842, + "step": 2732 + }, + { + "epoch": 0.44, + "grad_norm": 1.3477498049673966, + "learning_rate": 1.2395690457082348e-05, + "loss": 0.8355, + "step": 2733 + }, + { + "epoch": 0.44, + "grad_norm": 1.5706843177942593, + "learning_rate": 1.239062266567835e-05, + "loss": 0.7975, + "step": 2734 + }, + { + "epoch": 0.44, + "grad_norm": 1.48539112785759, + "learning_rate": 1.2385554223002844e-05, + "loss": 0.9006, + "step": 2735 + }, + { + "epoch": 0.44, + "grad_norm": 1.7228948985765387, + "learning_rate": 1.238048513043661e-05, + "loss": 0.8607, + "step": 2736 + }, + { + "epoch": 0.44, + "grad_norm": 1.257356729550077, + "learning_rate": 1.2375415389360605e-05, + "loss": 0.8735, + "step": 2737 + }, + { + "epoch": 0.44, + "grad_norm": 1.207074651431655, + "learning_rate": 1.2370345001155972e-05, + "loss": 0.877, + "step": 2738 + }, + { + "epoch": 0.44, + "grad_norm": 1.3810815457071217, + "learning_rate": 1.2365273967204018e-05, + "loss": 0.8835, + "step": 2739 + }, + { + "epoch": 0.44, + "grad_norm": 1.4066106876767055, + "learning_rate": 1.2360202288886243e-05, + "loss": 0.8312, + "step": 2740 + }, + { + "epoch": 0.44, + "grad_norm": 1.4209303540974425, + "learning_rate": 1.23551299675843e-05, + "loss": 0.8382, + "step": 2741 + }, + { + "epoch": 0.44, + "grad_norm": 1.408524447485463, + "learning_rate": 1.2350057004680036e-05, + "loss": 0.9103, + "step": 2742 + }, + { + "epoch": 0.44, + "grad_norm": 1.6038285773618228, + "learning_rate": 1.2344983401555464e-05, + "loss": 0.9332, + "step": 2743 + }, + { + "epoch": 0.44, + "grad_norm": 1.6167742067468192, + "learning_rate": 1.2339909159592774e-05, + "loss": 0.8909, + "step": 2744 + }, + { + "epoch": 0.44, + "grad_norm": 1.4133871135843694, + "learning_rate": 1.2334834280174325e-05, + "loss": 0.8257, + "step": 2745 + }, + { + "epoch": 0.44, + "grad_norm": 1.499130896900979, + "learning_rate": 1.2329758764682663e-05, + "loss": 0.8813, + "step": 2746 + }, + { + "epoch": 0.44, + "grad_norm": 1.2326498160881572, + "learning_rate": 1.2324682614500492e-05, + "loss": 0.8458, + "step": 2747 + }, + { + "epoch": 0.44, + "grad_norm": 1.4242312451974273, + "learning_rate": 1.2319605831010694e-05, + "loss": 0.8885, + "step": 2748 + }, + { + "epoch": 0.44, + "grad_norm": 1.408793049059259, + "learning_rate": 1.231452841559633e-05, + "loss": 0.8716, + "step": 2749 + }, + { + "epoch": 0.44, + "grad_norm": 1.455639623306039, + "learning_rate": 1.2309450369640622e-05, + "loss": 0.8793, + "step": 2750 + }, + { + "epoch": 0.44, + "grad_norm": 1.4140109090983228, + "learning_rate": 1.230437169452698e-05, + "loss": 0.9245, + "step": 2751 + }, + { + "epoch": 0.44, + "grad_norm": 1.4044579193397113, + "learning_rate": 1.2299292391638962e-05, + "loss": 0.8144, + "step": 2752 + }, + { + "epoch": 0.44, + "grad_norm": 1.444665904007464, + "learning_rate": 1.2294212462360318e-05, + "loss": 0.8437, + "step": 2753 + }, + { + "epoch": 0.44, + "grad_norm": 1.3885806036247759, + "learning_rate": 1.2289131908074958e-05, + "loss": 0.8473, + "step": 2754 + }, + { + "epoch": 0.44, + "grad_norm": 1.1797751668856538, + "learning_rate": 1.2284050730166968e-05, + "loss": 0.8338, + "step": 2755 + }, + { + "epoch": 0.44, + "grad_norm": 1.4853253296410984, + "learning_rate": 1.2278968930020597e-05, + "loss": 0.8942, + "step": 2756 + }, + { + "epoch": 0.44, + "grad_norm": 1.5549913617412394, + "learning_rate": 1.227388650902027e-05, + "loss": 0.8534, + "step": 2757 + }, + { + "epoch": 0.44, + "grad_norm": 1.4667515157195492, + "learning_rate": 1.2268803468550576e-05, + "loss": 0.9162, + "step": 2758 + }, + { + "epoch": 0.44, + "grad_norm": 1.307285162699302, + "learning_rate": 1.226371980999628e-05, + "loss": 0.8589, + "step": 2759 + }, + { + "epoch": 0.44, + "grad_norm": 1.2235047744533885, + "learning_rate": 1.2258635534742307e-05, + "loss": 0.8867, + "step": 2760 + }, + { + "epoch": 0.44, + "grad_norm": 1.4302356823990086, + "learning_rate": 1.2253550644173753e-05, + "loss": 0.9036, + "step": 2761 + }, + { + "epoch": 0.45, + "grad_norm": 1.3079626262402047, + "learning_rate": 1.2248465139675886e-05, + "loss": 0.8023, + "step": 2762 + }, + { + "epoch": 0.45, + "grad_norm": 1.5588515332610038, + "learning_rate": 1.2243379022634136e-05, + "loss": 0.8995, + "step": 2763 + }, + { + "epoch": 0.45, + "grad_norm": 1.40071126527148, + "learning_rate": 1.22382922944341e-05, + "loss": 0.8275, + "step": 2764 + }, + { + "epoch": 0.45, + "grad_norm": 1.883921169199151, + "learning_rate": 1.2233204956461545e-05, + "loss": 0.8628, + "step": 2765 + }, + { + "epoch": 0.45, + "grad_norm": 1.366785348602125, + "learning_rate": 1.22281170101024e-05, + "loss": 0.8359, + "step": 2766 + }, + { + "epoch": 0.45, + "grad_norm": 1.4794186690291042, + "learning_rate": 1.2223028456742762e-05, + "loss": 0.8399, + "step": 2767 + }, + { + "epoch": 0.45, + "grad_norm": 1.2888013828125635, + "learning_rate": 1.2217939297768898e-05, + "loss": 0.3306, + "step": 2768 + }, + { + "epoch": 0.45, + "grad_norm": 1.584539053808563, + "learning_rate": 1.221284953456723e-05, + "loss": 0.7839, + "step": 2769 + }, + { + "epoch": 0.45, + "grad_norm": 0.8881681733473118, + "learning_rate": 1.2207759168524352e-05, + "loss": 0.3687, + "step": 2770 + }, + { + "epoch": 0.45, + "grad_norm": 1.354175990367544, + "learning_rate": 1.2202668201027016e-05, + "loss": 0.8596, + "step": 2771 + }, + { + "epoch": 0.45, + "grad_norm": 1.3194218486878357, + "learning_rate": 1.2197576633462147e-05, + "loss": 0.8138, + "step": 2772 + }, + { + "epoch": 0.45, + "grad_norm": 1.3180311779953826, + "learning_rate": 1.2192484467216828e-05, + "loss": 0.9242, + "step": 2773 + }, + { + "epoch": 0.45, + "grad_norm": 1.4526055706114724, + "learning_rate": 1.2187391703678301e-05, + "loss": 0.8586, + "step": 2774 + }, + { + "epoch": 0.45, + "grad_norm": 0.7478634320520393, + "learning_rate": 1.2182298344233981e-05, + "loss": 0.3684, + "step": 2775 + }, + { + "epoch": 0.45, + "grad_norm": 1.5846749574800176, + "learning_rate": 1.2177204390271434e-05, + "loss": 0.8321, + "step": 2776 + }, + { + "epoch": 0.45, + "grad_norm": 1.3337479796613283, + "learning_rate": 1.2172109843178396e-05, + "loss": 0.8608, + "step": 2777 + }, + { + "epoch": 0.45, + "grad_norm": 1.4888490935247887, + "learning_rate": 1.2167014704342763e-05, + "loss": 0.861, + "step": 2778 + }, + { + "epoch": 0.45, + "grad_norm": 1.3817411599158573, + "learning_rate": 1.2161918975152593e-05, + "loss": 0.8004, + "step": 2779 + }, + { + "epoch": 0.45, + "grad_norm": 1.3615329573280277, + "learning_rate": 1.2156822656996098e-05, + "loss": 0.8266, + "step": 2780 + }, + { + "epoch": 0.45, + "grad_norm": 1.3344758620295176, + "learning_rate": 1.2151725751261659e-05, + "loss": 0.8528, + "step": 2781 + }, + { + "epoch": 0.45, + "grad_norm": 1.4694329294799107, + "learning_rate": 1.2146628259337814e-05, + "loss": 0.8931, + "step": 2782 + }, + { + "epoch": 0.45, + "grad_norm": 1.4841911848345541, + "learning_rate": 1.2141530182613255e-05, + "loss": 0.8026, + "step": 2783 + }, + { + "epoch": 0.45, + "grad_norm": 1.2637370164993806, + "learning_rate": 1.2136431522476847e-05, + "loss": 0.8731, + "step": 2784 + }, + { + "epoch": 0.45, + "grad_norm": 1.368588599531654, + "learning_rate": 1.2131332280317598e-05, + "loss": 0.8911, + "step": 2785 + }, + { + "epoch": 0.45, + "grad_norm": 1.359812150332946, + "learning_rate": 1.2126232457524686e-05, + "loss": 0.9365, + "step": 2786 + }, + { + "epoch": 0.45, + "grad_norm": 0.7891008998006404, + "learning_rate": 1.2121132055487442e-05, + "loss": 0.3736, + "step": 2787 + }, + { + "epoch": 0.45, + "grad_norm": 1.3053082435655867, + "learning_rate": 1.2116031075595356e-05, + "loss": 0.8867, + "step": 2788 + }, + { + "epoch": 0.45, + "grad_norm": 1.3282828366933468, + "learning_rate": 1.2110929519238077e-05, + "loss": 0.8157, + "step": 2789 + }, + { + "epoch": 0.45, + "grad_norm": 1.2593385208966215, + "learning_rate": 1.2105827387805413e-05, + "loss": 0.8379, + "step": 2790 + }, + { + "epoch": 0.45, + "grad_norm": 0.7885035167213441, + "learning_rate": 1.2100724682687317e-05, + "loss": 0.3385, + "step": 2791 + }, + { + "epoch": 0.45, + "grad_norm": 1.3351968701184085, + "learning_rate": 1.2095621405273912e-05, + "loss": 0.7924, + "step": 2792 + }, + { + "epoch": 0.45, + "grad_norm": 1.4256215528645497, + "learning_rate": 1.209051755695547e-05, + "loss": 0.8995, + "step": 2793 + }, + { + "epoch": 0.45, + "grad_norm": 1.3211608988087316, + "learning_rate": 1.2085413139122417e-05, + "loss": 0.8325, + "step": 2794 + }, + { + "epoch": 0.45, + "grad_norm": 1.890842614378777, + "learning_rate": 1.2080308153165345e-05, + "loss": 0.9104, + "step": 2795 + }, + { + "epoch": 0.45, + "grad_norm": 1.4438560525587463, + "learning_rate": 1.2075202600474985e-05, + "loss": 0.833, + "step": 2796 + }, + { + "epoch": 0.45, + "grad_norm": 1.4223889251345907, + "learning_rate": 1.2070096482442235e-05, + "loss": 0.9033, + "step": 2797 + }, + { + "epoch": 0.45, + "grad_norm": 1.2045743103943218, + "learning_rate": 1.2064989800458138e-05, + "loss": 0.8629, + "step": 2798 + }, + { + "epoch": 0.45, + "grad_norm": 1.2210693271663275, + "learning_rate": 1.20598825559139e-05, + "loss": 0.8976, + "step": 2799 + }, + { + "epoch": 0.45, + "grad_norm": 0.988542389387899, + "learning_rate": 1.205477475020087e-05, + "loss": 0.3307, + "step": 2800 + }, + { + "epoch": 0.45, + "grad_norm": 1.3625822133231007, + "learning_rate": 1.2049666384710563e-05, + "loss": 0.8764, + "step": 2801 + }, + { + "epoch": 0.45, + "grad_norm": 1.297299460670785, + "learning_rate": 1.204455746083463e-05, + "loss": 0.855, + "step": 2802 + }, + { + "epoch": 0.45, + "grad_norm": 1.7781711622658598, + "learning_rate": 1.2039447979964887e-05, + "loss": 0.8376, + "step": 2803 + }, + { + "epoch": 0.45, + "grad_norm": 1.5087258501606984, + "learning_rate": 1.2034337943493297e-05, + "loss": 0.8322, + "step": 2804 + }, + { + "epoch": 0.45, + "grad_norm": 1.3860452436860717, + "learning_rate": 1.202922735281197e-05, + "loss": 0.8896, + "step": 2805 + }, + { + "epoch": 0.45, + "grad_norm": 1.3409551752701763, + "learning_rate": 1.202411620931318e-05, + "loss": 0.8719, + "step": 2806 + }, + { + "epoch": 0.45, + "grad_norm": 1.5286285808560125, + "learning_rate": 1.2019004514389338e-05, + "loss": 0.9403, + "step": 2807 + }, + { + "epoch": 0.45, + "grad_norm": 1.3945856784969757, + "learning_rate": 1.2013892269433017e-05, + "loss": 0.843, + "step": 2808 + }, + { + "epoch": 0.45, + "grad_norm": 1.261472924616925, + "learning_rate": 1.2008779475836923e-05, + "loss": 0.8935, + "step": 2809 + }, + { + "epoch": 0.45, + "grad_norm": 0.8824725498730404, + "learning_rate": 1.2003666134993928e-05, + "loss": 0.3443, + "step": 2810 + }, + { + "epoch": 0.45, + "grad_norm": 1.4178017427283562, + "learning_rate": 1.199855224829705e-05, + "loss": 0.8828, + "step": 2811 + }, + { + "epoch": 0.45, + "grad_norm": 1.3129133658495373, + "learning_rate": 1.1993437817139446e-05, + "loss": 0.8583, + "step": 2812 + }, + { + "epoch": 0.45, + "grad_norm": 0.7714852622787589, + "learning_rate": 1.1988322842914431e-05, + "loss": 0.3175, + "step": 2813 + }, + { + "epoch": 0.45, + "grad_norm": 1.4551300257188933, + "learning_rate": 1.1983207327015465e-05, + "loss": 0.8628, + "step": 2814 + }, + { + "epoch": 0.45, + "grad_norm": 0.8596732356313029, + "learning_rate": 1.197809127083616e-05, + "loss": 0.3527, + "step": 2815 + }, + { + "epoch": 0.45, + "grad_norm": 1.3962911552192607, + "learning_rate": 1.1972974675770259e-05, + "loss": 0.845, + "step": 2816 + }, + { + "epoch": 0.45, + "grad_norm": 1.355512136677426, + "learning_rate": 1.1967857543211679e-05, + "loss": 0.8325, + "step": 2817 + }, + { + "epoch": 0.45, + "grad_norm": 1.3582930191457832, + "learning_rate": 1.1962739874554452e-05, + "loss": 0.8088, + "step": 2818 + }, + { + "epoch": 0.45, + "grad_norm": 1.3228360555774272, + "learning_rate": 1.1957621671192785e-05, + "loss": 0.7442, + "step": 2819 + }, + { + "epoch": 0.45, + "grad_norm": 1.4636040082620687, + "learning_rate": 1.195250293452101e-05, + "loss": 0.8396, + "step": 2820 + }, + { + "epoch": 0.45, + "grad_norm": 1.3374091590743906, + "learning_rate": 1.1947383665933619e-05, + "loss": 0.8866, + "step": 2821 + }, + { + "epoch": 0.45, + "grad_norm": 1.368567384824542, + "learning_rate": 1.1942263866825234e-05, + "loss": 0.8447, + "step": 2822 + }, + { + "epoch": 0.45, + "grad_norm": 1.4597333832809862, + "learning_rate": 1.1937143538590635e-05, + "loss": 0.9666, + "step": 2823 + }, + { + "epoch": 0.46, + "grad_norm": 1.5084731126107642, + "learning_rate": 1.1932022682624735e-05, + "loss": 0.88, + "step": 2824 + }, + { + "epoch": 0.46, + "grad_norm": 1.2923528531161748, + "learning_rate": 1.1926901300322601e-05, + "loss": 0.9191, + "step": 2825 + }, + { + "epoch": 0.46, + "grad_norm": 1.366419643305925, + "learning_rate": 1.1921779393079438e-05, + "loss": 0.8654, + "step": 2826 + }, + { + "epoch": 0.46, + "grad_norm": 1.2065926290347666, + "learning_rate": 1.1916656962290594e-05, + "loss": 0.8036, + "step": 2827 + }, + { + "epoch": 0.46, + "grad_norm": 1.450801029286185, + "learning_rate": 1.1911534009351561e-05, + "loss": 0.8347, + "step": 2828 + }, + { + "epoch": 0.46, + "grad_norm": 1.3182389218440302, + "learning_rate": 1.190641053565797e-05, + "loss": 0.8147, + "step": 2829 + }, + { + "epoch": 0.46, + "grad_norm": 1.4333985076189057, + "learning_rate": 1.1901286542605604e-05, + "loss": 0.9358, + "step": 2830 + }, + { + "epoch": 0.46, + "grad_norm": 1.41093501648415, + "learning_rate": 1.1896162031590367e-05, + "loss": 0.8542, + "step": 2831 + }, + { + "epoch": 0.46, + "grad_norm": 1.408105363375565, + "learning_rate": 1.189103700400833e-05, + "loss": 0.9133, + "step": 2832 + }, + { + "epoch": 0.46, + "grad_norm": 1.505468807705461, + "learning_rate": 1.1885911461255685e-05, + "loss": 0.8975, + "step": 2833 + }, + { + "epoch": 0.46, + "grad_norm": 0.8165674962965024, + "learning_rate": 1.1880785404728773e-05, + "loss": 0.3573, + "step": 2834 + }, + { + "epoch": 0.46, + "grad_norm": 1.3166676524944276, + "learning_rate": 1.1875658835824071e-05, + "loss": 0.8793, + "step": 2835 + }, + { + "epoch": 0.46, + "grad_norm": 1.3976809040627696, + "learning_rate": 1.1870531755938202e-05, + "loss": 0.9186, + "step": 2836 + }, + { + "epoch": 0.46, + "grad_norm": 0.7698839741982167, + "learning_rate": 1.1865404166467918e-05, + "loss": 0.3131, + "step": 2837 + }, + { + "epoch": 0.46, + "grad_norm": 1.3597761677588358, + "learning_rate": 1.1860276068810119e-05, + "loss": 0.8338, + "step": 2838 + }, + { + "epoch": 0.46, + "grad_norm": 1.2666371332437028, + "learning_rate": 1.1855147464361845e-05, + "loss": 0.8526, + "step": 2839 + }, + { + "epoch": 0.46, + "grad_norm": 1.3296651941438953, + "learning_rate": 1.1850018354520256e-05, + "loss": 0.8487, + "step": 2840 + }, + { + "epoch": 0.46, + "grad_norm": 1.2792481340339643, + "learning_rate": 1.1844888740682678e-05, + "loss": 0.871, + "step": 2841 + }, + { + "epoch": 0.46, + "grad_norm": 1.310151728484075, + "learning_rate": 1.1839758624246551e-05, + "loss": 0.9137, + "step": 2842 + }, + { + "epoch": 0.46, + "grad_norm": 1.3247685553482373, + "learning_rate": 1.183462800660946e-05, + "loss": 0.8951, + "step": 2843 + }, + { + "epoch": 0.46, + "grad_norm": 0.997448782121293, + "learning_rate": 1.182949688916913e-05, + "loss": 0.3656, + "step": 2844 + }, + { + "epoch": 0.46, + "grad_norm": 1.2133007811142893, + "learning_rate": 1.1824365273323414e-05, + "loss": 0.8791, + "step": 2845 + }, + { + "epoch": 0.46, + "grad_norm": 1.260885700668015, + "learning_rate": 1.1819233160470311e-05, + "loss": 0.9284, + "step": 2846 + }, + { + "epoch": 0.46, + "grad_norm": 1.3161530895062066, + "learning_rate": 1.1814100552007947e-05, + "loss": 0.8815, + "step": 2847 + }, + { + "epoch": 0.46, + "grad_norm": 1.5194529645606776, + "learning_rate": 1.1808967449334588e-05, + "loss": 0.8993, + "step": 2848 + }, + { + "epoch": 0.46, + "grad_norm": 1.4777492410263449, + "learning_rate": 1.1803833853848627e-05, + "loss": 0.8779, + "step": 2849 + }, + { + "epoch": 0.46, + "grad_norm": 1.4856803186622674, + "learning_rate": 1.1798699766948606e-05, + "loss": 0.9052, + "step": 2850 + }, + { + "epoch": 0.46, + "grad_norm": 1.6142362736247533, + "learning_rate": 1.1793565190033188e-05, + "loss": 0.8999, + "step": 2851 + }, + { + "epoch": 0.46, + "grad_norm": 1.2031496141442488, + "learning_rate": 1.1788430124501167e-05, + "loss": 0.8131, + "step": 2852 + }, + { + "epoch": 0.46, + "grad_norm": 1.4405244382309563, + "learning_rate": 1.1783294571751484e-05, + "loss": 0.8967, + "step": 2853 + }, + { + "epoch": 0.46, + "grad_norm": 1.2509649917936982, + "learning_rate": 1.1778158533183203e-05, + "loss": 0.8103, + "step": 2854 + }, + { + "epoch": 0.46, + "grad_norm": 0.9366071601971184, + "learning_rate": 1.1773022010195525e-05, + "loss": 0.3502, + "step": 2855 + }, + { + "epoch": 0.46, + "grad_norm": 1.4953320471563223, + "learning_rate": 1.1767885004187773e-05, + "loss": 0.972, + "step": 2856 + }, + { + "epoch": 0.46, + "grad_norm": 1.2945888171521325, + "learning_rate": 1.1762747516559418e-05, + "loss": 0.8736, + "step": 2857 + }, + { + "epoch": 0.46, + "grad_norm": 1.5723759488085398, + "learning_rate": 1.1757609548710048e-05, + "loss": 0.8158, + "step": 2858 + }, + { + "epoch": 0.46, + "grad_norm": 1.371234215499491, + "learning_rate": 1.1752471102039385e-05, + "loss": 0.8133, + "step": 2859 + }, + { + "epoch": 0.46, + "grad_norm": 1.2716717003390021, + "learning_rate": 1.1747332177947288e-05, + "loss": 0.8464, + "step": 2860 + }, + { + "epoch": 0.46, + "grad_norm": 1.4914683723700202, + "learning_rate": 1.1742192777833746e-05, + "loss": 0.8277, + "step": 2861 + }, + { + "epoch": 0.46, + "grad_norm": 1.6590621473272844, + "learning_rate": 1.173705290309886e-05, + "loss": 0.9288, + "step": 2862 + }, + { + "epoch": 0.46, + "grad_norm": 1.2368147097401632, + "learning_rate": 1.1731912555142883e-05, + "loss": 0.8414, + "step": 2863 + }, + { + "epoch": 0.46, + "grad_norm": 0.7567428039344742, + "learning_rate": 1.1726771735366186e-05, + "loss": 0.3492, + "step": 2864 + }, + { + "epoch": 0.46, + "grad_norm": 1.4984309774600924, + "learning_rate": 1.172163044516927e-05, + "loss": 0.8697, + "step": 2865 + }, + { + "epoch": 0.46, + "grad_norm": 1.7209098206817808, + "learning_rate": 1.1716488685952765e-05, + "loss": 0.801, + "step": 2866 + }, + { + "epoch": 0.46, + "grad_norm": 1.2571964336013137, + "learning_rate": 1.1711346459117423e-05, + "loss": 0.8538, + "step": 2867 + }, + { + "epoch": 0.46, + "grad_norm": 1.3619064877090303, + "learning_rate": 1.1706203766064137e-05, + "loss": 0.8653, + "step": 2868 + }, + { + "epoch": 0.46, + "grad_norm": 1.3813489568063029, + "learning_rate": 1.1701060608193911e-05, + "loss": 0.8036, + "step": 2869 + }, + { + "epoch": 0.46, + "grad_norm": 1.3611597609702055, + "learning_rate": 1.1695916986907882e-05, + "loss": 0.8683, + "step": 2870 + }, + { + "epoch": 0.46, + "grad_norm": 1.3006412427892529, + "learning_rate": 1.1690772903607321e-05, + "loss": 0.9069, + "step": 2871 + }, + { + "epoch": 0.46, + "grad_norm": 1.4821328935671958, + "learning_rate": 1.1685628359693617e-05, + "loss": 0.8532, + "step": 2872 + }, + { + "epoch": 0.46, + "grad_norm": 1.3597263010366636, + "learning_rate": 1.168048335656828e-05, + "loss": 0.8981, + "step": 2873 + }, + { + "epoch": 0.46, + "grad_norm": 1.5102332009229082, + "learning_rate": 1.1675337895632953e-05, + "loss": 0.8841, + "step": 2874 + }, + { + "epoch": 0.46, + "grad_norm": 1.4767862117676795, + "learning_rate": 1.1670191978289402e-05, + "loss": 0.7675, + "step": 2875 + }, + { + "epoch": 0.46, + "grad_norm": 1.3012494652866793, + "learning_rate": 1.166504560593952e-05, + "loss": 0.8775, + "step": 2876 + }, + { + "epoch": 0.46, + "grad_norm": 1.2737512871081325, + "learning_rate": 1.1659898779985317e-05, + "loss": 0.8733, + "step": 2877 + }, + { + "epoch": 0.46, + "grad_norm": 1.7411447706075702, + "learning_rate": 1.1654751501828927e-05, + "loss": 0.8581, + "step": 2878 + }, + { + "epoch": 0.46, + "grad_norm": 1.2367130907312387, + "learning_rate": 1.164960377287262e-05, + "loss": 0.7849, + "step": 2879 + }, + { + "epoch": 0.46, + "grad_norm": 1.0886292963218578, + "learning_rate": 1.164445559451877e-05, + "loss": 0.8253, + "step": 2880 + }, + { + "epoch": 0.46, + "grad_norm": 1.420603886123083, + "learning_rate": 1.1639306968169887e-05, + "loss": 0.9012, + "step": 2881 + }, + { + "epoch": 0.46, + "grad_norm": 1.323879612056229, + "learning_rate": 1.1634157895228599e-05, + "loss": 0.9256, + "step": 2882 + }, + { + "epoch": 0.46, + "grad_norm": 1.4298347211039335, + "learning_rate": 1.1629008377097655e-05, + "loss": 0.8795, + "step": 2883 + }, + { + "epoch": 0.46, + "grad_norm": 1.524826035680562, + "learning_rate": 1.1623858415179925e-05, + "loss": 0.9387, + "step": 2884 + }, + { + "epoch": 0.46, + "grad_norm": 1.5203140211244757, + "learning_rate": 1.16187080108784e-05, + "loss": 0.8467, + "step": 2885 + }, + { + "epoch": 0.46, + "grad_norm": 1.440477101466983, + "learning_rate": 1.1613557165596192e-05, + "loss": 0.8511, + "step": 2886 + }, + { + "epoch": 0.47, + "grad_norm": 1.1502314657429593, + "learning_rate": 1.1608405880736535e-05, + "loss": 0.847, + "step": 2887 + }, + { + "epoch": 0.47, + "grad_norm": 1.5765633492803035, + "learning_rate": 1.1603254157702782e-05, + "loss": 0.751, + "step": 2888 + }, + { + "epoch": 0.47, + "grad_norm": 0.8804045822620761, + "learning_rate": 1.1598101997898398e-05, + "loss": 0.3381, + "step": 2889 + }, + { + "epoch": 0.47, + "grad_norm": 1.3894535364982075, + "learning_rate": 1.1592949402726983e-05, + "loss": 0.9192, + "step": 2890 + }, + { + "epoch": 0.47, + "grad_norm": 1.2866985029933335, + "learning_rate": 1.1587796373592237e-05, + "loss": 0.7986, + "step": 2891 + }, + { + "epoch": 0.47, + "grad_norm": 1.276677159318935, + "learning_rate": 1.1582642911897991e-05, + "loss": 0.7888, + "step": 2892 + }, + { + "epoch": 0.47, + "grad_norm": 1.2928498871123901, + "learning_rate": 1.1577489019048191e-05, + "loss": 0.931, + "step": 2893 + }, + { + "epoch": 0.47, + "grad_norm": 1.583216413982874, + "learning_rate": 1.15723346964469e-05, + "loss": 0.8236, + "step": 2894 + }, + { + "epoch": 0.47, + "grad_norm": 1.347128758995366, + "learning_rate": 1.1567179945498297e-05, + "loss": 0.8581, + "step": 2895 + }, + { + "epoch": 0.47, + "grad_norm": 0.9166941366586149, + "learning_rate": 1.1562024767606674e-05, + "loss": 0.3567, + "step": 2896 + }, + { + "epoch": 0.47, + "grad_norm": 1.3206274084508376, + "learning_rate": 1.1556869164176447e-05, + "loss": 0.904, + "step": 2897 + }, + { + "epoch": 0.47, + "grad_norm": 1.5217995589546103, + "learning_rate": 1.1551713136612146e-05, + "loss": 0.9046, + "step": 2898 + }, + { + "epoch": 0.47, + "grad_norm": 1.1762235738597384, + "learning_rate": 1.1546556686318414e-05, + "loss": 0.8637, + "step": 2899 + }, + { + "epoch": 0.47, + "grad_norm": 1.452796972715515, + "learning_rate": 1.1541399814700006e-05, + "loss": 0.9107, + "step": 2900 + }, + { + "epoch": 0.47, + "grad_norm": 1.2723111930384068, + "learning_rate": 1.1536242523161802e-05, + "loss": 0.8783, + "step": 2901 + }, + { + "epoch": 0.47, + "grad_norm": 1.6607513741072362, + "learning_rate": 1.1531084813108788e-05, + "loss": 0.8718, + "step": 2902 + }, + { + "epoch": 0.47, + "grad_norm": 0.7620576453091051, + "learning_rate": 1.1525926685946067e-05, + "loss": 0.3074, + "step": 2903 + }, + { + "epoch": 0.47, + "grad_norm": 1.2063257040316104, + "learning_rate": 1.1520768143078853e-05, + "loss": 0.8267, + "step": 2904 + }, + { + "epoch": 0.47, + "grad_norm": 1.465755465024223, + "learning_rate": 1.1515609185912475e-05, + "loss": 0.839, + "step": 2905 + }, + { + "epoch": 0.47, + "grad_norm": 1.2918454337012537, + "learning_rate": 1.151044981585238e-05, + "loss": 0.8004, + "step": 2906 + }, + { + "epoch": 0.47, + "grad_norm": 1.305521007082058, + "learning_rate": 1.1505290034304117e-05, + "loss": 0.8529, + "step": 2907 + }, + { + "epoch": 0.47, + "grad_norm": 1.6960730395904198, + "learning_rate": 1.1500129842673358e-05, + "loss": 0.8841, + "step": 2908 + }, + { + "epoch": 0.47, + "grad_norm": 1.4101361044768252, + "learning_rate": 1.1494969242365875e-05, + "loss": 0.8296, + "step": 2909 + }, + { + "epoch": 0.47, + "grad_norm": 1.4833911822746206, + "learning_rate": 1.1489808234787565e-05, + "loss": 0.8285, + "step": 2910 + }, + { + "epoch": 0.47, + "grad_norm": 1.2565317749191307, + "learning_rate": 1.1484646821344421e-05, + "loss": 0.8008, + "step": 2911 + }, + { + "epoch": 0.47, + "grad_norm": 1.2667514264572968, + "learning_rate": 1.1479485003442564e-05, + "loss": 0.8586, + "step": 2912 + }, + { + "epoch": 0.47, + "grad_norm": 1.2456004069011246, + "learning_rate": 1.1474322782488203e-05, + "loss": 0.8173, + "step": 2913 + }, + { + "epoch": 0.47, + "grad_norm": 1.2875052640631384, + "learning_rate": 1.146916015988768e-05, + "loss": 0.7914, + "step": 2914 + }, + { + "epoch": 0.47, + "grad_norm": 1.4086299677893321, + "learning_rate": 1.1463997137047431e-05, + "loss": 0.8541, + "step": 2915 + }, + { + "epoch": 0.47, + "grad_norm": 1.5384610829695098, + "learning_rate": 1.1458833715374005e-05, + "loss": 0.8569, + "step": 2916 + }, + { + "epoch": 0.47, + "grad_norm": 1.3547685433381567, + "learning_rate": 1.1453669896274066e-05, + "loss": 0.8702, + "step": 2917 + }, + { + "epoch": 0.47, + "grad_norm": 0.8553072622370183, + "learning_rate": 1.1448505681154373e-05, + "loss": 0.3674, + "step": 2918 + }, + { + "epoch": 0.47, + "grad_norm": 1.3995818901317785, + "learning_rate": 1.1443341071421804e-05, + "loss": 0.8661, + "step": 2919 + }, + { + "epoch": 0.47, + "grad_norm": 1.7528879357256242, + "learning_rate": 1.1438176068483345e-05, + "loss": 0.8364, + "step": 2920 + }, + { + "epoch": 0.47, + "grad_norm": 1.4409132323644518, + "learning_rate": 1.1433010673746079e-05, + "loss": 0.8208, + "step": 2921 + }, + { + "epoch": 0.47, + "grad_norm": 1.2489985142145699, + "learning_rate": 1.1427844888617203e-05, + "loss": 0.8914, + "step": 2922 + }, + { + "epoch": 0.47, + "grad_norm": 1.445665078717719, + "learning_rate": 1.1422678714504022e-05, + "loss": 0.8446, + "step": 2923 + }, + { + "epoch": 0.47, + "grad_norm": 1.2702705746766858, + "learning_rate": 1.1417512152813944e-05, + "loss": 0.9024, + "step": 2924 + }, + { + "epoch": 0.47, + "grad_norm": 1.6022261279739576, + "learning_rate": 1.1412345204954477e-05, + "loss": 0.8765, + "step": 2925 + }, + { + "epoch": 0.47, + "grad_norm": 1.2438470854978412, + "learning_rate": 1.140717787233325e-05, + "loss": 0.8555, + "step": 2926 + }, + { + "epoch": 0.47, + "grad_norm": 1.5090866838503327, + "learning_rate": 1.1402010156357978e-05, + "loss": 0.9202, + "step": 2927 + }, + { + "epoch": 0.47, + "grad_norm": 1.2470917696141193, + "learning_rate": 1.1396842058436497e-05, + "loss": 0.9295, + "step": 2928 + }, + { + "epoch": 0.47, + "grad_norm": 0.8692983746689249, + "learning_rate": 1.139167357997673e-05, + "loss": 0.3746, + "step": 2929 + }, + { + "epoch": 0.47, + "grad_norm": 1.4676548941045635, + "learning_rate": 1.1386504722386719e-05, + "loss": 0.8075, + "step": 2930 + }, + { + "epoch": 0.47, + "grad_norm": 1.3342653470134176, + "learning_rate": 1.1381335487074605e-05, + "loss": 0.9185, + "step": 2931 + }, + { + "epoch": 0.47, + "grad_norm": 1.408590592325525, + "learning_rate": 1.1376165875448628e-05, + "loss": 0.8616, + "step": 2932 + }, + { + "epoch": 0.47, + "grad_norm": 1.3362973144132835, + "learning_rate": 1.1370995888917129e-05, + "loss": 0.8452, + "step": 2933 + }, + { + "epoch": 0.47, + "grad_norm": 1.3735837687091863, + "learning_rate": 1.136582552888856e-05, + "loss": 0.9264, + "step": 2934 + }, + { + "epoch": 0.47, + "grad_norm": 1.4534493015251444, + "learning_rate": 1.136065479677147e-05, + "loss": 0.9038, + "step": 2935 + }, + { + "epoch": 0.47, + "grad_norm": 1.4388074740284413, + "learning_rate": 1.1355483693974502e-05, + "loss": 0.8554, + "step": 2936 + }, + { + "epoch": 0.47, + "grad_norm": 1.438160769054234, + "learning_rate": 1.1350312221906416e-05, + "loss": 0.8509, + "step": 2937 + }, + { + "epoch": 0.47, + "grad_norm": 1.4130747747870107, + "learning_rate": 1.1345140381976054e-05, + "loss": 0.851, + "step": 2938 + }, + { + "epoch": 0.47, + "grad_norm": 1.3303468400572487, + "learning_rate": 1.1339968175592377e-05, + "loss": 0.8818, + "step": 2939 + }, + { + "epoch": 0.47, + "grad_norm": 1.2382521904204054, + "learning_rate": 1.1334795604164432e-05, + "loss": 0.8361, + "step": 2940 + }, + { + "epoch": 0.47, + "grad_norm": 0.8692931512876478, + "learning_rate": 1.132962266910137e-05, + "loss": 0.3447, + "step": 2941 + }, + { + "epoch": 0.47, + "grad_norm": 1.3881043563485997, + "learning_rate": 1.1324449371812441e-05, + "loss": 0.8167, + "step": 2942 + }, + { + "epoch": 0.47, + "grad_norm": 1.5907361310975605, + "learning_rate": 1.1319275713706997e-05, + "loss": 0.8553, + "step": 2943 + }, + { + "epoch": 0.47, + "grad_norm": 1.3510955861245566, + "learning_rate": 1.1314101696194479e-05, + "loss": 0.8525, + "step": 2944 + }, + { + "epoch": 0.47, + "grad_norm": 1.6029627081359954, + "learning_rate": 1.1308927320684438e-05, + "loss": 0.8692, + "step": 2945 + }, + { + "epoch": 0.47, + "grad_norm": 1.4410630563192721, + "learning_rate": 1.1303752588586512e-05, + "loss": 0.813, + "step": 2946 + }, + { + "epoch": 0.47, + "grad_norm": 1.619421401947289, + "learning_rate": 1.1298577501310444e-05, + "loss": 0.8304, + "step": 2947 + }, + { + "epoch": 0.47, + "grad_norm": 1.22101609719287, + "learning_rate": 1.1293402060266072e-05, + "loss": 0.8797, + "step": 2948 + }, + { + "epoch": 0.48, + "grad_norm": 1.3283268859233963, + "learning_rate": 1.128822626686332e-05, + "loss": 0.8684, + "step": 2949 + }, + { + "epoch": 0.48, + "grad_norm": 1.3459190436272004, + "learning_rate": 1.1283050122512227e-05, + "loss": 0.8993, + "step": 2950 + }, + { + "epoch": 0.48, + "grad_norm": 1.5610107572699294, + "learning_rate": 1.127787362862291e-05, + "loss": 0.8363, + "step": 2951 + }, + { + "epoch": 0.48, + "grad_norm": 1.5798116684447117, + "learning_rate": 1.1272696786605595e-05, + "loss": 0.9311, + "step": 2952 + }, + { + "epoch": 0.48, + "grad_norm": 1.2809505868761126, + "learning_rate": 1.1267519597870591e-05, + "loss": 0.8554, + "step": 2953 + }, + { + "epoch": 0.48, + "grad_norm": 1.3269061119256682, + "learning_rate": 1.126234206382831e-05, + "loss": 0.9096, + "step": 2954 + }, + { + "epoch": 0.48, + "grad_norm": 1.4759068111794678, + "learning_rate": 1.1257164185889255e-05, + "loss": 0.8255, + "step": 2955 + }, + { + "epoch": 0.48, + "grad_norm": 1.2955341089289438, + "learning_rate": 1.1251985965464022e-05, + "loss": 0.8678, + "step": 2956 + }, + { + "epoch": 0.48, + "grad_norm": 1.5321555367340742, + "learning_rate": 1.12468074039633e-05, + "loss": 0.8853, + "step": 2957 + }, + { + "epoch": 0.48, + "grad_norm": 1.468687954058805, + "learning_rate": 1.1241628502797872e-05, + "loss": 0.8495, + "step": 2958 + }, + { + "epoch": 0.48, + "grad_norm": 1.2278376172975625, + "learning_rate": 1.1236449263378617e-05, + "loss": 0.8643, + "step": 2959 + }, + { + "epoch": 0.48, + "grad_norm": 1.4976612381760017, + "learning_rate": 1.1231269687116495e-05, + "loss": 0.8688, + "step": 2960 + }, + { + "epoch": 0.48, + "grad_norm": 1.3245385024466307, + "learning_rate": 1.1226089775422575e-05, + "loss": 0.8985, + "step": 2961 + }, + { + "epoch": 0.48, + "grad_norm": 1.343736999886859, + "learning_rate": 1.1220909529707999e-05, + "loss": 0.952, + "step": 2962 + }, + { + "epoch": 0.48, + "grad_norm": 1.3539183404231865, + "learning_rate": 1.1215728951384013e-05, + "loss": 0.8531, + "step": 2963 + }, + { + "epoch": 0.48, + "grad_norm": 1.297294036975661, + "learning_rate": 1.121054804186195e-05, + "loss": 0.8817, + "step": 2964 + }, + { + "epoch": 0.48, + "grad_norm": 1.3092166144211002, + "learning_rate": 1.1205366802553231e-05, + "loss": 0.8348, + "step": 2965 + }, + { + "epoch": 0.48, + "grad_norm": 1.630493325138277, + "learning_rate": 1.1200185234869372e-05, + "loss": 0.8637, + "step": 2966 + }, + { + "epoch": 0.48, + "grad_norm": 1.3499751368248618, + "learning_rate": 1.1195003340221968e-05, + "loss": 0.8746, + "step": 2967 + }, + { + "epoch": 0.48, + "grad_norm": 1.4052956925963385, + "learning_rate": 1.1189821120022712e-05, + "loss": 0.8543, + "step": 2968 + }, + { + "epoch": 0.48, + "grad_norm": 1.4099089380534122, + "learning_rate": 1.1184638575683388e-05, + "loss": 0.8621, + "step": 2969 + }, + { + "epoch": 0.48, + "grad_norm": 1.4528705851044637, + "learning_rate": 1.1179455708615863e-05, + "loss": 0.8594, + "step": 2970 + }, + { + "epoch": 0.48, + "grad_norm": 1.496986509989778, + "learning_rate": 1.1174272520232087e-05, + "loss": 0.8853, + "step": 2971 + }, + { + "epoch": 0.48, + "grad_norm": 1.3505102772183077, + "learning_rate": 1.1169089011944113e-05, + "loss": 0.8235, + "step": 2972 + }, + { + "epoch": 0.48, + "grad_norm": 1.4212502276750465, + "learning_rate": 1.116390518516406e-05, + "loss": 0.8558, + "step": 2973 + }, + { + "epoch": 0.48, + "grad_norm": 1.7726388038106262, + "learning_rate": 1.1158721041304155e-05, + "loss": 0.8839, + "step": 2974 + }, + { + "epoch": 0.48, + "grad_norm": 1.4986555163641022, + "learning_rate": 1.1153536581776697e-05, + "loss": 0.8677, + "step": 2975 + }, + { + "epoch": 0.48, + "grad_norm": 1.3681541104877117, + "learning_rate": 1.1148351807994075e-05, + "loss": 0.8653, + "step": 2976 + }, + { + "epoch": 0.48, + "grad_norm": 1.2996499989130925, + "learning_rate": 1.114316672136877e-05, + "loss": 0.8279, + "step": 2977 + }, + { + "epoch": 0.48, + "grad_norm": 1.3546121801295268, + "learning_rate": 1.1137981323313335e-05, + "loss": 0.8361, + "step": 2978 + }, + { + "epoch": 0.48, + "grad_norm": 1.4983547583141477, + "learning_rate": 1.1132795615240419e-05, + "loss": 0.8882, + "step": 2979 + }, + { + "epoch": 0.48, + "grad_norm": 1.303788976441012, + "learning_rate": 1.1127609598562754e-05, + "loss": 0.8003, + "step": 2980 + }, + { + "epoch": 0.48, + "grad_norm": 1.7365332612079538, + "learning_rate": 1.1122423274693152e-05, + "loss": 0.8311, + "step": 2981 + }, + { + "epoch": 0.48, + "grad_norm": 1.4166608405928396, + "learning_rate": 1.1117236645044506e-05, + "loss": 0.8556, + "step": 2982 + }, + { + "epoch": 0.48, + "grad_norm": 1.6934813265242583, + "learning_rate": 1.1112049711029807e-05, + "loss": 0.9303, + "step": 2983 + }, + { + "epoch": 0.48, + "grad_norm": 1.5709518855600877, + "learning_rate": 1.1106862474062107e-05, + "loss": 0.9316, + "step": 2984 + }, + { + "epoch": 0.48, + "grad_norm": 0.857608599957514, + "learning_rate": 1.1101674935554561e-05, + "loss": 0.3559, + "step": 2985 + }, + { + "epoch": 0.48, + "grad_norm": 1.4075655975510553, + "learning_rate": 1.1096487096920394e-05, + "loss": 0.8147, + "step": 2986 + }, + { + "epoch": 0.48, + "grad_norm": 1.2997814902937594, + "learning_rate": 1.1091298959572919e-05, + "loss": 0.7365, + "step": 2987 + }, + { + "epoch": 0.48, + "grad_norm": 1.40202356909792, + "learning_rate": 1.1086110524925524e-05, + "loss": 0.8973, + "step": 2988 + }, + { + "epoch": 0.48, + "grad_norm": 1.5691812103090126, + "learning_rate": 1.1080921794391684e-05, + "loss": 0.8815, + "step": 2989 + }, + { + "epoch": 0.48, + "grad_norm": 1.2316265260256627, + "learning_rate": 1.1075732769384948e-05, + "loss": 0.8537, + "step": 2990 + }, + { + "epoch": 0.48, + "grad_norm": 1.4167117295449394, + "learning_rate": 1.1070543451318956e-05, + "loss": 0.8762, + "step": 2991 + }, + { + "epoch": 0.48, + "grad_norm": 1.5047278896943974, + "learning_rate": 1.106535384160742e-05, + "loss": 0.8648, + "step": 2992 + }, + { + "epoch": 0.48, + "grad_norm": 1.3259441288324516, + "learning_rate": 1.1060163941664125e-05, + "loss": 0.8455, + "step": 2993 + }, + { + "epoch": 0.48, + "grad_norm": 0.8233852058613227, + "learning_rate": 1.1054973752902956e-05, + "loss": 0.3473, + "step": 2994 + }, + { + "epoch": 0.48, + "grad_norm": 1.607580340513171, + "learning_rate": 1.1049783276737852e-05, + "loss": 0.8417, + "step": 2995 + }, + { + "epoch": 0.48, + "grad_norm": 1.4068987224253178, + "learning_rate": 1.1044592514582843e-05, + "loss": 0.859, + "step": 2996 + }, + { + "epoch": 0.48, + "grad_norm": 1.7232248174610592, + "learning_rate": 1.1039401467852044e-05, + "loss": 0.8699, + "step": 2997 + }, + { + "epoch": 0.48, + "grad_norm": 0.8293732241875293, + "learning_rate": 1.1034210137959632e-05, + "loss": 0.344, + "step": 2998 + }, + { + "epoch": 0.48, + "grad_norm": 1.3692938185626327, + "learning_rate": 1.102901852631987e-05, + "loss": 0.8968, + "step": 2999 + }, + { + "epoch": 0.48, + "grad_norm": 1.4805334050018335, + "learning_rate": 1.1023826634347096e-05, + "loss": 0.8902, + "step": 3000 + }, + { + "epoch": 0.48, + "grad_norm": 1.5018954613210944, + "learning_rate": 1.1018634463455727e-05, + "loss": 0.7854, + "step": 3001 + }, + { + "epoch": 0.48, + "grad_norm": 1.8211642920579347, + "learning_rate": 1.101344201506025e-05, + "loss": 0.8621, + "step": 3002 + }, + { + "epoch": 0.48, + "grad_norm": 1.312300602349726, + "learning_rate": 1.1008249290575232e-05, + "loss": 0.8464, + "step": 3003 + }, + { + "epoch": 0.48, + "grad_norm": 1.5266064494775065, + "learning_rate": 1.1003056291415313e-05, + "loss": 0.8481, + "step": 3004 + }, + { + "epoch": 0.48, + "grad_norm": 1.424324660723558, + "learning_rate": 1.0997863018995214e-05, + "loss": 0.9076, + "step": 3005 + }, + { + "epoch": 0.48, + "grad_norm": 1.3320157303283566, + "learning_rate": 1.0992669474729723e-05, + "loss": 0.7934, + "step": 3006 + }, + { + "epoch": 0.48, + "grad_norm": 1.3626655538184695, + "learning_rate": 1.09874756600337e-05, + "loss": 0.8851, + "step": 3007 + }, + { + "epoch": 0.48, + "grad_norm": 1.2323640459914589, + "learning_rate": 1.0982281576322091e-05, + "loss": 0.8774, + "step": 3008 + }, + { + "epoch": 0.48, + "grad_norm": 1.2549033753309693, + "learning_rate": 1.09770872250099e-05, + "loss": 0.839, + "step": 3009 + }, + { + "epoch": 0.48, + "grad_norm": 1.3773540758523743, + "learning_rate": 1.097189260751222e-05, + "loss": 0.8842, + "step": 3010 + }, + { + "epoch": 0.49, + "grad_norm": 1.4174115272284709, + "learning_rate": 1.09666977252442e-05, + "loss": 0.7974, + "step": 3011 + }, + { + "epoch": 0.49, + "grad_norm": 0.7896154745205465, + "learning_rate": 1.0961502579621073e-05, + "loss": 0.3597, + "step": 3012 + }, + { + "epoch": 0.49, + "grad_norm": 1.5416169286745272, + "learning_rate": 1.0956307172058138e-05, + "loss": 0.917, + "step": 3013 + }, + { + "epoch": 0.49, + "grad_norm": 1.2237705760529862, + "learning_rate": 1.0951111503970771e-05, + "loss": 0.8209, + "step": 3014 + }, + { + "epoch": 0.49, + "grad_norm": 1.2416210967527441, + "learning_rate": 1.0945915576774411e-05, + "loss": 0.8677, + "step": 3015 + }, + { + "epoch": 0.49, + "grad_norm": 1.411229555978256, + "learning_rate": 1.0940719391884578e-05, + "loss": 0.8517, + "step": 3016 + }, + { + "epoch": 0.49, + "grad_norm": 1.2529346528404601, + "learning_rate": 1.0935522950716847e-05, + "loss": 0.9088, + "step": 3017 + }, + { + "epoch": 0.49, + "grad_norm": 1.3785350836290926, + "learning_rate": 1.0930326254686877e-05, + "loss": 0.8228, + "step": 3018 + }, + { + "epoch": 0.49, + "grad_norm": 1.4542644593947307, + "learning_rate": 1.0925129305210393e-05, + "loss": 0.8685, + "step": 3019 + }, + { + "epoch": 0.49, + "grad_norm": 1.6380241157234026, + "learning_rate": 1.0919932103703178e-05, + "loss": 0.8182, + "step": 3020 + }, + { + "epoch": 0.49, + "grad_norm": 1.2486095574133431, + "learning_rate": 1.091473465158111e-05, + "loss": 0.8835, + "step": 3021 + }, + { + "epoch": 0.49, + "grad_norm": 1.381331825300475, + "learning_rate": 1.0909536950260102e-05, + "loss": 0.9239, + "step": 3022 + }, + { + "epoch": 0.49, + "grad_norm": 1.4046452213582474, + "learning_rate": 1.0904339001156158e-05, + "loss": 0.9335, + "step": 3023 + }, + { + "epoch": 0.49, + "grad_norm": 1.2256137441999007, + "learning_rate": 1.0899140805685342e-05, + "loss": 0.8673, + "step": 3024 + }, + { + "epoch": 0.49, + "grad_norm": 1.2910098722823145, + "learning_rate": 1.0893942365263786e-05, + "loss": 0.8909, + "step": 3025 + }, + { + "epoch": 0.49, + "grad_norm": 1.7420881813974156, + "learning_rate": 1.0888743681307686e-05, + "loss": 0.9179, + "step": 3026 + }, + { + "epoch": 0.49, + "grad_norm": 1.333959480098031, + "learning_rate": 1.0883544755233313e-05, + "loss": 0.8248, + "step": 3027 + }, + { + "epoch": 0.49, + "grad_norm": 1.586648918889774, + "learning_rate": 1.087834558845699e-05, + "loss": 0.8658, + "step": 3028 + }, + { + "epoch": 0.49, + "grad_norm": 1.284939605633235, + "learning_rate": 1.0873146182395118e-05, + "loss": 0.8498, + "step": 3029 + }, + { + "epoch": 0.49, + "grad_norm": 1.5124325674912475, + "learning_rate": 1.086794653846416e-05, + "loss": 0.8567, + "step": 3030 + }, + { + "epoch": 0.49, + "grad_norm": 1.4019648420583464, + "learning_rate": 1.0862746658080635e-05, + "loss": 0.8788, + "step": 3031 + }, + { + "epoch": 0.49, + "grad_norm": 1.580635005674901, + "learning_rate": 1.085754654266114e-05, + "loss": 0.7656, + "step": 3032 + }, + { + "epoch": 0.49, + "grad_norm": 1.166470229364124, + "learning_rate": 1.0852346193622332e-05, + "loss": 0.8374, + "step": 3033 + }, + { + "epoch": 0.49, + "grad_norm": 1.5701394851701387, + "learning_rate": 1.0847145612380922e-05, + "loss": 0.9083, + "step": 3034 + }, + { + "epoch": 0.49, + "grad_norm": 1.3089514385850622, + "learning_rate": 1.0841944800353696e-05, + "loss": 0.8856, + "step": 3035 + }, + { + "epoch": 0.49, + "grad_norm": 1.444316081077573, + "learning_rate": 1.08367437589575e-05, + "loss": 0.8077, + "step": 3036 + }, + { + "epoch": 0.49, + "grad_norm": 0.9166723969354483, + "learning_rate": 1.083154248960924e-05, + "loss": 0.3726, + "step": 3037 + }, + { + "epoch": 0.49, + "grad_norm": 1.5668749227036867, + "learning_rate": 1.0826340993725882e-05, + "loss": 0.8868, + "step": 3038 + }, + { + "epoch": 0.49, + "grad_norm": 1.461915218120926, + "learning_rate": 1.0821139272724457e-05, + "loss": 0.8095, + "step": 3039 + }, + { + "epoch": 0.49, + "grad_norm": 1.3578788523270395, + "learning_rate": 1.0815937328022061e-05, + "loss": 0.9192, + "step": 3040 + }, + { + "epoch": 0.49, + "grad_norm": 1.1894804610006675, + "learning_rate": 1.0810735161035848e-05, + "loss": 0.8428, + "step": 3041 + }, + { + "epoch": 0.49, + "grad_norm": 1.7331824111646426, + "learning_rate": 1.0805532773183022e-05, + "loss": 0.8325, + "step": 3042 + }, + { + "epoch": 0.49, + "grad_norm": 1.4538021086733113, + "learning_rate": 1.0800330165880872e-05, + "loss": 0.8545, + "step": 3043 + }, + { + "epoch": 0.49, + "grad_norm": 1.3360087497015196, + "learning_rate": 1.0795127340546718e-05, + "loss": 0.8988, + "step": 3044 + }, + { + "epoch": 0.49, + "grad_norm": 1.5260827250349465, + "learning_rate": 1.0789924298597963e-05, + "loss": 0.8645, + "step": 3045 + }, + { + "epoch": 0.49, + "grad_norm": 1.235726106354246, + "learning_rate": 1.0784721041452054e-05, + "loss": 0.8812, + "step": 3046 + }, + { + "epoch": 0.49, + "grad_norm": 1.4751178029164163, + "learning_rate": 1.07795175705265e-05, + "loss": 0.8403, + "step": 3047 + }, + { + "epoch": 0.49, + "grad_norm": 1.553100274117524, + "learning_rate": 1.0774313887238874e-05, + "loss": 0.832, + "step": 3048 + }, + { + "epoch": 0.49, + "grad_norm": 1.7974727771364543, + "learning_rate": 1.0769109993006802e-05, + "loss": 0.7906, + "step": 3049 + }, + { + "epoch": 0.49, + "grad_norm": 1.3385744331266738, + "learning_rate": 1.0763905889247964e-05, + "loss": 0.8528, + "step": 3050 + }, + { + "epoch": 0.49, + "grad_norm": 1.2120850666797633, + "learning_rate": 1.0758701577380107e-05, + "loss": 0.8312, + "step": 3051 + }, + { + "epoch": 0.49, + "grad_norm": 1.2644943600030492, + "learning_rate": 1.0753497058821027e-05, + "loss": 0.8489, + "step": 3052 + }, + { + "epoch": 0.49, + "grad_norm": 1.52135022910106, + "learning_rate": 1.0748292334988573e-05, + "loss": 0.86, + "step": 3053 + }, + { + "epoch": 0.49, + "grad_norm": 1.2692066026951876, + "learning_rate": 1.0743087407300665e-05, + "loss": 0.8497, + "step": 3054 + }, + { + "epoch": 0.49, + "grad_norm": 1.2573178233752513, + "learning_rate": 1.0737882277175262e-05, + "loss": 0.7725, + "step": 3055 + }, + { + "epoch": 0.49, + "grad_norm": 1.3798735119505534, + "learning_rate": 1.073267694603039e-05, + "loss": 0.9252, + "step": 3056 + }, + { + "epoch": 0.49, + "grad_norm": 1.603363245926989, + "learning_rate": 1.0727471415284119e-05, + "loss": 0.927, + "step": 3057 + }, + { + "epoch": 0.49, + "grad_norm": 1.4765345751175483, + "learning_rate": 1.072226568635458e-05, + "loss": 0.8639, + "step": 3058 + }, + { + "epoch": 0.49, + "grad_norm": 1.6139136049533478, + "learning_rate": 1.0717059760659963e-05, + "loss": 0.8781, + "step": 3059 + }, + { + "epoch": 0.49, + "grad_norm": 1.5698912907298164, + "learning_rate": 1.0711853639618497e-05, + "loss": 0.8643, + "step": 3060 + }, + { + "epoch": 0.49, + "grad_norm": 1.3785351653753097, + "learning_rate": 1.0706647324648481e-05, + "loss": 0.796, + "step": 3061 + }, + { + "epoch": 0.49, + "grad_norm": 1.4642440445685065, + "learning_rate": 1.0701440817168251e-05, + "loss": 0.8572, + "step": 3062 + }, + { + "epoch": 0.49, + "grad_norm": 1.219279837177049, + "learning_rate": 1.069623411859621e-05, + "loss": 0.8526, + "step": 3063 + }, + { + "epoch": 0.49, + "grad_norm": 1.185054528347692, + "learning_rate": 1.0691027230350802e-05, + "loss": 0.8926, + "step": 3064 + }, + { + "epoch": 0.49, + "grad_norm": 1.2715295089234997, + "learning_rate": 1.0685820153850528e-05, + "loss": 0.8697, + "step": 3065 + }, + { + "epoch": 0.49, + "grad_norm": 1.523485082714657, + "learning_rate": 1.0680612890513937e-05, + "loss": 0.9524, + "step": 3066 + }, + { + "epoch": 0.49, + "grad_norm": 1.4532688374753664, + "learning_rate": 1.0675405441759639e-05, + "loss": 0.858, + "step": 3067 + }, + { + "epoch": 0.49, + "grad_norm": 0.8122004899145338, + "learning_rate": 1.0670197809006277e-05, + "loss": 0.3511, + "step": 3068 + }, + { + "epoch": 0.49, + "grad_norm": 1.3301169183930275, + "learning_rate": 1.0664989993672559e-05, + "loss": 0.8247, + "step": 3069 + }, + { + "epoch": 0.49, + "grad_norm": 1.5584257986647938, + "learning_rate": 1.0659781997177239e-05, + "loss": 0.8551, + "step": 3070 + }, + { + "epoch": 0.49, + "grad_norm": 1.309450681713823, + "learning_rate": 1.0654573820939112e-05, + "loss": 0.8529, + "step": 3071 + }, + { + "epoch": 0.49, + "grad_norm": 1.272414766529403, + "learning_rate": 1.0649365466377033e-05, + "loss": 0.8431, + "step": 3072 + }, + { + "epoch": 0.5, + "grad_norm": 1.5019277346018816, + "learning_rate": 1.0644156934909902e-05, + "loss": 0.8873, + "step": 3073 + }, + { + "epoch": 0.5, + "grad_norm": 1.3930023571737786, + "learning_rate": 1.0638948227956665e-05, + "loss": 0.892, + "step": 3074 + }, + { + "epoch": 0.5, + "grad_norm": 1.420408759341007, + "learning_rate": 1.0633739346936318e-05, + "loss": 0.8424, + "step": 3075 + }, + { + "epoch": 0.5, + "grad_norm": 1.5070115411704652, + "learning_rate": 1.0628530293267908e-05, + "loss": 0.8681, + "step": 3076 + }, + { + "epoch": 0.5, + "grad_norm": 1.5695336531245214, + "learning_rate": 1.0623321068370515e-05, + "loss": 0.8613, + "step": 3077 + }, + { + "epoch": 0.5, + "grad_norm": 1.3391037550427791, + "learning_rate": 1.0618111673663283e-05, + "loss": 0.9225, + "step": 3078 + }, + { + "epoch": 0.5, + "grad_norm": 1.2236615160097064, + "learning_rate": 1.0612902110565393e-05, + "loss": 0.9126, + "step": 3079 + }, + { + "epoch": 0.5, + "grad_norm": 1.469619887780148, + "learning_rate": 1.060769238049607e-05, + "loss": 0.9166, + "step": 3080 + }, + { + "epoch": 0.5, + "grad_norm": 1.3387055323788486, + "learning_rate": 1.0602482484874598e-05, + "loss": 0.9164, + "step": 3081 + }, + { + "epoch": 0.5, + "grad_norm": 1.2345736610072682, + "learning_rate": 1.0597272425120286e-05, + "loss": 0.9115, + "step": 3082 + }, + { + "epoch": 0.5, + "grad_norm": 1.2540792441688027, + "learning_rate": 1.05920622026525e-05, + "loss": 0.8786, + "step": 3083 + }, + { + "epoch": 0.5, + "grad_norm": 1.4995320379929469, + "learning_rate": 1.0586851818890651e-05, + "loss": 0.8833, + "step": 3084 + }, + { + "epoch": 0.5, + "grad_norm": 1.3356992198137094, + "learning_rate": 1.058164127525419e-05, + "loss": 0.8919, + "step": 3085 + }, + { + "epoch": 0.5, + "grad_norm": 1.2781278831127638, + "learning_rate": 1.0576430573162612e-05, + "loss": 0.8647, + "step": 3086 + }, + { + "epoch": 0.5, + "grad_norm": 1.4111650351043021, + "learning_rate": 1.057121971403546e-05, + "loss": 0.8607, + "step": 3087 + }, + { + "epoch": 0.5, + "grad_norm": 1.7572311404275531, + "learning_rate": 1.0566008699292307e-05, + "loss": 0.8936, + "step": 3088 + }, + { + "epoch": 0.5, + "grad_norm": 1.1955921291687412, + "learning_rate": 1.0560797530352784e-05, + "loss": 0.8447, + "step": 3089 + }, + { + "epoch": 0.5, + "grad_norm": 1.2750046395703591, + "learning_rate": 1.0555586208636557e-05, + "loss": 0.8708, + "step": 3090 + }, + { + "epoch": 0.5, + "grad_norm": 1.3163310122850227, + "learning_rate": 1.0550374735563329e-05, + "loss": 0.8144, + "step": 3091 + }, + { + "epoch": 0.5, + "grad_norm": 1.3811984043927703, + "learning_rate": 1.0545163112552856e-05, + "loss": 0.8669, + "step": 3092 + }, + { + "epoch": 0.5, + "grad_norm": 1.5005440675225892, + "learning_rate": 1.053995134102492e-05, + "loss": 0.8513, + "step": 3093 + }, + { + "epoch": 0.5, + "grad_norm": 1.1969314959951984, + "learning_rate": 1.0534739422399357e-05, + "loss": 0.9069, + "step": 3094 + }, + { + "epoch": 0.5, + "grad_norm": 1.3382550662063022, + "learning_rate": 1.0529527358096035e-05, + "loss": 0.8344, + "step": 3095 + }, + { + "epoch": 0.5, + "grad_norm": 1.2396128741786128, + "learning_rate": 1.052431514953486e-05, + "loss": 0.893, + "step": 3096 + }, + { + "epoch": 0.5, + "grad_norm": 1.4395120911444843, + "learning_rate": 1.051910279813579e-05, + "loss": 0.8691, + "step": 3097 + }, + { + "epoch": 0.5, + "grad_norm": 1.5309220462779471, + "learning_rate": 1.0513890305318808e-05, + "loss": 0.825, + "step": 3098 + }, + { + "epoch": 0.5, + "grad_norm": 1.477627941894886, + "learning_rate": 1.0508677672503942e-05, + "loss": 0.8173, + "step": 3099 + }, + { + "epoch": 0.5, + "grad_norm": 1.3164822474426208, + "learning_rate": 1.0503464901111253e-05, + "loss": 0.9002, + "step": 3100 + }, + { + "epoch": 0.5, + "grad_norm": 1.558687143348059, + "learning_rate": 1.0498251992560851e-05, + "loss": 0.8533, + "step": 3101 + }, + { + "epoch": 0.5, + "grad_norm": 1.3918851898004079, + "learning_rate": 1.0493038948272866e-05, + "loss": 0.899, + "step": 3102 + }, + { + "epoch": 0.5, + "grad_norm": 1.4231218704380626, + "learning_rate": 1.0487825769667489e-05, + "loss": 0.8851, + "step": 3103 + }, + { + "epoch": 0.5, + "grad_norm": 1.21505158185986, + "learning_rate": 1.0482612458164918e-05, + "loss": 0.8337, + "step": 3104 + }, + { + "epoch": 0.5, + "grad_norm": 1.2429226860475722, + "learning_rate": 1.0477399015185415e-05, + "loss": 0.8279, + "step": 3105 + }, + { + "epoch": 0.5, + "grad_norm": 0.9001127551768887, + "learning_rate": 1.0472185442149257e-05, + "loss": 0.3617, + "step": 3106 + }, + { + "epoch": 0.5, + "grad_norm": 1.3572454488804095, + "learning_rate": 1.046697174047677e-05, + "loss": 0.8517, + "step": 3107 + }, + { + "epoch": 0.5, + "grad_norm": 1.3877297510575701, + "learning_rate": 1.0461757911588309e-05, + "loss": 0.8241, + "step": 3108 + }, + { + "epoch": 0.5, + "grad_norm": 1.4030912606573163, + "learning_rate": 1.0456543956904266e-05, + "loss": 0.8886, + "step": 3109 + }, + { + "epoch": 0.5, + "grad_norm": 1.497197921500087, + "learning_rate": 1.0451329877845062e-05, + "loss": 0.8048, + "step": 3110 + }, + { + "epoch": 0.5, + "grad_norm": 1.489961032871929, + "learning_rate": 1.0446115675831159e-05, + "loss": 0.7885, + "step": 3111 + }, + { + "epoch": 0.5, + "grad_norm": 1.4814037120387153, + "learning_rate": 1.044090135228305e-05, + "loss": 0.829, + "step": 3112 + }, + { + "epoch": 0.5, + "grad_norm": 1.2720968906994399, + "learning_rate": 1.0435686908621256e-05, + "loss": 0.8484, + "step": 3113 + }, + { + "epoch": 0.5, + "grad_norm": 1.3388453620227982, + "learning_rate": 1.0430472346266343e-05, + "loss": 0.831, + "step": 3114 + }, + { + "epoch": 0.5, + "grad_norm": 1.36215736864177, + "learning_rate": 1.0425257666638891e-05, + "loss": 0.8744, + "step": 3115 + }, + { + "epoch": 0.5, + "grad_norm": 1.3694964305707824, + "learning_rate": 1.0420042871159532e-05, + "loss": 0.8057, + "step": 3116 + }, + { + "epoch": 0.5, + "grad_norm": 1.6054553500658117, + "learning_rate": 1.0414827961248917e-05, + "loss": 0.9106, + "step": 3117 + }, + { + "epoch": 0.5, + "grad_norm": 1.1922218942899219, + "learning_rate": 1.040961293832773e-05, + "loss": 0.8907, + "step": 3118 + }, + { + "epoch": 0.5, + "grad_norm": 1.2974136717474913, + "learning_rate": 1.0404397803816686e-05, + "loss": 0.8941, + "step": 3119 + }, + { + "epoch": 0.5, + "grad_norm": 1.55441616256837, + "learning_rate": 1.0399182559136536e-05, + "loss": 0.8465, + "step": 3120 + }, + { + "epoch": 0.5, + "grad_norm": 1.2929329906802793, + "learning_rate": 1.0393967205708049e-05, + "loss": 0.7521, + "step": 3121 + }, + { + "epoch": 0.5, + "grad_norm": 1.470199296020881, + "learning_rate": 1.0388751744952037e-05, + "loss": 0.8495, + "step": 3122 + }, + { + "epoch": 0.5, + "grad_norm": 1.6260963166730618, + "learning_rate": 1.0383536178289336e-05, + "loss": 0.876, + "step": 3123 + }, + { + "epoch": 0.5, + "grad_norm": 1.4356338325425162, + "learning_rate": 1.0378320507140804e-05, + "loss": 0.7491, + "step": 3124 + }, + { + "epoch": 0.5, + "grad_norm": 1.0090143629780062, + "learning_rate": 1.0373104732927341e-05, + "loss": 0.3394, + "step": 3125 + }, + { + "epoch": 0.5, + "grad_norm": 0.8160227891444077, + "learning_rate": 1.036788885706986e-05, + "loss": 0.3514, + "step": 3126 + }, + { + "epoch": 0.5, + "grad_norm": 1.996273928458851, + "learning_rate": 1.0362672880989317e-05, + "loss": 0.9005, + "step": 3127 + }, + { + "epoch": 0.5, + "grad_norm": 1.2449916088130744, + "learning_rate": 1.0357456806106684e-05, + "loss": 0.9654, + "step": 3128 + }, + { + "epoch": 0.5, + "grad_norm": 1.3419406548694788, + "learning_rate": 1.0352240633842962e-05, + "loss": 0.8573, + "step": 3129 + }, + { + "epoch": 0.5, + "grad_norm": 1.2650574674411519, + "learning_rate": 1.0347024365619183e-05, + "loss": 0.8852, + "step": 3130 + }, + { + "epoch": 0.5, + "grad_norm": 1.6728382321621054, + "learning_rate": 1.0341808002856402e-05, + "loss": 0.8419, + "step": 3131 + }, + { + "epoch": 0.5, + "grad_norm": 1.399009813244773, + "learning_rate": 1.0336591546975695e-05, + "loss": 0.8571, + "step": 3132 + }, + { + "epoch": 0.5, + "grad_norm": 1.3006703809938753, + "learning_rate": 1.0331374999398175e-05, + "loss": 0.8225, + "step": 3133 + }, + { + "epoch": 0.5, + "grad_norm": 1.3331187596803233, + "learning_rate": 1.0326158361544971e-05, + "loss": 0.8225, + "step": 3134 + }, + { + "epoch": 0.51, + "grad_norm": 1.3008551275907787, + "learning_rate": 1.0320941634837238e-05, + "loss": 0.7968, + "step": 3135 + }, + { + "epoch": 0.51, + "grad_norm": 1.18041499865668, + "learning_rate": 1.031572482069616e-05, + "loss": 0.9243, + "step": 3136 + }, + { + "epoch": 0.51, + "grad_norm": 1.3027629863451, + "learning_rate": 1.0310507920542932e-05, + "loss": 0.886, + "step": 3137 + }, + { + "epoch": 0.51, + "grad_norm": 1.23192960668184, + "learning_rate": 1.0305290935798794e-05, + "loss": 0.7809, + "step": 3138 + }, + { + "epoch": 0.51, + "grad_norm": 1.2739663324156287, + "learning_rate": 1.0300073867884983e-05, + "loss": 0.978, + "step": 3139 + }, + { + "epoch": 0.51, + "grad_norm": 1.291261864464104, + "learning_rate": 1.0294856718222782e-05, + "loss": 0.8875, + "step": 3140 + }, + { + "epoch": 0.51, + "grad_norm": 1.2803623989107642, + "learning_rate": 1.0289639488233484e-05, + "loss": 0.8812, + "step": 3141 + }, + { + "epoch": 0.51, + "grad_norm": 1.2952654326714508, + "learning_rate": 1.0284422179338404e-05, + "loss": 0.798, + "step": 3142 + }, + { + "epoch": 0.51, + "grad_norm": 1.460529664750074, + "learning_rate": 1.0279204792958883e-05, + "loss": 0.8852, + "step": 3143 + }, + { + "epoch": 0.51, + "grad_norm": 1.4179800016046116, + "learning_rate": 1.0273987330516279e-05, + "loss": 0.7886, + "step": 3144 + }, + { + "epoch": 0.51, + "grad_norm": 1.1754818904342017, + "learning_rate": 1.0268769793431975e-05, + "loss": 0.3502, + "step": 3145 + }, + { + "epoch": 0.51, + "grad_norm": 1.3339787136812287, + "learning_rate": 1.026355218312737e-05, + "loss": 0.815, + "step": 3146 + }, + { + "epoch": 0.51, + "grad_norm": 1.2073814148820599, + "learning_rate": 1.0258334501023886e-05, + "loss": 0.822, + "step": 3147 + }, + { + "epoch": 0.51, + "grad_norm": 1.2465137839546458, + "learning_rate": 1.025311674854296e-05, + "loss": 0.8075, + "step": 3148 + }, + { + "epoch": 0.51, + "grad_norm": 1.2636057749445282, + "learning_rate": 1.0247898927106058e-05, + "loss": 0.802, + "step": 3149 + }, + { + "epoch": 0.51, + "grad_norm": 1.4012301015759687, + "learning_rate": 1.0242681038134654e-05, + "loss": 0.8936, + "step": 3150 + }, + { + "epoch": 0.51, + "grad_norm": 1.3593550686087572, + "learning_rate": 1.0237463083050246e-05, + "loss": 0.8381, + "step": 3151 + }, + { + "epoch": 0.51, + "grad_norm": 1.2268692771142444, + "learning_rate": 1.0232245063274349e-05, + "loss": 0.8783, + "step": 3152 + }, + { + "epoch": 0.51, + "grad_norm": 1.5626147728728654, + "learning_rate": 1.0227026980228492e-05, + "loss": 0.9065, + "step": 3153 + }, + { + "epoch": 0.51, + "grad_norm": 1.149314831118315, + "learning_rate": 1.022180883533423e-05, + "loss": 0.8502, + "step": 3154 + }, + { + "epoch": 0.51, + "grad_norm": 1.3129715856368063, + "learning_rate": 1.0216590630013125e-05, + "loss": 0.8691, + "step": 3155 + }, + { + "epoch": 0.51, + "grad_norm": 1.2882633282297973, + "learning_rate": 1.0211372365686763e-05, + "loss": 0.8097, + "step": 3156 + }, + { + "epoch": 0.51, + "grad_norm": 1.2599390612837662, + "learning_rate": 1.0206154043776741e-05, + "loss": 0.8594, + "step": 3157 + }, + { + "epoch": 0.51, + "grad_norm": 1.3250497275593773, + "learning_rate": 1.0200935665704679e-05, + "loss": 0.8597, + "step": 3158 + }, + { + "epoch": 0.51, + "grad_norm": 1.3176112630834818, + "learning_rate": 1.0195717232892198e-05, + "loss": 0.8682, + "step": 3159 + }, + { + "epoch": 0.51, + "grad_norm": 1.4073732519366353, + "learning_rate": 1.0190498746760951e-05, + "loss": 0.8688, + "step": 3160 + }, + { + "epoch": 0.51, + "grad_norm": 1.6235617189457237, + "learning_rate": 1.0185280208732594e-05, + "loss": 0.8239, + "step": 3161 + }, + { + "epoch": 0.51, + "grad_norm": 1.4806289296824688, + "learning_rate": 1.0180061620228799e-05, + "loss": 0.8933, + "step": 3162 + }, + { + "epoch": 0.51, + "grad_norm": 1.359189705305406, + "learning_rate": 1.0174842982671258e-05, + "loss": 0.8886, + "step": 3163 + }, + { + "epoch": 0.51, + "grad_norm": 1.6146281729645078, + "learning_rate": 1.0169624297481663e-05, + "loss": 0.8727, + "step": 3164 + }, + { + "epoch": 0.51, + "grad_norm": 1.3418625794000443, + "learning_rate": 1.016440556608174e-05, + "loss": 0.8618, + "step": 3165 + }, + { + "epoch": 0.51, + "grad_norm": 1.533684622729243, + "learning_rate": 1.0159186789893207e-05, + "loss": 0.824, + "step": 3166 + }, + { + "epoch": 0.51, + "grad_norm": 1.4036490732862157, + "learning_rate": 1.0153967970337803e-05, + "loss": 0.875, + "step": 3167 + }, + { + "epoch": 0.51, + "grad_norm": 1.2541933057919359, + "learning_rate": 1.0148749108837282e-05, + "loss": 0.8879, + "step": 3168 + }, + { + "epoch": 0.51, + "grad_norm": 1.4148761880622192, + "learning_rate": 1.0143530206813403e-05, + "loss": 0.8635, + "step": 3169 + }, + { + "epoch": 0.51, + "grad_norm": 1.4013081823945985, + "learning_rate": 1.013831126568794e-05, + "loss": 0.8664, + "step": 3170 + }, + { + "epoch": 0.51, + "grad_norm": 1.460869006601472, + "learning_rate": 1.0133092286882672e-05, + "loss": 0.8606, + "step": 3171 + }, + { + "epoch": 0.51, + "grad_norm": 1.4625063813001031, + "learning_rate": 1.01278732718194e-05, + "loss": 0.8937, + "step": 3172 + }, + { + "epoch": 0.51, + "grad_norm": 1.409921717755494, + "learning_rate": 1.0122654221919921e-05, + "loss": 0.8269, + "step": 3173 + }, + { + "epoch": 0.51, + "grad_norm": 1.3715806114322515, + "learning_rate": 1.0117435138606054e-05, + "loss": 0.861, + "step": 3174 + }, + { + "epoch": 0.51, + "grad_norm": 1.4017339106958044, + "learning_rate": 1.0112216023299615e-05, + "loss": 0.8726, + "step": 3175 + }, + { + "epoch": 0.51, + "grad_norm": 1.2811030857536037, + "learning_rate": 1.0106996877422442e-05, + "loss": 0.8857, + "step": 3176 + }, + { + "epoch": 0.51, + "grad_norm": 1.3252947516299698, + "learning_rate": 1.0101777702396367e-05, + "loss": 0.8071, + "step": 3177 + }, + { + "epoch": 0.51, + "grad_norm": 1.4938043702178325, + "learning_rate": 1.0096558499643239e-05, + "loss": 0.8945, + "step": 3178 + }, + { + "epoch": 0.51, + "grad_norm": 1.3124407559034774, + "learning_rate": 1.0091339270584917e-05, + "loss": 0.9057, + "step": 3179 + }, + { + "epoch": 0.51, + "grad_norm": 1.3544737306477355, + "learning_rate": 1.008612001664326e-05, + "loss": 0.8952, + "step": 3180 + }, + { + "epoch": 0.51, + "grad_norm": 1.6120500993195108, + "learning_rate": 1.0080900739240136e-05, + "loss": 0.8759, + "step": 3181 + }, + { + "epoch": 0.51, + "grad_norm": 1.6284267917148112, + "learning_rate": 1.007568143979742e-05, + "loss": 0.8224, + "step": 3182 + }, + { + "epoch": 0.51, + "grad_norm": 1.5481018417614492, + "learning_rate": 1.0070462119736993e-05, + "loss": 0.7742, + "step": 3183 + }, + { + "epoch": 0.51, + "grad_norm": 1.3091158252467596, + "learning_rate": 1.0065242780480742e-05, + "loss": 0.8715, + "step": 3184 + }, + { + "epoch": 0.51, + "grad_norm": 1.3840233958393622, + "learning_rate": 1.0060023423450562e-05, + "loss": 0.8413, + "step": 3185 + }, + { + "epoch": 0.51, + "grad_norm": 1.2993453401413937, + "learning_rate": 1.0054804050068343e-05, + "loss": 0.8938, + "step": 3186 + }, + { + "epoch": 0.51, + "grad_norm": 1.4014582454857538, + "learning_rate": 1.0049584661755993e-05, + "loss": 0.9122, + "step": 3187 + }, + { + "epoch": 0.51, + "grad_norm": 1.3691916755540423, + "learning_rate": 1.0044365259935413e-05, + "loss": 0.8346, + "step": 3188 + }, + { + "epoch": 0.51, + "grad_norm": 1.5906937041227802, + "learning_rate": 1.0039145846028515e-05, + "loss": 0.8847, + "step": 3189 + }, + { + "epoch": 0.51, + "grad_norm": 1.4258281974379026, + "learning_rate": 1.0033926421457208e-05, + "loss": 0.8351, + "step": 3190 + }, + { + "epoch": 0.51, + "grad_norm": 1.5022129440543395, + "learning_rate": 1.002870698764341e-05, + "loss": 0.7858, + "step": 3191 + }, + { + "epoch": 0.51, + "grad_norm": 1.2733292248395236, + "learning_rate": 1.0023487546009036e-05, + "loss": 0.8423, + "step": 3192 + }, + { + "epoch": 0.51, + "grad_norm": 1.3213549433296046, + "learning_rate": 1.0018268097976007e-05, + "loss": 0.8121, + "step": 3193 + }, + { + "epoch": 0.51, + "grad_norm": 1.4159834154346203, + "learning_rate": 1.0013048644966246e-05, + "loss": 0.8386, + "step": 3194 + }, + { + "epoch": 0.51, + "grad_norm": 1.4920112720501866, + "learning_rate": 1.0007829188401673e-05, + "loss": 0.8227, + "step": 3195 + }, + { + "epoch": 0.51, + "grad_norm": 1.4170278371195022, + "learning_rate": 1.0002609729704213e-05, + "loss": 0.9145, + "step": 3196 + }, + { + "epoch": 0.52, + "grad_norm": 1.6737666419231418, + "learning_rate": 9.997390270295792e-06, + "loss": 0.8728, + "step": 3197 + }, + { + "epoch": 0.52, + "grad_norm": 1.5829280813189386, + "learning_rate": 9.992170811598332e-06, + "loss": 0.8802, + "step": 3198 + }, + { + "epoch": 0.52, + "grad_norm": 1.93074717171541, + "learning_rate": 9.986951355033755e-06, + "loss": 0.9011, + "step": 3199 + }, + { + "epoch": 0.52, + "grad_norm": 1.414408144937566, + "learning_rate": 9.981731902023998e-06, + "loss": 0.8614, + "step": 3200 + }, + { + "epoch": 0.52, + "grad_norm": 1.3214095022706858, + "learning_rate": 9.976512453990967e-06, + "loss": 0.8053, + "step": 3201 + }, + { + "epoch": 0.52, + "grad_norm": 1.279115182772325, + "learning_rate": 9.971293012356593e-06, + "loss": 0.8094, + "step": 3202 + }, + { + "epoch": 0.52, + "grad_norm": 1.3226841759986698, + "learning_rate": 9.966073578542795e-06, + "loss": 0.8266, + "step": 3203 + }, + { + "epoch": 0.52, + "grad_norm": 1.2711641476789992, + "learning_rate": 9.960854153971488e-06, + "loss": 0.8602, + "step": 3204 + }, + { + "epoch": 0.52, + "grad_norm": 1.26318410505499, + "learning_rate": 9.955634740064588e-06, + "loss": 0.8106, + "step": 3205 + }, + { + "epoch": 0.52, + "grad_norm": 1.3253027282787253, + "learning_rate": 9.95041533824401e-06, + "loss": 0.8908, + "step": 3206 + }, + { + "epoch": 0.52, + "grad_norm": 1.4176140941351614, + "learning_rate": 9.94519594993166e-06, + "loss": 0.8178, + "step": 3207 + }, + { + "epoch": 0.52, + "grad_norm": 1.3476219552970083, + "learning_rate": 9.939976576549441e-06, + "loss": 0.8324, + "step": 3208 + }, + { + "epoch": 0.52, + "grad_norm": 1.2259315580913956, + "learning_rate": 9.934757219519258e-06, + "loss": 0.8626, + "step": 3209 + }, + { + "epoch": 0.52, + "grad_norm": 1.451424786514718, + "learning_rate": 9.92953788026301e-06, + "loss": 0.874, + "step": 3210 + }, + { + "epoch": 0.52, + "grad_norm": 1.129632753390078, + "learning_rate": 9.924318560202584e-06, + "loss": 0.3413, + "step": 3211 + }, + { + "epoch": 0.52, + "grad_norm": 1.7329740719416427, + "learning_rate": 9.919099260759867e-06, + "loss": 0.8547, + "step": 3212 + }, + { + "epoch": 0.52, + "grad_norm": 1.39038526653231, + "learning_rate": 9.913879983356745e-06, + "loss": 0.8298, + "step": 3213 + }, + { + "epoch": 0.52, + "grad_norm": 0.8740425097126813, + "learning_rate": 9.908660729415087e-06, + "loss": 0.3335, + "step": 3214 + }, + { + "epoch": 0.52, + "grad_norm": 1.2656311716411959, + "learning_rate": 9.903441500356761e-06, + "loss": 0.8184, + "step": 3215 + }, + { + "epoch": 0.52, + "grad_norm": 1.2894256860447826, + "learning_rate": 9.898222297603638e-06, + "loss": 0.8612, + "step": 3216 + }, + { + "epoch": 0.52, + "grad_norm": 1.436470852517631, + "learning_rate": 9.893003122577563e-06, + "loss": 0.8561, + "step": 3217 + }, + { + "epoch": 0.52, + "grad_norm": 1.2168111019619272, + "learning_rate": 9.887783976700387e-06, + "loss": 0.9145, + "step": 3218 + }, + { + "epoch": 0.52, + "grad_norm": 1.415597640203667, + "learning_rate": 9.882564861393951e-06, + "loss": 0.9051, + "step": 3219 + }, + { + "epoch": 0.52, + "grad_norm": 1.2835811257104337, + "learning_rate": 9.877345778080082e-06, + "loss": 0.802, + "step": 3220 + }, + { + "epoch": 0.52, + "grad_norm": 1.3501525481448895, + "learning_rate": 9.872126728180604e-06, + "loss": 0.8079, + "step": 3221 + }, + { + "epoch": 0.52, + "grad_norm": 1.2296740796569845, + "learning_rate": 9.866907713117333e-06, + "loss": 0.9071, + "step": 3222 + }, + { + "epoch": 0.52, + "grad_norm": 1.387880318018517, + "learning_rate": 9.861688734312064e-06, + "loss": 0.8981, + "step": 3223 + }, + { + "epoch": 0.52, + "grad_norm": 0.8682626382927793, + "learning_rate": 9.8564697931866e-06, + "loss": 0.3573, + "step": 3224 + }, + { + "epoch": 0.52, + "grad_norm": 0.8786518491679626, + "learning_rate": 9.851250891162722e-06, + "loss": 0.3454, + "step": 3225 + }, + { + "epoch": 0.52, + "grad_norm": 1.57749481590159, + "learning_rate": 9.846032029662199e-06, + "loss": 0.8204, + "step": 3226 + }, + { + "epoch": 0.52, + "grad_norm": 1.4232237112521977, + "learning_rate": 9.840813210106795e-06, + "loss": 0.7425, + "step": 3227 + }, + { + "epoch": 0.52, + "grad_norm": 1.6960350026446973, + "learning_rate": 9.83559443391826e-06, + "loss": 0.7671, + "step": 3228 + }, + { + "epoch": 0.52, + "grad_norm": 1.5915257082530343, + "learning_rate": 9.830375702518339e-06, + "loss": 0.7468, + "step": 3229 + }, + { + "epoch": 0.52, + "grad_norm": 1.485632043549182, + "learning_rate": 9.825157017328745e-06, + "loss": 0.8364, + "step": 3230 + }, + { + "epoch": 0.52, + "grad_norm": 1.3955019829950641, + "learning_rate": 9.8199383797712e-06, + "loss": 0.9204, + "step": 3231 + }, + { + "epoch": 0.52, + "grad_norm": 0.8736183677006223, + "learning_rate": 9.81471979126741e-06, + "loss": 0.3491, + "step": 3232 + }, + { + "epoch": 0.52, + "grad_norm": 1.5361389092132198, + "learning_rate": 9.80950125323905e-06, + "loss": 0.8685, + "step": 3233 + }, + { + "epoch": 0.52, + "grad_norm": 1.248868662749267, + "learning_rate": 9.804282767107802e-06, + "loss": 0.8397, + "step": 3234 + }, + { + "epoch": 0.52, + "grad_norm": 1.497041755498462, + "learning_rate": 9.799064334295324e-06, + "loss": 0.84, + "step": 3235 + }, + { + "epoch": 0.52, + "grad_norm": 1.5284794014709355, + "learning_rate": 9.79384595622326e-06, + "loss": 0.8886, + "step": 3236 + }, + { + "epoch": 0.52, + "grad_norm": 1.5436049519912511, + "learning_rate": 9.788627634313237e-06, + "loss": 0.8126, + "step": 3237 + }, + { + "epoch": 0.52, + "grad_norm": 1.4701211013224909, + "learning_rate": 9.78340936998688e-06, + "loss": 0.8344, + "step": 3238 + }, + { + "epoch": 0.52, + "grad_norm": 1.3373669675011004, + "learning_rate": 9.778191164665774e-06, + "loss": 0.8491, + "step": 3239 + }, + { + "epoch": 0.52, + "grad_norm": 1.3913500381169384, + "learning_rate": 9.77297301977151e-06, + "loss": 0.7915, + "step": 3240 + }, + { + "epoch": 0.52, + "grad_norm": 1.2765407987784612, + "learning_rate": 9.767754936725656e-06, + "loss": 0.908, + "step": 3241 + }, + { + "epoch": 0.52, + "grad_norm": 1.4764627405304673, + "learning_rate": 9.76253691694976e-06, + "loss": 0.8667, + "step": 3242 + }, + { + "epoch": 0.52, + "grad_norm": 1.2246526356102798, + "learning_rate": 9.757318961865348e-06, + "loss": 0.7937, + "step": 3243 + }, + { + "epoch": 0.52, + "grad_norm": 1.3768426282659891, + "learning_rate": 9.752101072893947e-06, + "loss": 0.8324, + "step": 3244 + }, + { + "epoch": 0.52, + "grad_norm": 1.441890983305899, + "learning_rate": 9.746883251457043e-06, + "loss": 0.911, + "step": 3245 + }, + { + "epoch": 0.52, + "grad_norm": 1.4113821159232731, + "learning_rate": 9.741665498976116e-06, + "loss": 0.8244, + "step": 3246 + }, + { + "epoch": 0.52, + "grad_norm": 1.7333876751847688, + "learning_rate": 9.736447816872632e-06, + "loss": 0.905, + "step": 3247 + }, + { + "epoch": 0.52, + "grad_norm": 1.4742538440390462, + "learning_rate": 9.731230206568029e-06, + "loss": 0.9185, + "step": 3248 + }, + { + "epoch": 0.52, + "grad_norm": 1.3126379831761994, + "learning_rate": 9.726012669483723e-06, + "loss": 0.818, + "step": 3249 + }, + { + "epoch": 0.52, + "grad_norm": 1.6227053094655446, + "learning_rate": 9.720795207041119e-06, + "loss": 0.8947, + "step": 3250 + }, + { + "epoch": 0.52, + "grad_norm": 1.397636898326866, + "learning_rate": 9.7155778206616e-06, + "loss": 0.7957, + "step": 3251 + }, + { + "epoch": 0.52, + "grad_norm": 1.2708570482693884, + "learning_rate": 9.710360511766517e-06, + "loss": 0.8147, + "step": 3252 + }, + { + "epoch": 0.52, + "grad_norm": 1.3150074407837038, + "learning_rate": 9.705143281777218e-06, + "loss": 0.8171, + "step": 3253 + }, + { + "epoch": 0.52, + "grad_norm": 1.2447325295569098, + "learning_rate": 9.69992613211502e-06, + "loss": 0.8669, + "step": 3254 + }, + { + "epoch": 0.52, + "grad_norm": 1.3215230627426713, + "learning_rate": 9.694709064201211e-06, + "loss": 0.9005, + "step": 3255 + }, + { + "epoch": 0.52, + "grad_norm": 1.231424330730008, + "learning_rate": 9.68949207945707e-06, + "loss": 0.869, + "step": 3256 + }, + { + "epoch": 0.52, + "grad_norm": 1.3507630472738346, + "learning_rate": 9.684275179303846e-06, + "loss": 0.8506, + "step": 3257 + }, + { + "epoch": 0.52, + "grad_norm": 1.3667070327922466, + "learning_rate": 9.679058365162765e-06, + "loss": 0.8005, + "step": 3258 + }, + { + "epoch": 0.53, + "grad_norm": 1.3234353216630736, + "learning_rate": 9.673841638455029e-06, + "loss": 0.8896, + "step": 3259 + }, + { + "epoch": 0.53, + "grad_norm": 1.267091078564698, + "learning_rate": 9.66862500060183e-06, + "loss": 0.7921, + "step": 3260 + }, + { + "epoch": 0.53, + "grad_norm": 1.4947600691905483, + "learning_rate": 9.663408453024306e-06, + "loss": 0.8229, + "step": 3261 + }, + { + "epoch": 0.53, + "grad_norm": 1.4072061255279407, + "learning_rate": 9.658191997143602e-06, + "loss": 0.8935, + "step": 3262 + }, + { + "epoch": 0.53, + "grad_norm": 1.2321752537233202, + "learning_rate": 9.652975634380822e-06, + "loss": 0.8934, + "step": 3263 + }, + { + "epoch": 0.53, + "grad_norm": 1.5781972362254921, + "learning_rate": 9.647759366157041e-06, + "loss": 0.8501, + "step": 3264 + }, + { + "epoch": 0.53, + "grad_norm": 1.4062687031508807, + "learning_rate": 9.642543193893318e-06, + "loss": 0.8765, + "step": 3265 + }, + { + "epoch": 0.53, + "grad_norm": 1.4509175130300187, + "learning_rate": 9.637327119010683e-06, + "loss": 0.8286, + "step": 3266 + }, + { + "epoch": 0.53, + "grad_norm": 1.3186831599456152, + "learning_rate": 9.632111142930143e-06, + "loss": 0.8702, + "step": 3267 + }, + { + "epoch": 0.53, + "grad_norm": 1.3842937185943751, + "learning_rate": 9.626895267072662e-06, + "loss": 0.845, + "step": 3268 + }, + { + "epoch": 0.53, + "grad_norm": 0.7024665259244136, + "learning_rate": 9.621679492859196e-06, + "loss": 0.3666, + "step": 3269 + }, + { + "epoch": 0.53, + "grad_norm": 1.4368949917907894, + "learning_rate": 9.61646382171067e-06, + "loss": 0.8622, + "step": 3270 + }, + { + "epoch": 0.53, + "grad_norm": 1.345945094989235, + "learning_rate": 9.611248255047965e-06, + "loss": 0.8835, + "step": 3271 + }, + { + "epoch": 0.53, + "grad_norm": 1.3956786545648319, + "learning_rate": 9.606032794291953e-06, + "loss": 0.8034, + "step": 3272 + }, + { + "epoch": 0.53, + "grad_norm": 1.3842278321980457, + "learning_rate": 9.60081744086347e-06, + "loss": 0.7945, + "step": 3273 + }, + { + "epoch": 0.53, + "grad_norm": 1.1970881444119787, + "learning_rate": 9.595602196183317e-06, + "loss": 0.8815, + "step": 3274 + }, + { + "epoch": 0.53, + "grad_norm": 1.2455472054008807, + "learning_rate": 9.590387061672272e-06, + "loss": 0.8893, + "step": 3275 + }, + { + "epoch": 0.53, + "grad_norm": 1.246811117660622, + "learning_rate": 9.585172038751086e-06, + "loss": 0.8052, + "step": 3276 + }, + { + "epoch": 0.53, + "grad_norm": 1.3471413596257102, + "learning_rate": 9.57995712884047e-06, + "loss": 0.7828, + "step": 3277 + }, + { + "epoch": 0.53, + "grad_norm": 1.7177921839564563, + "learning_rate": 9.57474233336111e-06, + "loss": 0.8415, + "step": 3278 + }, + { + "epoch": 0.53, + "grad_norm": 1.3380361617071423, + "learning_rate": 9.569527653733662e-06, + "loss": 0.8182, + "step": 3279 + }, + { + "epoch": 0.53, + "grad_norm": 1.302063352352058, + "learning_rate": 9.564313091378746e-06, + "loss": 0.842, + "step": 3280 + }, + { + "epoch": 0.53, + "grad_norm": 1.405020572426469, + "learning_rate": 9.559098647716953e-06, + "loss": 0.8852, + "step": 3281 + }, + { + "epoch": 0.53, + "grad_norm": 1.259735057519102, + "learning_rate": 9.553884324168846e-06, + "loss": 0.7663, + "step": 3282 + }, + { + "epoch": 0.53, + "grad_norm": 1.2974090539382952, + "learning_rate": 9.54867012215494e-06, + "loss": 0.8928, + "step": 3283 + }, + { + "epoch": 0.53, + "grad_norm": 2.24188124661922, + "learning_rate": 9.543456043095736e-06, + "loss": 0.845, + "step": 3284 + }, + { + "epoch": 0.53, + "grad_norm": 1.4228821792397963, + "learning_rate": 9.538242088411694e-06, + "loss": 0.8464, + "step": 3285 + }, + { + "epoch": 0.53, + "grad_norm": 1.3606431413064488, + "learning_rate": 9.533028259523233e-06, + "loss": 0.8039, + "step": 3286 + }, + { + "epoch": 0.53, + "grad_norm": 1.3872950417549692, + "learning_rate": 9.527814557850744e-06, + "loss": 0.8638, + "step": 3287 + }, + { + "epoch": 0.53, + "grad_norm": 1.3396394673976084, + "learning_rate": 9.522600984814587e-06, + "loss": 0.8705, + "step": 3288 + }, + { + "epoch": 0.53, + "grad_norm": 1.1874340624917628, + "learning_rate": 9.517387541835087e-06, + "loss": 0.8346, + "step": 3289 + }, + { + "epoch": 0.53, + "grad_norm": 1.3985226041855046, + "learning_rate": 9.512174230332515e-06, + "loss": 0.8497, + "step": 3290 + }, + { + "epoch": 0.53, + "grad_norm": 1.3690590874993591, + "learning_rate": 9.506961051727132e-06, + "loss": 0.8275, + "step": 3291 + }, + { + "epoch": 0.53, + "grad_norm": 1.3371064616792028, + "learning_rate": 9.501748007439154e-06, + "loss": 0.8681, + "step": 3292 + }, + { + "epoch": 0.53, + "grad_norm": 1.4895092273553918, + "learning_rate": 9.496535098888749e-06, + "loss": 0.8545, + "step": 3293 + }, + { + "epoch": 0.53, + "grad_norm": 1.286697821527991, + "learning_rate": 9.491322327496062e-06, + "loss": 0.8229, + "step": 3294 + }, + { + "epoch": 0.53, + "grad_norm": 1.7047885919870527, + "learning_rate": 9.486109694681195e-06, + "loss": 0.8583, + "step": 3295 + }, + { + "epoch": 0.53, + "grad_norm": 1.4133797757255386, + "learning_rate": 9.480897201864214e-06, + "loss": 0.7621, + "step": 3296 + }, + { + "epoch": 0.53, + "grad_norm": 1.302556461127079, + "learning_rate": 9.47568485046514e-06, + "loss": 0.887, + "step": 3297 + }, + { + "epoch": 0.53, + "grad_norm": 1.2363819973855787, + "learning_rate": 9.470472641903972e-06, + "loss": 0.836, + "step": 3298 + }, + { + "epoch": 0.53, + "grad_norm": 1.2457326430085816, + "learning_rate": 9.465260577600645e-06, + "loss": 0.8685, + "step": 3299 + }, + { + "epoch": 0.53, + "grad_norm": 1.3753553430532974, + "learning_rate": 9.460048658975082e-06, + "loss": 0.8869, + "step": 3300 + }, + { + "epoch": 0.53, + "grad_norm": 1.5568293585092103, + "learning_rate": 9.454836887447149e-06, + "loss": 0.9146, + "step": 3301 + }, + { + "epoch": 0.53, + "grad_norm": 1.9679666599415533, + "learning_rate": 9.449625264436673e-06, + "loss": 0.8247, + "step": 3302 + }, + { + "epoch": 0.53, + "grad_norm": 1.7636427787163398, + "learning_rate": 9.444413791363446e-06, + "loss": 0.8434, + "step": 3303 + }, + { + "epoch": 0.53, + "grad_norm": 1.2428006077634597, + "learning_rate": 9.43920246964722e-06, + "loss": 0.8759, + "step": 3304 + }, + { + "epoch": 0.53, + "grad_norm": 1.366094269314453, + "learning_rate": 9.433991300707694e-06, + "loss": 0.908, + "step": 3305 + }, + { + "epoch": 0.53, + "grad_norm": 1.1758315702930813, + "learning_rate": 9.428780285964544e-06, + "loss": 0.8394, + "step": 3306 + }, + { + "epoch": 0.53, + "grad_norm": 1.33915486017233, + "learning_rate": 9.423569426837387e-06, + "loss": 0.9365, + "step": 3307 + }, + { + "epoch": 0.53, + "grad_norm": 1.4125424402958784, + "learning_rate": 9.418358724745813e-06, + "loss": 0.8071, + "step": 3308 + }, + { + "epoch": 0.53, + "grad_norm": 1.2092331035734156, + "learning_rate": 9.413148181109352e-06, + "loss": 0.8836, + "step": 3309 + }, + { + "epoch": 0.53, + "grad_norm": 1.458978251427767, + "learning_rate": 9.4079377973475e-06, + "loss": 0.8039, + "step": 3310 + }, + { + "epoch": 0.53, + "grad_norm": 1.2949103913006559, + "learning_rate": 9.40272757487972e-06, + "loss": 0.9294, + "step": 3311 + }, + { + "epoch": 0.53, + "grad_norm": 1.6459365410726812, + "learning_rate": 9.397517515125405e-06, + "loss": 0.8069, + "step": 3312 + }, + { + "epoch": 0.53, + "grad_norm": 1.1752430513381584, + "learning_rate": 9.392307619503928e-06, + "loss": 0.781, + "step": 3313 + }, + { + "epoch": 0.53, + "grad_norm": 0.9929648249208682, + "learning_rate": 9.38709788943461e-06, + "loss": 0.3537, + "step": 3314 + }, + { + "epoch": 0.53, + "grad_norm": 1.1559951024426096, + "learning_rate": 9.38188832633672e-06, + "loss": 0.8594, + "step": 3315 + }, + { + "epoch": 0.53, + "grad_norm": 1.4160601483941857, + "learning_rate": 9.376678931629488e-06, + "loss": 0.8624, + "step": 3316 + }, + { + "epoch": 0.53, + "grad_norm": 1.4955144363182897, + "learning_rate": 9.371469706732097e-06, + "loss": 0.8767, + "step": 3317 + }, + { + "epoch": 0.53, + "grad_norm": 1.2999167934941769, + "learning_rate": 9.366260653063684e-06, + "loss": 0.8151, + "step": 3318 + }, + { + "epoch": 0.53, + "grad_norm": 1.7342737137155968, + "learning_rate": 9.361051772043335e-06, + "loss": 0.8603, + "step": 3319 + }, + { + "epoch": 0.53, + "grad_norm": 1.2798142107853252, + "learning_rate": 9.355843065090103e-06, + "loss": 0.8476, + "step": 3320 + }, + { + "epoch": 0.54, + "grad_norm": 1.2164646593732578, + "learning_rate": 9.35063453362297e-06, + "loss": 0.8029, + "step": 3321 + }, + { + "epoch": 0.54, + "grad_norm": 1.593603336023194, + "learning_rate": 9.34542617906089e-06, + "loss": 0.8905, + "step": 3322 + }, + { + "epoch": 0.54, + "grad_norm": 1.3817291548754649, + "learning_rate": 9.340218002822766e-06, + "loss": 0.8649, + "step": 3323 + }, + { + "epoch": 0.54, + "grad_norm": 1.2888652484426582, + "learning_rate": 9.335010006327443e-06, + "loss": 0.887, + "step": 3324 + }, + { + "epoch": 0.54, + "grad_norm": 1.221618575344612, + "learning_rate": 9.329802190993726e-06, + "loss": 0.8655, + "step": 3325 + }, + { + "epoch": 0.54, + "grad_norm": 1.447332046698887, + "learning_rate": 9.324594558240361e-06, + "loss": 0.8638, + "step": 3326 + }, + { + "epoch": 0.54, + "grad_norm": 1.1928868560124382, + "learning_rate": 9.319387109486065e-06, + "loss": 0.8639, + "step": 3327 + }, + { + "epoch": 0.54, + "grad_norm": 1.4015089554802516, + "learning_rate": 9.314179846149474e-06, + "loss": 0.8247, + "step": 3328 + }, + { + "epoch": 0.54, + "grad_norm": 1.4683099065801253, + "learning_rate": 9.3089727696492e-06, + "loss": 0.7507, + "step": 3329 + }, + { + "epoch": 0.54, + "grad_norm": 1.3350791048660984, + "learning_rate": 9.303765881403794e-06, + "loss": 0.7957, + "step": 3330 + }, + { + "epoch": 0.54, + "grad_norm": 1.3386527055915662, + "learning_rate": 9.298559182831752e-06, + "loss": 0.8944, + "step": 3331 + }, + { + "epoch": 0.54, + "grad_norm": 1.2582893112009188, + "learning_rate": 9.293352675351524e-06, + "loss": 0.8958, + "step": 3332 + }, + { + "epoch": 0.54, + "grad_norm": 1.0103044464980298, + "learning_rate": 9.288146360381507e-06, + "loss": 0.3316, + "step": 3333 + }, + { + "epoch": 0.54, + "grad_norm": 1.4801365446923147, + "learning_rate": 9.28294023934004e-06, + "loss": 0.8254, + "step": 3334 + }, + { + "epoch": 0.54, + "grad_norm": 1.6335760982185497, + "learning_rate": 9.277734313645422e-06, + "loss": 0.7876, + "step": 3335 + }, + { + "epoch": 0.54, + "grad_norm": 1.4513177105310808, + "learning_rate": 9.272528584715886e-06, + "loss": 0.8784, + "step": 3336 + }, + { + "epoch": 0.54, + "grad_norm": 1.4233294602762923, + "learning_rate": 9.267323053969615e-06, + "loss": 0.9514, + "step": 3337 + }, + { + "epoch": 0.54, + "grad_norm": 1.438712034744723, + "learning_rate": 9.26211772282474e-06, + "loss": 0.8255, + "step": 3338 + }, + { + "epoch": 0.54, + "grad_norm": 1.1731702949554745, + "learning_rate": 9.256912592699339e-06, + "loss": 0.8511, + "step": 3339 + }, + { + "epoch": 0.54, + "grad_norm": 1.3856830570866687, + "learning_rate": 9.251707665011429e-06, + "loss": 0.844, + "step": 3340 + }, + { + "epoch": 0.54, + "grad_norm": 1.3257599350250424, + "learning_rate": 9.246502941178976e-06, + "loss": 0.8203, + "step": 3341 + }, + { + "epoch": 0.54, + "grad_norm": 1.3296645350495884, + "learning_rate": 9.2412984226199e-06, + "loss": 0.8484, + "step": 3342 + }, + { + "epoch": 0.54, + "grad_norm": 1.3817864978694165, + "learning_rate": 9.236094110752038e-06, + "loss": 0.8188, + "step": 3343 + }, + { + "epoch": 0.54, + "grad_norm": 1.1829318203702317, + "learning_rate": 9.230890006993203e-06, + "loss": 0.7452, + "step": 3344 + }, + { + "epoch": 0.54, + "grad_norm": 1.3846450478768437, + "learning_rate": 9.225686112761128e-06, + "loss": 0.8478, + "step": 3345 + }, + { + "epoch": 0.54, + "grad_norm": 1.6448235104840225, + "learning_rate": 9.220482429473502e-06, + "loss": 0.7519, + "step": 3346 + }, + { + "epoch": 0.54, + "grad_norm": 1.8019647195171369, + "learning_rate": 9.215278958547951e-06, + "loss": 0.8446, + "step": 3347 + }, + { + "epoch": 0.54, + "grad_norm": 1.400620540063252, + "learning_rate": 9.210075701402037e-06, + "loss": 0.8517, + "step": 3348 + }, + { + "epoch": 0.54, + "grad_norm": 1.3389476630901973, + "learning_rate": 9.204872659453285e-06, + "loss": 0.8865, + "step": 3349 + }, + { + "epoch": 0.54, + "grad_norm": 1.177703387294026, + "learning_rate": 9.19966983411913e-06, + "loss": 0.8552, + "step": 3350 + }, + { + "epoch": 0.54, + "grad_norm": 1.4277524627343439, + "learning_rate": 9.194467226816976e-06, + "loss": 0.8006, + "step": 3351 + }, + { + "epoch": 0.54, + "grad_norm": 1.2796502211401775, + "learning_rate": 9.189264838964159e-06, + "loss": 0.8755, + "step": 3352 + }, + { + "epoch": 0.54, + "grad_norm": 1.285876972064867, + "learning_rate": 9.184062671977942e-06, + "loss": 0.8977, + "step": 3353 + }, + { + "epoch": 0.54, + "grad_norm": 1.3819258050710548, + "learning_rate": 9.178860727275546e-06, + "loss": 0.8324, + "step": 3354 + }, + { + "epoch": 0.54, + "grad_norm": 1.3038578165017511, + "learning_rate": 9.173659006274125e-06, + "loss": 0.8412, + "step": 3355 + }, + { + "epoch": 0.54, + "grad_norm": 1.4912254186650333, + "learning_rate": 9.168457510390764e-06, + "loss": 0.8058, + "step": 3356 + }, + { + "epoch": 0.54, + "grad_norm": 1.298473348421068, + "learning_rate": 9.163256241042502e-06, + "loss": 0.8442, + "step": 3357 + }, + { + "epoch": 0.54, + "grad_norm": 1.4440505382225683, + "learning_rate": 9.158055199646306e-06, + "loss": 0.8951, + "step": 3358 + }, + { + "epoch": 0.54, + "grad_norm": 1.403539087713613, + "learning_rate": 9.152854387619081e-06, + "loss": 0.8092, + "step": 3359 + }, + { + "epoch": 0.54, + "grad_norm": 1.2930896088853676, + "learning_rate": 9.147653806377672e-06, + "loss": 0.8721, + "step": 3360 + }, + { + "epoch": 0.54, + "grad_norm": 0.7208774789943176, + "learning_rate": 9.142453457338864e-06, + "loss": 0.3398, + "step": 3361 + }, + { + "epoch": 0.54, + "grad_norm": 1.4569638680276722, + "learning_rate": 9.137253341919369e-06, + "loss": 0.8183, + "step": 3362 + }, + { + "epoch": 0.54, + "grad_norm": 1.341705779352439, + "learning_rate": 9.132053461535844e-06, + "loss": 0.8964, + "step": 3363 + }, + { + "epoch": 0.54, + "grad_norm": 1.3918776147995133, + "learning_rate": 9.126853817604887e-06, + "loss": 0.9255, + "step": 3364 + }, + { + "epoch": 0.54, + "grad_norm": 1.362497436396348, + "learning_rate": 9.121654411543013e-06, + "loss": 0.8283, + "step": 3365 + }, + { + "epoch": 0.54, + "grad_norm": 1.669726853387932, + "learning_rate": 9.11645524476669e-06, + "loss": 0.8362, + "step": 3366 + }, + { + "epoch": 0.54, + "grad_norm": 1.317623392170619, + "learning_rate": 9.111256318692316e-06, + "loss": 0.8533, + "step": 3367 + }, + { + "epoch": 0.54, + "grad_norm": 1.4399511816211037, + "learning_rate": 9.10605763473622e-06, + "loss": 0.8598, + "step": 3368 + }, + { + "epoch": 0.54, + "grad_norm": 1.3776198389559628, + "learning_rate": 9.100859194314661e-06, + "loss": 0.859, + "step": 3369 + }, + { + "epoch": 0.54, + "grad_norm": 1.3374845870134853, + "learning_rate": 9.095660998843842e-06, + "loss": 0.7726, + "step": 3370 + }, + { + "epoch": 0.54, + "grad_norm": 1.4050668938426265, + "learning_rate": 9.090463049739903e-06, + "loss": 0.8178, + "step": 3371 + }, + { + "epoch": 0.54, + "grad_norm": 1.2354048619438749, + "learning_rate": 9.085265348418894e-06, + "loss": 0.803, + "step": 3372 + }, + { + "epoch": 0.54, + "grad_norm": 1.4202285196741902, + "learning_rate": 9.08006789629682e-06, + "loss": 0.7299, + "step": 3373 + }, + { + "epoch": 0.54, + "grad_norm": 1.291248405869618, + "learning_rate": 9.074870694789613e-06, + "loss": 0.8047, + "step": 3374 + }, + { + "epoch": 0.54, + "grad_norm": 1.309811337370424, + "learning_rate": 9.069673745313127e-06, + "loss": 0.817, + "step": 3375 + }, + { + "epoch": 0.54, + "grad_norm": 1.2550397976969294, + "learning_rate": 9.064477049283157e-06, + "loss": 0.7978, + "step": 3376 + }, + { + "epoch": 0.54, + "grad_norm": 1.8764698714408057, + "learning_rate": 9.059280608115427e-06, + "loss": 0.8226, + "step": 3377 + }, + { + "epoch": 0.54, + "grad_norm": 1.278061927344398, + "learning_rate": 9.054084423225592e-06, + "loss": 0.8699, + "step": 3378 + }, + { + "epoch": 0.54, + "grad_norm": 1.5728104985929459, + "learning_rate": 9.04888849602923e-06, + "loss": 0.8941, + "step": 3379 + }, + { + "epoch": 0.54, + "grad_norm": 1.4322838587259112, + "learning_rate": 9.043692827941864e-06, + "loss": 0.8641, + "step": 3380 + }, + { + "epoch": 0.54, + "grad_norm": 1.4555162701422832, + "learning_rate": 9.03849742037893e-06, + "loss": 0.9123, + "step": 3381 + }, + { + "epoch": 0.54, + "grad_norm": 0.977047434567082, + "learning_rate": 9.033302274755803e-06, + "loss": 0.3407, + "step": 3382 + }, + { + "epoch": 0.55, + "grad_norm": 1.174017810728722, + "learning_rate": 9.028107392487787e-06, + "loss": 0.8131, + "step": 3383 + }, + { + "epoch": 0.55, + "grad_norm": 1.280065650576498, + "learning_rate": 9.022912774990104e-06, + "loss": 0.8552, + "step": 3384 + }, + { + "epoch": 0.55, + "grad_norm": 1.5807153998649524, + "learning_rate": 9.01771842367791e-06, + "loss": 0.8253, + "step": 3385 + }, + { + "epoch": 0.55, + "grad_norm": 1.67972839106089, + "learning_rate": 9.0125243399663e-06, + "loss": 0.8966, + "step": 3386 + }, + { + "epoch": 0.55, + "grad_norm": 1.3247030836722151, + "learning_rate": 9.007330525270282e-06, + "loss": 0.7648, + "step": 3387 + }, + { + "epoch": 0.55, + "grad_norm": 1.410290600724773, + "learning_rate": 9.002136981004787e-06, + "loss": 0.8495, + "step": 3388 + }, + { + "epoch": 0.55, + "grad_norm": 1.2745015983919232, + "learning_rate": 8.996943708584688e-06, + "loss": 0.7973, + "step": 3389 + }, + { + "epoch": 0.55, + "grad_norm": 1.2123064920434417, + "learning_rate": 8.991750709424772e-06, + "loss": 0.851, + "step": 3390 + }, + { + "epoch": 0.55, + "grad_norm": 1.1945824333649506, + "learning_rate": 8.986557984939754e-06, + "loss": 0.9181, + "step": 3391 + }, + { + "epoch": 0.55, + "grad_norm": 1.343760625525537, + "learning_rate": 8.981365536544275e-06, + "loss": 0.8441, + "step": 3392 + }, + { + "epoch": 0.55, + "grad_norm": 1.4729414867280886, + "learning_rate": 8.976173365652909e-06, + "loss": 0.9232, + "step": 3393 + }, + { + "epoch": 0.55, + "grad_norm": 1.307193615565311, + "learning_rate": 8.970981473680132e-06, + "loss": 0.841, + "step": 3394 + }, + { + "epoch": 0.55, + "grad_norm": 0.8529591022657905, + "learning_rate": 8.96578986204037e-06, + "loss": 0.3418, + "step": 3395 + }, + { + "epoch": 0.55, + "grad_norm": 1.315675205634184, + "learning_rate": 8.96059853214796e-06, + "loss": 0.919, + "step": 3396 + }, + { + "epoch": 0.55, + "grad_norm": 1.343064447946701, + "learning_rate": 8.955407485417158e-06, + "loss": 0.8906, + "step": 3397 + }, + { + "epoch": 0.55, + "grad_norm": 1.6882801410498414, + "learning_rate": 8.950216723262152e-06, + "loss": 0.8458, + "step": 3398 + }, + { + "epoch": 0.55, + "grad_norm": 1.1444624540008776, + "learning_rate": 8.94502624709705e-06, + "loss": 0.7972, + "step": 3399 + }, + { + "epoch": 0.55, + "grad_norm": 1.3552666556065922, + "learning_rate": 8.939836058335878e-06, + "loss": 0.8917, + "step": 3400 + }, + { + "epoch": 0.55, + "grad_norm": 1.2170402232237352, + "learning_rate": 8.934646158392584e-06, + "loss": 0.8539, + "step": 3401 + }, + { + "epoch": 0.55, + "grad_norm": 1.3320480105109125, + "learning_rate": 8.92945654868105e-06, + "loss": 0.7902, + "step": 3402 + }, + { + "epoch": 0.55, + "grad_norm": 1.4974063389174643, + "learning_rate": 8.924267230615054e-06, + "loss": 0.8757, + "step": 3403 + }, + { + "epoch": 0.55, + "grad_norm": 1.3847964556426955, + "learning_rate": 8.91907820560832e-06, + "loss": 0.8244, + "step": 3404 + }, + { + "epoch": 0.55, + "grad_norm": 1.6634492785240853, + "learning_rate": 8.913889475074479e-06, + "loss": 0.855, + "step": 3405 + }, + { + "epoch": 0.55, + "grad_norm": 1.2321780094987667, + "learning_rate": 8.908701040427086e-06, + "loss": 0.8548, + "step": 3406 + }, + { + "epoch": 0.55, + "grad_norm": 0.8889174803166736, + "learning_rate": 8.903512903079608e-06, + "loss": 0.3522, + "step": 3407 + }, + { + "epoch": 0.55, + "grad_norm": 1.3483419339438767, + "learning_rate": 8.89832506444544e-06, + "loss": 0.7946, + "step": 3408 + }, + { + "epoch": 0.55, + "grad_norm": 1.492575287984684, + "learning_rate": 8.893137525937894e-06, + "loss": 0.8886, + "step": 3409 + }, + { + "epoch": 0.55, + "grad_norm": 1.4546793452038402, + "learning_rate": 8.887950288970198e-06, + "loss": 0.8573, + "step": 3410 + }, + { + "epoch": 0.55, + "grad_norm": 1.6755418484195712, + "learning_rate": 8.882763354955495e-06, + "loss": 0.8698, + "step": 3411 + }, + { + "epoch": 0.55, + "grad_norm": 1.404483284734382, + "learning_rate": 8.877576725306853e-06, + "loss": 0.8878, + "step": 3412 + }, + { + "epoch": 0.55, + "grad_norm": 1.3739944867064433, + "learning_rate": 8.87239040143725e-06, + "loss": 0.8147, + "step": 3413 + }, + { + "epoch": 0.55, + "grad_norm": 1.4173808389697002, + "learning_rate": 8.86720438475958e-06, + "loss": 0.8596, + "step": 3414 + }, + { + "epoch": 0.55, + "grad_norm": 1.409818225063227, + "learning_rate": 8.86201867668667e-06, + "loss": 0.8626, + "step": 3415 + }, + { + "epoch": 0.55, + "grad_norm": 1.3166725401568788, + "learning_rate": 8.856833278631232e-06, + "loss": 0.8806, + "step": 3416 + }, + { + "epoch": 0.55, + "grad_norm": 1.3113517627568532, + "learning_rate": 8.851648192005925e-06, + "loss": 0.8049, + "step": 3417 + }, + { + "epoch": 0.55, + "grad_norm": 1.2117255915024996, + "learning_rate": 8.846463418223307e-06, + "loss": 0.8432, + "step": 3418 + }, + { + "epoch": 0.55, + "grad_norm": 1.3200814680546045, + "learning_rate": 8.841278958695848e-06, + "loss": 0.8171, + "step": 3419 + }, + { + "epoch": 0.55, + "grad_norm": 1.3658590691897836, + "learning_rate": 8.836094814835941e-06, + "loss": 0.8924, + "step": 3420 + }, + { + "epoch": 0.55, + "grad_norm": 1.446069199066689, + "learning_rate": 8.830910988055892e-06, + "loss": 0.8487, + "step": 3421 + }, + { + "epoch": 0.55, + "grad_norm": 1.5483252453363934, + "learning_rate": 8.825727479767916e-06, + "loss": 0.9023, + "step": 3422 + }, + { + "epoch": 0.55, + "grad_norm": 1.3674131084671775, + "learning_rate": 8.820544291384138e-06, + "loss": 0.8237, + "step": 3423 + }, + { + "epoch": 0.55, + "grad_norm": 1.4079866745582212, + "learning_rate": 8.815361424316617e-06, + "loss": 0.9318, + "step": 3424 + }, + { + "epoch": 0.55, + "grad_norm": 0.8796023489180789, + "learning_rate": 8.81017887997729e-06, + "loss": 0.3483, + "step": 3425 + }, + { + "epoch": 0.55, + "grad_norm": 1.3621521539646435, + "learning_rate": 8.804996659778036e-06, + "loss": 0.8007, + "step": 3426 + }, + { + "epoch": 0.55, + "grad_norm": 1.361617328513014, + "learning_rate": 8.799814765130631e-06, + "loss": 0.8484, + "step": 3427 + }, + { + "epoch": 0.55, + "grad_norm": 1.2074680078283484, + "learning_rate": 8.79463319744677e-06, + "loss": 0.8332, + "step": 3428 + }, + { + "epoch": 0.55, + "grad_norm": 1.6675587637128741, + "learning_rate": 8.789451958138053e-06, + "loss": 0.8813, + "step": 3429 + }, + { + "epoch": 0.55, + "grad_norm": 1.524230130943539, + "learning_rate": 8.784271048615987e-06, + "loss": 0.8003, + "step": 3430 + }, + { + "epoch": 0.55, + "grad_norm": 1.4790775334899682, + "learning_rate": 8.779090470292006e-06, + "loss": 0.8543, + "step": 3431 + }, + { + "epoch": 0.55, + "grad_norm": 1.4189167971545105, + "learning_rate": 8.773910224577428e-06, + "loss": 0.8354, + "step": 3432 + }, + { + "epoch": 0.55, + "grad_norm": 1.4887925668887927, + "learning_rate": 8.768730312883505e-06, + "loss": 0.8881, + "step": 3433 + }, + { + "epoch": 0.55, + "grad_norm": 1.475849120567027, + "learning_rate": 8.763550736621388e-06, + "loss": 0.8337, + "step": 3434 + }, + { + "epoch": 0.55, + "grad_norm": 1.6214498519438747, + "learning_rate": 8.758371497202131e-06, + "loss": 0.8568, + "step": 3435 + }, + { + "epoch": 0.55, + "grad_norm": 1.4446566595112706, + "learning_rate": 8.753192596036703e-06, + "loss": 0.9014, + "step": 3436 + }, + { + "epoch": 0.55, + "grad_norm": 1.3820789318959346, + "learning_rate": 8.748014034535983e-06, + "loss": 0.8282, + "step": 3437 + }, + { + "epoch": 0.55, + "grad_norm": 1.464002998037563, + "learning_rate": 8.742835814110746e-06, + "loss": 0.8003, + "step": 3438 + }, + { + "epoch": 0.55, + "grad_norm": 1.2255985445536814, + "learning_rate": 8.737657936171691e-06, + "loss": 0.8896, + "step": 3439 + }, + { + "epoch": 0.55, + "grad_norm": 1.5406771887737536, + "learning_rate": 8.73248040212941e-06, + "loss": 0.8551, + "step": 3440 + }, + { + "epoch": 0.55, + "grad_norm": 1.3434126896329739, + "learning_rate": 8.727303213394408e-06, + "loss": 0.8004, + "step": 3441 + }, + { + "epoch": 0.55, + "grad_norm": 1.4663454896525807, + "learning_rate": 8.722126371377091e-06, + "loss": 0.8203, + "step": 3442 + }, + { + "epoch": 0.55, + "grad_norm": 1.3687337898084877, + "learning_rate": 8.716949877487778e-06, + "loss": 0.8606, + "step": 3443 + }, + { + "epoch": 0.55, + "grad_norm": 1.2787642891846192, + "learning_rate": 8.711773733136684e-06, + "loss": 0.9052, + "step": 3444 + }, + { + "epoch": 0.56, + "grad_norm": 1.301175002767422, + "learning_rate": 8.706597939733931e-06, + "loss": 0.7972, + "step": 3445 + }, + { + "epoch": 0.56, + "grad_norm": 1.3148539698583814, + "learning_rate": 8.701422498689556e-06, + "loss": 0.8045, + "step": 3446 + }, + { + "epoch": 0.56, + "grad_norm": 1.4242140520999744, + "learning_rate": 8.69624741141349e-06, + "loss": 0.8506, + "step": 3447 + }, + { + "epoch": 0.56, + "grad_norm": 1.2213792469591873, + "learning_rate": 8.691072679315566e-06, + "loss": 0.8421, + "step": 3448 + }, + { + "epoch": 0.56, + "grad_norm": 1.3107817765505518, + "learning_rate": 8.685898303805523e-06, + "loss": 0.8007, + "step": 3449 + }, + { + "epoch": 0.56, + "grad_norm": 1.5206204643977579, + "learning_rate": 8.680724286293008e-06, + "loss": 0.8206, + "step": 3450 + }, + { + "epoch": 0.56, + "grad_norm": 1.3542190852626406, + "learning_rate": 8.675550628187562e-06, + "loss": 0.8753, + "step": 3451 + }, + { + "epoch": 0.56, + "grad_norm": 1.4947709050686147, + "learning_rate": 8.670377330898631e-06, + "loss": 0.8927, + "step": 3452 + }, + { + "epoch": 0.56, + "grad_norm": 1.3518749479535428, + "learning_rate": 8.665204395835573e-06, + "loss": 0.8229, + "step": 3453 + }, + { + "epoch": 0.56, + "grad_norm": 1.3676865102993336, + "learning_rate": 8.660031824407625e-06, + "loss": 0.7903, + "step": 3454 + }, + { + "epoch": 0.56, + "grad_norm": 1.544069456481627, + "learning_rate": 8.654859618023946e-06, + "loss": 0.9192, + "step": 3455 + }, + { + "epoch": 0.56, + "grad_norm": 1.2918752392455823, + "learning_rate": 8.64968777809359e-06, + "loss": 0.8777, + "step": 3456 + }, + { + "epoch": 0.56, + "grad_norm": 1.6232854791696851, + "learning_rate": 8.644516306025501e-06, + "loss": 0.8583, + "step": 3457 + }, + { + "epoch": 0.56, + "grad_norm": 1.3632774762863789, + "learning_rate": 8.639345203228536e-06, + "loss": 0.8936, + "step": 3458 + }, + { + "epoch": 0.56, + "grad_norm": 1.3708328409095023, + "learning_rate": 8.634174471111445e-06, + "loss": 0.8736, + "step": 3459 + }, + { + "epoch": 0.56, + "grad_norm": 1.4050686483564696, + "learning_rate": 8.629004111082875e-06, + "loss": 0.797, + "step": 3460 + }, + { + "epoch": 0.56, + "grad_norm": 1.3173011813877678, + "learning_rate": 8.623834124551375e-06, + "loss": 0.8666, + "step": 3461 + }, + { + "epoch": 0.56, + "grad_norm": 1.4803703169698292, + "learning_rate": 8.618664512925398e-06, + "loss": 0.8583, + "step": 3462 + }, + { + "epoch": 0.56, + "grad_norm": 1.4517891390610331, + "learning_rate": 8.613495277613283e-06, + "loss": 0.8668, + "step": 3463 + }, + { + "epoch": 0.56, + "grad_norm": 1.2888484589287883, + "learning_rate": 8.608326420023272e-06, + "loss": 0.9131, + "step": 3464 + }, + { + "epoch": 0.56, + "grad_norm": 1.5421020379296073, + "learning_rate": 8.603157941563506e-06, + "loss": 0.8667, + "step": 3465 + }, + { + "epoch": 0.56, + "grad_norm": 1.6044311604703163, + "learning_rate": 8.597989843642025e-06, + "loss": 0.8045, + "step": 3466 + }, + { + "epoch": 0.56, + "grad_norm": 1.3475238617540806, + "learning_rate": 8.592822127666751e-06, + "loss": 0.819, + "step": 3467 + }, + { + "epoch": 0.56, + "grad_norm": 1.3603085992386053, + "learning_rate": 8.58765479504552e-06, + "loss": 0.8988, + "step": 3468 + }, + { + "epoch": 0.56, + "grad_norm": 1.4890242177649533, + "learning_rate": 8.582487847186061e-06, + "loss": 0.8094, + "step": 3469 + }, + { + "epoch": 0.56, + "grad_norm": 1.502264165481318, + "learning_rate": 8.577321285495981e-06, + "loss": 0.9264, + "step": 3470 + }, + { + "epoch": 0.56, + "grad_norm": 1.3146033877802417, + "learning_rate": 8.572155111382799e-06, + "loss": 0.8494, + "step": 3471 + }, + { + "epoch": 0.56, + "grad_norm": 1.419184554974385, + "learning_rate": 8.566989326253924e-06, + "loss": 0.8773, + "step": 3472 + }, + { + "epoch": 0.56, + "grad_norm": 1.7184860406104847, + "learning_rate": 8.56182393151666e-06, + "loss": 0.845, + "step": 3473 + }, + { + "epoch": 0.56, + "grad_norm": 1.3167459116216893, + "learning_rate": 8.556658928578196e-06, + "loss": 0.856, + "step": 3474 + }, + { + "epoch": 0.56, + "grad_norm": 1.3286731241047276, + "learning_rate": 8.55149431884563e-06, + "loss": 0.7825, + "step": 3475 + }, + { + "epoch": 0.56, + "grad_norm": 1.5192447149487844, + "learning_rate": 8.546330103725937e-06, + "loss": 0.847, + "step": 3476 + }, + { + "epoch": 0.56, + "grad_norm": 1.3691414671172901, + "learning_rate": 8.541166284625995e-06, + "loss": 0.9192, + "step": 3477 + }, + { + "epoch": 0.56, + "grad_norm": 1.29155082833734, + "learning_rate": 8.536002862952572e-06, + "loss": 0.8558, + "step": 3478 + }, + { + "epoch": 0.56, + "grad_norm": 1.2134733631509982, + "learning_rate": 8.530839840112324e-06, + "loss": 0.8644, + "step": 3479 + }, + { + "epoch": 0.56, + "grad_norm": 1.3060670001232697, + "learning_rate": 8.525677217511799e-06, + "loss": 0.8198, + "step": 3480 + }, + { + "epoch": 0.56, + "grad_norm": 1.4957118686162425, + "learning_rate": 8.520514996557443e-06, + "loss": 0.8626, + "step": 3481 + }, + { + "epoch": 0.56, + "grad_norm": 1.4513779708200936, + "learning_rate": 8.515353178655582e-06, + "loss": 0.7608, + "step": 3482 + }, + { + "epoch": 0.56, + "grad_norm": 1.369259684905322, + "learning_rate": 8.510191765212438e-06, + "loss": 0.8428, + "step": 3483 + }, + { + "epoch": 0.56, + "grad_norm": 1.3502303387664847, + "learning_rate": 8.505030757634125e-06, + "loss": 0.846, + "step": 3484 + }, + { + "epoch": 0.56, + "grad_norm": 1.6171504652233177, + "learning_rate": 8.499870157326647e-06, + "loss": 0.8608, + "step": 3485 + }, + { + "epoch": 0.56, + "grad_norm": 1.4247616518880055, + "learning_rate": 8.494709965695885e-06, + "loss": 0.866, + "step": 3486 + }, + { + "epoch": 0.56, + "grad_norm": 1.6919014846636853, + "learning_rate": 8.489550184147621e-06, + "loss": 0.7106, + "step": 3487 + }, + { + "epoch": 0.56, + "grad_norm": 1.1573675685396114, + "learning_rate": 8.484390814087527e-06, + "loss": 0.8998, + "step": 3488 + }, + { + "epoch": 0.56, + "grad_norm": 1.3370441414586165, + "learning_rate": 8.479231856921149e-06, + "loss": 0.7731, + "step": 3489 + }, + { + "epoch": 0.56, + "grad_norm": 0.7838595308023129, + "learning_rate": 8.474073314053935e-06, + "loss": 0.379, + "step": 3490 + }, + { + "epoch": 0.56, + "grad_norm": 1.878643165676291, + "learning_rate": 8.468915186891215e-06, + "loss": 0.8244, + "step": 3491 + }, + { + "epoch": 0.56, + "grad_norm": 1.466068582584559, + "learning_rate": 8.4637574768382e-06, + "loss": 0.8915, + "step": 3492 + }, + { + "epoch": 0.56, + "grad_norm": 1.245145688198243, + "learning_rate": 8.458600185299994e-06, + "loss": 0.8539, + "step": 3493 + }, + { + "epoch": 0.56, + "grad_norm": 1.2236216747054087, + "learning_rate": 8.453443313681591e-06, + "loss": 0.8388, + "step": 3494 + }, + { + "epoch": 0.56, + "grad_norm": 0.7984914379088176, + "learning_rate": 8.448286863387858e-06, + "loss": 0.3502, + "step": 3495 + }, + { + "epoch": 0.56, + "grad_norm": 1.2871219535988674, + "learning_rate": 8.443130835823553e-06, + "loss": 0.8831, + "step": 3496 + }, + { + "epoch": 0.56, + "grad_norm": 1.4578350473236452, + "learning_rate": 8.437975232393331e-06, + "loss": 0.8541, + "step": 3497 + }, + { + "epoch": 0.56, + "grad_norm": 1.630661494990603, + "learning_rate": 8.432820054501706e-06, + "loss": 0.8386, + "step": 3498 + }, + { + "epoch": 0.56, + "grad_norm": 1.19345090861553, + "learning_rate": 8.427665303553101e-06, + "loss": 0.9186, + "step": 3499 + }, + { + "epoch": 0.56, + "grad_norm": 1.3315620739233816, + "learning_rate": 8.42251098095181e-06, + "loss": 0.8752, + "step": 3500 + }, + { + "epoch": 0.56, + "grad_norm": 1.5573633794369377, + "learning_rate": 8.41735708810201e-06, + "loss": 0.926, + "step": 3501 + }, + { + "epoch": 0.56, + "grad_norm": 1.4227882769144127, + "learning_rate": 8.412203626407765e-06, + "loss": 0.8436, + "step": 3502 + }, + { + "epoch": 0.56, + "grad_norm": 1.358640902124898, + "learning_rate": 8.407050597273024e-06, + "loss": 0.8363, + "step": 3503 + }, + { + "epoch": 0.56, + "grad_norm": 1.4649772487843364, + "learning_rate": 8.401898002101605e-06, + "loss": 0.8284, + "step": 3504 + }, + { + "epoch": 0.56, + "grad_norm": 1.5623891767300726, + "learning_rate": 8.39674584229722e-06, + "loss": 0.8587, + "step": 3505 + }, + { + "epoch": 0.56, + "grad_norm": 1.4618227152293575, + "learning_rate": 8.391594119263467e-06, + "loss": 0.8058, + "step": 3506 + }, + { + "epoch": 0.57, + "grad_norm": 1.3704500303880773, + "learning_rate": 8.386442834403811e-06, + "loss": 0.8871, + "step": 3507 + }, + { + "epoch": 0.57, + "grad_norm": 1.3659837818879494, + "learning_rate": 8.381291989121604e-06, + "loss": 0.914, + "step": 3508 + }, + { + "epoch": 0.57, + "grad_norm": 1.4738154680470426, + "learning_rate": 8.376141584820078e-06, + "loss": 0.8292, + "step": 3509 + }, + { + "epoch": 0.57, + "grad_norm": 1.4942203251145167, + "learning_rate": 8.37099162290235e-06, + "loss": 0.9181, + "step": 3510 + }, + { + "epoch": 0.57, + "grad_norm": 1.5220244413835853, + "learning_rate": 8.365842104771405e-06, + "loss": 0.7563, + "step": 3511 + }, + { + "epoch": 0.57, + "grad_norm": 1.3785797095867351, + "learning_rate": 8.360693031830114e-06, + "loss": 0.7912, + "step": 3512 + }, + { + "epoch": 0.57, + "grad_norm": 1.387485150658345, + "learning_rate": 8.355544405481233e-06, + "loss": 0.8284, + "step": 3513 + }, + { + "epoch": 0.57, + "grad_norm": 1.308382793759505, + "learning_rate": 8.350396227127383e-06, + "loss": 0.8941, + "step": 3514 + }, + { + "epoch": 0.57, + "grad_norm": 1.7194145651799102, + "learning_rate": 8.345248498171073e-06, + "loss": 0.8431, + "step": 3515 + }, + { + "epoch": 0.57, + "grad_norm": 1.4640116820681126, + "learning_rate": 8.340101220014688e-06, + "loss": 0.8107, + "step": 3516 + }, + { + "epoch": 0.57, + "grad_norm": 1.468349522666423, + "learning_rate": 8.334954394060484e-06, + "loss": 0.8209, + "step": 3517 + }, + { + "epoch": 0.57, + "grad_norm": 1.41981655829562, + "learning_rate": 8.329808021710598e-06, + "loss": 0.8364, + "step": 3518 + }, + { + "epoch": 0.57, + "grad_norm": 1.5125132507195005, + "learning_rate": 8.324662104367052e-06, + "loss": 0.8311, + "step": 3519 + }, + { + "epoch": 0.57, + "grad_norm": 1.5000362149724618, + "learning_rate": 8.319516643431723e-06, + "loss": 0.8127, + "step": 3520 + }, + { + "epoch": 0.57, + "grad_norm": 1.4911642523192152, + "learning_rate": 8.314371640306386e-06, + "loss": 0.8827, + "step": 3521 + }, + { + "epoch": 0.57, + "grad_norm": 1.2776137534712464, + "learning_rate": 8.309227096392682e-06, + "loss": 0.8137, + "step": 3522 + }, + { + "epoch": 0.57, + "grad_norm": 1.2842261001409296, + "learning_rate": 8.30408301309212e-06, + "loss": 0.8031, + "step": 3523 + }, + { + "epoch": 0.57, + "grad_norm": 2.3034707187119876, + "learning_rate": 8.298939391806094e-06, + "loss": 0.8687, + "step": 3524 + }, + { + "epoch": 0.57, + "grad_norm": 1.5299966016422737, + "learning_rate": 8.293796233935864e-06, + "loss": 0.8133, + "step": 3525 + }, + { + "epoch": 0.57, + "grad_norm": 1.597913794173523, + "learning_rate": 8.288653540882579e-06, + "loss": 0.8824, + "step": 3526 + }, + { + "epoch": 0.57, + "grad_norm": 1.3503685477694625, + "learning_rate": 8.283511314047236e-06, + "loss": 0.794, + "step": 3527 + }, + { + "epoch": 0.57, + "grad_norm": 1.4766486870624465, + "learning_rate": 8.27836955483073e-06, + "loss": 0.7188, + "step": 3528 + }, + { + "epoch": 0.57, + "grad_norm": 1.217303663435195, + "learning_rate": 8.273228264633815e-06, + "loss": 0.8129, + "step": 3529 + }, + { + "epoch": 0.57, + "grad_norm": 1.4726487926021135, + "learning_rate": 8.268087444857119e-06, + "loss": 0.7939, + "step": 3530 + }, + { + "epoch": 0.57, + "grad_norm": 1.327405187544465, + "learning_rate": 8.262947096901142e-06, + "loss": 0.834, + "step": 3531 + }, + { + "epoch": 0.57, + "grad_norm": 1.420776379642049, + "learning_rate": 8.25780722216626e-06, + "loss": 0.8278, + "step": 3532 + }, + { + "epoch": 0.57, + "grad_norm": 1.4610885513979301, + "learning_rate": 8.252667822052714e-06, + "loss": 0.8712, + "step": 3533 + }, + { + "epoch": 0.57, + "grad_norm": 1.440062648756464, + "learning_rate": 8.247528897960615e-06, + "loss": 0.8614, + "step": 3534 + }, + { + "epoch": 0.57, + "grad_norm": 1.7676202136766617, + "learning_rate": 8.242390451289959e-06, + "loss": 0.8857, + "step": 3535 + }, + { + "epoch": 0.57, + "grad_norm": 1.8676428199801571, + "learning_rate": 8.237252483440585e-06, + "loss": 0.8328, + "step": 3536 + }, + { + "epoch": 0.57, + "grad_norm": 1.4970313115025857, + "learning_rate": 8.232114995812228e-06, + "loss": 0.87, + "step": 3537 + }, + { + "epoch": 0.57, + "grad_norm": 1.2127092063754226, + "learning_rate": 8.22697798980448e-06, + "loss": 0.8517, + "step": 3538 + }, + { + "epoch": 0.57, + "grad_norm": 1.488339550648583, + "learning_rate": 8.2218414668168e-06, + "loss": 0.8198, + "step": 3539 + }, + { + "epoch": 0.57, + "grad_norm": 1.4571245210928145, + "learning_rate": 8.216705428248519e-06, + "loss": 0.7956, + "step": 3540 + }, + { + "epoch": 0.57, + "grad_norm": 1.303849842229644, + "learning_rate": 8.211569875498838e-06, + "loss": 0.7641, + "step": 3541 + }, + { + "epoch": 0.57, + "grad_norm": 0.9496303370479445, + "learning_rate": 8.206434809966817e-06, + "loss": 0.3606, + "step": 3542 + }, + { + "epoch": 0.57, + "grad_norm": 1.7477419033087054, + "learning_rate": 8.201300233051395e-06, + "loss": 0.8629, + "step": 3543 + }, + { + "epoch": 0.57, + "grad_norm": 1.6319190893897733, + "learning_rate": 8.196166146151373e-06, + "loss": 0.8791, + "step": 3544 + }, + { + "epoch": 0.57, + "grad_norm": 1.6224880966589676, + "learning_rate": 8.191032550665417e-06, + "loss": 0.7813, + "step": 3545 + }, + { + "epoch": 0.57, + "grad_norm": 1.3197628632177116, + "learning_rate": 8.185899447992056e-06, + "loss": 0.8836, + "step": 3546 + }, + { + "epoch": 0.57, + "grad_norm": 1.3899934518654704, + "learning_rate": 8.180766839529689e-06, + "loss": 0.8408, + "step": 3547 + }, + { + "epoch": 0.57, + "grad_norm": 1.3495105018502578, + "learning_rate": 8.175634726676589e-06, + "loss": 0.8034, + "step": 3548 + }, + { + "epoch": 0.57, + "grad_norm": 1.3971576308208387, + "learning_rate": 8.170503110830874e-06, + "loss": 0.814, + "step": 3549 + }, + { + "epoch": 0.57, + "grad_norm": 1.5229597449053567, + "learning_rate": 8.16537199339054e-06, + "loss": 0.9228, + "step": 3550 + }, + { + "epoch": 0.57, + "grad_norm": 1.375893798813591, + "learning_rate": 8.160241375753452e-06, + "loss": 0.8735, + "step": 3551 + }, + { + "epoch": 0.57, + "grad_norm": 1.5051167251845792, + "learning_rate": 8.155111259317323e-06, + "loss": 0.8423, + "step": 3552 + }, + { + "epoch": 0.57, + "grad_norm": 1.383524382805488, + "learning_rate": 8.149981645479743e-06, + "loss": 0.8562, + "step": 3553 + }, + { + "epoch": 0.57, + "grad_norm": 1.3398200181049318, + "learning_rate": 8.144852535638161e-06, + "loss": 0.789, + "step": 3554 + }, + { + "epoch": 0.57, + "grad_norm": 1.341858163486133, + "learning_rate": 8.139723931189883e-06, + "loss": 0.8419, + "step": 3555 + }, + { + "epoch": 0.57, + "grad_norm": 1.38262743304159, + "learning_rate": 8.134595833532084e-06, + "loss": 0.8639, + "step": 3556 + }, + { + "epoch": 0.57, + "grad_norm": 1.4483461349131526, + "learning_rate": 8.129468244061805e-06, + "loss": 0.8975, + "step": 3557 + }, + { + "epoch": 0.57, + "grad_norm": 1.3566356879621455, + "learning_rate": 8.12434116417593e-06, + "loss": 0.8168, + "step": 3558 + }, + { + "epoch": 0.57, + "grad_norm": 1.4431632473915308, + "learning_rate": 8.11921459527123e-06, + "loss": 0.8434, + "step": 3559 + }, + { + "epoch": 0.57, + "grad_norm": 1.4308074462698728, + "learning_rate": 8.114088538744318e-06, + "loss": 0.8442, + "step": 3560 + }, + { + "epoch": 0.57, + "grad_norm": 1.4400251884185313, + "learning_rate": 8.108962995991673e-06, + "loss": 0.8847, + "step": 3561 + }, + { + "epoch": 0.57, + "grad_norm": 1.3285095590735039, + "learning_rate": 8.103837968409634e-06, + "loss": 0.8069, + "step": 3562 + }, + { + "epoch": 0.57, + "grad_norm": 1.208650873849463, + "learning_rate": 8.098713457394398e-06, + "loss": 0.8725, + "step": 3563 + }, + { + "epoch": 0.57, + "grad_norm": 0.8345193319732066, + "learning_rate": 8.093589464342032e-06, + "loss": 0.3239, + "step": 3564 + }, + { + "epoch": 0.57, + "grad_norm": 1.355158976575549, + "learning_rate": 8.08846599064844e-06, + "loss": 0.8725, + "step": 3565 + }, + { + "epoch": 0.57, + "grad_norm": 1.1012514377760785, + "learning_rate": 8.083343037709407e-06, + "loss": 0.8556, + "step": 3566 + }, + { + "epoch": 0.57, + "grad_norm": 1.594654231115963, + "learning_rate": 8.078220606920565e-06, + "loss": 0.8012, + "step": 3567 + }, + { + "epoch": 0.57, + "grad_norm": 1.2731365304554907, + "learning_rate": 8.073098699677402e-06, + "loss": 0.8872, + "step": 3568 + }, + { + "epoch": 0.58, + "grad_norm": 1.4688180371261643, + "learning_rate": 8.067977317375268e-06, + "loss": 0.7722, + "step": 3569 + }, + { + "epoch": 0.58, + "grad_norm": 1.4432357837120806, + "learning_rate": 8.062856461409372e-06, + "loss": 0.8392, + "step": 3570 + }, + { + "epoch": 0.58, + "grad_norm": 1.2710278294954116, + "learning_rate": 8.057736133174768e-06, + "loss": 0.8285, + "step": 3571 + }, + { + "epoch": 0.58, + "grad_norm": 1.430426906028895, + "learning_rate": 8.052616334066383e-06, + "loss": 0.8659, + "step": 3572 + }, + { + "epoch": 0.58, + "grad_norm": 0.7644353730504525, + "learning_rate": 8.047497065478991e-06, + "loss": 0.3111, + "step": 3573 + }, + { + "epoch": 0.58, + "grad_norm": 1.3325014165567486, + "learning_rate": 8.042378328807217e-06, + "loss": 0.937, + "step": 3574 + }, + { + "epoch": 0.58, + "grad_norm": 0.862486585200898, + "learning_rate": 8.037260125445548e-06, + "loss": 0.3652, + "step": 3575 + }, + { + "epoch": 0.58, + "grad_norm": 1.56205962174118, + "learning_rate": 8.032142456788328e-06, + "loss": 0.8393, + "step": 3576 + }, + { + "epoch": 0.58, + "grad_norm": 0.8122837530815264, + "learning_rate": 8.027025324229743e-06, + "loss": 0.3588, + "step": 3577 + }, + { + "epoch": 0.58, + "grad_norm": 1.2216100756772725, + "learning_rate": 8.021908729163842e-06, + "loss": 0.8717, + "step": 3578 + }, + { + "epoch": 0.58, + "grad_norm": 1.2831629169092844, + "learning_rate": 8.016792672984538e-06, + "loss": 0.9045, + "step": 3579 + }, + { + "epoch": 0.58, + "grad_norm": 1.3243025855867936, + "learning_rate": 8.01167715708557e-06, + "loss": 0.7933, + "step": 3580 + }, + { + "epoch": 0.58, + "grad_norm": 1.6597515678842947, + "learning_rate": 8.006562182860557e-06, + "loss": 0.9037, + "step": 3581 + }, + { + "epoch": 0.58, + "grad_norm": 1.2053052844233856, + "learning_rate": 8.001447751702955e-06, + "loss": 0.8249, + "step": 3582 + }, + { + "epoch": 0.58, + "grad_norm": 1.4288555106012812, + "learning_rate": 7.996333865006074e-06, + "loss": 0.8947, + "step": 3583 + }, + { + "epoch": 0.58, + "grad_norm": 1.2804017500952973, + "learning_rate": 7.99122052416308e-06, + "loss": 0.8257, + "step": 3584 + }, + { + "epoch": 0.58, + "grad_norm": 1.4580941241778993, + "learning_rate": 7.986107730566985e-06, + "loss": 0.9537, + "step": 3585 + }, + { + "epoch": 0.58, + "grad_norm": 1.8079095235166265, + "learning_rate": 7.980995485610665e-06, + "loss": 0.7771, + "step": 3586 + }, + { + "epoch": 0.58, + "grad_norm": 1.5024538458060308, + "learning_rate": 7.975883790686821e-06, + "loss": 0.8335, + "step": 3587 + }, + { + "epoch": 0.58, + "grad_norm": 1.326399495699327, + "learning_rate": 7.970772647188029e-06, + "loss": 0.862, + "step": 3588 + }, + { + "epoch": 0.58, + "grad_norm": 1.4773395451526725, + "learning_rate": 7.965662056506708e-06, + "loss": 0.8297, + "step": 3589 + }, + { + "epoch": 0.58, + "grad_norm": 1.3607165366632843, + "learning_rate": 7.960552020035118e-06, + "loss": 0.7825, + "step": 3590 + }, + { + "epoch": 0.58, + "grad_norm": 1.4867744361578705, + "learning_rate": 7.955442539165372e-06, + "loss": 0.756, + "step": 3591 + }, + { + "epoch": 0.58, + "grad_norm": 1.5535544862747301, + "learning_rate": 7.950333615289442e-06, + "loss": 0.9193, + "step": 3592 + }, + { + "epoch": 0.58, + "grad_norm": 1.3525258239208373, + "learning_rate": 7.945225249799132e-06, + "loss": 0.8587, + "step": 3593 + }, + { + "epoch": 0.58, + "grad_norm": 0.843060194938453, + "learning_rate": 7.9401174440861e-06, + "loss": 0.3293, + "step": 3594 + }, + { + "epoch": 0.58, + "grad_norm": 1.2834715816406845, + "learning_rate": 7.935010199541864e-06, + "loss": 0.8056, + "step": 3595 + }, + { + "epoch": 0.58, + "grad_norm": 1.2828446690292952, + "learning_rate": 7.92990351755777e-06, + "loss": 0.8823, + "step": 3596 + }, + { + "epoch": 0.58, + "grad_norm": 1.195200621910457, + "learning_rate": 7.924797399525017e-06, + "loss": 0.894, + "step": 3597 + }, + { + "epoch": 0.58, + "grad_norm": 0.8263470709706975, + "learning_rate": 7.91969184683466e-06, + "loss": 0.33, + "step": 3598 + }, + { + "epoch": 0.58, + "grad_norm": 1.448683042042446, + "learning_rate": 7.914586860877584e-06, + "loss": 0.7512, + "step": 3599 + }, + { + "epoch": 0.58, + "grad_norm": 1.2014069451228253, + "learning_rate": 7.909482443044532e-06, + "loss": 0.8261, + "step": 3600 + }, + { + "epoch": 0.58, + "grad_norm": 1.5463862483870354, + "learning_rate": 7.904378594726095e-06, + "loss": 0.9134, + "step": 3601 + }, + { + "epoch": 0.58, + "grad_norm": 1.3698324144096066, + "learning_rate": 7.899275317312686e-06, + "loss": 0.9085, + "step": 3602 + }, + { + "epoch": 0.58, + "grad_norm": 1.3879407481755344, + "learning_rate": 7.89417261219459e-06, + "loss": 0.9213, + "step": 3603 + }, + { + "epoch": 0.58, + "grad_norm": 1.5156555897781723, + "learning_rate": 7.889070480761921e-06, + "loss": 0.8249, + "step": 3604 + }, + { + "epoch": 0.58, + "grad_norm": 1.2609308982106695, + "learning_rate": 7.883968924404645e-06, + "loss": 0.8533, + "step": 3605 + }, + { + "epoch": 0.58, + "grad_norm": 1.245514164639684, + "learning_rate": 7.878867944512561e-06, + "loss": 0.8531, + "step": 3606 + }, + { + "epoch": 0.58, + "grad_norm": 1.4669113158981286, + "learning_rate": 7.873767542475316e-06, + "loss": 0.7977, + "step": 3607 + }, + { + "epoch": 0.58, + "grad_norm": 1.2839468008717088, + "learning_rate": 7.868667719682409e-06, + "loss": 0.8916, + "step": 3608 + }, + { + "epoch": 0.58, + "grad_norm": 1.2653930515538772, + "learning_rate": 7.863568477523158e-06, + "loss": 0.8507, + "step": 3609 + }, + { + "epoch": 0.58, + "grad_norm": 1.4853824095623467, + "learning_rate": 7.858469817386746e-06, + "loss": 0.8579, + "step": 3610 + }, + { + "epoch": 0.58, + "grad_norm": 1.2878997511275698, + "learning_rate": 7.853371740662193e-06, + "loss": 0.7886, + "step": 3611 + }, + { + "epoch": 0.58, + "grad_norm": 1.4375878286482502, + "learning_rate": 7.848274248738345e-06, + "loss": 0.8143, + "step": 3612 + }, + { + "epoch": 0.58, + "grad_norm": 1.3883573629308774, + "learning_rate": 7.843177343003905e-06, + "loss": 0.8321, + "step": 3613 + }, + { + "epoch": 0.58, + "grad_norm": 1.8584630196235041, + "learning_rate": 7.838081024847412e-06, + "loss": 0.9148, + "step": 3614 + }, + { + "epoch": 0.58, + "grad_norm": 1.5934805548791853, + "learning_rate": 7.83298529565724e-06, + "loss": 0.8588, + "step": 3615 + }, + { + "epoch": 0.58, + "grad_norm": 1.4429527517793959, + "learning_rate": 7.827890156821604e-06, + "loss": 0.8875, + "step": 3616 + }, + { + "epoch": 0.58, + "grad_norm": 1.486885162855852, + "learning_rate": 7.82279560972857e-06, + "loss": 0.888, + "step": 3617 + }, + { + "epoch": 0.58, + "grad_norm": 1.4873931473983737, + "learning_rate": 7.817701655766024e-06, + "loss": 0.8558, + "step": 3618 + }, + { + "epoch": 0.58, + "grad_norm": 1.4075738847533912, + "learning_rate": 7.8126082963217e-06, + "loss": 0.8897, + "step": 3619 + }, + { + "epoch": 0.58, + "grad_norm": 0.9865273708747688, + "learning_rate": 7.807515532783177e-06, + "loss": 0.3497, + "step": 3620 + }, + { + "epoch": 0.58, + "grad_norm": 1.4310453499215643, + "learning_rate": 7.802423366537856e-06, + "loss": 0.9124, + "step": 3621 + }, + { + "epoch": 0.58, + "grad_norm": 1.763220100267875, + "learning_rate": 7.797331798972986e-06, + "loss": 0.7677, + "step": 3622 + }, + { + "epoch": 0.58, + "grad_norm": 1.2520234695227572, + "learning_rate": 7.79224083147565e-06, + "loss": 0.8828, + "step": 3623 + }, + { + "epoch": 0.58, + "grad_norm": 1.278512043121873, + "learning_rate": 7.787150465432774e-06, + "loss": 0.8728, + "step": 3624 + }, + { + "epoch": 0.58, + "grad_norm": 1.2522640838913428, + "learning_rate": 7.782060702231103e-06, + "loss": 0.8438, + "step": 3625 + }, + { + "epoch": 0.58, + "grad_norm": 1.4182873156551168, + "learning_rate": 7.776971543257236e-06, + "loss": 0.8658, + "step": 3626 + }, + { + "epoch": 0.58, + "grad_norm": 1.4030868147537157, + "learning_rate": 7.771882989897603e-06, + "loss": 0.8811, + "step": 3627 + }, + { + "epoch": 0.58, + "grad_norm": 1.3024417600039881, + "learning_rate": 7.766795043538457e-06, + "loss": 0.8329, + "step": 3628 + }, + { + "epoch": 0.58, + "grad_norm": 1.3326810055412666, + "learning_rate": 7.7617077055659e-06, + "loss": 0.8909, + "step": 3629 + }, + { + "epoch": 0.58, + "grad_norm": 1.317713482489219, + "learning_rate": 7.756620977365869e-06, + "loss": 0.9012, + "step": 3630 + }, + { + "epoch": 0.59, + "grad_norm": 1.7085090234969864, + "learning_rate": 7.751534860324116e-06, + "loss": 0.8379, + "step": 3631 + }, + { + "epoch": 0.59, + "grad_norm": 1.2171472845485551, + "learning_rate": 7.746449355826247e-06, + "loss": 0.8305, + "step": 3632 + }, + { + "epoch": 0.59, + "grad_norm": 1.5023959862430087, + "learning_rate": 7.741364465257697e-06, + "loss": 0.8494, + "step": 3633 + }, + { + "epoch": 0.59, + "grad_norm": 1.5182640140933463, + "learning_rate": 7.736280190003723e-06, + "loss": 0.8885, + "step": 3634 + }, + { + "epoch": 0.59, + "grad_norm": 1.4046572732113116, + "learning_rate": 7.731196531449426e-06, + "loss": 0.8659, + "step": 3635 + }, + { + "epoch": 0.59, + "grad_norm": 1.4395281249374392, + "learning_rate": 7.726113490979735e-06, + "loss": 0.82, + "step": 3636 + }, + { + "epoch": 0.59, + "grad_norm": 1.4571951653779784, + "learning_rate": 7.721031069979408e-06, + "loss": 0.7843, + "step": 3637 + }, + { + "epoch": 0.59, + "grad_norm": 1.3917649872505053, + "learning_rate": 7.715949269833034e-06, + "loss": 0.8384, + "step": 3638 + }, + { + "epoch": 0.59, + "grad_norm": 1.3893693682607504, + "learning_rate": 7.710868091925047e-06, + "loss": 0.8921, + "step": 3639 + }, + { + "epoch": 0.59, + "grad_norm": 1.4430888907167914, + "learning_rate": 7.705787537639685e-06, + "loss": 0.8301, + "step": 3640 + }, + { + "epoch": 0.59, + "grad_norm": 1.5921294315463197, + "learning_rate": 7.70070760836104e-06, + "loss": 0.8365, + "step": 3641 + }, + { + "epoch": 0.59, + "grad_norm": 1.4446459186733032, + "learning_rate": 7.695628305473025e-06, + "loss": 0.7485, + "step": 3642 + }, + { + "epoch": 0.59, + "grad_norm": 1.4433217192139707, + "learning_rate": 7.69054963035938e-06, + "loss": 0.8406, + "step": 3643 + }, + { + "epoch": 0.59, + "grad_norm": 1.2885077297812098, + "learning_rate": 7.685471584403674e-06, + "loss": 0.8194, + "step": 3644 + }, + { + "epoch": 0.59, + "grad_norm": 1.4000970364447052, + "learning_rate": 7.680394168989306e-06, + "loss": 0.8436, + "step": 3645 + }, + { + "epoch": 0.59, + "grad_norm": 1.4945662328620992, + "learning_rate": 7.675317385499513e-06, + "loss": 0.8877, + "step": 3646 + }, + { + "epoch": 0.59, + "grad_norm": 1.5682556574102344, + "learning_rate": 7.670241235317339e-06, + "loss": 0.8562, + "step": 3647 + }, + { + "epoch": 0.59, + "grad_norm": 0.841921139991204, + "learning_rate": 7.665165719825676e-06, + "loss": 0.3711, + "step": 3648 + }, + { + "epoch": 0.59, + "grad_norm": 1.6112753923052603, + "learning_rate": 7.660090840407231e-06, + "loss": 0.8236, + "step": 3649 + }, + { + "epoch": 0.59, + "grad_norm": 1.546960588697949, + "learning_rate": 7.65501659844454e-06, + "loss": 0.8742, + "step": 3650 + }, + { + "epoch": 0.59, + "grad_norm": 1.4390734031949686, + "learning_rate": 7.649942995319965e-06, + "loss": 0.892, + "step": 3651 + }, + { + "epoch": 0.59, + "grad_norm": 1.276461270534326, + "learning_rate": 7.644870032415705e-06, + "loss": 0.8085, + "step": 3652 + }, + { + "epoch": 0.59, + "grad_norm": 1.3511085423175608, + "learning_rate": 7.63979771111376e-06, + "loss": 0.8323, + "step": 3653 + }, + { + "epoch": 0.59, + "grad_norm": 1.5268784153174397, + "learning_rate": 7.63472603279598e-06, + "loss": 0.8848, + "step": 3654 + }, + { + "epoch": 0.59, + "grad_norm": 1.6635770766914166, + "learning_rate": 7.629654998844031e-06, + "loss": 0.8249, + "step": 3655 + }, + { + "epoch": 0.59, + "grad_norm": 1.3344818618099579, + "learning_rate": 7.624584610639397e-06, + "loss": 0.8231, + "step": 3656 + }, + { + "epoch": 0.59, + "grad_norm": 1.308149194289193, + "learning_rate": 7.619514869563394e-06, + "loss": 0.9221, + "step": 3657 + }, + { + "epoch": 0.59, + "grad_norm": 1.4663931383675721, + "learning_rate": 7.6144457769971606e-06, + "loss": 0.8183, + "step": 3658 + }, + { + "epoch": 0.59, + "grad_norm": 1.4681913119841432, + "learning_rate": 7.609377334321653e-06, + "loss": 0.8503, + "step": 3659 + }, + { + "epoch": 0.59, + "grad_norm": 1.2563910463848558, + "learning_rate": 7.604309542917656e-06, + "loss": 0.8484, + "step": 3660 + }, + { + "epoch": 0.59, + "grad_norm": 1.4152445101026292, + "learning_rate": 7.599242404165783e-06, + "loss": 0.9022, + "step": 3661 + }, + { + "epoch": 0.59, + "grad_norm": 1.4840868224688974, + "learning_rate": 7.59417591944645e-06, + "loss": 0.8138, + "step": 3662 + }, + { + "epoch": 0.59, + "grad_norm": 1.4477249515719366, + "learning_rate": 7.589110090139917e-06, + "loss": 0.8842, + "step": 3663 + }, + { + "epoch": 0.59, + "grad_norm": 0.7909386467859402, + "learning_rate": 7.584044917626251e-06, + "loss": 0.3137, + "step": 3664 + }, + { + "epoch": 0.59, + "grad_norm": 1.212369966017015, + "learning_rate": 7.5789804032853476e-06, + "loss": 0.8048, + "step": 3665 + }, + { + "epoch": 0.59, + "grad_norm": 1.4096588779284505, + "learning_rate": 7.573916548496916e-06, + "loss": 0.8737, + "step": 3666 + }, + { + "epoch": 0.59, + "grad_norm": 1.4305288869382986, + "learning_rate": 7.5688533546404895e-06, + "loss": 0.7828, + "step": 3667 + }, + { + "epoch": 0.59, + "grad_norm": 1.5185095041305432, + "learning_rate": 7.5637908230954316e-06, + "loss": 0.8334, + "step": 3668 + }, + { + "epoch": 0.59, + "grad_norm": 1.1437209378635254, + "learning_rate": 7.558728955240901e-06, + "loss": 0.781, + "step": 3669 + }, + { + "epoch": 0.59, + "grad_norm": 1.3076771839186683, + "learning_rate": 7.553667752455899e-06, + "loss": 0.8256, + "step": 3670 + }, + { + "epoch": 0.59, + "grad_norm": 1.1032935318851078, + "learning_rate": 7.548607216119237e-06, + "loss": 0.8056, + "step": 3671 + }, + { + "epoch": 0.59, + "grad_norm": 1.2407737955590747, + "learning_rate": 7.54354734760954e-06, + "loss": 0.887, + "step": 3672 + }, + { + "epoch": 0.59, + "grad_norm": 1.3579257105512947, + "learning_rate": 7.5384881483052585e-06, + "loss": 0.9177, + "step": 3673 + }, + { + "epoch": 0.59, + "grad_norm": 1.4128993609144886, + "learning_rate": 7.53342961958466e-06, + "loss": 0.8702, + "step": 3674 + }, + { + "epoch": 0.59, + "grad_norm": 1.7175463428207238, + "learning_rate": 7.528371762825819e-06, + "loss": 0.8744, + "step": 3675 + }, + { + "epoch": 0.59, + "grad_norm": 1.408134601634781, + "learning_rate": 7.5233145794066445e-06, + "loss": 0.8289, + "step": 3676 + }, + { + "epoch": 0.59, + "grad_norm": 1.59831714193309, + "learning_rate": 7.518258070704849e-06, + "loss": 0.7477, + "step": 3677 + }, + { + "epoch": 0.59, + "grad_norm": 1.3482466130223394, + "learning_rate": 7.513202238097963e-06, + "loss": 0.7927, + "step": 3678 + }, + { + "epoch": 0.59, + "grad_norm": 1.3275480728326299, + "learning_rate": 7.508147082963337e-06, + "loss": 0.8057, + "step": 3679 + }, + { + "epoch": 0.59, + "grad_norm": 1.3447884661042737, + "learning_rate": 7.503092606678135e-06, + "loss": 0.7839, + "step": 3680 + }, + { + "epoch": 0.59, + "grad_norm": 1.5359912256563613, + "learning_rate": 7.4980388106193336e-06, + "loss": 0.7742, + "step": 3681 + }, + { + "epoch": 0.59, + "grad_norm": 1.3523769505296812, + "learning_rate": 7.4929856961637246e-06, + "loss": 0.7711, + "step": 3682 + }, + { + "epoch": 0.59, + "grad_norm": 1.169135366651676, + "learning_rate": 7.487933264687921e-06, + "loss": 0.8801, + "step": 3683 + }, + { + "epoch": 0.59, + "grad_norm": 1.3607897525132546, + "learning_rate": 7.482881517568344e-06, + "loss": 0.8532, + "step": 3684 + }, + { + "epoch": 0.59, + "grad_norm": 1.2490016608766912, + "learning_rate": 7.477830456181222e-06, + "loss": 0.8673, + "step": 3685 + }, + { + "epoch": 0.59, + "grad_norm": 1.2592078201194958, + "learning_rate": 7.472780081902608e-06, + "loss": 0.83, + "step": 3686 + }, + { + "epoch": 0.59, + "grad_norm": 1.5756763668503362, + "learning_rate": 7.467730396108368e-06, + "loss": 0.9209, + "step": 3687 + }, + { + "epoch": 0.59, + "grad_norm": 1.620636344631634, + "learning_rate": 7.462681400174165e-06, + "loss": 0.8684, + "step": 3688 + }, + { + "epoch": 0.59, + "grad_norm": 1.3609203373673189, + "learning_rate": 7.45763309547549e-06, + "loss": 0.8377, + "step": 3689 + }, + { + "epoch": 0.59, + "grad_norm": 1.3793255814055534, + "learning_rate": 7.452585483387647e-06, + "loss": 0.8438, + "step": 3690 + }, + { + "epoch": 0.59, + "grad_norm": 1.3528625032692254, + "learning_rate": 7.4475385652857325e-06, + "loss": 0.8305, + "step": 3691 + }, + { + "epoch": 0.59, + "grad_norm": 1.290223113928149, + "learning_rate": 7.442492342544672e-06, + "loss": 0.8887, + "step": 3692 + }, + { + "epoch": 0.6, + "grad_norm": 1.238010273805996, + "learning_rate": 7.437446816539198e-06, + "loss": 0.8474, + "step": 3693 + }, + { + "epoch": 0.6, + "grad_norm": 1.3766904553341708, + "learning_rate": 7.432401988643847e-06, + "loss": 0.8001, + "step": 3694 + }, + { + "epoch": 0.6, + "grad_norm": 1.3583595981982763, + "learning_rate": 7.427357860232971e-06, + "loss": 0.7952, + "step": 3695 + }, + { + "epoch": 0.6, + "grad_norm": 1.437833247870376, + "learning_rate": 7.422314432680731e-06, + "loss": 0.8208, + "step": 3696 + }, + { + "epoch": 0.6, + "grad_norm": 1.2220461240194171, + "learning_rate": 7.417271707361091e-06, + "loss": 0.8127, + "step": 3697 + }, + { + "epoch": 0.6, + "grad_norm": 1.3941381966421966, + "learning_rate": 7.412229685647829e-06, + "loss": 0.8268, + "step": 3698 + }, + { + "epoch": 0.6, + "grad_norm": 1.2628702159097358, + "learning_rate": 7.407188368914537e-06, + "loss": 0.7658, + "step": 3699 + }, + { + "epoch": 0.6, + "grad_norm": 1.364032088743096, + "learning_rate": 7.402147758534604e-06, + "loss": 0.8644, + "step": 3700 + }, + { + "epoch": 0.6, + "grad_norm": 1.4716586267778038, + "learning_rate": 7.39710785588123e-06, + "loss": 0.8216, + "step": 3701 + }, + { + "epoch": 0.6, + "grad_norm": 1.3870052216362971, + "learning_rate": 7.3920686623274265e-06, + "loss": 0.8654, + "step": 3702 + }, + { + "epoch": 0.6, + "grad_norm": 1.5895279622266585, + "learning_rate": 7.387030179246009e-06, + "loss": 0.8984, + "step": 3703 + }, + { + "epoch": 0.6, + "grad_norm": 1.3119306351167366, + "learning_rate": 7.381992408009593e-06, + "loss": 0.9005, + "step": 3704 + }, + { + "epoch": 0.6, + "grad_norm": 1.4103566526630942, + "learning_rate": 7.376955349990613e-06, + "loss": 0.8906, + "step": 3705 + }, + { + "epoch": 0.6, + "grad_norm": 1.208304368040526, + "learning_rate": 7.3719190065613035e-06, + "loss": 0.8328, + "step": 3706 + }, + { + "epoch": 0.6, + "grad_norm": 1.3522814116222333, + "learning_rate": 7.366883379093698e-06, + "loss": 0.8812, + "step": 3707 + }, + { + "epoch": 0.6, + "grad_norm": 1.350054844044675, + "learning_rate": 7.361848468959641e-06, + "loss": 0.7613, + "step": 3708 + }, + { + "epoch": 0.6, + "grad_norm": 1.3140572026743578, + "learning_rate": 7.356814277530785e-06, + "loss": 0.8696, + "step": 3709 + }, + { + "epoch": 0.6, + "grad_norm": 1.4975234047132615, + "learning_rate": 7.351780806178578e-06, + "loss": 0.7983, + "step": 3710 + }, + { + "epoch": 0.6, + "grad_norm": 1.865545402678677, + "learning_rate": 7.346748056274275e-06, + "loss": 0.8764, + "step": 3711 + }, + { + "epoch": 0.6, + "grad_norm": 1.203837494152397, + "learning_rate": 7.341716029188946e-06, + "loss": 0.852, + "step": 3712 + }, + { + "epoch": 0.6, + "grad_norm": 1.627818836106272, + "learning_rate": 7.33668472629344e-06, + "loss": 0.7911, + "step": 3713 + }, + { + "epoch": 0.6, + "grad_norm": 1.261271191056609, + "learning_rate": 7.33165414895843e-06, + "loss": 0.8796, + "step": 3714 + }, + { + "epoch": 0.6, + "grad_norm": 1.3952529319423257, + "learning_rate": 7.326624298554387e-06, + "loss": 0.8235, + "step": 3715 + }, + { + "epoch": 0.6, + "grad_norm": 0.979079099511998, + "learning_rate": 7.321595176451575e-06, + "loss": 0.3373, + "step": 3716 + }, + { + "epoch": 0.6, + "grad_norm": 1.343273951794725, + "learning_rate": 7.316566784020067e-06, + "loss": 0.8397, + "step": 3717 + }, + { + "epoch": 0.6, + "grad_norm": 1.353298017010148, + "learning_rate": 7.311539122629738e-06, + "loss": 0.8213, + "step": 3718 + }, + { + "epoch": 0.6, + "grad_norm": 1.2965598563691165, + "learning_rate": 7.306512193650258e-06, + "loss": 0.7882, + "step": 3719 + }, + { + "epoch": 0.6, + "grad_norm": 1.4715317017003005, + "learning_rate": 7.301485998451101e-06, + "loss": 0.8659, + "step": 3720 + }, + { + "epoch": 0.6, + "grad_norm": 1.3947613303265236, + "learning_rate": 7.29646053840155e-06, + "loss": 0.8331, + "step": 3721 + }, + { + "epoch": 0.6, + "grad_norm": 1.4954970429784855, + "learning_rate": 7.291435814870664e-06, + "loss": 0.8322, + "step": 3722 + }, + { + "epoch": 0.6, + "grad_norm": 1.5425309948496622, + "learning_rate": 7.2864118292273265e-06, + "loss": 0.8248, + "step": 3723 + }, + { + "epoch": 0.6, + "grad_norm": 1.4963361955142336, + "learning_rate": 7.281388582840209e-06, + "loss": 0.8453, + "step": 3724 + }, + { + "epoch": 0.6, + "grad_norm": 1.564836836012575, + "learning_rate": 7.276366077077781e-06, + "loss": 0.8985, + "step": 3725 + }, + { + "epoch": 0.6, + "grad_norm": 1.6190949043564615, + "learning_rate": 7.271344313308308e-06, + "loss": 0.8961, + "step": 3726 + }, + { + "epoch": 0.6, + "grad_norm": 1.3215569959662545, + "learning_rate": 7.2663232928998594e-06, + "loss": 0.8095, + "step": 3727 + }, + { + "epoch": 0.6, + "grad_norm": 1.5356228497889537, + "learning_rate": 7.2613030172203045e-06, + "loss": 0.8943, + "step": 3728 + }, + { + "epoch": 0.6, + "grad_norm": 1.7928921749658995, + "learning_rate": 7.256283487637297e-06, + "loss": 0.7233, + "step": 3729 + }, + { + "epoch": 0.6, + "grad_norm": 1.3531569876865392, + "learning_rate": 7.251264705518299e-06, + "loss": 0.7773, + "step": 3730 + }, + { + "epoch": 0.6, + "grad_norm": 1.2383020050282971, + "learning_rate": 7.246246672230568e-06, + "loss": 0.8637, + "step": 3731 + }, + { + "epoch": 0.6, + "grad_norm": 1.4276006084091848, + "learning_rate": 7.241229389141147e-06, + "loss": 0.8728, + "step": 3732 + }, + { + "epoch": 0.6, + "grad_norm": 1.3888262201532078, + "learning_rate": 7.236212857616885e-06, + "loss": 0.8606, + "step": 3733 + }, + { + "epoch": 0.6, + "grad_norm": 1.2961132026831341, + "learning_rate": 7.231197079024431e-06, + "loss": 0.7709, + "step": 3734 + }, + { + "epoch": 0.6, + "grad_norm": 1.5803558567625329, + "learning_rate": 7.226182054730208e-06, + "loss": 0.8432, + "step": 3735 + }, + { + "epoch": 0.6, + "grad_norm": 1.1878801873635094, + "learning_rate": 7.221167786100458e-06, + "loss": 0.8552, + "step": 3736 + }, + { + "epoch": 0.6, + "grad_norm": 1.8382884891155093, + "learning_rate": 7.216154274501203e-06, + "loss": 0.8576, + "step": 3737 + }, + { + "epoch": 0.6, + "grad_norm": 1.2841162966903286, + "learning_rate": 7.211141521298259e-06, + "loss": 0.8328, + "step": 3738 + }, + { + "epoch": 0.6, + "grad_norm": 1.6146652805902946, + "learning_rate": 7.206129527857239e-06, + "loss": 0.8061, + "step": 3739 + }, + { + "epoch": 0.6, + "grad_norm": 1.6124781683342146, + "learning_rate": 7.201118295543553e-06, + "loss": 0.8315, + "step": 3740 + }, + { + "epoch": 0.6, + "grad_norm": 1.281805271396234, + "learning_rate": 7.196107825722392e-06, + "loss": 0.8275, + "step": 3741 + }, + { + "epoch": 0.6, + "grad_norm": 1.5154533805229058, + "learning_rate": 7.191098119758747e-06, + "loss": 0.8083, + "step": 3742 + }, + { + "epoch": 0.6, + "grad_norm": 1.3711137562973996, + "learning_rate": 7.1860891790174035e-06, + "loss": 0.8973, + "step": 3743 + }, + { + "epoch": 0.6, + "grad_norm": 0.9492173057513767, + "learning_rate": 7.181081004862937e-06, + "loss": 0.3528, + "step": 3744 + }, + { + "epoch": 0.6, + "grad_norm": 1.5164724081279757, + "learning_rate": 7.176073598659706e-06, + "loss": 0.8187, + "step": 3745 + }, + { + "epoch": 0.6, + "grad_norm": 1.509988591902235, + "learning_rate": 7.171066961771868e-06, + "loss": 0.8555, + "step": 3746 + }, + { + "epoch": 0.6, + "grad_norm": 1.2078658412163263, + "learning_rate": 7.1660610955633725e-06, + "loss": 0.8119, + "step": 3747 + }, + { + "epoch": 0.6, + "grad_norm": 1.2320890559119315, + "learning_rate": 7.16105600139795e-06, + "loss": 0.8192, + "step": 3748 + }, + { + "epoch": 0.6, + "grad_norm": 0.9052482208476379, + "learning_rate": 7.156051680639127e-06, + "loss": 0.3438, + "step": 3749 + }, + { + "epoch": 0.6, + "grad_norm": 1.2174561874258307, + "learning_rate": 7.151048134650225e-06, + "loss": 0.8238, + "step": 3750 + }, + { + "epoch": 0.6, + "grad_norm": 1.4608942525245714, + "learning_rate": 7.146045364794339e-06, + "loss": 0.8616, + "step": 3751 + }, + { + "epoch": 0.6, + "grad_norm": 1.446425289200998, + "learning_rate": 7.1410433724343645e-06, + "loss": 0.8532, + "step": 3752 + }, + { + "epoch": 0.6, + "grad_norm": 1.4059920375705637, + "learning_rate": 7.136042158932988e-06, + "loss": 0.8724, + "step": 3753 + }, + { + "epoch": 0.6, + "grad_norm": 1.4354468434054175, + "learning_rate": 7.131041725652669e-06, + "loss": 0.8495, + "step": 3754 + }, + { + "epoch": 0.61, + "grad_norm": 1.4425730208578538, + "learning_rate": 7.126042073955668e-06, + "loss": 0.7817, + "step": 3755 + }, + { + "epoch": 0.61, + "grad_norm": 2.1004703393267685, + "learning_rate": 7.121043205204028e-06, + "loss": 0.8513, + "step": 3756 + }, + { + "epoch": 0.61, + "grad_norm": 1.4016663062631076, + "learning_rate": 7.116045120759575e-06, + "loss": 0.8705, + "step": 3757 + }, + { + "epoch": 0.61, + "grad_norm": 1.5700019937475267, + "learning_rate": 7.1110478219839295e-06, + "loss": 0.8473, + "step": 3758 + }, + { + "epoch": 0.61, + "grad_norm": 1.3322647259274711, + "learning_rate": 7.106051310238493e-06, + "loss": 0.8125, + "step": 3759 + }, + { + "epoch": 0.61, + "grad_norm": 1.3092225784385094, + "learning_rate": 7.101055586884451e-06, + "loss": 0.7825, + "step": 3760 + }, + { + "epoch": 0.61, + "grad_norm": 1.6500487803072337, + "learning_rate": 7.096060653282777e-06, + "loss": 0.7645, + "step": 3761 + }, + { + "epoch": 0.61, + "grad_norm": 1.3108016873552462, + "learning_rate": 7.091066510794224e-06, + "loss": 0.7239, + "step": 3762 + }, + { + "epoch": 0.61, + "grad_norm": 1.539918395293901, + "learning_rate": 7.086073160779347e-06, + "loss": 0.8589, + "step": 3763 + }, + { + "epoch": 0.61, + "grad_norm": 1.6536426308868717, + "learning_rate": 7.081080604598458e-06, + "loss": 0.889, + "step": 3764 + }, + { + "epoch": 0.61, + "grad_norm": 1.3652522422920201, + "learning_rate": 7.076088843611675e-06, + "loss": 0.8722, + "step": 3765 + }, + { + "epoch": 0.61, + "grad_norm": 1.7119311230120018, + "learning_rate": 7.0710978791788895e-06, + "loss": 0.899, + "step": 3766 + }, + { + "epoch": 0.61, + "grad_norm": 1.5380290132918233, + "learning_rate": 7.066107712659778e-06, + "loss": 0.8224, + "step": 3767 + }, + { + "epoch": 0.61, + "grad_norm": 1.3341886872178037, + "learning_rate": 7.061118345413797e-06, + "loss": 0.9101, + "step": 3768 + }, + { + "epoch": 0.61, + "grad_norm": 1.5923699897690586, + "learning_rate": 7.0561297788001915e-06, + "loss": 0.835, + "step": 3769 + }, + { + "epoch": 0.61, + "grad_norm": 1.7803896443895229, + "learning_rate": 7.0511420141779805e-06, + "loss": 0.8906, + "step": 3770 + }, + { + "epoch": 0.61, + "grad_norm": 0.8546563285124402, + "learning_rate": 7.046155052905967e-06, + "loss": 0.3626, + "step": 3771 + }, + { + "epoch": 0.61, + "grad_norm": 1.4560847416826381, + "learning_rate": 7.04116889634275e-06, + "loss": 0.7829, + "step": 3772 + }, + { + "epoch": 0.61, + "grad_norm": 1.052536155966014, + "learning_rate": 7.036183545846678e-06, + "loss": 0.3147, + "step": 3773 + }, + { + "epoch": 0.61, + "grad_norm": 1.286655588429566, + "learning_rate": 7.031199002775908e-06, + "loss": 0.8641, + "step": 3774 + }, + { + "epoch": 0.61, + "grad_norm": 1.2671804303843404, + "learning_rate": 7.0262152684883675e-06, + "loss": 0.8589, + "step": 3775 + }, + { + "epoch": 0.61, + "grad_norm": 0.8664162219447997, + "learning_rate": 7.021232344341759e-06, + "loss": 0.3383, + "step": 3776 + }, + { + "epoch": 0.61, + "grad_norm": 0.8998993016117565, + "learning_rate": 7.01625023169357e-06, + "loss": 0.3481, + "step": 3777 + }, + { + "epoch": 0.61, + "grad_norm": 1.507431383589791, + "learning_rate": 7.011268931901067e-06, + "loss": 0.8369, + "step": 3778 + }, + { + "epoch": 0.61, + "grad_norm": 1.4175251892267902, + "learning_rate": 7.006288446321288e-06, + "loss": 0.8372, + "step": 3779 + }, + { + "epoch": 0.61, + "grad_norm": 1.3982933032692515, + "learning_rate": 7.001308776311061e-06, + "loss": 0.7768, + "step": 3780 + }, + { + "epoch": 0.61, + "grad_norm": 1.3049032556142626, + "learning_rate": 6.9963299232269824e-06, + "loss": 0.9424, + "step": 3781 + }, + { + "epoch": 0.61, + "grad_norm": 1.5430626623329047, + "learning_rate": 6.99135188842543e-06, + "loss": 0.8321, + "step": 3782 + }, + { + "epoch": 0.61, + "grad_norm": 1.4836573723222872, + "learning_rate": 6.986374673262557e-06, + "loss": 0.8695, + "step": 3783 + }, + { + "epoch": 0.61, + "grad_norm": 1.2659571191274577, + "learning_rate": 6.981398279094292e-06, + "loss": 0.7818, + "step": 3784 + }, + { + "epoch": 0.61, + "grad_norm": 1.3177329242212288, + "learning_rate": 6.976422707276349e-06, + "loss": 0.8151, + "step": 3785 + }, + { + "epoch": 0.61, + "grad_norm": 1.3344675141454254, + "learning_rate": 6.971447959164201e-06, + "loss": 0.8755, + "step": 3786 + }, + { + "epoch": 0.61, + "grad_norm": 1.4076364874492573, + "learning_rate": 6.966474036113112e-06, + "loss": 0.7421, + "step": 3787 + }, + { + "epoch": 0.61, + "grad_norm": 1.2840531014831673, + "learning_rate": 6.961500939478118e-06, + "loss": 0.8807, + "step": 3788 + }, + { + "epoch": 0.61, + "grad_norm": 1.5947739698624976, + "learning_rate": 6.956528670614022e-06, + "loss": 0.9341, + "step": 3789 + }, + { + "epoch": 0.61, + "grad_norm": 1.4517645588223715, + "learning_rate": 6.951557230875412e-06, + "loss": 0.8489, + "step": 3790 + }, + { + "epoch": 0.61, + "grad_norm": 1.334367908480157, + "learning_rate": 6.9465866216166436e-06, + "loss": 0.8091, + "step": 3791 + }, + { + "epoch": 0.61, + "grad_norm": 1.2659517472773645, + "learning_rate": 6.941616844191846e-06, + "loss": 0.9292, + "step": 3792 + }, + { + "epoch": 0.61, + "grad_norm": 1.587963199951103, + "learning_rate": 6.936647899954921e-06, + "loss": 0.8121, + "step": 3793 + }, + { + "epoch": 0.61, + "grad_norm": 1.3562906354209097, + "learning_rate": 6.931679790259558e-06, + "loss": 0.7597, + "step": 3794 + }, + { + "epoch": 0.61, + "grad_norm": 1.0654614327558893, + "learning_rate": 6.926712516459192e-06, + "loss": 0.8222, + "step": 3795 + }, + { + "epoch": 0.61, + "grad_norm": 1.7444124362424673, + "learning_rate": 6.921746079907055e-06, + "loss": 0.8627, + "step": 3796 + }, + { + "epoch": 0.61, + "grad_norm": 1.5263978097178368, + "learning_rate": 6.916780481956139e-06, + "loss": 0.8418, + "step": 3797 + }, + { + "epoch": 0.61, + "grad_norm": 1.2133475708321293, + "learning_rate": 6.911815723959207e-06, + "loss": 0.8363, + "step": 3798 + }, + { + "epoch": 0.61, + "grad_norm": 0.8835223338140338, + "learning_rate": 6.906851807268799e-06, + "loss": 0.3636, + "step": 3799 + }, + { + "epoch": 0.61, + "grad_norm": 1.5157267914833537, + "learning_rate": 6.901888733237226e-06, + "loss": 0.8091, + "step": 3800 + }, + { + "epoch": 0.61, + "grad_norm": 1.1870753774516376, + "learning_rate": 6.896926503216557e-06, + "loss": 0.8568, + "step": 3801 + }, + { + "epoch": 0.61, + "grad_norm": 1.4105007868198842, + "learning_rate": 6.891965118558644e-06, + "loss": 0.8421, + "step": 3802 + }, + { + "epoch": 0.61, + "grad_norm": 1.5210852221831315, + "learning_rate": 6.887004580615107e-06, + "loss": 0.8451, + "step": 3803 + }, + { + "epoch": 0.61, + "grad_norm": 1.3933183497037784, + "learning_rate": 6.882044890737336e-06, + "loss": 0.8844, + "step": 3804 + }, + { + "epoch": 0.61, + "grad_norm": 1.269943716095466, + "learning_rate": 6.8770860502764815e-06, + "loss": 0.8371, + "step": 3805 + }, + { + "epoch": 0.61, + "grad_norm": 1.4708111667589325, + "learning_rate": 6.8721280605834706e-06, + "loss": 0.8056, + "step": 3806 + }, + { + "epoch": 0.61, + "grad_norm": 1.739870615658819, + "learning_rate": 6.867170923008998e-06, + "loss": 0.8203, + "step": 3807 + }, + { + "epoch": 0.61, + "grad_norm": 1.455825154027949, + "learning_rate": 6.862214638903519e-06, + "loss": 0.8448, + "step": 3808 + }, + { + "epoch": 0.61, + "grad_norm": 1.348078657376387, + "learning_rate": 6.857259209617269e-06, + "loss": 0.8221, + "step": 3809 + }, + { + "epoch": 0.61, + "grad_norm": 1.4379066975220443, + "learning_rate": 6.85230463650024e-06, + "loss": 0.9047, + "step": 3810 + }, + { + "epoch": 0.61, + "grad_norm": 1.2474798450139029, + "learning_rate": 6.847350920902194e-06, + "loss": 0.8018, + "step": 3811 + }, + { + "epoch": 0.61, + "grad_norm": 1.3708274034449286, + "learning_rate": 6.842398064172661e-06, + "loss": 0.8674, + "step": 3812 + }, + { + "epoch": 0.61, + "grad_norm": 1.503655790553231, + "learning_rate": 6.837446067660937e-06, + "loss": 0.8215, + "step": 3813 + }, + { + "epoch": 0.61, + "grad_norm": 1.1957143126908043, + "learning_rate": 6.832494932716078e-06, + "loss": 0.8852, + "step": 3814 + }, + { + "epoch": 0.61, + "grad_norm": 1.3507048629675866, + "learning_rate": 6.82754466068691e-06, + "loss": 0.7933, + "step": 3815 + }, + { + "epoch": 0.61, + "grad_norm": 1.5629797401659629, + "learning_rate": 6.822595252922033e-06, + "loss": 0.7818, + "step": 3816 + }, + { + "epoch": 0.62, + "grad_norm": 1.2167692097110474, + "learning_rate": 6.817646710769788e-06, + "loss": 0.9114, + "step": 3817 + }, + { + "epoch": 0.62, + "grad_norm": 1.3911668828061785, + "learning_rate": 6.812699035578302e-06, + "loss": 0.8691, + "step": 3818 + }, + { + "epoch": 0.62, + "grad_norm": 1.50195036101614, + "learning_rate": 6.807752228695459e-06, + "loss": 0.9178, + "step": 3819 + }, + { + "epoch": 0.62, + "grad_norm": 1.3963145929777998, + "learning_rate": 6.802806291468903e-06, + "loss": 0.8161, + "step": 3820 + }, + { + "epoch": 0.62, + "grad_norm": 1.5922067781428673, + "learning_rate": 6.797861225246045e-06, + "loss": 0.882, + "step": 3821 + }, + { + "epoch": 0.62, + "grad_norm": 1.2555064110249468, + "learning_rate": 6.792917031374053e-06, + "loss": 0.8769, + "step": 3822 + }, + { + "epoch": 0.62, + "grad_norm": 1.5183302532560468, + "learning_rate": 6.787973711199872e-06, + "loss": 0.8616, + "step": 3823 + }, + { + "epoch": 0.62, + "grad_norm": 1.3071454118007868, + "learning_rate": 6.783031266070186e-06, + "loss": 0.8477, + "step": 3824 + }, + { + "epoch": 0.62, + "grad_norm": 1.364322909970862, + "learning_rate": 6.778089697331462e-06, + "loss": 0.8059, + "step": 3825 + }, + { + "epoch": 0.62, + "grad_norm": 1.3547880534425194, + "learning_rate": 6.773149006329919e-06, + "loss": 0.8548, + "step": 3826 + }, + { + "epoch": 0.62, + "grad_norm": 1.2762865702876471, + "learning_rate": 6.768209194411533e-06, + "loss": 0.8439, + "step": 3827 + }, + { + "epoch": 0.62, + "grad_norm": 0.9905165024925404, + "learning_rate": 6.763270262922047e-06, + "loss": 0.3443, + "step": 3828 + }, + { + "epoch": 0.62, + "grad_norm": 1.2404020406374674, + "learning_rate": 6.758332213206964e-06, + "loss": 0.899, + "step": 3829 + }, + { + "epoch": 0.62, + "grad_norm": 1.2130910789160914, + "learning_rate": 6.753395046611543e-06, + "loss": 0.8132, + "step": 3830 + }, + { + "epoch": 0.62, + "grad_norm": 1.2329448891459043, + "learning_rate": 6.748458764480801e-06, + "loss": 0.8692, + "step": 3831 + }, + { + "epoch": 0.62, + "grad_norm": 1.2778135636525336, + "learning_rate": 6.743523368159526e-06, + "loss": 0.7959, + "step": 3832 + }, + { + "epoch": 0.62, + "grad_norm": 1.2908200131654213, + "learning_rate": 6.738588858992248e-06, + "loss": 0.8436, + "step": 3833 + }, + { + "epoch": 0.62, + "grad_norm": 1.6573904852195145, + "learning_rate": 6.733655238323264e-06, + "loss": 0.7732, + "step": 3834 + }, + { + "epoch": 0.62, + "grad_norm": 1.3587608717757185, + "learning_rate": 6.728722507496636e-06, + "loss": 0.8557, + "step": 3835 + }, + { + "epoch": 0.62, + "grad_norm": 1.3749184577507532, + "learning_rate": 6.723790667856166e-06, + "loss": 0.8109, + "step": 3836 + }, + { + "epoch": 0.62, + "grad_norm": 1.2553640719276358, + "learning_rate": 6.718859720745425e-06, + "loss": 0.8166, + "step": 3837 + }, + { + "epoch": 0.62, + "grad_norm": 1.4501132363877314, + "learning_rate": 6.71392966750775e-06, + "loss": 0.8743, + "step": 3838 + }, + { + "epoch": 0.62, + "grad_norm": 1.4810316813464302, + "learning_rate": 6.709000509486207e-06, + "loss": 0.8756, + "step": 3839 + }, + { + "epoch": 0.62, + "grad_norm": 1.4574098120818715, + "learning_rate": 6.704072248023643e-06, + "loss": 0.8671, + "step": 3840 + }, + { + "epoch": 0.62, + "grad_norm": 1.366405744481634, + "learning_rate": 6.6991448844626525e-06, + "loss": 0.7989, + "step": 3841 + }, + { + "epoch": 0.62, + "grad_norm": 1.3041910391193647, + "learning_rate": 6.694218420145586e-06, + "loss": 0.8102, + "step": 3842 + }, + { + "epoch": 0.62, + "grad_norm": 1.3346756083941864, + "learning_rate": 6.689292856414545e-06, + "loss": 0.8424, + "step": 3843 + }, + { + "epoch": 0.62, + "grad_norm": 1.286790194971462, + "learning_rate": 6.684368194611387e-06, + "loss": 0.8577, + "step": 3844 + }, + { + "epoch": 0.62, + "grad_norm": 1.4012391141524787, + "learning_rate": 6.679444436077735e-06, + "loss": 0.8411, + "step": 3845 + }, + { + "epoch": 0.62, + "grad_norm": 1.3709297292637812, + "learning_rate": 6.674521582154945e-06, + "loss": 0.846, + "step": 3846 + }, + { + "epoch": 0.62, + "grad_norm": 1.3359358059721798, + "learning_rate": 6.669599634184145e-06, + "loss": 0.8867, + "step": 3847 + }, + { + "epoch": 0.62, + "grad_norm": 1.351903414177435, + "learning_rate": 6.6646785935062086e-06, + "loss": 0.8423, + "step": 3848 + }, + { + "epoch": 0.62, + "grad_norm": 1.5836209975158235, + "learning_rate": 6.65975846146176e-06, + "loss": 0.8498, + "step": 3849 + }, + { + "epoch": 0.62, + "grad_norm": 1.5405361012060257, + "learning_rate": 6.654839239391182e-06, + "loss": 0.8913, + "step": 3850 + }, + { + "epoch": 0.62, + "grad_norm": 1.292856445650205, + "learning_rate": 6.649920928634607e-06, + "loss": 0.8889, + "step": 3851 + }, + { + "epoch": 0.62, + "grad_norm": 1.510325321293648, + "learning_rate": 6.645003530531915e-06, + "loss": 0.753, + "step": 3852 + }, + { + "epoch": 0.62, + "grad_norm": 1.455399105332562, + "learning_rate": 6.640087046422739e-06, + "loss": 0.8586, + "step": 3853 + }, + { + "epoch": 0.62, + "grad_norm": 1.5319229369754304, + "learning_rate": 6.635171477646475e-06, + "loss": 0.8262, + "step": 3854 + }, + { + "epoch": 0.62, + "grad_norm": 1.3414757558103527, + "learning_rate": 6.630256825542245e-06, + "loss": 0.8045, + "step": 3855 + }, + { + "epoch": 0.62, + "grad_norm": 1.457036105293591, + "learning_rate": 6.625343091448948e-06, + "loss": 0.8217, + "step": 3856 + }, + { + "epoch": 0.62, + "grad_norm": 1.8820600391053, + "learning_rate": 6.620430276705215e-06, + "loss": 0.891, + "step": 3857 + }, + { + "epoch": 0.62, + "grad_norm": 1.3313039782711484, + "learning_rate": 6.615518382649433e-06, + "loss": 0.8003, + "step": 3858 + }, + { + "epoch": 0.62, + "grad_norm": 1.3311225820621155, + "learning_rate": 6.610607410619736e-06, + "loss": 0.8075, + "step": 3859 + }, + { + "epoch": 0.62, + "grad_norm": 1.2785665087039633, + "learning_rate": 6.605697361954009e-06, + "loss": 0.8465, + "step": 3860 + }, + { + "epoch": 0.62, + "grad_norm": 1.3221310916172697, + "learning_rate": 6.600788237989888e-06, + "loss": 0.8124, + "step": 3861 + }, + { + "epoch": 0.62, + "grad_norm": 1.5172563295142003, + "learning_rate": 6.595880040064748e-06, + "loss": 0.8953, + "step": 3862 + }, + { + "epoch": 0.62, + "grad_norm": 0.8194082430556378, + "learning_rate": 6.590972769515722e-06, + "loss": 0.3225, + "step": 3863 + }, + { + "epoch": 0.62, + "grad_norm": 1.4939210063369914, + "learning_rate": 6.5860664276796844e-06, + "loss": 0.8443, + "step": 3864 + }, + { + "epoch": 0.62, + "grad_norm": 1.3179444486619654, + "learning_rate": 6.581161015893257e-06, + "loss": 0.8405, + "step": 3865 + }, + { + "epoch": 0.62, + "grad_norm": 1.2136823585580507, + "learning_rate": 6.576256535492807e-06, + "loss": 0.8095, + "step": 3866 + }, + { + "epoch": 0.62, + "grad_norm": 1.3083798710561638, + "learning_rate": 6.571352987814459e-06, + "loss": 0.8156, + "step": 3867 + }, + { + "epoch": 0.62, + "grad_norm": 0.8883186901333191, + "learning_rate": 6.566450374194062e-06, + "loss": 0.3456, + "step": 3868 + }, + { + "epoch": 0.62, + "grad_norm": 1.639756726795095, + "learning_rate": 6.561548695967231e-06, + "loss": 0.8333, + "step": 3869 + }, + { + "epoch": 0.62, + "grad_norm": 1.3956511205683717, + "learning_rate": 6.55664795446932e-06, + "loss": 0.7575, + "step": 3870 + }, + { + "epoch": 0.62, + "grad_norm": 1.5620315189185812, + "learning_rate": 6.55174815103542e-06, + "loss": 0.8712, + "step": 3871 + }, + { + "epoch": 0.62, + "grad_norm": 1.3727895046453484, + "learning_rate": 6.546849287000374e-06, + "loss": 0.7996, + "step": 3872 + }, + { + "epoch": 0.62, + "grad_norm": 1.3549539133854809, + "learning_rate": 6.5419513636987735e-06, + "loss": 0.8423, + "step": 3873 + }, + { + "epoch": 0.62, + "grad_norm": 1.56441936027866, + "learning_rate": 6.53705438246494e-06, + "loss": 0.8715, + "step": 3874 + }, + { + "epoch": 0.62, + "grad_norm": 1.4146641938289164, + "learning_rate": 6.532158344632946e-06, + "loss": 0.7786, + "step": 3875 + }, + { + "epoch": 0.62, + "grad_norm": 1.539858001292727, + "learning_rate": 6.527263251536618e-06, + "loss": 0.8717, + "step": 3876 + }, + { + "epoch": 0.62, + "grad_norm": 1.2531133748999301, + "learning_rate": 6.522369104509501e-06, + "loss": 0.748, + "step": 3877 + }, + { + "epoch": 0.62, + "grad_norm": 1.3791909342711943, + "learning_rate": 6.517475904884903e-06, + "loss": 0.8501, + "step": 3878 + }, + { + "epoch": 0.62, + "grad_norm": 1.2987783329246365, + "learning_rate": 6.512583653995867e-06, + "loss": 0.8373, + "step": 3879 + }, + { + "epoch": 0.63, + "grad_norm": 1.1898795238177797, + "learning_rate": 6.507692353175172e-06, + "loss": 0.8104, + "step": 3880 + }, + { + "epoch": 0.63, + "grad_norm": 1.2514483978762605, + "learning_rate": 6.502802003755346e-06, + "loss": 0.8873, + "step": 3881 + }, + { + "epoch": 0.63, + "grad_norm": 1.023076055488548, + "learning_rate": 6.497912607068655e-06, + "loss": 0.3298, + "step": 3882 + }, + { + "epoch": 0.63, + "grad_norm": 1.3154740687149613, + "learning_rate": 6.49302416444711e-06, + "loss": 0.8374, + "step": 3883 + }, + { + "epoch": 0.63, + "grad_norm": 1.5074828681952899, + "learning_rate": 6.4881366772224475e-06, + "loss": 0.8782, + "step": 3884 + }, + { + "epoch": 0.63, + "grad_norm": 1.4126928220273647, + "learning_rate": 6.48325014672616e-06, + "loss": 0.8281, + "step": 3885 + }, + { + "epoch": 0.63, + "grad_norm": 1.4133387736838636, + "learning_rate": 6.478364574289475e-06, + "loss": 0.8034, + "step": 3886 + }, + { + "epoch": 0.63, + "grad_norm": 1.490236299846001, + "learning_rate": 6.473479961243353e-06, + "loss": 0.8885, + "step": 3887 + }, + { + "epoch": 0.63, + "grad_norm": 0.7767728045855768, + "learning_rate": 6.468596308918498e-06, + "loss": 0.3206, + "step": 3888 + }, + { + "epoch": 0.63, + "grad_norm": 1.4011839264988704, + "learning_rate": 6.463713618645354e-06, + "loss": 0.7966, + "step": 3889 + }, + { + "epoch": 0.63, + "grad_norm": 1.6773287629211742, + "learning_rate": 6.458831891754096e-06, + "loss": 0.7919, + "step": 3890 + }, + { + "epoch": 0.63, + "grad_norm": 1.4943671784022743, + "learning_rate": 6.453951129574644e-06, + "loss": 0.8574, + "step": 3891 + }, + { + "epoch": 0.63, + "grad_norm": 1.541524229015405, + "learning_rate": 6.449071333436654e-06, + "loss": 0.879, + "step": 3892 + }, + { + "epoch": 0.63, + "grad_norm": 1.4072891310124973, + "learning_rate": 6.4441925046695135e-06, + "loss": 0.7966, + "step": 3893 + }, + { + "epoch": 0.63, + "grad_norm": 1.4710360311103452, + "learning_rate": 6.43931464460235e-06, + "loss": 0.8769, + "step": 3894 + }, + { + "epoch": 0.63, + "grad_norm": 1.5640893952661052, + "learning_rate": 6.434437754564031e-06, + "loss": 0.8641, + "step": 3895 + }, + { + "epoch": 0.63, + "grad_norm": 1.460537627518122, + "learning_rate": 6.42956183588315e-06, + "loss": 0.7523, + "step": 3896 + }, + { + "epoch": 0.63, + "grad_norm": 1.6129954271919813, + "learning_rate": 6.424686889888044e-06, + "loss": 0.8699, + "step": 3897 + }, + { + "epoch": 0.63, + "grad_norm": 1.358066823454367, + "learning_rate": 6.419812917906788e-06, + "loss": 0.9098, + "step": 3898 + }, + { + "epoch": 0.63, + "grad_norm": 1.4520916308776415, + "learning_rate": 6.414939921267176e-06, + "loss": 0.8435, + "step": 3899 + }, + { + "epoch": 0.63, + "grad_norm": 1.5296102753201162, + "learning_rate": 6.410067901296755e-06, + "loss": 0.7561, + "step": 3900 + }, + { + "epoch": 0.63, + "grad_norm": 1.0983438001473078, + "learning_rate": 6.405196859322793e-06, + "loss": 0.7667, + "step": 3901 + }, + { + "epoch": 0.63, + "grad_norm": 1.4200359407545446, + "learning_rate": 6.400326796672302e-06, + "loss": 0.8492, + "step": 3902 + }, + { + "epoch": 0.63, + "grad_norm": 1.4481202966194264, + "learning_rate": 6.395457714672013e-06, + "loss": 0.7654, + "step": 3903 + }, + { + "epoch": 0.63, + "grad_norm": 1.3408678508980298, + "learning_rate": 6.390589614648401e-06, + "loss": 0.7966, + "step": 3904 + }, + { + "epoch": 0.63, + "grad_norm": 1.1945157426497406, + "learning_rate": 6.38572249792768e-06, + "loss": 0.8496, + "step": 3905 + }, + { + "epoch": 0.63, + "grad_norm": 1.3099271969298356, + "learning_rate": 6.38085636583577e-06, + "loss": 0.7965, + "step": 3906 + }, + { + "epoch": 0.63, + "grad_norm": 1.378734434732506, + "learning_rate": 6.375991219698352e-06, + "loss": 0.8123, + "step": 3907 + }, + { + "epoch": 0.63, + "grad_norm": 1.267391995693451, + "learning_rate": 6.371127060840823e-06, + "loss": 0.7871, + "step": 3908 + }, + { + "epoch": 0.63, + "grad_norm": 1.3450140451845496, + "learning_rate": 6.366263890588311e-06, + "loss": 0.8619, + "step": 3909 + }, + { + "epoch": 0.63, + "grad_norm": 1.3117907195656504, + "learning_rate": 6.3614017102656815e-06, + "loss": 0.8353, + "step": 3910 + }, + { + "epoch": 0.63, + "grad_norm": 1.6675054411463346, + "learning_rate": 6.356540521197525e-06, + "loss": 0.8298, + "step": 3911 + }, + { + "epoch": 0.63, + "grad_norm": 1.198745127664038, + "learning_rate": 6.351680324708162e-06, + "loss": 0.8509, + "step": 3912 + }, + { + "epoch": 0.63, + "grad_norm": 1.5782425991233708, + "learning_rate": 6.3468211221216425e-06, + "loss": 0.8921, + "step": 3913 + }, + { + "epoch": 0.63, + "grad_norm": 1.584570323596617, + "learning_rate": 6.341962914761753e-06, + "loss": 0.8397, + "step": 3914 + }, + { + "epoch": 0.63, + "grad_norm": 1.4637526065856552, + "learning_rate": 6.337105703952e-06, + "loss": 0.8771, + "step": 3915 + }, + { + "epoch": 0.63, + "grad_norm": 1.366055254760888, + "learning_rate": 6.3322494910156195e-06, + "loss": 0.8808, + "step": 3916 + }, + { + "epoch": 0.63, + "grad_norm": 1.373528994815268, + "learning_rate": 6.327394277275582e-06, + "loss": 0.8317, + "step": 3917 + }, + { + "epoch": 0.63, + "grad_norm": 1.2917610669601896, + "learning_rate": 6.322540064054578e-06, + "loss": 0.8568, + "step": 3918 + }, + { + "epoch": 0.63, + "grad_norm": 1.494186823884887, + "learning_rate": 6.317686852675029e-06, + "loss": 0.9097, + "step": 3919 + }, + { + "epoch": 0.63, + "grad_norm": 0.8268577324928732, + "learning_rate": 6.312834644459086e-06, + "loss": 0.3386, + "step": 3920 + }, + { + "epoch": 0.63, + "grad_norm": 1.6323789552576708, + "learning_rate": 6.307983440728626e-06, + "loss": 0.7836, + "step": 3921 + }, + { + "epoch": 0.63, + "grad_norm": 1.5594724936195266, + "learning_rate": 6.303133242805244e-06, + "loss": 0.8737, + "step": 3922 + }, + { + "epoch": 0.63, + "grad_norm": 1.094742007491149, + "learning_rate": 6.2982840520102725e-06, + "loss": 0.7931, + "step": 3923 + }, + { + "epoch": 0.63, + "grad_norm": 1.3402914607399476, + "learning_rate": 6.293435869664766e-06, + "loss": 0.7525, + "step": 3924 + }, + { + "epoch": 0.63, + "grad_norm": 1.4621459366784988, + "learning_rate": 6.2885886970894986e-06, + "loss": 0.8691, + "step": 3925 + }, + { + "epoch": 0.63, + "grad_norm": 1.3047944409834007, + "learning_rate": 6.283742535604973e-06, + "loss": 0.8786, + "step": 3926 + }, + { + "epoch": 0.63, + "grad_norm": 1.465103721437772, + "learning_rate": 6.2788973865314265e-06, + "loss": 0.8092, + "step": 3927 + }, + { + "epoch": 0.63, + "grad_norm": 0.7942771827985942, + "learning_rate": 6.274053251188798e-06, + "loss": 0.3504, + "step": 3928 + }, + { + "epoch": 0.63, + "grad_norm": 1.597226951149117, + "learning_rate": 6.269210130896773e-06, + "loss": 0.7935, + "step": 3929 + }, + { + "epoch": 0.63, + "grad_norm": 1.4703019859691049, + "learning_rate": 6.26436802697475e-06, + "loss": 0.8158, + "step": 3930 + }, + { + "epoch": 0.63, + "grad_norm": 1.5427191665246727, + "learning_rate": 6.259526940741848e-06, + "loss": 0.7648, + "step": 3931 + }, + { + "epoch": 0.63, + "grad_norm": 1.2875201629071407, + "learning_rate": 6.2546868735169134e-06, + "loss": 0.8319, + "step": 3932 + }, + { + "epoch": 0.63, + "grad_norm": 1.3976696369725188, + "learning_rate": 6.249847826618518e-06, + "loss": 0.881, + "step": 3933 + }, + { + "epoch": 0.63, + "grad_norm": 1.3154673383347013, + "learning_rate": 6.245009801364945e-06, + "loss": 0.8783, + "step": 3934 + }, + { + "epoch": 0.63, + "grad_norm": 1.5589811326473262, + "learning_rate": 6.240172799074207e-06, + "loss": 0.8259, + "step": 3935 + }, + { + "epoch": 0.63, + "grad_norm": 1.3403328129500314, + "learning_rate": 6.2353368210640415e-06, + "loss": 0.7911, + "step": 3936 + }, + { + "epoch": 0.63, + "grad_norm": 1.4445005552611252, + "learning_rate": 6.230501868651899e-06, + "loss": 0.7871, + "step": 3937 + }, + { + "epoch": 0.63, + "grad_norm": 1.3519677742509113, + "learning_rate": 6.225667943154956e-06, + "loss": 0.8888, + "step": 3938 + }, + { + "epoch": 0.63, + "grad_norm": 1.3556500692872318, + "learning_rate": 6.2208350458901035e-06, + "loss": 0.8431, + "step": 3939 + }, + { + "epoch": 0.63, + "grad_norm": 1.3622933561612653, + "learning_rate": 6.216003178173959e-06, + "loss": 0.8037, + "step": 3940 + }, + { + "epoch": 0.63, + "grad_norm": 1.6732516247414286, + "learning_rate": 6.211172341322853e-06, + "loss": 0.8262, + "step": 3941 + }, + { + "epoch": 0.64, + "grad_norm": 1.5471596227198163, + "learning_rate": 6.206342536652841e-06, + "loss": 0.8002, + "step": 3942 + }, + { + "epoch": 0.64, + "grad_norm": 1.3813001362247834, + "learning_rate": 6.201513765479699e-06, + "loss": 0.7342, + "step": 3943 + }, + { + "epoch": 0.64, + "grad_norm": 1.2193124669871471, + "learning_rate": 6.196686029118909e-06, + "loss": 0.7827, + "step": 3944 + }, + { + "epoch": 0.64, + "grad_norm": 1.3880225036788776, + "learning_rate": 6.191859328885685e-06, + "loss": 0.8141, + "step": 3945 + }, + { + "epoch": 0.64, + "grad_norm": 1.409709568420695, + "learning_rate": 6.1870336660949526e-06, + "loss": 0.8803, + "step": 3946 + }, + { + "epoch": 0.64, + "grad_norm": 1.2217121973751148, + "learning_rate": 6.182209042061353e-06, + "loss": 0.7302, + "step": 3947 + }, + { + "epoch": 0.64, + "grad_norm": 1.1732205224295202, + "learning_rate": 6.177385458099248e-06, + "loss": 0.8009, + "step": 3948 + }, + { + "epoch": 0.64, + "grad_norm": 0.7438549960378901, + "learning_rate": 6.1725629155227195e-06, + "loss": 0.3495, + "step": 3949 + }, + { + "epoch": 0.64, + "grad_norm": 0.8350387263731213, + "learning_rate": 6.167741415645552e-06, + "loss": 0.3581, + "step": 3950 + }, + { + "epoch": 0.64, + "grad_norm": 1.39328924804151, + "learning_rate": 6.16292095978126e-06, + "loss": 0.7978, + "step": 3951 + }, + { + "epoch": 0.64, + "grad_norm": 1.3524587106420576, + "learning_rate": 6.158101549243072e-06, + "loss": 0.9403, + "step": 3952 + }, + { + "epoch": 0.64, + "grad_norm": 1.29334385046575, + "learning_rate": 6.153283185343921e-06, + "loss": 0.8599, + "step": 3953 + }, + { + "epoch": 0.64, + "grad_norm": 1.310218875874768, + "learning_rate": 6.148465869396468e-06, + "loss": 0.8455, + "step": 3954 + }, + { + "epoch": 0.64, + "grad_norm": 1.342882829545087, + "learning_rate": 6.14364960271308e-06, + "loss": 0.8817, + "step": 3955 + }, + { + "epoch": 0.64, + "grad_norm": 1.3753952730236263, + "learning_rate": 6.13883438660584e-06, + "loss": 0.8258, + "step": 3956 + }, + { + "epoch": 0.64, + "grad_norm": 1.2785663105760143, + "learning_rate": 6.134020222386544e-06, + "loss": 0.9149, + "step": 3957 + }, + { + "epoch": 0.64, + "grad_norm": 1.4914870552984836, + "learning_rate": 6.1292071113667125e-06, + "loss": 0.8149, + "step": 3958 + }, + { + "epoch": 0.64, + "grad_norm": 1.5993073717486268, + "learning_rate": 6.124395054857557e-06, + "loss": 0.84, + "step": 3959 + }, + { + "epoch": 0.64, + "grad_norm": 1.9809127005303395, + "learning_rate": 6.119584054170022e-06, + "loss": 0.8733, + "step": 3960 + }, + { + "epoch": 0.64, + "grad_norm": 1.5134555243648176, + "learning_rate": 6.114774110614755e-06, + "loss": 0.8816, + "step": 3961 + }, + { + "epoch": 0.64, + "grad_norm": 1.3341747472038046, + "learning_rate": 6.109965225502119e-06, + "loss": 0.8165, + "step": 3962 + }, + { + "epoch": 0.64, + "grad_norm": 1.2948750412643426, + "learning_rate": 6.105157400142183e-06, + "loss": 0.737, + "step": 3963 + }, + { + "epoch": 0.64, + "grad_norm": 1.6163825528863194, + "learning_rate": 6.100350635844731e-06, + "loss": 0.8599, + "step": 3964 + }, + { + "epoch": 0.64, + "grad_norm": 1.530450086494435, + "learning_rate": 6.095544933919265e-06, + "loss": 0.9085, + "step": 3965 + }, + { + "epoch": 0.64, + "grad_norm": 1.1907440205530424, + "learning_rate": 6.090740295674984e-06, + "loss": 0.8493, + "step": 3966 + }, + { + "epoch": 0.64, + "grad_norm": 1.329265067296999, + "learning_rate": 6.085936722420806e-06, + "loss": 0.7813, + "step": 3967 + }, + { + "epoch": 0.64, + "grad_norm": 1.4042984182537657, + "learning_rate": 6.081134215465358e-06, + "loss": 0.9183, + "step": 3968 + }, + { + "epoch": 0.64, + "grad_norm": 1.3320858488331586, + "learning_rate": 6.076332776116971e-06, + "loss": 0.7692, + "step": 3969 + }, + { + "epoch": 0.64, + "grad_norm": 1.3569285439681797, + "learning_rate": 6.071532405683691e-06, + "loss": 0.8311, + "step": 3970 + }, + { + "epoch": 0.64, + "grad_norm": 1.6704253146128583, + "learning_rate": 6.066733105473279e-06, + "loss": 0.8487, + "step": 3971 + }, + { + "epoch": 0.64, + "grad_norm": 1.4449896039349552, + "learning_rate": 6.061934876793183e-06, + "loss": 0.8463, + "step": 3972 + }, + { + "epoch": 0.64, + "grad_norm": 1.6424342794403093, + "learning_rate": 6.057137720950583e-06, + "loss": 0.8732, + "step": 3973 + }, + { + "epoch": 0.64, + "grad_norm": 0.878657248510857, + "learning_rate": 6.052341639252356e-06, + "loss": 0.317, + "step": 3974 + }, + { + "epoch": 0.64, + "grad_norm": 1.270700163065299, + "learning_rate": 6.0475466330050815e-06, + "loss": 0.8394, + "step": 3975 + }, + { + "epoch": 0.64, + "grad_norm": 1.2124278507168977, + "learning_rate": 6.042752703515054e-06, + "loss": 0.8106, + "step": 3976 + }, + { + "epoch": 0.64, + "grad_norm": 1.4378982470423594, + "learning_rate": 6.037959852088275e-06, + "loss": 0.8534, + "step": 3977 + }, + { + "epoch": 0.64, + "grad_norm": 1.7260439037806536, + "learning_rate": 6.0331680800304436e-06, + "loss": 0.9067, + "step": 3978 + }, + { + "epoch": 0.64, + "grad_norm": 1.2420710315624883, + "learning_rate": 6.028377388646972e-06, + "loss": 0.8034, + "step": 3979 + }, + { + "epoch": 0.64, + "grad_norm": 1.262116983473999, + "learning_rate": 6.023587779242981e-06, + "loss": 0.8314, + "step": 3980 + }, + { + "epoch": 0.64, + "grad_norm": 1.6167055312494005, + "learning_rate": 6.0187992531232895e-06, + "loss": 0.8078, + "step": 3981 + }, + { + "epoch": 0.64, + "grad_norm": 1.4401756062530737, + "learning_rate": 6.014011811592424e-06, + "loss": 0.8451, + "step": 3982 + }, + { + "epoch": 0.64, + "grad_norm": 1.1415996845187493, + "learning_rate": 6.0092254559546134e-06, + "loss": 0.8364, + "step": 3983 + }, + { + "epoch": 0.64, + "grad_norm": 1.2672171012464313, + "learning_rate": 6.004440187513798e-06, + "loss": 0.8644, + "step": 3984 + }, + { + "epoch": 0.64, + "grad_norm": 1.3558756309695346, + "learning_rate": 5.999656007573612e-06, + "loss": 0.8497, + "step": 3985 + }, + { + "epoch": 0.64, + "grad_norm": 1.0870451959470988, + "learning_rate": 5.9948729174373975e-06, + "loss": 0.8393, + "step": 3986 + }, + { + "epoch": 0.64, + "grad_norm": 1.3579787635137754, + "learning_rate": 5.990090918408208e-06, + "loss": 0.8521, + "step": 3987 + }, + { + "epoch": 0.64, + "grad_norm": 1.4161645140145918, + "learning_rate": 5.985310011788781e-06, + "loss": 0.8378, + "step": 3988 + }, + { + "epoch": 0.64, + "grad_norm": 1.1578293920261569, + "learning_rate": 5.9805301988815735e-06, + "loss": 0.8489, + "step": 3989 + }, + { + "epoch": 0.64, + "grad_norm": 1.4376929182196696, + "learning_rate": 5.975751480988739e-06, + "loss": 0.8539, + "step": 3990 + }, + { + "epoch": 0.64, + "grad_norm": 1.3919597358876348, + "learning_rate": 5.970973859412128e-06, + "loss": 0.8758, + "step": 3991 + }, + { + "epoch": 0.64, + "grad_norm": 1.5679502648364925, + "learning_rate": 5.966197335453299e-06, + "loss": 0.7203, + "step": 3992 + }, + { + "epoch": 0.64, + "grad_norm": 1.5038221224254285, + "learning_rate": 5.96142191041351e-06, + "loss": 0.8686, + "step": 3993 + }, + { + "epoch": 0.64, + "grad_norm": 1.259228608163467, + "learning_rate": 5.956647585593713e-06, + "loss": 0.7934, + "step": 3994 + }, + { + "epoch": 0.64, + "grad_norm": 1.4696726586351705, + "learning_rate": 5.9518743622945715e-06, + "loss": 0.9047, + "step": 3995 + }, + { + "epoch": 0.64, + "grad_norm": 1.6539421607638625, + "learning_rate": 5.9471022418164425e-06, + "loss": 0.9049, + "step": 3996 + }, + { + "epoch": 0.64, + "grad_norm": 1.798190730860522, + "learning_rate": 5.942331225459379e-06, + "loss": 0.8001, + "step": 3997 + }, + { + "epoch": 0.64, + "grad_norm": 1.4203691466788804, + "learning_rate": 5.937561314523142e-06, + "loss": 0.8261, + "step": 3998 + }, + { + "epoch": 0.64, + "grad_norm": 1.3277820200394561, + "learning_rate": 5.93279251030718e-06, + "loss": 0.8414, + "step": 3999 + }, + { + "epoch": 0.64, + "grad_norm": 1.2097297398310745, + "learning_rate": 5.928024814110659e-06, + "loss": 0.8371, + "step": 4000 + }, + { + "epoch": 0.64, + "grad_norm": 1.4452443989612773, + "learning_rate": 5.923258227232418e-06, + "loss": 0.8248, + "step": 4001 + }, + { + "epoch": 0.64, + "grad_norm": 1.6710395480287648, + "learning_rate": 5.918492750971012e-06, + "loss": 0.7991, + "step": 4002 + }, + { + "epoch": 0.64, + "grad_norm": 1.2142557982160682, + "learning_rate": 5.913728386624691e-06, + "loss": 0.8768, + "step": 4003 + }, + { + "epoch": 0.65, + "grad_norm": 1.4379956360193322, + "learning_rate": 5.908965135491394e-06, + "loss": 0.8802, + "step": 4004 + }, + { + "epoch": 0.65, + "grad_norm": 1.462085253898475, + "learning_rate": 5.904202998868765e-06, + "loss": 0.8475, + "step": 4005 + }, + { + "epoch": 0.65, + "grad_norm": 1.5163560774254725, + "learning_rate": 5.899441978054141e-06, + "loss": 0.8627, + "step": 4006 + }, + { + "epoch": 0.65, + "grad_norm": 1.3948609192177073, + "learning_rate": 5.894682074344554e-06, + "loss": 0.8076, + "step": 4007 + }, + { + "epoch": 0.65, + "grad_norm": 1.3766523736177794, + "learning_rate": 5.889923289036732e-06, + "loss": 0.862, + "step": 4008 + }, + { + "epoch": 0.65, + "grad_norm": 1.3971974033074275, + "learning_rate": 5.885165623427106e-06, + "loss": 0.8892, + "step": 4009 + }, + { + "epoch": 0.65, + "grad_norm": 1.2614007769122986, + "learning_rate": 5.880409078811784e-06, + "loss": 0.8506, + "step": 4010 + }, + { + "epoch": 0.65, + "grad_norm": 1.4371098515000917, + "learning_rate": 5.875653656486588e-06, + "loss": 0.8635, + "step": 4011 + }, + { + "epoch": 0.65, + "grad_norm": 1.5648057066453844, + "learning_rate": 5.870899357747029e-06, + "loss": 0.842, + "step": 4012 + }, + { + "epoch": 0.65, + "grad_norm": 1.5073605580300786, + "learning_rate": 5.8661461838882995e-06, + "loss": 0.795, + "step": 4013 + }, + { + "epoch": 0.65, + "grad_norm": 1.2816969417989925, + "learning_rate": 5.861394136205301e-06, + "loss": 0.8809, + "step": 4014 + }, + { + "epoch": 0.65, + "grad_norm": 1.4620882018500523, + "learning_rate": 5.8566432159926215e-06, + "loss": 0.817, + "step": 4015 + }, + { + "epoch": 0.65, + "grad_norm": 1.7464170625974282, + "learning_rate": 5.851893424544541e-06, + "loss": 0.8155, + "step": 4016 + }, + { + "epoch": 0.65, + "grad_norm": 1.5602237458784107, + "learning_rate": 5.8471447631550335e-06, + "loss": 0.9038, + "step": 4017 + }, + { + "epoch": 0.65, + "grad_norm": 1.4702517664139347, + "learning_rate": 5.842397233117771e-06, + "loss": 0.9115, + "step": 4018 + }, + { + "epoch": 0.65, + "grad_norm": 1.3053117054077699, + "learning_rate": 5.837650835726105e-06, + "loss": 0.8173, + "step": 4019 + }, + { + "epoch": 0.65, + "grad_norm": 1.247199273666786, + "learning_rate": 5.832905572273084e-06, + "loss": 0.8227, + "step": 4020 + }, + { + "epoch": 0.65, + "grad_norm": 1.5306365270694602, + "learning_rate": 5.828161444051456e-06, + "loss": 0.9002, + "step": 4021 + }, + { + "epoch": 0.65, + "grad_norm": 1.471143086801859, + "learning_rate": 5.823418452353649e-06, + "loss": 0.8279, + "step": 4022 + }, + { + "epoch": 0.65, + "grad_norm": 1.6208905104119133, + "learning_rate": 5.8186765984717795e-06, + "loss": 0.8232, + "step": 4023 + }, + { + "epoch": 0.65, + "grad_norm": 1.456329880005253, + "learning_rate": 5.813935883697668e-06, + "loss": 0.8014, + "step": 4024 + }, + { + "epoch": 0.65, + "grad_norm": 1.2398354316725633, + "learning_rate": 5.809196309322811e-06, + "loss": 0.8212, + "step": 4025 + }, + { + "epoch": 0.65, + "grad_norm": 1.2797954235807563, + "learning_rate": 5.8044578766383945e-06, + "loss": 0.8271, + "step": 4026 + }, + { + "epoch": 0.65, + "grad_norm": 1.5748992152078602, + "learning_rate": 5.799720586935304e-06, + "loss": 0.6803, + "step": 4027 + }, + { + "epoch": 0.65, + "grad_norm": 1.2863578740672688, + "learning_rate": 5.794984441504114e-06, + "loss": 0.8957, + "step": 4028 + }, + { + "epoch": 0.65, + "grad_norm": 0.8837200664754753, + "learning_rate": 5.790249441635067e-06, + "loss": 0.3432, + "step": 4029 + }, + { + "epoch": 0.65, + "grad_norm": 1.4133217541661536, + "learning_rate": 5.785515588618115e-06, + "loss": 0.8378, + "step": 4030 + }, + { + "epoch": 0.65, + "grad_norm": 1.3577327838758657, + "learning_rate": 5.780782883742892e-06, + "loss": 0.7748, + "step": 4031 + }, + { + "epoch": 0.65, + "grad_norm": 1.4261684936968158, + "learning_rate": 5.7760513282987164e-06, + "loss": 0.7836, + "step": 4032 + }, + { + "epoch": 0.65, + "grad_norm": 1.3547482066373593, + "learning_rate": 5.771320923574588e-06, + "loss": 0.8069, + "step": 4033 + }, + { + "epoch": 0.65, + "grad_norm": 1.4813321648114053, + "learning_rate": 5.7665916708592095e-06, + "loss": 0.8125, + "step": 4034 + }, + { + "epoch": 0.65, + "grad_norm": 1.4367805856328193, + "learning_rate": 5.761863571440948e-06, + "loss": 0.8009, + "step": 4035 + }, + { + "epoch": 0.65, + "grad_norm": 1.2375233595472444, + "learning_rate": 5.757136626607881e-06, + "loss": 0.7962, + "step": 4036 + }, + { + "epoch": 0.65, + "grad_norm": 1.458239181209716, + "learning_rate": 5.75241083764775e-06, + "loss": 0.8251, + "step": 4037 + }, + { + "epoch": 0.65, + "grad_norm": 1.382116358714955, + "learning_rate": 5.747686205847991e-06, + "loss": 0.8765, + "step": 4038 + }, + { + "epoch": 0.65, + "grad_norm": 1.3325681946599095, + "learning_rate": 5.742962732495727e-06, + "loss": 0.8136, + "step": 4039 + }, + { + "epoch": 0.65, + "grad_norm": 1.4997090663499903, + "learning_rate": 5.738240418877758e-06, + "loss": 0.8496, + "step": 4040 + }, + { + "epoch": 0.65, + "grad_norm": 1.217586496867139, + "learning_rate": 5.7335192662805784e-06, + "loss": 0.8843, + "step": 4041 + }, + { + "epoch": 0.65, + "grad_norm": 1.4073555680070253, + "learning_rate": 5.728799275990352e-06, + "loss": 0.7038, + "step": 4042 + }, + { + "epoch": 0.65, + "grad_norm": 0.8884718559821404, + "learning_rate": 5.7240804492929435e-06, + "loss": 0.3639, + "step": 4043 + }, + { + "epoch": 0.65, + "grad_norm": 1.5819973259102533, + "learning_rate": 5.7193627874738874e-06, + "loss": 0.8256, + "step": 4044 + }, + { + "epoch": 0.65, + "grad_norm": 1.870378071687995, + "learning_rate": 5.714646291818401e-06, + "loss": 0.8135, + "step": 4045 + }, + { + "epoch": 0.65, + "grad_norm": 0.9817251565621882, + "learning_rate": 5.709930963611394e-06, + "loss": 0.345, + "step": 4046 + }, + { + "epoch": 0.65, + "grad_norm": 1.3398380418439002, + "learning_rate": 5.70521680413745e-06, + "loss": 0.8668, + "step": 4047 + }, + { + "epoch": 0.65, + "grad_norm": 1.6440664391106998, + "learning_rate": 5.700503814680831e-06, + "loss": 0.7886, + "step": 4048 + }, + { + "epoch": 0.65, + "grad_norm": 1.3966817399587208, + "learning_rate": 5.695791996525488e-06, + "loss": 0.8683, + "step": 4049 + }, + { + "epoch": 0.65, + "grad_norm": 1.3726707841476058, + "learning_rate": 5.691081350955061e-06, + "loss": 0.8752, + "step": 4050 + }, + { + "epoch": 0.65, + "grad_norm": 1.361613889444106, + "learning_rate": 5.686371879252841e-06, + "loss": 0.7818, + "step": 4051 + }, + { + "epoch": 0.65, + "grad_norm": 1.7504538730873453, + "learning_rate": 5.681663582701827e-06, + "loss": 0.7762, + "step": 4052 + }, + { + "epoch": 0.65, + "grad_norm": 1.3889290347522594, + "learning_rate": 5.676956462584693e-06, + "loss": 0.8072, + "step": 4053 + }, + { + "epoch": 0.65, + "grad_norm": 1.3255283767148927, + "learning_rate": 5.672250520183784e-06, + "loss": 0.8584, + "step": 4054 + }, + { + "epoch": 0.65, + "grad_norm": 1.3036688094172493, + "learning_rate": 5.6675457567811235e-06, + "loss": 0.8133, + "step": 4055 + }, + { + "epoch": 0.65, + "grad_norm": 1.361300981954292, + "learning_rate": 5.662842173658429e-06, + "loss": 0.8778, + "step": 4056 + }, + { + "epoch": 0.65, + "grad_norm": 1.4547585962194465, + "learning_rate": 5.65813977209708e-06, + "loss": 0.7704, + "step": 4057 + }, + { + "epoch": 0.65, + "grad_norm": 1.448547585736928, + "learning_rate": 5.653438553378137e-06, + "loss": 0.8397, + "step": 4058 + }, + { + "epoch": 0.65, + "grad_norm": 1.4918102416633083, + "learning_rate": 5.648738518782346e-06, + "loss": 0.8288, + "step": 4059 + }, + { + "epoch": 0.65, + "grad_norm": 0.8084724301854964, + "learning_rate": 5.64403966959013e-06, + "loss": 0.3354, + "step": 4060 + }, + { + "epoch": 0.65, + "grad_norm": 1.5034145269698331, + "learning_rate": 5.639342007081581e-06, + "loss": 0.8158, + "step": 4061 + }, + { + "epoch": 0.65, + "grad_norm": 1.493635524194898, + "learning_rate": 5.6346455325364664e-06, + "loss": 0.8768, + "step": 4062 + }, + { + "epoch": 0.65, + "grad_norm": 1.4314812939754407, + "learning_rate": 5.629950247234246e-06, + "loss": 0.8369, + "step": 4063 + }, + { + "epoch": 0.65, + "grad_norm": 1.2395893619862246, + "learning_rate": 5.625256152454035e-06, + "loss": 0.857, + "step": 4064 + }, + { + "epoch": 0.65, + "grad_norm": 1.5027821643797372, + "learning_rate": 5.620563249474642e-06, + "loss": 0.8198, + "step": 4065 + }, + { + "epoch": 0.66, + "grad_norm": 1.3521031176355667, + "learning_rate": 5.615871539574539e-06, + "loss": 0.8384, + "step": 4066 + }, + { + "epoch": 0.66, + "grad_norm": 1.3222561811957922, + "learning_rate": 5.611181024031875e-06, + "loss": 0.8643, + "step": 4067 + }, + { + "epoch": 0.66, + "grad_norm": 1.359295431636533, + "learning_rate": 5.606491704124482e-06, + "loss": 0.8021, + "step": 4068 + }, + { + "epoch": 0.66, + "grad_norm": 1.3018932214868242, + "learning_rate": 5.6018035811298566e-06, + "loss": 0.7823, + "step": 4069 + }, + { + "epoch": 0.66, + "grad_norm": 1.309949127057004, + "learning_rate": 5.597116656325169e-06, + "loss": 0.8076, + "step": 4070 + }, + { + "epoch": 0.66, + "grad_norm": 1.735967448999417, + "learning_rate": 5.5924309309872705e-06, + "loss": 0.8342, + "step": 4071 + }, + { + "epoch": 0.66, + "grad_norm": 1.3373301623399914, + "learning_rate": 5.587746406392689e-06, + "loss": 0.8414, + "step": 4072 + }, + { + "epoch": 0.66, + "grad_norm": 1.3232984126205791, + "learning_rate": 5.583063083817604e-06, + "loss": 0.8589, + "step": 4073 + }, + { + "epoch": 0.66, + "grad_norm": 1.3086289593244262, + "learning_rate": 5.578380964537889e-06, + "loss": 0.8777, + "step": 4074 + }, + { + "epoch": 0.66, + "grad_norm": 1.6483532694227965, + "learning_rate": 5.573700049829083e-06, + "loss": 0.7961, + "step": 4075 + }, + { + "epoch": 0.66, + "grad_norm": 1.4501485643654826, + "learning_rate": 5.569020340966398e-06, + "loss": 0.8308, + "step": 4076 + }, + { + "epoch": 0.66, + "grad_norm": 1.4840859501892305, + "learning_rate": 5.564341839224707e-06, + "loss": 0.8698, + "step": 4077 + }, + { + "epoch": 0.66, + "grad_norm": 1.3117336579066952, + "learning_rate": 5.55966454587857e-06, + "loss": 0.8803, + "step": 4078 + }, + { + "epoch": 0.66, + "grad_norm": 1.3251758410963688, + "learning_rate": 5.5549884622022165e-06, + "loss": 0.7994, + "step": 4079 + }, + { + "epoch": 0.66, + "grad_norm": 1.4255004727189236, + "learning_rate": 5.550313589469525e-06, + "loss": 0.8774, + "step": 4080 + }, + { + "epoch": 0.66, + "grad_norm": 1.5354747247739804, + "learning_rate": 5.54563992895407e-06, + "loss": 0.8786, + "step": 4081 + }, + { + "epoch": 0.66, + "grad_norm": 1.4491303743422195, + "learning_rate": 5.540967481929085e-06, + "loss": 0.8136, + "step": 4082 + }, + { + "epoch": 0.66, + "grad_norm": 1.3099310955432473, + "learning_rate": 5.536296249667472e-06, + "loss": 0.8844, + "step": 4083 + }, + { + "epoch": 0.66, + "grad_norm": 1.368648701206005, + "learning_rate": 5.531626233441802e-06, + "loss": 0.8422, + "step": 4084 + }, + { + "epoch": 0.66, + "grad_norm": 1.1280260924222627, + "learning_rate": 5.526957434524319e-06, + "loss": 0.7933, + "step": 4085 + }, + { + "epoch": 0.66, + "grad_norm": 1.3166774369964485, + "learning_rate": 5.522289854186928e-06, + "loss": 0.8385, + "step": 4086 + }, + { + "epoch": 0.66, + "grad_norm": 1.3558668121006436, + "learning_rate": 5.517623493701213e-06, + "loss": 0.8111, + "step": 4087 + }, + { + "epoch": 0.66, + "grad_norm": 1.524344565489292, + "learning_rate": 5.512958354338416e-06, + "loss": 0.8015, + "step": 4088 + }, + { + "epoch": 0.66, + "grad_norm": 1.2776905516453796, + "learning_rate": 5.508294437369445e-06, + "loss": 0.8844, + "step": 4089 + }, + { + "epoch": 0.66, + "grad_norm": 1.465207931388753, + "learning_rate": 5.503631744064888e-06, + "loss": 0.8045, + "step": 4090 + }, + { + "epoch": 0.66, + "grad_norm": 1.3721645713141295, + "learning_rate": 5.498970275694987e-06, + "loss": 0.9016, + "step": 4091 + }, + { + "epoch": 0.66, + "grad_norm": 1.5834572369998123, + "learning_rate": 5.494310033529651e-06, + "loss": 0.8416, + "step": 4092 + }, + { + "epoch": 0.66, + "grad_norm": 1.2643175833799456, + "learning_rate": 5.489651018838462e-06, + "loss": 0.8088, + "step": 4093 + }, + { + "epoch": 0.66, + "grad_norm": 1.2772354625449334, + "learning_rate": 5.48499323289067e-06, + "loss": 0.8256, + "step": 4094 + }, + { + "epoch": 0.66, + "grad_norm": 1.5651003288236798, + "learning_rate": 5.480336676955174e-06, + "loss": 0.8689, + "step": 4095 + }, + { + "epoch": 0.66, + "grad_norm": 1.4412919686663415, + "learning_rate": 5.475681352300549e-06, + "loss": 0.8293, + "step": 4096 + }, + { + "epoch": 0.66, + "grad_norm": 1.9230907750547812, + "learning_rate": 5.471027260195043e-06, + "loss": 0.8567, + "step": 4097 + }, + { + "epoch": 0.66, + "grad_norm": 1.2018629020863378, + "learning_rate": 5.466374401906552e-06, + "loss": 0.9062, + "step": 4098 + }, + { + "epoch": 0.66, + "grad_norm": 1.3295145899573564, + "learning_rate": 5.461722778702641e-06, + "loss": 0.8589, + "step": 4099 + }, + { + "epoch": 0.66, + "grad_norm": 0.855081309368814, + "learning_rate": 5.457072391850543e-06, + "loss": 0.3295, + "step": 4100 + }, + { + "epoch": 0.66, + "grad_norm": 1.405920672266167, + "learning_rate": 5.45242324261716e-06, + "loss": 0.8583, + "step": 4101 + }, + { + "epoch": 0.66, + "grad_norm": 1.4551644114057882, + "learning_rate": 5.447775332269032e-06, + "loss": 0.796, + "step": 4102 + }, + { + "epoch": 0.66, + "grad_norm": 1.8446735729888617, + "learning_rate": 5.443128662072388e-06, + "loss": 0.8019, + "step": 4103 + }, + { + "epoch": 0.66, + "grad_norm": 1.4738961187421096, + "learning_rate": 5.43848323329311e-06, + "loss": 0.8213, + "step": 4104 + }, + { + "epoch": 0.66, + "grad_norm": 1.565910716407285, + "learning_rate": 5.433839047196738e-06, + "loss": 0.7797, + "step": 4105 + }, + { + "epoch": 0.66, + "grad_norm": 1.6286963314989293, + "learning_rate": 5.429196105048473e-06, + "loss": 0.8268, + "step": 4106 + }, + { + "epoch": 0.66, + "grad_norm": 1.2586299811996409, + "learning_rate": 5.424554408113188e-06, + "loss": 0.8165, + "step": 4107 + }, + { + "epoch": 0.66, + "grad_norm": 1.3218611626176995, + "learning_rate": 5.4199139576554046e-06, + "loss": 0.8007, + "step": 4108 + }, + { + "epoch": 0.66, + "grad_norm": 0.8196061249231192, + "learning_rate": 5.4152747549393055e-06, + "loss": 0.3346, + "step": 4109 + }, + { + "epoch": 0.66, + "grad_norm": 1.2826753355403062, + "learning_rate": 5.410636801228748e-06, + "loss": 0.9353, + "step": 4110 + }, + { + "epoch": 0.66, + "grad_norm": 1.3633835161211914, + "learning_rate": 5.406000097787226e-06, + "loss": 0.8477, + "step": 4111 + }, + { + "epoch": 0.66, + "grad_norm": 1.1942060595688722, + "learning_rate": 5.4013646458779175e-06, + "loss": 0.7308, + "step": 4112 + }, + { + "epoch": 0.66, + "grad_norm": 1.504643966514461, + "learning_rate": 5.396730446763641e-06, + "loss": 0.8846, + "step": 4113 + }, + { + "epoch": 0.66, + "grad_norm": 1.3896823833337928, + "learning_rate": 5.392097501706877e-06, + "loss": 0.8687, + "step": 4114 + }, + { + "epoch": 0.66, + "grad_norm": 1.3155873132696965, + "learning_rate": 5.38746581196977e-06, + "loss": 0.8437, + "step": 4115 + }, + { + "epoch": 0.66, + "grad_norm": 1.4463019772918246, + "learning_rate": 5.382835378814129e-06, + "loss": 0.8511, + "step": 4116 + }, + { + "epoch": 0.66, + "grad_norm": 1.704193711726875, + "learning_rate": 5.378206203501397e-06, + "loss": 0.8727, + "step": 4117 + }, + { + "epoch": 0.66, + "grad_norm": 1.3219415945216757, + "learning_rate": 5.373578287292694e-06, + "loss": 0.8379, + "step": 4118 + }, + { + "epoch": 0.66, + "grad_norm": 1.3201682146648561, + "learning_rate": 5.3689516314488e-06, + "loss": 0.7444, + "step": 4119 + }, + { + "epoch": 0.66, + "grad_norm": 1.4252850701223332, + "learning_rate": 5.364326237230135e-06, + "loss": 0.8247, + "step": 4120 + }, + { + "epoch": 0.66, + "grad_norm": 1.536274852478914, + "learning_rate": 5.359702105896783e-06, + "loss": 0.7844, + "step": 4121 + }, + { + "epoch": 0.66, + "grad_norm": 1.4154694233808633, + "learning_rate": 5.355079238708487e-06, + "loss": 0.8705, + "step": 4122 + }, + { + "epoch": 0.66, + "grad_norm": 1.3561580606470416, + "learning_rate": 5.350457636924654e-06, + "loss": 0.8456, + "step": 4123 + }, + { + "epoch": 0.66, + "grad_norm": 1.3336741022385572, + "learning_rate": 5.345837301804317e-06, + "loss": 0.8787, + "step": 4124 + }, + { + "epoch": 0.66, + "grad_norm": 1.3689592869889338, + "learning_rate": 5.341218234606192e-06, + "loss": 0.8937, + "step": 4125 + }, + { + "epoch": 0.66, + "grad_norm": 1.7517368444692838, + "learning_rate": 5.336600436588644e-06, + "loss": 0.8158, + "step": 4126 + }, + { + "epoch": 0.66, + "grad_norm": 1.1664041462136399, + "learning_rate": 5.331983909009685e-06, + "loss": 0.8189, + "step": 4127 + }, + { + "epoch": 0.67, + "grad_norm": 1.4077388134204758, + "learning_rate": 5.327368653126978e-06, + "loss": 0.8371, + "step": 4128 + }, + { + "epoch": 0.67, + "grad_norm": 1.3650620502512707, + "learning_rate": 5.322754670197859e-06, + "loss": 0.894, + "step": 4129 + }, + { + "epoch": 0.67, + "grad_norm": 1.716730906376721, + "learning_rate": 5.318141961479293e-06, + "loss": 0.866, + "step": 4130 + }, + { + "epoch": 0.67, + "grad_norm": 1.2337296110005247, + "learning_rate": 5.31353052822791e-06, + "loss": 0.828, + "step": 4131 + }, + { + "epoch": 0.67, + "grad_norm": 1.3861016246877065, + "learning_rate": 5.3089203717e-06, + "loss": 0.8222, + "step": 4132 + }, + { + "epoch": 0.67, + "grad_norm": 1.4938032816617672, + "learning_rate": 5.304311493151486e-06, + "loss": 0.888, + "step": 4133 + }, + { + "epoch": 0.67, + "grad_norm": 1.71518135983593, + "learning_rate": 5.299703893837963e-06, + "loss": 0.8306, + "step": 4134 + }, + { + "epoch": 0.67, + "grad_norm": 1.2921420447993095, + "learning_rate": 5.2950975750146635e-06, + "loss": 0.8228, + "step": 4135 + }, + { + "epoch": 0.67, + "grad_norm": 1.4195008302976238, + "learning_rate": 5.290492537936473e-06, + "loss": 0.835, + "step": 4136 + }, + { + "epoch": 0.67, + "grad_norm": 1.3045007352867344, + "learning_rate": 5.285888783857935e-06, + "loss": 0.8481, + "step": 4137 + }, + { + "epoch": 0.67, + "grad_norm": 1.4921538888431292, + "learning_rate": 5.281286314033236e-06, + "loss": 0.7515, + "step": 4138 + }, + { + "epoch": 0.67, + "grad_norm": 1.3416922138380263, + "learning_rate": 5.27668512971622e-06, + "loss": 0.9012, + "step": 4139 + }, + { + "epoch": 0.67, + "grad_norm": 1.5248734889505318, + "learning_rate": 5.27208523216037e-06, + "loss": 0.8502, + "step": 4140 + }, + { + "epoch": 0.67, + "grad_norm": 1.4403566128573408, + "learning_rate": 5.267486622618833e-06, + "loss": 0.8313, + "step": 4141 + }, + { + "epoch": 0.67, + "grad_norm": 1.188131326529266, + "learning_rate": 5.262889302344391e-06, + "loss": 0.8126, + "step": 4142 + }, + { + "epoch": 0.67, + "grad_norm": 1.6992531400247863, + "learning_rate": 5.2582932725894785e-06, + "loss": 0.7718, + "step": 4143 + }, + { + "epoch": 0.67, + "grad_norm": 1.5632700810564857, + "learning_rate": 5.253698534606186e-06, + "loss": 0.8667, + "step": 4144 + }, + { + "epoch": 0.67, + "grad_norm": 1.3871500425841599, + "learning_rate": 5.249105089646252e-06, + "loss": 0.8133, + "step": 4145 + }, + { + "epoch": 0.67, + "grad_norm": 1.3471871057017366, + "learning_rate": 5.244512938961044e-06, + "loss": 0.8264, + "step": 4146 + }, + { + "epoch": 0.67, + "grad_norm": 1.4019634779215433, + "learning_rate": 5.239922083801597e-06, + "loss": 0.8421, + "step": 4147 + }, + { + "epoch": 0.67, + "grad_norm": 1.8824573214978029, + "learning_rate": 5.235332525418588e-06, + "loss": 0.8294, + "step": 4148 + }, + { + "epoch": 0.67, + "grad_norm": 1.3021465681139015, + "learning_rate": 5.23074426506234e-06, + "loss": 0.8288, + "step": 4149 + }, + { + "epoch": 0.67, + "grad_norm": 1.5319928172697292, + "learning_rate": 5.226157303982815e-06, + "loss": 0.8887, + "step": 4150 + }, + { + "epoch": 0.67, + "grad_norm": 1.5424103245449776, + "learning_rate": 5.221571643429637e-06, + "loss": 0.8237, + "step": 4151 + }, + { + "epoch": 0.67, + "grad_norm": 1.5012499275533069, + "learning_rate": 5.216987284652061e-06, + "loss": 0.8309, + "step": 4152 + }, + { + "epoch": 0.67, + "grad_norm": 1.4417295603772156, + "learning_rate": 5.21240422889899e-06, + "loss": 0.8205, + "step": 4153 + }, + { + "epoch": 0.67, + "grad_norm": 1.4890165621355274, + "learning_rate": 5.20782247741898e-06, + "loss": 0.8327, + "step": 4154 + }, + { + "epoch": 0.67, + "grad_norm": 1.428936557855059, + "learning_rate": 5.203242031460222e-06, + "loss": 0.7972, + "step": 4155 + }, + { + "epoch": 0.67, + "grad_norm": 1.4052990042221027, + "learning_rate": 5.1986628922705605e-06, + "loss": 0.8333, + "step": 4156 + }, + { + "epoch": 0.67, + "grad_norm": 1.444153480792433, + "learning_rate": 5.194085061097474e-06, + "loss": 0.863, + "step": 4157 + }, + { + "epoch": 0.67, + "grad_norm": 1.5130336474227828, + "learning_rate": 5.189508539188097e-06, + "loss": 0.8071, + "step": 4158 + }, + { + "epoch": 0.67, + "grad_norm": 1.3586003892488454, + "learning_rate": 5.1849333277891946e-06, + "loss": 0.8332, + "step": 4159 + }, + { + "epoch": 0.67, + "grad_norm": 1.3918935703624078, + "learning_rate": 5.180359428147179e-06, + "loss": 0.8001, + "step": 4160 + }, + { + "epoch": 0.67, + "grad_norm": 1.4827910555383252, + "learning_rate": 5.175786841508113e-06, + "loss": 0.8029, + "step": 4161 + }, + { + "epoch": 0.67, + "grad_norm": 1.430665932091022, + "learning_rate": 5.1712155691176865e-06, + "loss": 0.8622, + "step": 4162 + }, + { + "epoch": 0.67, + "grad_norm": 1.931053049112487, + "learning_rate": 5.166645612221251e-06, + "loss": 0.8721, + "step": 4163 + }, + { + "epoch": 0.67, + "grad_norm": 1.5829165783618364, + "learning_rate": 5.162076972063781e-06, + "loss": 0.7307, + "step": 4164 + }, + { + "epoch": 0.67, + "grad_norm": 1.6689632168161572, + "learning_rate": 5.1575096498899e-06, + "loss": 0.8096, + "step": 4165 + }, + { + "epoch": 0.67, + "grad_norm": 1.4894216003403309, + "learning_rate": 5.152943646943876e-06, + "loss": 0.8247, + "step": 4166 + }, + { + "epoch": 0.67, + "grad_norm": 1.2528460319343926, + "learning_rate": 5.148378964469615e-06, + "loss": 0.812, + "step": 4167 + }, + { + "epoch": 0.67, + "grad_norm": 1.1767225782572486, + "learning_rate": 5.143815603710654e-06, + "loss": 0.801, + "step": 4168 + }, + { + "epoch": 0.67, + "grad_norm": 1.5673012875659664, + "learning_rate": 5.139253565910185e-06, + "loss": 0.7431, + "step": 4169 + }, + { + "epoch": 0.67, + "grad_norm": 1.370067746445637, + "learning_rate": 5.134692852311035e-06, + "loss": 0.7779, + "step": 4170 + }, + { + "epoch": 0.67, + "grad_norm": 1.4659902834969942, + "learning_rate": 5.1301334641556665e-06, + "loss": 0.8194, + "step": 4171 + }, + { + "epoch": 0.67, + "grad_norm": 1.3976580402838306, + "learning_rate": 5.125575402686176e-06, + "loss": 0.9117, + "step": 4172 + }, + { + "epoch": 0.67, + "grad_norm": 1.2469693862235234, + "learning_rate": 5.121018669144313e-06, + "loss": 0.8586, + "step": 4173 + }, + { + "epoch": 0.67, + "grad_norm": 1.691663335992051, + "learning_rate": 5.116463264771456e-06, + "loss": 0.8383, + "step": 4174 + }, + { + "epoch": 0.67, + "grad_norm": 1.3874890649344223, + "learning_rate": 5.111909190808617e-06, + "loss": 0.7871, + "step": 4175 + }, + { + "epoch": 0.67, + "grad_norm": 1.7953108033055187, + "learning_rate": 5.107356448496459e-06, + "loss": 0.7561, + "step": 4176 + }, + { + "epoch": 0.67, + "grad_norm": 1.6476175565051514, + "learning_rate": 5.102805039075267e-06, + "loss": 0.806, + "step": 4177 + }, + { + "epoch": 0.67, + "grad_norm": 1.2533106065041095, + "learning_rate": 5.098254963784979e-06, + "loss": 0.8747, + "step": 4178 + }, + { + "epoch": 0.67, + "grad_norm": 1.530157907089299, + "learning_rate": 5.093706223865151e-06, + "loss": 0.8378, + "step": 4179 + }, + { + "epoch": 0.67, + "grad_norm": 1.4607250740016877, + "learning_rate": 5.089158820554996e-06, + "loss": 0.8273, + "step": 4180 + }, + { + "epoch": 0.67, + "grad_norm": 0.767049993576241, + "learning_rate": 5.084612755093346e-06, + "loss": 0.3893, + "step": 4181 + }, + { + "epoch": 0.67, + "grad_norm": 1.9875126919626056, + "learning_rate": 5.08006802871867e-06, + "loss": 0.8459, + "step": 4182 + }, + { + "epoch": 0.67, + "grad_norm": 1.404159340204978, + "learning_rate": 5.075524642669086e-06, + "loss": 0.8032, + "step": 4183 + }, + { + "epoch": 0.67, + "grad_norm": 0.9055791336657648, + "learning_rate": 5.07098259818233e-06, + "loss": 0.3658, + "step": 4184 + }, + { + "epoch": 0.67, + "grad_norm": 1.3789108751775963, + "learning_rate": 5.066441896495786e-06, + "loss": 0.8222, + "step": 4185 + }, + { + "epoch": 0.67, + "grad_norm": 1.7762124794798146, + "learning_rate": 5.061902538846466e-06, + "loss": 0.7915, + "step": 4186 + }, + { + "epoch": 0.67, + "grad_norm": 1.215253719383097, + "learning_rate": 5.057364526471008e-06, + "loss": 0.8712, + "step": 4187 + }, + { + "epoch": 0.67, + "grad_norm": 1.2735209839901582, + "learning_rate": 5.052827860605702e-06, + "loss": 0.8714, + "step": 4188 + }, + { + "epoch": 0.67, + "grad_norm": 1.3704509736368349, + "learning_rate": 5.048292542486457e-06, + "loss": 0.8347, + "step": 4189 + }, + { + "epoch": 0.68, + "grad_norm": 1.2014146102093959, + "learning_rate": 5.0437585733488135e-06, + "loss": 0.7817, + "step": 4190 + }, + { + "epoch": 0.68, + "grad_norm": 1.1074998254606334, + "learning_rate": 5.039225954427953e-06, + "loss": 0.8352, + "step": 4191 + }, + { + "epoch": 0.68, + "grad_norm": 1.6573681869190582, + "learning_rate": 5.034694686958692e-06, + "loss": 0.8487, + "step": 4192 + }, + { + "epoch": 0.68, + "grad_norm": 1.3367422835041438, + "learning_rate": 5.030164772175469e-06, + "loss": 0.7788, + "step": 4193 + }, + { + "epoch": 0.68, + "grad_norm": 1.6694863247751166, + "learning_rate": 5.025636211312351e-06, + "loss": 0.8542, + "step": 4194 + }, + { + "epoch": 0.68, + "grad_norm": 1.5172708831212016, + "learning_rate": 5.021109005603053e-06, + "loss": 0.8969, + "step": 4195 + }, + { + "epoch": 0.68, + "grad_norm": 1.5531020047176003, + "learning_rate": 5.016583156280906e-06, + "loss": 0.8551, + "step": 4196 + }, + { + "epoch": 0.68, + "grad_norm": 1.597165706869662, + "learning_rate": 5.012058664578871e-06, + "loss": 0.8361, + "step": 4197 + }, + { + "epoch": 0.68, + "grad_norm": 1.565971783366627, + "learning_rate": 5.007535531729548e-06, + "loss": 0.8746, + "step": 4198 + }, + { + "epoch": 0.68, + "grad_norm": 1.2939333594867741, + "learning_rate": 5.003013758965171e-06, + "loss": 0.8803, + "step": 4199 + }, + { + "epoch": 0.68, + "grad_norm": 1.735253318920913, + "learning_rate": 4.9984933475175865e-06, + "loss": 0.7513, + "step": 4200 + }, + { + "epoch": 0.68, + "grad_norm": 1.3502436733096745, + "learning_rate": 4.9939742986182795e-06, + "loss": 0.8239, + "step": 4201 + }, + { + "epoch": 0.68, + "grad_norm": 1.423550296751868, + "learning_rate": 4.989456613498368e-06, + "loss": 0.7908, + "step": 4202 + }, + { + "epoch": 0.68, + "grad_norm": 1.344387202080036, + "learning_rate": 4.9849402933885915e-06, + "loss": 0.868, + "step": 4203 + }, + { + "epoch": 0.68, + "grad_norm": 1.3923112873098311, + "learning_rate": 4.980425339519316e-06, + "loss": 0.8074, + "step": 4204 + }, + { + "epoch": 0.68, + "grad_norm": 1.4965230360740287, + "learning_rate": 4.975911753120548e-06, + "loss": 0.8724, + "step": 4205 + }, + { + "epoch": 0.68, + "grad_norm": 1.4713633702983342, + "learning_rate": 4.971399535421904e-06, + "loss": 0.8581, + "step": 4206 + }, + { + "epoch": 0.68, + "grad_norm": 1.7559516270412578, + "learning_rate": 4.966888687652645e-06, + "loss": 0.8259, + "step": 4207 + }, + { + "epoch": 0.68, + "grad_norm": 1.5932226488967949, + "learning_rate": 4.9623792110416454e-06, + "loss": 0.8088, + "step": 4208 + }, + { + "epoch": 0.68, + "grad_norm": 1.3742652573856473, + "learning_rate": 4.9578711068174076e-06, + "loss": 0.8385, + "step": 4209 + }, + { + "epoch": 0.68, + "grad_norm": 1.450995553909466, + "learning_rate": 4.953364376208072e-06, + "loss": 0.805, + "step": 4210 + }, + { + "epoch": 0.68, + "grad_norm": 1.3650553644614798, + "learning_rate": 4.948859020441391e-06, + "loss": 0.7591, + "step": 4211 + }, + { + "epoch": 0.68, + "grad_norm": 1.3585927709068117, + "learning_rate": 4.944355040744745e-06, + "loss": 0.8074, + "step": 4212 + }, + { + "epoch": 0.68, + "grad_norm": 1.566671540443173, + "learning_rate": 4.939852438345145e-06, + "loss": 0.8423, + "step": 4213 + }, + { + "epoch": 0.68, + "grad_norm": 0.947401078361531, + "learning_rate": 4.93535121446923e-06, + "loss": 0.3548, + "step": 4214 + }, + { + "epoch": 0.68, + "grad_norm": 1.2614752070132242, + "learning_rate": 4.93085137034325e-06, + "loss": 0.8239, + "step": 4215 + }, + { + "epoch": 0.68, + "grad_norm": 1.377570802384327, + "learning_rate": 4.926352907193086e-06, + "loss": 0.8483, + "step": 4216 + }, + { + "epoch": 0.68, + "grad_norm": 1.579806181819675, + "learning_rate": 4.921855826244249e-06, + "loss": 0.9109, + "step": 4217 + }, + { + "epoch": 0.68, + "grad_norm": 1.2732063435434646, + "learning_rate": 4.917360128721865e-06, + "loss": 0.8378, + "step": 4218 + }, + { + "epoch": 0.68, + "grad_norm": 1.4535796117038635, + "learning_rate": 4.912865815850682e-06, + "loss": 0.8732, + "step": 4219 + }, + { + "epoch": 0.68, + "grad_norm": 1.4451757224547972, + "learning_rate": 4.908372888855078e-06, + "loss": 0.7574, + "step": 4220 + }, + { + "epoch": 0.68, + "grad_norm": 1.1897169937793317, + "learning_rate": 4.903881348959055e-06, + "loss": 0.8207, + "step": 4221 + }, + { + "epoch": 0.68, + "grad_norm": 1.5176649737698933, + "learning_rate": 4.899391197386229e-06, + "loss": 0.8761, + "step": 4222 + }, + { + "epoch": 0.68, + "grad_norm": 0.8269583901484135, + "learning_rate": 4.894902435359834e-06, + "loss": 0.3339, + "step": 4223 + }, + { + "epoch": 0.68, + "grad_norm": 0.9487676254904482, + "learning_rate": 4.890415064102744e-06, + "loss": 0.3254, + "step": 4224 + }, + { + "epoch": 0.68, + "grad_norm": 1.433907412820224, + "learning_rate": 4.885929084837436e-06, + "loss": 0.8873, + "step": 4225 + }, + { + "epoch": 0.68, + "grad_norm": 1.4160452477875352, + "learning_rate": 4.8814444987860125e-06, + "loss": 0.8446, + "step": 4226 + }, + { + "epoch": 0.68, + "grad_norm": 1.3603272562532365, + "learning_rate": 4.876961307170204e-06, + "loss": 0.8355, + "step": 4227 + }, + { + "epoch": 0.68, + "grad_norm": 1.2530137748926036, + "learning_rate": 4.87247951121135e-06, + "loss": 0.8811, + "step": 4228 + }, + { + "epoch": 0.68, + "grad_norm": 1.5037231705437166, + "learning_rate": 4.867999112130422e-06, + "loss": 0.7822, + "step": 4229 + }, + { + "epoch": 0.68, + "grad_norm": 1.310163035230178, + "learning_rate": 4.863520111147999e-06, + "loss": 0.8287, + "step": 4230 + }, + { + "epoch": 0.68, + "grad_norm": 1.315104518707782, + "learning_rate": 4.859042509484283e-06, + "loss": 0.8107, + "step": 4231 + }, + { + "epoch": 0.68, + "grad_norm": 1.6038638490701458, + "learning_rate": 4.854566308359102e-06, + "loss": 0.8739, + "step": 4232 + }, + { + "epoch": 0.68, + "grad_norm": 1.3270457756469651, + "learning_rate": 4.850091508991893e-06, + "loss": 0.7879, + "step": 4233 + }, + { + "epoch": 0.68, + "grad_norm": 1.6159304107428625, + "learning_rate": 4.845618112601712e-06, + "loss": 0.7815, + "step": 4234 + }, + { + "epoch": 0.68, + "grad_norm": 1.5436037631042647, + "learning_rate": 4.841146120407239e-06, + "loss": 0.7474, + "step": 4235 + }, + { + "epoch": 0.68, + "grad_norm": 1.353050811299745, + "learning_rate": 4.836675533626769e-06, + "loss": 0.8678, + "step": 4236 + }, + { + "epoch": 0.68, + "grad_norm": 1.5235886462132675, + "learning_rate": 4.832206353478213e-06, + "loss": 0.8524, + "step": 4237 + }, + { + "epoch": 0.68, + "grad_norm": 1.357256117655531, + "learning_rate": 4.8277385811790946e-06, + "loss": 0.7329, + "step": 4238 + }, + { + "epoch": 0.68, + "grad_norm": 1.5672266047468137, + "learning_rate": 4.823272217946563e-06, + "loss": 0.8705, + "step": 4239 + }, + { + "epoch": 0.68, + "grad_norm": 1.5066919035439938, + "learning_rate": 4.8188072649973775e-06, + "loss": 0.8568, + "step": 4240 + }, + { + "epoch": 0.68, + "grad_norm": 1.414692065142792, + "learning_rate": 4.8143437235479085e-06, + "loss": 0.7798, + "step": 4241 + }, + { + "epoch": 0.68, + "grad_norm": 1.3251039428528075, + "learning_rate": 4.809881594814154e-06, + "loss": 0.7959, + "step": 4242 + }, + { + "epoch": 0.68, + "grad_norm": 1.3910431541261903, + "learning_rate": 4.805420880011723e-06, + "loss": 0.8846, + "step": 4243 + }, + { + "epoch": 0.68, + "grad_norm": 1.383795968377024, + "learning_rate": 4.800961580355833e-06, + "loss": 0.8464, + "step": 4244 + }, + { + "epoch": 0.68, + "grad_norm": 1.2411846282655783, + "learning_rate": 4.7965036970613175e-06, + "loss": 0.8007, + "step": 4245 + }, + { + "epoch": 0.68, + "grad_norm": 1.2800983124410865, + "learning_rate": 4.792047231342635e-06, + "loss": 0.8502, + "step": 4246 + }, + { + "epoch": 0.68, + "grad_norm": 1.5499003115829288, + "learning_rate": 4.787592184413843e-06, + "loss": 0.8434, + "step": 4247 + }, + { + "epoch": 0.68, + "grad_norm": 1.471188831042224, + "learning_rate": 4.783138557488618e-06, + "loss": 0.8329, + "step": 4248 + }, + { + "epoch": 0.68, + "grad_norm": 1.3411880469545743, + "learning_rate": 4.778686351780257e-06, + "loss": 0.8156, + "step": 4249 + }, + { + "epoch": 0.68, + "grad_norm": 1.4559445157192166, + "learning_rate": 4.774235568501656e-06, + "loss": 0.8704, + "step": 4250 + }, + { + "epoch": 0.68, + "grad_norm": 1.4099827842008033, + "learning_rate": 4.76978620886534e-06, + "loss": 0.8314, + "step": 4251 + }, + { + "epoch": 0.69, + "grad_norm": 1.271745015969375, + "learning_rate": 4.765338274083432e-06, + "loss": 0.8672, + "step": 4252 + }, + { + "epoch": 0.69, + "grad_norm": 1.3600439886427995, + "learning_rate": 4.7608917653676675e-06, + "loss": 0.7909, + "step": 4253 + }, + { + "epoch": 0.69, + "grad_norm": 1.3058487355685737, + "learning_rate": 4.7564466839294085e-06, + "loss": 0.8075, + "step": 4254 + }, + { + "epoch": 0.69, + "grad_norm": 1.5523066327390431, + "learning_rate": 4.752003030979612e-06, + "loss": 0.8041, + "step": 4255 + }, + { + "epoch": 0.69, + "grad_norm": 1.6513959068900588, + "learning_rate": 4.747560807728847e-06, + "loss": 0.7848, + "step": 4256 + }, + { + "epoch": 0.69, + "grad_norm": 0.7969387609787683, + "learning_rate": 4.743120015387302e-06, + "loss": 0.3539, + "step": 4257 + }, + { + "epoch": 0.69, + "grad_norm": 1.6098973638266458, + "learning_rate": 4.7386806551647766e-06, + "loss": 0.8377, + "step": 4258 + }, + { + "epoch": 0.69, + "grad_norm": 1.5441044008255504, + "learning_rate": 4.73424272827067e-06, + "loss": 0.905, + "step": 4259 + }, + { + "epoch": 0.69, + "grad_norm": 1.3169301070293704, + "learning_rate": 4.729806235913991e-06, + "loss": 0.8537, + "step": 4260 + }, + { + "epoch": 0.69, + "grad_norm": 1.4602267913147935, + "learning_rate": 4.725371179303371e-06, + "loss": 0.822, + "step": 4261 + }, + { + "epoch": 0.69, + "grad_norm": 1.3661101469718007, + "learning_rate": 4.720937559647038e-06, + "loss": 0.8142, + "step": 4262 + }, + { + "epoch": 0.69, + "grad_norm": 1.4251445341428535, + "learning_rate": 4.716505378152827e-06, + "loss": 0.8638, + "step": 4263 + }, + { + "epoch": 0.69, + "grad_norm": 1.402143744388967, + "learning_rate": 4.712074636028192e-06, + "loss": 0.7855, + "step": 4264 + }, + { + "epoch": 0.69, + "grad_norm": 1.3368081067566315, + "learning_rate": 4.7076453344801965e-06, + "loss": 0.7672, + "step": 4265 + }, + { + "epoch": 0.69, + "grad_norm": 1.3909078696083208, + "learning_rate": 4.703217474715489e-06, + "loss": 0.8124, + "step": 4266 + }, + { + "epoch": 0.69, + "grad_norm": 1.503184016312308, + "learning_rate": 4.698791057940349e-06, + "loss": 0.8185, + "step": 4267 + }, + { + "epoch": 0.69, + "grad_norm": 1.6566107701150459, + "learning_rate": 4.694366085360656e-06, + "loss": 0.8099, + "step": 4268 + }, + { + "epoch": 0.69, + "grad_norm": 1.4594392188776675, + "learning_rate": 4.689942558181893e-06, + "loss": 0.7954, + "step": 4269 + }, + { + "epoch": 0.69, + "grad_norm": 1.1994972154212105, + "learning_rate": 4.6855204776091445e-06, + "loss": 0.8343, + "step": 4270 + }, + { + "epoch": 0.69, + "grad_norm": 1.3837925817354688, + "learning_rate": 4.681099844847117e-06, + "loss": 0.8414, + "step": 4271 + }, + { + "epoch": 0.69, + "grad_norm": 0.9752629913080488, + "learning_rate": 4.6766806611001046e-06, + "loss": 0.3456, + "step": 4272 + }, + { + "epoch": 0.69, + "grad_norm": 1.320982936348882, + "learning_rate": 4.672262927572021e-06, + "loss": 0.8191, + "step": 4273 + }, + { + "epoch": 0.69, + "grad_norm": 1.3972569309398184, + "learning_rate": 4.667846645466377e-06, + "loss": 0.8556, + "step": 4274 + }, + { + "epoch": 0.69, + "grad_norm": 1.2531189656617716, + "learning_rate": 4.663431815986284e-06, + "loss": 0.7872, + "step": 4275 + }, + { + "epoch": 0.69, + "grad_norm": 1.4050375927399317, + "learning_rate": 4.659018440334472e-06, + "loss": 0.8213, + "step": 4276 + }, + { + "epoch": 0.69, + "grad_norm": 1.3031584779981, + "learning_rate": 4.654606519713258e-06, + "loss": 0.7036, + "step": 4277 + }, + { + "epoch": 0.69, + "grad_norm": 1.3272594759133631, + "learning_rate": 4.6501960553245785e-06, + "loss": 0.8378, + "step": 4278 + }, + { + "epoch": 0.69, + "grad_norm": 1.5452150899706918, + "learning_rate": 4.645787048369958e-06, + "loss": 0.8572, + "step": 4279 + }, + { + "epoch": 0.69, + "grad_norm": 1.4658813285962644, + "learning_rate": 4.641379500050538e-06, + "loss": 0.777, + "step": 4280 + }, + { + "epoch": 0.69, + "grad_norm": 1.5286045913044735, + "learning_rate": 4.636973411567055e-06, + "loss": 0.8519, + "step": 4281 + }, + { + "epoch": 0.69, + "grad_norm": 1.4282003945465973, + "learning_rate": 4.632568784119842e-06, + "loss": 0.8141, + "step": 4282 + }, + { + "epoch": 0.69, + "grad_norm": 1.2310525568841226, + "learning_rate": 4.628165618908851e-06, + "loss": 0.8543, + "step": 4283 + }, + { + "epoch": 0.69, + "grad_norm": 1.454495744840437, + "learning_rate": 4.623763917133621e-06, + "loss": 0.8202, + "step": 4284 + }, + { + "epoch": 0.69, + "grad_norm": 1.3034967071451053, + "learning_rate": 4.619363679993293e-06, + "loss": 0.8683, + "step": 4285 + }, + { + "epoch": 0.69, + "grad_norm": 1.4223198103228434, + "learning_rate": 4.614964908686617e-06, + "loss": 0.848, + "step": 4286 + }, + { + "epoch": 0.69, + "grad_norm": 1.4667652167346432, + "learning_rate": 4.610567604411946e-06, + "loss": 0.8771, + "step": 4287 + }, + { + "epoch": 0.69, + "grad_norm": 1.4082472188503694, + "learning_rate": 4.606171768367213e-06, + "loss": 0.8316, + "step": 4288 + }, + { + "epoch": 0.69, + "grad_norm": 1.164634589612392, + "learning_rate": 4.601777401749972e-06, + "loss": 0.7953, + "step": 4289 + }, + { + "epoch": 0.69, + "grad_norm": 1.4609849234906007, + "learning_rate": 4.597384505757373e-06, + "loss": 0.8111, + "step": 4290 + }, + { + "epoch": 0.69, + "grad_norm": 1.2823083056899898, + "learning_rate": 4.592993081586159e-06, + "loss": 0.8902, + "step": 4291 + }, + { + "epoch": 0.69, + "grad_norm": 1.4016509244023847, + "learning_rate": 4.588603130432671e-06, + "loss": 0.8317, + "step": 4292 + }, + { + "epoch": 0.69, + "grad_norm": 1.3713189249898754, + "learning_rate": 4.58421465349286e-06, + "loss": 0.8631, + "step": 4293 + }, + { + "epoch": 0.69, + "grad_norm": 1.42265759764696, + "learning_rate": 4.579827651962264e-06, + "loss": 0.8371, + "step": 4294 + }, + { + "epoch": 0.69, + "grad_norm": 1.40742206406731, + "learning_rate": 4.5754421270360195e-06, + "loss": 0.8653, + "step": 4295 + }, + { + "epoch": 0.69, + "grad_norm": 1.3160320282092444, + "learning_rate": 4.571058079908869e-06, + "loss": 0.8299, + "step": 4296 + }, + { + "epoch": 0.69, + "grad_norm": 1.3970158439288505, + "learning_rate": 4.566675511775151e-06, + "loss": 0.8876, + "step": 4297 + }, + { + "epoch": 0.69, + "grad_norm": 1.4599217412783911, + "learning_rate": 4.562294423828794e-06, + "loss": 0.8124, + "step": 4298 + }, + { + "epoch": 0.69, + "grad_norm": 1.4014970157708602, + "learning_rate": 4.557914817263324e-06, + "loss": 0.8323, + "step": 4299 + }, + { + "epoch": 0.69, + "grad_norm": 1.416185129477895, + "learning_rate": 4.553536693271872e-06, + "loss": 0.8175, + "step": 4300 + }, + { + "epoch": 0.69, + "grad_norm": 1.3003775146192469, + "learning_rate": 4.5491600530471546e-06, + "loss": 0.855, + "step": 4301 + }, + { + "epoch": 0.69, + "grad_norm": 1.3429186142555603, + "learning_rate": 4.544784897781495e-06, + "loss": 0.7553, + "step": 4302 + }, + { + "epoch": 0.69, + "grad_norm": 1.4239382393830073, + "learning_rate": 4.540411228666805e-06, + "loss": 0.8836, + "step": 4303 + }, + { + "epoch": 0.69, + "grad_norm": 1.5405966780909657, + "learning_rate": 4.536039046894584e-06, + "loss": 0.9026, + "step": 4304 + }, + { + "epoch": 0.69, + "grad_norm": 1.4510673546919153, + "learning_rate": 4.531668353655948e-06, + "loss": 0.8642, + "step": 4305 + }, + { + "epoch": 0.69, + "grad_norm": 1.2502657257701797, + "learning_rate": 4.527299150141588e-06, + "loss": 0.7621, + "step": 4306 + }, + { + "epoch": 0.69, + "grad_norm": 1.271358715973476, + "learning_rate": 4.52293143754179e-06, + "loss": 0.8568, + "step": 4307 + }, + { + "epoch": 0.69, + "grad_norm": 1.299998817735602, + "learning_rate": 4.518565217046446e-06, + "loss": 0.7947, + "step": 4308 + }, + { + "epoch": 0.69, + "grad_norm": 1.4167577482552336, + "learning_rate": 4.51420048984504e-06, + "loss": 0.8466, + "step": 4309 + }, + { + "epoch": 0.69, + "grad_norm": 1.4398217945474072, + "learning_rate": 4.50983725712663e-06, + "loss": 0.8264, + "step": 4310 + }, + { + "epoch": 0.69, + "grad_norm": 0.7687833904995048, + "learning_rate": 4.505475520079889e-06, + "loss": 0.3481, + "step": 4311 + }, + { + "epoch": 0.69, + "grad_norm": 1.3900735758358231, + "learning_rate": 4.501115279893077e-06, + "loss": 0.8601, + "step": 4312 + }, + { + "epoch": 0.69, + "grad_norm": 1.401890721371135, + "learning_rate": 4.4967565377540415e-06, + "loss": 0.8584, + "step": 4313 + }, + { + "epoch": 0.7, + "grad_norm": 1.3355339504608454, + "learning_rate": 4.492399294850218e-06, + "loss": 0.8274, + "step": 4314 + }, + { + "epoch": 0.7, + "grad_norm": 1.2292345351202962, + "learning_rate": 4.488043552368649e-06, + "loss": 0.7696, + "step": 4315 + }, + { + "epoch": 0.7, + "grad_norm": 1.2980806594846481, + "learning_rate": 4.483689311495954e-06, + "loss": 0.792, + "step": 4316 + }, + { + "epoch": 0.7, + "grad_norm": 1.4058962227941507, + "learning_rate": 4.4793365734183445e-06, + "loss": 0.9071, + "step": 4317 + }, + { + "epoch": 0.7, + "grad_norm": 1.306442520137477, + "learning_rate": 4.47498533932163e-06, + "loss": 0.7713, + "step": 4318 + }, + { + "epoch": 0.7, + "grad_norm": 1.282194024578892, + "learning_rate": 4.470635610391212e-06, + "loss": 0.7357, + "step": 4319 + }, + { + "epoch": 0.7, + "grad_norm": 1.4685406814721074, + "learning_rate": 4.466287387812071e-06, + "loss": 0.8034, + "step": 4320 + }, + { + "epoch": 0.7, + "grad_norm": 1.5319636023835896, + "learning_rate": 4.461940672768779e-06, + "loss": 0.8333, + "step": 4321 + }, + { + "epoch": 0.7, + "grad_norm": 1.379211559144559, + "learning_rate": 4.457595466445509e-06, + "loss": 0.8597, + "step": 4322 + }, + { + "epoch": 0.7, + "grad_norm": 1.4002588261946511, + "learning_rate": 4.453251770026011e-06, + "loss": 0.8322, + "step": 4323 + }, + { + "epoch": 0.7, + "grad_norm": 1.3322261544654668, + "learning_rate": 4.448909584693626e-06, + "loss": 0.8257, + "step": 4324 + }, + { + "epoch": 0.7, + "grad_norm": 1.3120961391034642, + "learning_rate": 4.444568911631289e-06, + "loss": 0.8341, + "step": 4325 + }, + { + "epoch": 0.7, + "grad_norm": 1.5513946709665167, + "learning_rate": 4.4402297520215136e-06, + "loss": 0.8406, + "step": 4326 + }, + { + "epoch": 0.7, + "grad_norm": 1.4967581516204458, + "learning_rate": 4.435892107046414e-06, + "loss": 0.7546, + "step": 4327 + }, + { + "epoch": 0.7, + "grad_norm": 1.5152208290763223, + "learning_rate": 4.431555977887679e-06, + "loss": 0.9203, + "step": 4328 + }, + { + "epoch": 0.7, + "grad_norm": 1.3350884984260938, + "learning_rate": 4.427221365726586e-06, + "loss": 0.8077, + "step": 4329 + }, + { + "epoch": 0.7, + "grad_norm": 1.331257995267486, + "learning_rate": 4.422888271744009e-06, + "loss": 0.7852, + "step": 4330 + }, + { + "epoch": 0.7, + "grad_norm": 0.8421405159419195, + "learning_rate": 4.418556697120408e-06, + "loss": 0.3409, + "step": 4331 + }, + { + "epoch": 0.7, + "grad_norm": 1.3465174403666682, + "learning_rate": 4.4142266430358085e-06, + "loss": 0.8922, + "step": 4332 + }, + { + "epoch": 0.7, + "grad_norm": 1.342762922766005, + "learning_rate": 4.409898110669844e-06, + "loss": 0.8451, + "step": 4333 + }, + { + "epoch": 0.7, + "grad_norm": 1.4216569717632133, + "learning_rate": 4.40557110120173e-06, + "loss": 0.8271, + "step": 4334 + }, + { + "epoch": 0.7, + "grad_norm": 1.2983829872018666, + "learning_rate": 4.40124561581026e-06, + "loss": 0.8793, + "step": 4335 + }, + { + "epoch": 0.7, + "grad_norm": 1.2717146459867814, + "learning_rate": 4.39692165567381e-06, + "loss": 0.8251, + "step": 4336 + }, + { + "epoch": 0.7, + "grad_norm": 1.4643920220992683, + "learning_rate": 4.392599221970351e-06, + "loss": 0.8542, + "step": 4337 + }, + { + "epoch": 0.7, + "grad_norm": 1.2623009940023568, + "learning_rate": 4.388278315877441e-06, + "loss": 0.807, + "step": 4338 + }, + { + "epoch": 0.7, + "grad_norm": 1.5646845536854703, + "learning_rate": 4.3839589385721985e-06, + "loss": 0.8487, + "step": 4339 + }, + { + "epoch": 0.7, + "grad_norm": 1.7826344208873324, + "learning_rate": 4.379641091231348e-06, + "loss": 0.8593, + "step": 4340 + }, + { + "epoch": 0.7, + "grad_norm": 1.359692297412814, + "learning_rate": 4.375324775031194e-06, + "loss": 0.77, + "step": 4341 + }, + { + "epoch": 0.7, + "grad_norm": 1.4848570950550797, + "learning_rate": 4.3710099911476155e-06, + "loss": 0.7718, + "step": 4342 + }, + { + "epoch": 0.7, + "grad_norm": 1.3314098701499462, + "learning_rate": 4.3666967407560765e-06, + "loss": 0.8079, + "step": 4343 + }, + { + "epoch": 0.7, + "grad_norm": 1.6240926562644307, + "learning_rate": 4.362385025031631e-06, + "loss": 0.7615, + "step": 4344 + }, + { + "epoch": 0.7, + "grad_norm": 1.2855299700480645, + "learning_rate": 4.3580748451489075e-06, + "loss": 0.8104, + "step": 4345 + }, + { + "epoch": 0.7, + "grad_norm": 1.341040577199372, + "learning_rate": 4.353766202282113e-06, + "loss": 0.7761, + "step": 4346 + }, + { + "epoch": 0.7, + "grad_norm": 1.5248870848845937, + "learning_rate": 4.349459097605048e-06, + "loss": 0.802, + "step": 4347 + }, + { + "epoch": 0.7, + "grad_norm": 1.8543146268381816, + "learning_rate": 4.3451535322910786e-06, + "loss": 0.8255, + "step": 4348 + }, + { + "epoch": 0.7, + "grad_norm": 0.8233799083853918, + "learning_rate": 4.340849507513168e-06, + "loss": 0.3278, + "step": 4349 + }, + { + "epoch": 0.7, + "grad_norm": 1.2563264176359663, + "learning_rate": 4.336547024443847e-06, + "loss": 0.8228, + "step": 4350 + }, + { + "epoch": 0.7, + "grad_norm": 1.4376301551028559, + "learning_rate": 4.332246084255227e-06, + "loss": 0.7803, + "step": 4351 + }, + { + "epoch": 0.7, + "grad_norm": 1.7174340065100457, + "learning_rate": 4.327946688119006e-06, + "loss": 0.7753, + "step": 4352 + }, + { + "epoch": 0.7, + "grad_norm": 1.3727686180883552, + "learning_rate": 4.3236488372064656e-06, + "loss": 0.8563, + "step": 4353 + }, + { + "epoch": 0.7, + "grad_norm": 1.3912845585130535, + "learning_rate": 4.319352532688444e-06, + "loss": 0.8571, + "step": 4354 + }, + { + "epoch": 0.7, + "grad_norm": 1.420853639663344, + "learning_rate": 4.31505777573538e-06, + "loss": 0.8428, + "step": 4355 + }, + { + "epoch": 0.7, + "grad_norm": 1.3260112354056672, + "learning_rate": 4.310764567517288e-06, + "loss": 0.7866, + "step": 4356 + }, + { + "epoch": 0.7, + "grad_norm": 0.8249413590469924, + "learning_rate": 4.306472909203754e-06, + "loss": 0.3208, + "step": 4357 + }, + { + "epoch": 0.7, + "grad_norm": 1.239801401870534, + "learning_rate": 4.302182801963937e-06, + "loss": 0.8216, + "step": 4358 + }, + { + "epoch": 0.7, + "grad_norm": 1.6120524165751369, + "learning_rate": 4.297894246966586e-06, + "loss": 0.759, + "step": 4359 + }, + { + "epoch": 0.7, + "grad_norm": 1.2358976122294756, + "learning_rate": 4.29360724538003e-06, + "loss": 0.811, + "step": 4360 + }, + { + "epoch": 0.7, + "grad_norm": 1.360433988134727, + "learning_rate": 4.28932179837215e-06, + "loss": 0.7975, + "step": 4361 + }, + { + "epoch": 0.7, + "grad_norm": 1.5452096100864234, + "learning_rate": 4.2850379071104286e-06, + "loss": 0.8033, + "step": 4362 + }, + { + "epoch": 0.7, + "grad_norm": 1.1448321317220125, + "learning_rate": 4.280755572761919e-06, + "loss": 0.8292, + "step": 4363 + }, + { + "epoch": 0.7, + "grad_norm": 1.2363796351460845, + "learning_rate": 4.276474796493243e-06, + "loss": 0.8019, + "step": 4364 + }, + { + "epoch": 0.7, + "grad_norm": 1.4440542195220376, + "learning_rate": 4.2721955794705985e-06, + "loss": 0.9016, + "step": 4365 + }, + { + "epoch": 0.7, + "grad_norm": 1.584210504907714, + "learning_rate": 4.267917922859769e-06, + "loss": 0.7839, + "step": 4366 + }, + { + "epoch": 0.7, + "grad_norm": 1.4641191470567296, + "learning_rate": 4.263641827826104e-06, + "loss": 0.8933, + "step": 4367 + }, + { + "epoch": 0.7, + "grad_norm": 1.3534956107648393, + "learning_rate": 4.259367295534524e-06, + "loss": 0.8195, + "step": 4368 + }, + { + "epoch": 0.7, + "grad_norm": 1.3247653049878716, + "learning_rate": 4.255094327149539e-06, + "loss": 0.8095, + "step": 4369 + }, + { + "epoch": 0.7, + "grad_norm": 1.4035554315924368, + "learning_rate": 4.250822923835214e-06, + "loss": 0.8025, + "step": 4370 + }, + { + "epoch": 0.7, + "grad_norm": 1.7216002650990987, + "learning_rate": 4.2465530867552065e-06, + "loss": 0.8382, + "step": 4371 + }, + { + "epoch": 0.7, + "grad_norm": 1.5798997943430961, + "learning_rate": 4.242284817072732e-06, + "loss": 0.8089, + "step": 4372 + }, + { + "epoch": 0.7, + "grad_norm": 1.5969878946250402, + "learning_rate": 4.2380181159505815e-06, + "loss": 0.8721, + "step": 4373 + }, + { + "epoch": 0.7, + "grad_norm": 1.2793591593365967, + "learning_rate": 4.23375298455113e-06, + "loss": 0.8012, + "step": 4374 + }, + { + "epoch": 0.7, + "grad_norm": 1.4689644710213003, + "learning_rate": 4.22948942403631e-06, + "loss": 0.8791, + "step": 4375 + }, + { + "epoch": 0.71, + "grad_norm": 1.3866841902926106, + "learning_rate": 4.2252274355676395e-06, + "loss": 0.8581, + "step": 4376 + }, + { + "epoch": 0.71, + "grad_norm": 1.4802035009812873, + "learning_rate": 4.220967020306194e-06, + "loss": 0.8403, + "step": 4377 + }, + { + "epoch": 0.71, + "grad_norm": 1.4783590295574813, + "learning_rate": 4.216708179412636e-06, + "loss": 0.8812, + "step": 4378 + }, + { + "epoch": 0.71, + "grad_norm": 1.2870513200329659, + "learning_rate": 4.212450914047187e-06, + "loss": 0.8337, + "step": 4379 + }, + { + "epoch": 0.71, + "grad_norm": 1.270826924922164, + "learning_rate": 4.2081952253696415e-06, + "loss": 0.7893, + "step": 4380 + }, + { + "epoch": 0.71, + "grad_norm": 1.5507624634630555, + "learning_rate": 4.203941114539367e-06, + "loss": 0.8319, + "step": 4381 + }, + { + "epoch": 0.71, + "grad_norm": 1.2441525361405672, + "learning_rate": 4.19968858271531e-06, + "loss": 0.8358, + "step": 4382 + }, + { + "epoch": 0.71, + "grad_norm": 1.1619526464898216, + "learning_rate": 4.195437631055963e-06, + "loss": 0.8047, + "step": 4383 + }, + { + "epoch": 0.71, + "grad_norm": 1.434095952164828, + "learning_rate": 4.191188260719408e-06, + "loss": 0.782, + "step": 4384 + }, + { + "epoch": 0.71, + "grad_norm": 1.5227068267562682, + "learning_rate": 4.186940472863296e-06, + "loss": 0.8373, + "step": 4385 + }, + { + "epoch": 0.71, + "grad_norm": 1.2907865150749716, + "learning_rate": 4.182694268644837e-06, + "loss": 0.8352, + "step": 4386 + }, + { + "epoch": 0.71, + "grad_norm": 1.4436838471854327, + "learning_rate": 4.178449649220809e-06, + "loss": 0.7934, + "step": 4387 + }, + { + "epoch": 0.71, + "grad_norm": 1.415141850905036, + "learning_rate": 4.174206615747575e-06, + "loss": 0.7853, + "step": 4388 + }, + { + "epoch": 0.71, + "grad_norm": 1.5171684604504954, + "learning_rate": 4.169965169381045e-06, + "loss": 0.8476, + "step": 4389 + }, + { + "epoch": 0.71, + "grad_norm": 1.393244901387025, + "learning_rate": 4.165725311276707e-06, + "loss": 0.8279, + "step": 4390 + }, + { + "epoch": 0.71, + "grad_norm": 1.5484051571835435, + "learning_rate": 4.161487042589619e-06, + "loss": 0.8265, + "step": 4391 + }, + { + "epoch": 0.71, + "grad_norm": 1.5048206819040228, + "learning_rate": 4.157250364474398e-06, + "loss": 0.7923, + "step": 4392 + }, + { + "epoch": 0.71, + "grad_norm": 1.3434417746758274, + "learning_rate": 4.153015278085237e-06, + "loss": 0.7748, + "step": 4393 + }, + { + "epoch": 0.71, + "grad_norm": 1.6166106291670272, + "learning_rate": 4.148781784575888e-06, + "loss": 0.8116, + "step": 4394 + }, + { + "epoch": 0.71, + "grad_norm": 1.3729593152741917, + "learning_rate": 4.1445498850996664e-06, + "loss": 0.8035, + "step": 4395 + }, + { + "epoch": 0.71, + "grad_norm": 1.364431645262251, + "learning_rate": 4.1403195808094665e-06, + "loss": 0.8576, + "step": 4396 + }, + { + "epoch": 0.71, + "grad_norm": 1.3686102739197386, + "learning_rate": 4.136090872857732e-06, + "loss": 0.8869, + "step": 4397 + }, + { + "epoch": 0.71, + "grad_norm": 1.231157779023482, + "learning_rate": 4.131863762396487e-06, + "loss": 0.7824, + "step": 4398 + }, + { + "epoch": 0.71, + "grad_norm": 1.277919924784873, + "learning_rate": 4.127638250577305e-06, + "loss": 0.8451, + "step": 4399 + }, + { + "epoch": 0.71, + "grad_norm": 1.458665641888625, + "learning_rate": 4.12341433855134e-06, + "loss": 0.7839, + "step": 4400 + }, + { + "epoch": 0.71, + "grad_norm": 1.277600499547798, + "learning_rate": 4.119192027469299e-06, + "loss": 0.7722, + "step": 4401 + }, + { + "epoch": 0.71, + "grad_norm": 1.608756051090134, + "learning_rate": 4.114971318481451e-06, + "loss": 0.7772, + "step": 4402 + }, + { + "epoch": 0.71, + "grad_norm": 1.5647757563052802, + "learning_rate": 4.110752212737641e-06, + "loss": 0.7976, + "step": 4403 + }, + { + "epoch": 0.71, + "grad_norm": 1.4142641557962934, + "learning_rate": 4.106534711387267e-06, + "loss": 0.8794, + "step": 4404 + }, + { + "epoch": 0.71, + "grad_norm": 1.4552195141091302, + "learning_rate": 4.102318815579288e-06, + "loss": 0.8199, + "step": 4405 + }, + { + "epoch": 0.71, + "grad_norm": 1.4651162520302996, + "learning_rate": 4.098104526462235e-06, + "loss": 0.7774, + "step": 4406 + }, + { + "epoch": 0.71, + "grad_norm": 1.4399875544218832, + "learning_rate": 4.093891845184197e-06, + "loss": 0.7903, + "step": 4407 + }, + { + "epoch": 0.71, + "grad_norm": 1.4104297780809647, + "learning_rate": 4.0896807728928245e-06, + "loss": 0.8198, + "step": 4408 + }, + { + "epoch": 0.71, + "grad_norm": 1.6437664575532518, + "learning_rate": 4.085471310735325e-06, + "loss": 0.8021, + "step": 4409 + }, + { + "epoch": 0.71, + "grad_norm": 1.3001600527756463, + "learning_rate": 4.081263459858479e-06, + "loss": 0.7827, + "step": 4410 + }, + { + "epoch": 0.71, + "grad_norm": 1.4810665491336683, + "learning_rate": 4.077057221408617e-06, + "loss": 0.8264, + "step": 4411 + }, + { + "epoch": 0.71, + "grad_norm": 1.4193027010819241, + "learning_rate": 4.072852596531631e-06, + "loss": 0.756, + "step": 4412 + }, + { + "epoch": 0.71, + "grad_norm": 1.4906851329260473, + "learning_rate": 4.0686495863729854e-06, + "loss": 0.8811, + "step": 4413 + }, + { + "epoch": 0.71, + "grad_norm": 1.5196047436609912, + "learning_rate": 4.064448192077687e-06, + "loss": 0.8586, + "step": 4414 + }, + { + "epoch": 0.71, + "grad_norm": 1.3460065959029994, + "learning_rate": 4.060248414790318e-06, + "loss": 0.8466, + "step": 4415 + }, + { + "epoch": 0.71, + "grad_norm": 1.5134711741628315, + "learning_rate": 4.0560502556550085e-06, + "loss": 0.909, + "step": 4416 + }, + { + "epoch": 0.71, + "grad_norm": 1.3049091333601845, + "learning_rate": 4.051853715815459e-06, + "loss": 0.7882, + "step": 4417 + }, + { + "epoch": 0.71, + "grad_norm": 1.6003473219205104, + "learning_rate": 4.0476587964149185e-06, + "loss": 0.7748, + "step": 4418 + }, + { + "epoch": 0.71, + "grad_norm": 1.425819195281636, + "learning_rate": 4.0434654985961955e-06, + "loss": 0.8293, + "step": 4419 + }, + { + "epoch": 0.71, + "grad_norm": 1.4124346493007718, + "learning_rate": 4.039273823501667e-06, + "loss": 0.8452, + "step": 4420 + }, + { + "epoch": 0.71, + "grad_norm": 1.3024318707018312, + "learning_rate": 4.035083772273254e-06, + "loss": 0.8461, + "step": 4421 + }, + { + "epoch": 0.71, + "grad_norm": 1.4441841102431878, + "learning_rate": 4.030895346052449e-06, + "loss": 0.8181, + "step": 4422 + }, + { + "epoch": 0.71, + "grad_norm": 1.5049591688703727, + "learning_rate": 4.02670854598029e-06, + "loss": 0.8744, + "step": 4423 + }, + { + "epoch": 0.71, + "grad_norm": 1.6443665194292727, + "learning_rate": 4.022523373197376e-06, + "loss": 0.7989, + "step": 4424 + }, + { + "epoch": 0.71, + "grad_norm": 1.4515254647557412, + "learning_rate": 4.018339828843868e-06, + "loss": 0.8733, + "step": 4425 + }, + { + "epoch": 0.71, + "grad_norm": 1.3408149078715867, + "learning_rate": 4.014157914059475e-06, + "loss": 0.8726, + "step": 4426 + }, + { + "epoch": 0.71, + "grad_norm": 1.1709100827679175, + "learning_rate": 4.009977629983464e-06, + "loss": 0.9114, + "step": 4427 + }, + { + "epoch": 0.71, + "grad_norm": 1.2973978786715272, + "learning_rate": 4.005798977754664e-06, + "loss": 0.8347, + "step": 4428 + }, + { + "epoch": 0.71, + "grad_norm": 1.5730852555536845, + "learning_rate": 4.001621958511456e-06, + "loss": 0.8051, + "step": 4429 + }, + { + "epoch": 0.71, + "grad_norm": 1.377665937176999, + "learning_rate": 3.9974465733917725e-06, + "loss": 0.8662, + "step": 4430 + }, + { + "epoch": 0.71, + "grad_norm": 1.4935435142912283, + "learning_rate": 3.993272823533101e-06, + "loss": 0.8439, + "step": 4431 + }, + { + "epoch": 0.71, + "grad_norm": 1.4429110332712358, + "learning_rate": 3.989100710072491e-06, + "loss": 0.8767, + "step": 4432 + }, + { + "epoch": 0.71, + "grad_norm": 1.3359158040816939, + "learning_rate": 3.98493023414654e-06, + "loss": 0.8794, + "step": 4433 + }, + { + "epoch": 0.71, + "grad_norm": 1.4192622894648228, + "learning_rate": 3.980761396891396e-06, + "loss": 0.8591, + "step": 4434 + }, + { + "epoch": 0.71, + "grad_norm": 1.2474980363873371, + "learning_rate": 3.976594199442768e-06, + "loss": 0.8068, + "step": 4435 + }, + { + "epoch": 0.71, + "grad_norm": 1.4981168378733734, + "learning_rate": 3.972428642935921e-06, + "loss": 0.8296, + "step": 4436 + }, + { + "epoch": 0.71, + "grad_norm": 1.521495261416584, + "learning_rate": 3.968264728505662e-06, + "loss": 0.7668, + "step": 4437 + }, + { + "epoch": 0.72, + "grad_norm": 0.8689031447958926, + "learning_rate": 3.964102457286353e-06, + "loss": 0.353, + "step": 4438 + }, + { + "epoch": 0.72, + "grad_norm": 1.4503065130839268, + "learning_rate": 3.959941830411918e-06, + "loss": 0.7483, + "step": 4439 + }, + { + "epoch": 0.72, + "grad_norm": 1.717268342539693, + "learning_rate": 3.955782849015825e-06, + "loss": 0.8872, + "step": 4440 + }, + { + "epoch": 0.72, + "grad_norm": 1.294991918901895, + "learning_rate": 3.95162551423109e-06, + "loss": 0.9289, + "step": 4441 + }, + { + "epoch": 0.72, + "grad_norm": 1.3215312489104538, + "learning_rate": 3.9474698271902925e-06, + "loss": 0.8549, + "step": 4442 + }, + { + "epoch": 0.72, + "grad_norm": 1.4882718133502066, + "learning_rate": 3.94331578902555e-06, + "loss": 0.8303, + "step": 4443 + }, + { + "epoch": 0.72, + "grad_norm": 1.3237914638052315, + "learning_rate": 3.939163400868543e-06, + "loss": 0.7986, + "step": 4444 + }, + { + "epoch": 0.72, + "grad_norm": 1.1931634920124807, + "learning_rate": 3.935012663850493e-06, + "loss": 0.8229, + "step": 4445 + }, + { + "epoch": 0.72, + "grad_norm": 1.4098476345628528, + "learning_rate": 3.930863579102173e-06, + "loss": 0.7713, + "step": 4446 + }, + { + "epoch": 0.72, + "grad_norm": 1.8840831706959522, + "learning_rate": 3.9267161477539155e-06, + "loss": 0.8077, + "step": 4447 + }, + { + "epoch": 0.72, + "grad_norm": 1.398224938343925, + "learning_rate": 3.922570370935588e-06, + "loss": 0.8577, + "step": 4448 + }, + { + "epoch": 0.72, + "grad_norm": 1.5316889125321147, + "learning_rate": 3.918426249776614e-06, + "loss": 0.7675, + "step": 4449 + }, + { + "epoch": 0.72, + "grad_norm": 1.3503752169614192, + "learning_rate": 3.91428378540597e-06, + "loss": 0.8022, + "step": 4450 + }, + { + "epoch": 0.72, + "grad_norm": 1.4521086257012774, + "learning_rate": 3.910142978952183e-06, + "loss": 0.9199, + "step": 4451 + }, + { + "epoch": 0.72, + "grad_norm": 1.4825339830926825, + "learning_rate": 3.906003831543309e-06, + "loss": 0.8346, + "step": 4452 + }, + { + "epoch": 0.72, + "grad_norm": 0.7926786554502268, + "learning_rate": 3.901866344306975e-06, + "loss": 0.3066, + "step": 4453 + }, + { + "epoch": 0.72, + "grad_norm": 1.4031918187469365, + "learning_rate": 3.8977305183703464e-06, + "loss": 0.8465, + "step": 4454 + }, + { + "epoch": 0.72, + "grad_norm": 1.303496729651231, + "learning_rate": 3.893596354860135e-06, + "loss": 0.7074, + "step": 4455 + }, + { + "epoch": 0.72, + "grad_norm": 1.3881690218788287, + "learning_rate": 3.889463854902598e-06, + "loss": 0.8552, + "step": 4456 + }, + { + "epoch": 0.72, + "grad_norm": 1.5077475931596371, + "learning_rate": 3.885333019623544e-06, + "loss": 0.8626, + "step": 4457 + }, + { + "epoch": 0.72, + "grad_norm": 1.2328556490642726, + "learning_rate": 3.88120385014833e-06, + "loss": 0.851, + "step": 4458 + }, + { + "epoch": 0.72, + "grad_norm": 1.4566852373496852, + "learning_rate": 3.8770763476018546e-06, + "loss": 0.9238, + "step": 4459 + }, + { + "epoch": 0.72, + "grad_norm": 1.1874441428599796, + "learning_rate": 3.872950513108558e-06, + "loss": 0.7968, + "step": 4460 + }, + { + "epoch": 0.72, + "grad_norm": 1.499331230846118, + "learning_rate": 3.868826347792437e-06, + "loss": 0.7806, + "step": 4461 + }, + { + "epoch": 0.72, + "grad_norm": 1.2641836542131464, + "learning_rate": 3.864703852777026e-06, + "loss": 0.8337, + "step": 4462 + }, + { + "epoch": 0.72, + "grad_norm": 1.4957867001018097, + "learning_rate": 3.860583029185403e-06, + "loss": 0.7914, + "step": 4463 + }, + { + "epoch": 0.72, + "grad_norm": 1.4038091885109347, + "learning_rate": 3.8564638781402e-06, + "loss": 0.8702, + "step": 4464 + }, + { + "epoch": 0.72, + "grad_norm": 1.2688078506557041, + "learning_rate": 3.85234640076358e-06, + "loss": 0.8401, + "step": 4465 + }, + { + "epoch": 0.72, + "grad_norm": 1.3662071405212124, + "learning_rate": 3.848230598177266e-06, + "loss": 0.8727, + "step": 4466 + }, + { + "epoch": 0.72, + "grad_norm": 1.3091243479518637, + "learning_rate": 3.844116471502511e-06, + "loss": 0.8693, + "step": 4467 + }, + { + "epoch": 0.72, + "grad_norm": 1.3986915075986504, + "learning_rate": 3.840004021860113e-06, + "loss": 0.8571, + "step": 4468 + }, + { + "epoch": 0.72, + "grad_norm": 1.6132602141858485, + "learning_rate": 3.835893250370426e-06, + "loss": 0.8687, + "step": 4469 + }, + { + "epoch": 0.72, + "grad_norm": 1.3911932630700203, + "learning_rate": 3.831784158153331e-06, + "loss": 0.8043, + "step": 4470 + }, + { + "epoch": 0.72, + "grad_norm": 1.588859496540571, + "learning_rate": 3.827676746328256e-06, + "loss": 0.7905, + "step": 4471 + }, + { + "epoch": 0.72, + "grad_norm": 1.8065331349743698, + "learning_rate": 3.823571016014176e-06, + "loss": 0.7978, + "step": 4472 + }, + { + "epoch": 0.72, + "grad_norm": 1.3069954555703998, + "learning_rate": 3.819466968329613e-06, + "loss": 0.8102, + "step": 4473 + }, + { + "epoch": 0.72, + "grad_norm": 1.4073311775442594, + "learning_rate": 3.815364604392607e-06, + "loss": 0.8363, + "step": 4474 + }, + { + "epoch": 0.72, + "grad_norm": 1.4183997492295515, + "learning_rate": 3.811263925320765e-06, + "loss": 0.7498, + "step": 4475 + }, + { + "epoch": 0.72, + "grad_norm": 1.381584869194829, + "learning_rate": 3.8071649322312256e-06, + "loss": 0.8316, + "step": 4476 + }, + { + "epoch": 0.72, + "grad_norm": 1.3462547575136914, + "learning_rate": 3.803067626240665e-06, + "loss": 0.7858, + "step": 4477 + }, + { + "epoch": 0.72, + "grad_norm": 1.4051069216657956, + "learning_rate": 3.7989720084653003e-06, + "loss": 0.8219, + "step": 4478 + }, + { + "epoch": 0.72, + "grad_norm": 1.427378322741663, + "learning_rate": 3.7948780800208916e-06, + "loss": 0.7915, + "step": 4479 + }, + { + "epoch": 0.72, + "grad_norm": 1.494173768243142, + "learning_rate": 3.790785842022746e-06, + "loss": 0.7498, + "step": 4480 + }, + { + "epoch": 0.72, + "grad_norm": 1.3285230152403515, + "learning_rate": 3.7866952955856895e-06, + "loss": 0.8455, + "step": 4481 + }, + { + "epoch": 0.72, + "grad_norm": 1.3673930752694237, + "learning_rate": 3.7826064418241037e-06, + "loss": 0.9068, + "step": 4482 + }, + { + "epoch": 0.72, + "grad_norm": 1.2136249684188323, + "learning_rate": 3.7785192818519113e-06, + "loss": 0.8506, + "step": 4483 + }, + { + "epoch": 0.72, + "grad_norm": 1.5244707577198364, + "learning_rate": 3.774433816782561e-06, + "loss": 0.8534, + "step": 4484 + }, + { + "epoch": 0.72, + "grad_norm": 1.5745068705303786, + "learning_rate": 3.7703500477290456e-06, + "loss": 0.7814, + "step": 4485 + }, + { + "epoch": 0.72, + "grad_norm": 1.592223918711669, + "learning_rate": 3.7662679758039023e-06, + "loss": 0.9493, + "step": 4486 + }, + { + "epoch": 0.72, + "grad_norm": 1.5047488657350458, + "learning_rate": 3.762187602119192e-06, + "loss": 0.9013, + "step": 4487 + }, + { + "epoch": 0.72, + "grad_norm": 1.422928141075951, + "learning_rate": 3.758108927786528e-06, + "loss": 0.8844, + "step": 4488 + }, + { + "epoch": 0.72, + "grad_norm": 1.4594127608280405, + "learning_rate": 3.7540319539170522e-06, + "loss": 0.8185, + "step": 4489 + }, + { + "epoch": 0.72, + "grad_norm": 1.283019489636463, + "learning_rate": 3.7499566816214384e-06, + "loss": 0.8206, + "step": 4490 + }, + { + "epoch": 0.72, + "grad_norm": 1.4077586644524112, + "learning_rate": 3.7458831120099126e-06, + "loss": 0.8837, + "step": 4491 + }, + { + "epoch": 0.72, + "grad_norm": 1.4688753948936477, + "learning_rate": 3.741811246192223e-06, + "loss": 0.8671, + "step": 4492 + }, + { + "epoch": 0.72, + "grad_norm": 1.3062604438755507, + "learning_rate": 3.737741085277653e-06, + "loss": 0.832, + "step": 4493 + }, + { + "epoch": 0.72, + "grad_norm": 1.3191198677494795, + "learning_rate": 3.7336726303750327e-06, + "loss": 0.8666, + "step": 4494 + }, + { + "epoch": 0.72, + "grad_norm": 1.3160726229886406, + "learning_rate": 3.729605882592724e-06, + "loss": 0.8172, + "step": 4495 + }, + { + "epoch": 0.72, + "grad_norm": 1.3893386799854455, + "learning_rate": 3.7255408430386164e-06, + "loss": 0.9093, + "step": 4496 + }, + { + "epoch": 0.72, + "grad_norm": 1.6660702026747376, + "learning_rate": 3.7214775128201363e-06, + "loss": 0.7978, + "step": 4497 + }, + { + "epoch": 0.72, + "grad_norm": 0.8523426572511208, + "learning_rate": 3.717415893044254e-06, + "loss": 0.3175, + "step": 4498 + }, + { + "epoch": 0.72, + "grad_norm": 1.478682894674482, + "learning_rate": 3.713355984817463e-06, + "loss": 0.756, + "step": 4499 + }, + { + "epoch": 0.73, + "grad_norm": 1.3604547005555878, + "learning_rate": 3.7092977892457905e-06, + "loss": 0.7553, + "step": 4500 + }, + { + "epoch": 0.73, + "grad_norm": 1.3451571055792968, + "learning_rate": 3.7052413074348038e-06, + "loss": 0.8328, + "step": 4501 + }, + { + "epoch": 0.73, + "grad_norm": 1.3022451064624914, + "learning_rate": 3.7011865404896085e-06, + "loss": 0.8438, + "step": 4502 + }, + { + "epoch": 0.73, + "grad_norm": 0.8326903282664518, + "learning_rate": 3.6971334895148202e-06, + "loss": 0.323, + "step": 4503 + }, + { + "epoch": 0.73, + "grad_norm": 1.192988267359808, + "learning_rate": 3.6930821556146092e-06, + "loss": 0.8123, + "step": 4504 + }, + { + "epoch": 0.73, + "grad_norm": 1.3951098419854824, + "learning_rate": 3.689032539892673e-06, + "loss": 0.8104, + "step": 4505 + }, + { + "epoch": 0.73, + "grad_norm": 1.4705445329487081, + "learning_rate": 3.684984643452236e-06, + "loss": 0.8273, + "step": 4506 + }, + { + "epoch": 0.73, + "grad_norm": 0.8865627669422081, + "learning_rate": 3.6809384673960545e-06, + "loss": 0.3551, + "step": 4507 + }, + { + "epoch": 0.73, + "grad_norm": 1.314786424509152, + "learning_rate": 3.676894012826422e-06, + "loss": 0.8539, + "step": 4508 + }, + { + "epoch": 0.73, + "grad_norm": 1.4987192037801202, + "learning_rate": 3.6728512808451554e-06, + "loss": 0.8732, + "step": 4509 + }, + { + "epoch": 0.73, + "grad_norm": 1.4703414023818968, + "learning_rate": 3.668810272553612e-06, + "loss": 0.8551, + "step": 4510 + }, + { + "epoch": 0.73, + "grad_norm": 1.4844069358251264, + "learning_rate": 3.6647709890526708e-06, + "loss": 0.7939, + "step": 4511 + }, + { + "epoch": 0.73, + "grad_norm": 1.470149797985317, + "learning_rate": 3.66073343144274e-06, + "loss": 0.8611, + "step": 4512 + }, + { + "epoch": 0.73, + "grad_norm": 1.6220625981119294, + "learning_rate": 3.65669760082377e-06, + "loss": 0.8424, + "step": 4513 + }, + { + "epoch": 0.73, + "grad_norm": 1.4752904695871116, + "learning_rate": 3.6526634982952225e-06, + "loss": 0.774, + "step": 4514 + }, + { + "epoch": 0.73, + "grad_norm": 1.4321083603226317, + "learning_rate": 3.648631124956108e-06, + "loss": 0.8124, + "step": 4515 + }, + { + "epoch": 0.73, + "grad_norm": 1.8698157084910847, + "learning_rate": 3.6446004819049473e-06, + "loss": 0.9217, + "step": 4516 + }, + { + "epoch": 0.73, + "grad_norm": 1.5719660360999446, + "learning_rate": 3.640571570239807e-06, + "loss": 0.7643, + "step": 4517 + }, + { + "epoch": 0.73, + "grad_norm": 1.4693490484540859, + "learning_rate": 3.63654439105827e-06, + "loss": 0.7788, + "step": 4518 + }, + { + "epoch": 0.73, + "grad_norm": 1.3287259362184631, + "learning_rate": 3.632518945457446e-06, + "loss": 0.8596, + "step": 4519 + }, + { + "epoch": 0.73, + "grad_norm": 2.0685690862662183, + "learning_rate": 3.6284952345339864e-06, + "loss": 0.8181, + "step": 4520 + }, + { + "epoch": 0.73, + "grad_norm": 1.3796398572590949, + "learning_rate": 3.624473259384056e-06, + "loss": 0.7646, + "step": 4521 + }, + { + "epoch": 0.73, + "grad_norm": 1.5895217515535458, + "learning_rate": 3.6204530211033482e-06, + "loss": 0.8225, + "step": 4522 + }, + { + "epoch": 0.73, + "grad_norm": 1.2936802992151082, + "learning_rate": 3.616434520787091e-06, + "loss": 0.7523, + "step": 4523 + }, + { + "epoch": 0.73, + "grad_norm": 1.3218001471850482, + "learning_rate": 3.6124177595300415e-06, + "loss": 0.8659, + "step": 4524 + }, + { + "epoch": 0.73, + "grad_norm": 1.205757299044214, + "learning_rate": 3.608402738426462e-06, + "loss": 0.7237, + "step": 4525 + }, + { + "epoch": 0.73, + "grad_norm": 1.708876816225869, + "learning_rate": 3.6043894585701623e-06, + "loss": 0.836, + "step": 4526 + }, + { + "epoch": 0.73, + "grad_norm": 1.5839417936337774, + "learning_rate": 3.6003779210544733e-06, + "loss": 0.8183, + "step": 4527 + }, + { + "epoch": 0.73, + "grad_norm": 0.8846497180840347, + "learning_rate": 3.596368126972245e-06, + "loss": 0.3329, + "step": 4528 + }, + { + "epoch": 0.73, + "grad_norm": 1.4422604083409127, + "learning_rate": 3.592360077415853e-06, + "loss": 0.7988, + "step": 4529 + }, + { + "epoch": 0.73, + "grad_norm": 1.472374270583407, + "learning_rate": 3.588353773477208e-06, + "loss": 0.8148, + "step": 4530 + }, + { + "epoch": 0.73, + "grad_norm": 1.4220890575714356, + "learning_rate": 3.5843492162477312e-06, + "loss": 0.8144, + "step": 4531 + }, + { + "epoch": 0.73, + "grad_norm": 1.3744379426931526, + "learning_rate": 3.5803464068183734e-06, + "loss": 0.8854, + "step": 4532 + }, + { + "epoch": 0.73, + "grad_norm": 1.161171529106772, + "learning_rate": 3.576345346279614e-06, + "loss": 0.8274, + "step": 4533 + }, + { + "epoch": 0.73, + "grad_norm": 1.1475818034696188, + "learning_rate": 3.5723460357214547e-06, + "loss": 0.8043, + "step": 4534 + }, + { + "epoch": 0.73, + "grad_norm": 1.3847066230471472, + "learning_rate": 3.568348476233414e-06, + "loss": 0.8396, + "step": 4535 + }, + { + "epoch": 0.73, + "grad_norm": 0.8590872022588721, + "learning_rate": 3.564352668904535e-06, + "loss": 0.3347, + "step": 4536 + }, + { + "epoch": 0.73, + "grad_norm": 1.335976897043588, + "learning_rate": 3.5603586148233917e-06, + "loss": 0.8158, + "step": 4537 + }, + { + "epoch": 0.73, + "grad_norm": 1.1876977925211192, + "learning_rate": 3.556366315078068e-06, + "loss": 0.8106, + "step": 4538 + }, + { + "epoch": 0.73, + "grad_norm": 1.7558165132342765, + "learning_rate": 3.5523757707561836e-06, + "loss": 0.8079, + "step": 4539 + }, + { + "epoch": 0.73, + "grad_norm": 1.294637040993705, + "learning_rate": 3.548386982944868e-06, + "loss": 0.7753, + "step": 4540 + }, + { + "epoch": 0.73, + "grad_norm": 1.3278888502965889, + "learning_rate": 3.544399952730776e-06, + "loss": 0.8824, + "step": 4541 + }, + { + "epoch": 0.73, + "grad_norm": 1.3314996033151159, + "learning_rate": 3.5404146812000893e-06, + "loss": 0.8609, + "step": 4542 + }, + { + "epoch": 0.73, + "grad_norm": 1.2715993813099422, + "learning_rate": 3.536431169438502e-06, + "loss": 0.7481, + "step": 4543 + }, + { + "epoch": 0.73, + "grad_norm": 1.3580886671360501, + "learning_rate": 3.5324494185312317e-06, + "loss": 0.8293, + "step": 4544 + }, + { + "epoch": 0.73, + "grad_norm": 1.7015666525503723, + "learning_rate": 3.5284694295630183e-06, + "loss": 0.8196, + "step": 4545 + }, + { + "epoch": 0.73, + "grad_norm": 1.3454876887831788, + "learning_rate": 3.5244912036181276e-06, + "loss": 0.7904, + "step": 4546 + }, + { + "epoch": 0.73, + "grad_norm": 1.5431661256265312, + "learning_rate": 3.5205147417803253e-06, + "loss": 0.7576, + "step": 4547 + }, + { + "epoch": 0.73, + "grad_norm": 1.4191286988496183, + "learning_rate": 3.5165400451329147e-06, + "loss": 0.7237, + "step": 4548 + }, + { + "epoch": 0.73, + "grad_norm": 1.2165767235546667, + "learning_rate": 3.512567114758717e-06, + "loss": 0.8291, + "step": 4549 + }, + { + "epoch": 0.73, + "grad_norm": 1.3471825644852433, + "learning_rate": 3.5085959517400645e-06, + "loss": 0.8174, + "step": 4550 + }, + { + "epoch": 0.73, + "grad_norm": 1.6173246205749865, + "learning_rate": 3.504626557158808e-06, + "loss": 0.9089, + "step": 4551 + }, + { + "epoch": 0.73, + "grad_norm": 1.414567089414755, + "learning_rate": 3.500658932096327e-06, + "loss": 0.8711, + "step": 4552 + }, + { + "epoch": 0.73, + "grad_norm": 1.3942766464714509, + "learning_rate": 3.4966930776335083e-06, + "loss": 0.7604, + "step": 4553 + }, + { + "epoch": 0.73, + "grad_norm": 1.5862658787876505, + "learning_rate": 3.492728994850756e-06, + "loss": 0.8437, + "step": 4554 + }, + { + "epoch": 0.73, + "grad_norm": 1.2819101272914573, + "learning_rate": 3.488766684828e-06, + "loss": 0.6886, + "step": 4555 + }, + { + "epoch": 0.73, + "grad_norm": 1.3757993091208154, + "learning_rate": 3.4848061486446848e-06, + "loss": 0.8845, + "step": 4556 + }, + { + "epoch": 0.73, + "grad_norm": 1.3169207925692148, + "learning_rate": 3.4808473873797675e-06, + "loss": 0.8157, + "step": 4557 + }, + { + "epoch": 0.73, + "grad_norm": 1.4490506770052904, + "learning_rate": 3.476890402111721e-06, + "loss": 0.8317, + "step": 4558 + }, + { + "epoch": 0.73, + "grad_norm": 1.4116023229659287, + "learning_rate": 3.472935193918542e-06, + "loss": 0.8157, + "step": 4559 + }, + { + "epoch": 0.73, + "grad_norm": 1.3167486368112151, + "learning_rate": 3.4689817638777355e-06, + "loss": 0.8077, + "step": 4560 + }, + { + "epoch": 0.73, + "grad_norm": 1.4326459463102414, + "learning_rate": 3.4650301130663224e-06, + "loss": 0.8216, + "step": 4561 + }, + { + "epoch": 0.74, + "grad_norm": 1.3305761221434707, + "learning_rate": 3.461080242560847e-06, + "loss": 0.7662, + "step": 4562 + }, + { + "epoch": 0.74, + "grad_norm": 1.3511201907912784, + "learning_rate": 3.4571321534373557e-06, + "loss": 0.8214, + "step": 4563 + }, + { + "epoch": 0.74, + "grad_norm": 1.1735433298064246, + "learning_rate": 3.453185846771425e-06, + "loss": 0.8022, + "step": 4564 + }, + { + "epoch": 0.74, + "grad_norm": 1.3616895332747592, + "learning_rate": 3.449241323638135e-06, + "loss": 0.8511, + "step": 4565 + }, + { + "epoch": 0.74, + "grad_norm": 0.9705964966280717, + "learning_rate": 3.445298585112077e-06, + "loss": 0.3074, + "step": 4566 + }, + { + "epoch": 0.74, + "grad_norm": 1.3728402129417934, + "learning_rate": 3.441357632267365e-06, + "loss": 0.7986, + "step": 4567 + }, + { + "epoch": 0.74, + "grad_norm": 1.5934863065207023, + "learning_rate": 3.437418466177631e-06, + "loss": 0.8624, + "step": 4568 + }, + { + "epoch": 0.74, + "grad_norm": 1.0172151807894863, + "learning_rate": 3.4334810879159987e-06, + "loss": 0.3119, + "step": 4569 + }, + { + "epoch": 0.74, + "grad_norm": 1.2824732818374256, + "learning_rate": 3.429545498555126e-06, + "loss": 0.8599, + "step": 4570 + }, + { + "epoch": 0.74, + "grad_norm": 0.8087818000797187, + "learning_rate": 3.4256116991671773e-06, + "loss": 0.3331, + "step": 4571 + }, + { + "epoch": 0.74, + "grad_norm": 1.4284111016003391, + "learning_rate": 3.421679690823827e-06, + "loss": 0.9121, + "step": 4572 + }, + { + "epoch": 0.74, + "grad_norm": 1.394947740458682, + "learning_rate": 3.417749474596257e-06, + "loss": 0.8109, + "step": 4573 + }, + { + "epoch": 0.74, + "grad_norm": 1.4927535044631843, + "learning_rate": 3.4138210515551717e-06, + "loss": 0.8471, + "step": 4574 + }, + { + "epoch": 0.74, + "grad_norm": 1.4700875374449653, + "learning_rate": 3.4098944227707873e-06, + "loss": 0.8439, + "step": 4575 + }, + { + "epoch": 0.74, + "grad_norm": 1.3124245539620338, + "learning_rate": 3.4059695893128133e-06, + "loss": 0.7606, + "step": 4576 + }, + { + "epoch": 0.74, + "grad_norm": 2.046113454040001, + "learning_rate": 3.4020465522504876e-06, + "loss": 0.8573, + "step": 4577 + }, + { + "epoch": 0.74, + "grad_norm": 1.4425325593587695, + "learning_rate": 3.3981253126525593e-06, + "loss": 0.8124, + "step": 4578 + }, + { + "epoch": 0.74, + "grad_norm": 1.3061538938709223, + "learning_rate": 3.394205871587277e-06, + "loss": 0.9021, + "step": 4579 + }, + { + "epoch": 0.74, + "grad_norm": 1.3968656672948951, + "learning_rate": 3.3902882301224016e-06, + "loss": 0.7762, + "step": 4580 + }, + { + "epoch": 0.74, + "grad_norm": 1.4402304327269095, + "learning_rate": 3.386372389325213e-06, + "loss": 0.7976, + "step": 4581 + }, + { + "epoch": 0.74, + "grad_norm": 1.2529998998971583, + "learning_rate": 3.382458350262493e-06, + "loss": 0.8118, + "step": 4582 + }, + { + "epoch": 0.74, + "grad_norm": 1.267057599225208, + "learning_rate": 3.378546114000527e-06, + "loss": 0.7101, + "step": 4583 + }, + { + "epoch": 0.74, + "grad_norm": 1.283348846725599, + "learning_rate": 3.374635681605125e-06, + "loss": 0.7633, + "step": 4584 + }, + { + "epoch": 0.74, + "grad_norm": 1.3763833026881758, + "learning_rate": 3.3707270541415895e-06, + "loss": 0.8306, + "step": 4585 + }, + { + "epoch": 0.74, + "grad_norm": 1.3870988351802296, + "learning_rate": 3.3668202326747433e-06, + "loss": 0.7298, + "step": 4586 + }, + { + "epoch": 0.74, + "grad_norm": 1.4636531429043995, + "learning_rate": 3.3629152182689117e-06, + "loss": 0.8473, + "step": 4587 + }, + { + "epoch": 0.74, + "grad_norm": 1.2662382020093548, + "learning_rate": 3.3590120119879233e-06, + "loss": 0.8124, + "step": 4588 + }, + { + "epoch": 0.74, + "grad_norm": 1.3072294567765188, + "learning_rate": 3.3551106148951262e-06, + "loss": 0.8475, + "step": 4589 + }, + { + "epoch": 0.74, + "grad_norm": 1.442508742375798, + "learning_rate": 3.351211028053365e-06, + "loss": 0.9055, + "step": 4590 + }, + { + "epoch": 0.74, + "grad_norm": 1.2960453014907136, + "learning_rate": 3.3473132525249918e-06, + "loss": 0.899, + "step": 4591 + }, + { + "epoch": 0.74, + "grad_norm": 1.3906475300383587, + "learning_rate": 3.3434172893718707e-06, + "loss": 0.8027, + "step": 4592 + }, + { + "epoch": 0.74, + "grad_norm": 1.5760171429057548, + "learning_rate": 3.339523139655373e-06, + "loss": 0.9177, + "step": 4593 + }, + { + "epoch": 0.74, + "grad_norm": 1.269209711731106, + "learning_rate": 3.3356308044363683e-06, + "loss": 0.7466, + "step": 4594 + }, + { + "epoch": 0.74, + "grad_norm": 1.2789226031816263, + "learning_rate": 3.3317402847752344e-06, + "loss": 0.7593, + "step": 4595 + }, + { + "epoch": 0.74, + "grad_norm": 1.3426214513594168, + "learning_rate": 3.327851581731859e-06, + "loss": 0.8698, + "step": 4596 + }, + { + "epoch": 0.74, + "grad_norm": 1.4033046215828673, + "learning_rate": 3.323964696365638e-06, + "loss": 0.835, + "step": 4597 + }, + { + "epoch": 0.74, + "grad_norm": 1.7531404692601436, + "learning_rate": 3.320079629735452e-06, + "loss": 0.8819, + "step": 4598 + }, + { + "epoch": 0.74, + "grad_norm": 1.2343122264135091, + "learning_rate": 3.316196382899709e-06, + "loss": 0.8291, + "step": 4599 + }, + { + "epoch": 0.74, + "grad_norm": 1.6588369653195205, + "learning_rate": 3.312314956916315e-06, + "loss": 0.8245, + "step": 4600 + }, + { + "epoch": 0.74, + "grad_norm": 1.3280495776470922, + "learning_rate": 3.3084353528426727e-06, + "loss": 0.8668, + "step": 4601 + }, + { + "epoch": 0.74, + "grad_norm": 1.1922568419490234, + "learning_rate": 3.3045575717356926e-06, + "loss": 0.8691, + "step": 4602 + }, + { + "epoch": 0.74, + "grad_norm": 1.3327769509244611, + "learning_rate": 3.3006816146517927e-06, + "loss": 0.8104, + "step": 4603 + }, + { + "epoch": 0.74, + "grad_norm": 1.5039261123039187, + "learning_rate": 3.29680748264689e-06, + "loss": 0.8354, + "step": 4604 + }, + { + "epoch": 0.74, + "grad_norm": 1.4390163328280725, + "learning_rate": 3.2929351767763997e-06, + "loss": 0.7603, + "step": 4605 + }, + { + "epoch": 0.74, + "grad_norm": 1.3093446171542662, + "learning_rate": 3.289064698095251e-06, + "loss": 0.8822, + "step": 4606 + }, + { + "epoch": 0.74, + "grad_norm": 1.2121243759306402, + "learning_rate": 3.2851960476578647e-06, + "loss": 0.8336, + "step": 4607 + }, + { + "epoch": 0.74, + "grad_norm": 1.3479061060948208, + "learning_rate": 3.281329226518173e-06, + "loss": 0.8563, + "step": 4608 + }, + { + "epoch": 0.74, + "grad_norm": 1.319931036535498, + "learning_rate": 3.2774642357296006e-06, + "loss": 0.9039, + "step": 4609 + }, + { + "epoch": 0.74, + "grad_norm": 1.5990038192494218, + "learning_rate": 3.2736010763450744e-06, + "loss": 0.8383, + "step": 4610 + }, + { + "epoch": 0.74, + "grad_norm": 1.2487795250856075, + "learning_rate": 3.2697397494170336e-06, + "loss": 0.8858, + "step": 4611 + }, + { + "epoch": 0.74, + "grad_norm": 1.3727514626108848, + "learning_rate": 3.2658802559974046e-06, + "loss": 0.7983, + "step": 4612 + }, + { + "epoch": 0.74, + "grad_norm": 1.6093886812332248, + "learning_rate": 3.2620225971376187e-06, + "loss": 0.7997, + "step": 4613 + }, + { + "epoch": 0.74, + "grad_norm": 1.4311652526221224, + "learning_rate": 3.25816677388861e-06, + "loss": 0.8374, + "step": 4614 + }, + { + "epoch": 0.74, + "grad_norm": 1.480377833611945, + "learning_rate": 3.2543127873008164e-06, + "loss": 0.8615, + "step": 4615 + }, + { + "epoch": 0.74, + "grad_norm": 1.7145031751574502, + "learning_rate": 3.2504606384241642e-06, + "loss": 0.8197, + "step": 4616 + }, + { + "epoch": 0.74, + "grad_norm": 1.4812887752815658, + "learning_rate": 3.246610328308083e-06, + "loss": 0.8922, + "step": 4617 + }, + { + "epoch": 0.74, + "grad_norm": 1.3992676335543859, + "learning_rate": 3.2427618580015107e-06, + "loss": 0.8049, + "step": 4618 + }, + { + "epoch": 0.74, + "grad_norm": 1.3787676502468011, + "learning_rate": 3.2389152285528726e-06, + "loss": 0.8205, + "step": 4619 + }, + { + "epoch": 0.74, + "grad_norm": 1.251255421970912, + "learning_rate": 3.235070441010092e-06, + "loss": 0.7366, + "step": 4620 + }, + { + "epoch": 0.74, + "grad_norm": 1.4918743929733893, + "learning_rate": 3.2312274964206013e-06, + "loss": 0.8301, + "step": 4621 + }, + { + "epoch": 0.74, + "grad_norm": 1.4498453366587167, + "learning_rate": 3.2273863958313257e-06, + "loss": 0.8271, + "step": 4622 + }, + { + "epoch": 0.74, + "grad_norm": 1.4591424695500785, + "learning_rate": 3.2235471402886833e-06, + "loss": 0.778, + "step": 4623 + }, + { + "epoch": 0.75, + "grad_norm": 1.5772348191311392, + "learning_rate": 3.2197097308385916e-06, + "loss": 0.7478, + "step": 4624 + }, + { + "epoch": 0.75, + "grad_norm": 1.2070137646868462, + "learning_rate": 3.2158741685264715e-06, + "loss": 0.8594, + "step": 4625 + }, + { + "epoch": 0.75, + "grad_norm": 1.3933171559042201, + "learning_rate": 3.212040454397234e-06, + "loss": 0.7904, + "step": 4626 + }, + { + "epoch": 0.75, + "grad_norm": 1.1952114779349141, + "learning_rate": 3.208208589495284e-06, + "loss": 0.8301, + "step": 4627 + }, + { + "epoch": 0.75, + "grad_norm": 0.9752023943417202, + "learning_rate": 3.204378574864535e-06, + "loss": 0.3427, + "step": 4628 + }, + { + "epoch": 0.75, + "grad_norm": 1.505884330508093, + "learning_rate": 3.200550411548381e-06, + "loss": 0.8588, + "step": 4629 + }, + { + "epoch": 0.75, + "grad_norm": 1.4492764493607064, + "learning_rate": 3.1967241005897264e-06, + "loss": 0.7862, + "step": 4630 + }, + { + "epoch": 0.75, + "grad_norm": 1.3536520320859997, + "learning_rate": 3.19289964303096e-06, + "loss": 0.87, + "step": 4631 + }, + { + "epoch": 0.75, + "grad_norm": 1.709930752978526, + "learning_rate": 3.189077039913967e-06, + "loss": 0.8146, + "step": 4632 + }, + { + "epoch": 0.75, + "grad_norm": 1.2768763524056255, + "learning_rate": 3.1852562922801346e-06, + "loss": 0.8036, + "step": 4633 + }, + { + "epoch": 0.75, + "grad_norm": 1.555682858690599, + "learning_rate": 3.181437401170335e-06, + "loss": 0.8123, + "step": 4634 + }, + { + "epoch": 0.75, + "grad_norm": 0.7700029491121926, + "learning_rate": 3.177620367624946e-06, + "loss": 0.3377, + "step": 4635 + }, + { + "epoch": 0.75, + "grad_norm": 1.6109098874231431, + "learning_rate": 3.1738051926838243e-06, + "loss": 0.7974, + "step": 4636 + }, + { + "epoch": 0.75, + "grad_norm": 1.2557265543669884, + "learning_rate": 3.169991877386338e-06, + "loss": 0.8346, + "step": 4637 + }, + { + "epoch": 0.75, + "grad_norm": 1.4542267529644586, + "learning_rate": 3.1661804227713334e-06, + "loss": 0.7901, + "step": 4638 + }, + { + "epoch": 0.75, + "grad_norm": 1.6090037064572964, + "learning_rate": 3.162370829877154e-06, + "loss": 0.91, + "step": 4639 + }, + { + "epoch": 0.75, + "grad_norm": 1.274934944311655, + "learning_rate": 3.158563099741644e-06, + "loss": 0.8863, + "step": 4640 + }, + { + "epoch": 0.75, + "grad_norm": 1.3647634460765492, + "learning_rate": 3.1547572334021315e-06, + "loss": 0.865, + "step": 4641 + }, + { + "epoch": 0.75, + "grad_norm": 1.2058996602858776, + "learning_rate": 3.150953231895435e-06, + "loss": 0.785, + "step": 4642 + }, + { + "epoch": 0.75, + "grad_norm": 1.4880652940567194, + "learning_rate": 3.1471510962578743e-06, + "loss": 0.7924, + "step": 4643 + }, + { + "epoch": 0.75, + "grad_norm": 0.8404726191235011, + "learning_rate": 3.143350827525257e-06, + "loss": 0.3487, + "step": 4644 + }, + { + "epoch": 0.75, + "grad_norm": 1.5028895663383457, + "learning_rate": 3.139552426732879e-06, + "loss": 0.8436, + "step": 4645 + }, + { + "epoch": 0.75, + "grad_norm": 1.326259575024106, + "learning_rate": 3.1357558949155266e-06, + "loss": 0.8326, + "step": 4646 + }, + { + "epoch": 0.75, + "grad_norm": 1.6337339029389253, + "learning_rate": 3.1319612331074856e-06, + "loss": 0.841, + "step": 4647 + }, + { + "epoch": 0.75, + "grad_norm": 0.8376977060434554, + "learning_rate": 3.128168442342523e-06, + "loss": 0.3272, + "step": 4648 + }, + { + "epoch": 0.75, + "grad_norm": 1.2234325713177738, + "learning_rate": 3.1243775236538963e-06, + "loss": 0.8344, + "step": 4649 + }, + { + "epoch": 0.75, + "grad_norm": 1.4849761993966566, + "learning_rate": 3.1205884780743633e-06, + "loss": 0.7679, + "step": 4650 + }, + { + "epoch": 0.75, + "grad_norm": 1.409717975148358, + "learning_rate": 3.116801306636158e-06, + "loss": 0.8178, + "step": 4651 + }, + { + "epoch": 0.75, + "grad_norm": 1.6829071780253515, + "learning_rate": 3.113016010371016e-06, + "loss": 0.8338, + "step": 4652 + }, + { + "epoch": 0.75, + "grad_norm": 1.3574278330500635, + "learning_rate": 3.1092325903101518e-06, + "loss": 0.8474, + "step": 4653 + }, + { + "epoch": 0.75, + "grad_norm": 1.7394067196124647, + "learning_rate": 3.105451047484277e-06, + "loss": 0.8034, + "step": 4654 + }, + { + "epoch": 0.75, + "grad_norm": 1.550439496662501, + "learning_rate": 3.1016713829235866e-06, + "loss": 0.8021, + "step": 4655 + }, + { + "epoch": 0.75, + "grad_norm": 0.8544490547193098, + "learning_rate": 3.0978935976577617e-06, + "loss": 0.3281, + "step": 4656 + }, + { + "epoch": 0.75, + "grad_norm": 1.4015045524023768, + "learning_rate": 3.0941176927159812e-06, + "loss": 0.8779, + "step": 4657 + }, + { + "epoch": 0.75, + "grad_norm": 1.248961740182372, + "learning_rate": 3.0903436691269006e-06, + "loss": 0.7945, + "step": 4658 + }, + { + "epoch": 0.75, + "grad_norm": 1.6171210132513463, + "learning_rate": 3.0865715279186724e-06, + "loss": 0.7699, + "step": 4659 + }, + { + "epoch": 0.75, + "grad_norm": 1.3981970844782021, + "learning_rate": 3.0828012701189316e-06, + "loss": 0.8269, + "step": 4660 + }, + { + "epoch": 0.75, + "grad_norm": 1.6148611015657328, + "learning_rate": 3.079032896754793e-06, + "loss": 0.8701, + "step": 4661 + }, + { + "epoch": 0.75, + "grad_norm": 1.585843536806766, + "learning_rate": 3.075266408852876e-06, + "loss": 0.8013, + "step": 4662 + }, + { + "epoch": 0.75, + "grad_norm": 1.3893948971771075, + "learning_rate": 3.07150180743927e-06, + "loss": 0.7804, + "step": 4663 + }, + { + "epoch": 0.75, + "grad_norm": 1.6279533993207522, + "learning_rate": 3.0677390935395533e-06, + "loss": 0.8204, + "step": 4664 + }, + { + "epoch": 0.75, + "grad_norm": 1.6195709964541622, + "learning_rate": 3.063978268178798e-06, + "loss": 0.8633, + "step": 4665 + }, + { + "epoch": 0.75, + "grad_norm": 1.3495268361733943, + "learning_rate": 3.0602193323815563e-06, + "loss": 0.7675, + "step": 4666 + }, + { + "epoch": 0.75, + "grad_norm": 1.3265433402188649, + "learning_rate": 3.056462287171865e-06, + "loss": 0.7991, + "step": 4667 + }, + { + "epoch": 0.75, + "grad_norm": 1.538274529414804, + "learning_rate": 3.052707133573244e-06, + "loss": 0.8395, + "step": 4668 + }, + { + "epoch": 0.75, + "grad_norm": 1.503590284014218, + "learning_rate": 3.0489538726087053e-06, + "loss": 0.8402, + "step": 4669 + }, + { + "epoch": 0.75, + "grad_norm": 1.337043396494602, + "learning_rate": 3.0452025053007396e-06, + "loss": 0.8318, + "step": 4670 + }, + { + "epoch": 0.75, + "grad_norm": 1.3908823420129273, + "learning_rate": 3.0414530326713176e-06, + "loss": 0.816, + "step": 4671 + }, + { + "epoch": 0.75, + "grad_norm": 1.5278305451144205, + "learning_rate": 3.0377054557419028e-06, + "loss": 0.8631, + "step": 4672 + }, + { + "epoch": 0.75, + "grad_norm": 0.8858852060718336, + "learning_rate": 3.0339597755334414e-06, + "loss": 0.3367, + "step": 4673 + }, + { + "epoch": 0.75, + "grad_norm": 1.5073235835848535, + "learning_rate": 3.0302159930663575e-06, + "loss": 0.8395, + "step": 4674 + }, + { + "epoch": 0.75, + "grad_norm": 1.2255792625215078, + "learning_rate": 3.026474109360559e-06, + "loss": 0.8122, + "step": 4675 + }, + { + "epoch": 0.75, + "grad_norm": 1.395127827997497, + "learning_rate": 3.0227341254354405e-06, + "loss": 0.8483, + "step": 4676 + }, + { + "epoch": 0.75, + "grad_norm": 1.3967492405927933, + "learning_rate": 3.0189960423098765e-06, + "loss": 0.8411, + "step": 4677 + }, + { + "epoch": 0.75, + "grad_norm": 1.8058305842031075, + "learning_rate": 3.015259861002219e-06, + "loss": 0.8427, + "step": 4678 + }, + { + "epoch": 0.75, + "grad_norm": 1.5646683107989443, + "learning_rate": 3.0115255825303148e-06, + "loss": 0.8319, + "step": 4679 + }, + { + "epoch": 0.75, + "grad_norm": 1.2708654072709185, + "learning_rate": 3.0077932079114768e-06, + "loss": 0.8519, + "step": 4680 + }, + { + "epoch": 0.75, + "grad_norm": 1.579503433481849, + "learning_rate": 3.004062738162514e-06, + "loss": 0.7811, + "step": 4681 + }, + { + "epoch": 0.75, + "grad_norm": 1.6116411714193701, + "learning_rate": 3.000334174299705e-06, + "loss": 0.8498, + "step": 4682 + }, + { + "epoch": 0.75, + "grad_norm": 1.252487935769062, + "learning_rate": 2.996607517338811e-06, + "loss": 0.7534, + "step": 4683 + }, + { + "epoch": 0.75, + "grad_norm": 1.284323911219279, + "learning_rate": 2.9928827682950825e-06, + "loss": 0.8281, + "step": 4684 + }, + { + "epoch": 0.75, + "grad_norm": 1.5534638017082685, + "learning_rate": 2.9891599281832395e-06, + "loss": 0.85, + "step": 4685 + }, + { + "epoch": 0.76, + "grad_norm": 1.6245453157152632, + "learning_rate": 2.9854389980174837e-06, + "loss": 0.8166, + "step": 4686 + }, + { + "epoch": 0.76, + "grad_norm": 1.3335684069117697, + "learning_rate": 2.9817199788115025e-06, + "loss": 0.8264, + "step": 4687 + }, + { + "epoch": 0.76, + "grad_norm": 1.3994176475920879, + "learning_rate": 2.978002871578466e-06, + "loss": 0.8092, + "step": 4688 + }, + { + "epoch": 0.76, + "grad_norm": 1.5112244909411698, + "learning_rate": 2.9742876773310037e-06, + "loss": 0.8667, + "step": 4689 + }, + { + "epoch": 0.76, + "grad_norm": 1.343841591197542, + "learning_rate": 2.9705743970812416e-06, + "loss": 0.8188, + "step": 4690 + }, + { + "epoch": 0.76, + "grad_norm": 1.7326316432330489, + "learning_rate": 2.9668630318407854e-06, + "loss": 0.8499, + "step": 4691 + }, + { + "epoch": 0.76, + "grad_norm": 1.5315243319565888, + "learning_rate": 2.963153582620708e-06, + "loss": 0.7917, + "step": 4692 + }, + { + "epoch": 0.76, + "grad_norm": 1.2618766492473392, + "learning_rate": 2.9594460504315637e-06, + "loss": 0.8141, + "step": 4693 + }, + { + "epoch": 0.76, + "grad_norm": 1.6655788340978797, + "learning_rate": 2.9557404362833898e-06, + "loss": 0.8314, + "step": 4694 + }, + { + "epoch": 0.76, + "grad_norm": 1.1972884025087276, + "learning_rate": 2.9520367411856997e-06, + "loss": 0.7529, + "step": 4695 + }, + { + "epoch": 0.76, + "grad_norm": 1.6391126667287583, + "learning_rate": 2.9483349661474792e-06, + "loss": 0.8067, + "step": 4696 + }, + { + "epoch": 0.76, + "grad_norm": 1.2406837178785362, + "learning_rate": 2.944635112177192e-06, + "loss": 0.8418, + "step": 4697 + }, + { + "epoch": 0.76, + "grad_norm": 1.192417237615125, + "learning_rate": 2.9409371802827856e-06, + "loss": 0.8405, + "step": 4698 + }, + { + "epoch": 0.76, + "grad_norm": 1.4562592851973102, + "learning_rate": 2.937241171471674e-06, + "loss": 0.8118, + "step": 4699 + }, + { + "epoch": 0.76, + "grad_norm": 1.3906665270744865, + "learning_rate": 2.933547086750752e-06, + "loss": 0.7273, + "step": 4700 + }, + { + "epoch": 0.76, + "grad_norm": 1.313457905803475, + "learning_rate": 2.9298549271263944e-06, + "loss": 0.8816, + "step": 4701 + }, + { + "epoch": 0.76, + "grad_norm": 1.9222697651164837, + "learning_rate": 2.9261646936044408e-06, + "loss": 0.8614, + "step": 4702 + }, + { + "epoch": 0.76, + "grad_norm": 1.419822423094064, + "learning_rate": 2.9224763871902186e-06, + "loss": 0.7413, + "step": 4703 + }, + { + "epoch": 0.76, + "grad_norm": 1.24121525941542, + "learning_rate": 2.9187900088885224e-06, + "loss": 0.8137, + "step": 4704 + }, + { + "epoch": 0.76, + "grad_norm": 1.326930666804502, + "learning_rate": 2.9151055597036195e-06, + "loss": 0.7895, + "step": 4705 + }, + { + "epoch": 0.76, + "grad_norm": 1.5322445177906812, + "learning_rate": 2.9114230406392608e-06, + "loss": 0.8607, + "step": 4706 + }, + { + "epoch": 0.76, + "grad_norm": 1.6097622215755245, + "learning_rate": 2.907742452698664e-06, + "loss": 0.8665, + "step": 4707 + }, + { + "epoch": 0.76, + "grad_norm": 0.8712730757703152, + "learning_rate": 2.90406379688452e-06, + "loss": 0.3446, + "step": 4708 + }, + { + "epoch": 0.76, + "grad_norm": 1.5671107629470997, + "learning_rate": 2.900387074198997e-06, + "loss": 0.8077, + "step": 4709 + }, + { + "epoch": 0.76, + "grad_norm": 1.39163987424883, + "learning_rate": 2.8967122856437435e-06, + "loss": 0.826, + "step": 4710 + }, + { + "epoch": 0.76, + "grad_norm": 1.2525924643573303, + "learning_rate": 2.893039432219861e-06, + "loss": 0.829, + "step": 4711 + }, + { + "epoch": 0.76, + "grad_norm": 1.695902427309275, + "learning_rate": 2.8893685149279417e-06, + "loss": 0.8593, + "step": 4712 + }, + { + "epoch": 0.76, + "grad_norm": 1.4347093500493817, + "learning_rate": 2.885699534768047e-06, + "loss": 0.8287, + "step": 4713 + }, + { + "epoch": 0.76, + "grad_norm": 1.1842828765108258, + "learning_rate": 2.882032492739706e-06, + "loss": 0.8064, + "step": 4714 + }, + { + "epoch": 0.76, + "grad_norm": 1.2926222705767059, + "learning_rate": 2.8783673898419194e-06, + "loss": 0.802, + "step": 4715 + }, + { + "epoch": 0.76, + "grad_norm": 1.5344490395161348, + "learning_rate": 2.874704227073164e-06, + "loss": 0.8158, + "step": 4716 + }, + { + "epoch": 0.76, + "grad_norm": 1.2923744714475556, + "learning_rate": 2.871043005431394e-06, + "loss": 0.8168, + "step": 4717 + }, + { + "epoch": 0.76, + "grad_norm": 1.3479601994120898, + "learning_rate": 2.8673837259140138e-06, + "loss": 0.8182, + "step": 4718 + }, + { + "epoch": 0.76, + "grad_norm": 1.573127027851483, + "learning_rate": 2.863726389517918e-06, + "loss": 0.8421, + "step": 4719 + }, + { + "epoch": 0.76, + "grad_norm": 1.8362987299905271, + "learning_rate": 2.860070997239469e-06, + "loss": 0.778, + "step": 4720 + }, + { + "epoch": 0.76, + "grad_norm": 0.8000931830824157, + "learning_rate": 2.856417550074495e-06, + "loss": 0.3055, + "step": 4721 + }, + { + "epoch": 0.76, + "grad_norm": 1.362140206692828, + "learning_rate": 2.852766049018291e-06, + "loss": 0.8111, + "step": 4722 + }, + { + "epoch": 0.76, + "grad_norm": 1.2076487516405343, + "learning_rate": 2.8491164950656313e-06, + "loss": 0.9246, + "step": 4723 + }, + { + "epoch": 0.76, + "grad_norm": 1.3447822124349311, + "learning_rate": 2.8454688892107518e-06, + "loss": 0.8586, + "step": 4724 + }, + { + "epoch": 0.76, + "grad_norm": 1.3955087041084533, + "learning_rate": 2.841823232447366e-06, + "loss": 0.82, + "step": 4725 + }, + { + "epoch": 0.76, + "grad_norm": 1.3295635657878024, + "learning_rate": 2.8381795257686485e-06, + "loss": 0.848, + "step": 4726 + }, + { + "epoch": 0.76, + "grad_norm": 1.220902574267021, + "learning_rate": 2.8345377701672404e-06, + "loss": 0.8345, + "step": 4727 + }, + { + "epoch": 0.76, + "grad_norm": 1.7915873389831936, + "learning_rate": 2.8308979666352644e-06, + "loss": 0.7619, + "step": 4728 + }, + { + "epoch": 0.76, + "grad_norm": 1.4689146489195513, + "learning_rate": 2.8272601161643e-06, + "loss": 0.863, + "step": 4729 + }, + { + "epoch": 0.76, + "grad_norm": 0.7847020878284674, + "learning_rate": 2.8236242197453943e-06, + "loss": 0.3314, + "step": 4730 + }, + { + "epoch": 0.76, + "grad_norm": 1.416161603309013, + "learning_rate": 2.819990278369069e-06, + "loss": 0.781, + "step": 4731 + }, + { + "epoch": 0.76, + "grad_norm": 1.5261454088558803, + "learning_rate": 2.816358293025314e-06, + "loss": 0.755, + "step": 4732 + }, + { + "epoch": 0.76, + "grad_norm": 1.2681060949319873, + "learning_rate": 2.812728264703577e-06, + "loss": 0.8548, + "step": 4733 + }, + { + "epoch": 0.76, + "grad_norm": 1.5987856907762272, + "learning_rate": 2.8091001943927764e-06, + "loss": 0.8853, + "step": 4734 + }, + { + "epoch": 0.76, + "grad_norm": 1.2115621240193155, + "learning_rate": 2.8054740830813056e-06, + "loss": 0.7916, + "step": 4735 + }, + { + "epoch": 0.76, + "grad_norm": 1.294371233962966, + "learning_rate": 2.8018499317570115e-06, + "loss": 0.7853, + "step": 4736 + }, + { + "epoch": 0.76, + "grad_norm": 1.2931185953172708, + "learning_rate": 2.79822774140721e-06, + "loss": 0.8079, + "step": 4737 + }, + { + "epoch": 0.76, + "grad_norm": 1.9602101266890735, + "learning_rate": 2.794607513018691e-06, + "loss": 0.9138, + "step": 4738 + }, + { + "epoch": 0.76, + "grad_norm": 1.2451759040301955, + "learning_rate": 2.79098924757771e-06, + "loss": 0.8024, + "step": 4739 + }, + { + "epoch": 0.76, + "grad_norm": 1.1412515935946763, + "learning_rate": 2.7873729460699684e-06, + "loss": 0.7975, + "step": 4740 + }, + { + "epoch": 0.76, + "grad_norm": 1.439696610974413, + "learning_rate": 2.7837586094806535e-06, + "loss": 0.8942, + "step": 4741 + }, + { + "epoch": 0.76, + "grad_norm": 1.3840311150998554, + "learning_rate": 2.7801462387944145e-06, + "loss": 0.8432, + "step": 4742 + }, + { + "epoch": 0.76, + "grad_norm": 1.3094201083595784, + "learning_rate": 2.7765358349953554e-06, + "loss": 0.7871, + "step": 4743 + }, + { + "epoch": 0.76, + "grad_norm": 1.4368429651957844, + "learning_rate": 2.772927399067048e-06, + "loss": 0.8224, + "step": 4744 + }, + { + "epoch": 0.76, + "grad_norm": 1.4615771160797788, + "learning_rate": 2.769320931992535e-06, + "loss": 0.7728, + "step": 4745 + }, + { + "epoch": 0.76, + "grad_norm": 1.411504566498782, + "learning_rate": 2.765716434754315e-06, + "loss": 0.8077, + "step": 4746 + }, + { + "epoch": 0.76, + "grad_norm": 1.3903705385543468, + "learning_rate": 2.762113908334351e-06, + "loss": 0.7877, + "step": 4747 + }, + { + "epoch": 0.77, + "grad_norm": 1.401003650205048, + "learning_rate": 2.758513353714073e-06, + "loss": 0.7891, + "step": 4748 + }, + { + "epoch": 0.77, + "grad_norm": 1.4304251526689573, + "learning_rate": 2.7549147718743684e-06, + "loss": 0.7866, + "step": 4749 + }, + { + "epoch": 0.77, + "grad_norm": 1.43539533658667, + "learning_rate": 2.7513181637955945e-06, + "loss": 0.7968, + "step": 4750 + }, + { + "epoch": 0.77, + "grad_norm": 0.8902246449580287, + "learning_rate": 2.74772353045756e-06, + "loss": 0.3248, + "step": 4751 + }, + { + "epoch": 0.77, + "grad_norm": 1.3298765406444581, + "learning_rate": 2.74413087283955e-06, + "loss": 0.8225, + "step": 4752 + }, + { + "epoch": 0.77, + "grad_norm": 1.3946123640528336, + "learning_rate": 2.7405401919202967e-06, + "loss": 0.7738, + "step": 4753 + }, + { + "epoch": 0.77, + "grad_norm": 1.347772011415199, + "learning_rate": 2.736951488678006e-06, + "loss": 0.8393, + "step": 4754 + }, + { + "epoch": 0.77, + "grad_norm": 1.3137958145716673, + "learning_rate": 2.733364764090336e-06, + "loss": 0.8634, + "step": 4755 + }, + { + "epoch": 0.77, + "grad_norm": 1.1784630872795685, + "learning_rate": 2.729780019134408e-06, + "loss": 0.827, + "step": 4756 + }, + { + "epoch": 0.77, + "grad_norm": 1.3337123175933627, + "learning_rate": 2.726197254786811e-06, + "loss": 0.8242, + "step": 4757 + }, + { + "epoch": 0.77, + "grad_norm": 1.5238580966770316, + "learning_rate": 2.722616472023585e-06, + "loss": 0.8548, + "step": 4758 + }, + { + "epoch": 0.77, + "grad_norm": 1.2206352970352585, + "learning_rate": 2.7190376718202304e-06, + "loss": 0.8462, + "step": 4759 + }, + { + "epoch": 0.77, + "grad_norm": 1.6015753745515238, + "learning_rate": 2.715460855151716e-06, + "loss": 0.7925, + "step": 4760 + }, + { + "epoch": 0.77, + "grad_norm": 1.3470440517563254, + "learning_rate": 2.711886022992469e-06, + "loss": 0.8337, + "step": 4761 + }, + { + "epoch": 0.77, + "grad_norm": 1.5101020753934624, + "learning_rate": 2.708313176316363e-06, + "loss": 0.8181, + "step": 4762 + }, + { + "epoch": 0.77, + "grad_norm": 1.0197095820056687, + "learning_rate": 2.7047423160967433e-06, + "loss": 0.3047, + "step": 4763 + }, + { + "epoch": 0.77, + "grad_norm": 1.3298754453630075, + "learning_rate": 2.701173443306414e-06, + "loss": 0.7523, + "step": 4764 + }, + { + "epoch": 0.77, + "grad_norm": 1.296041938494606, + "learning_rate": 2.6976065589176337e-06, + "loss": 0.8572, + "step": 4765 + }, + { + "epoch": 0.77, + "grad_norm": 1.3128053176640733, + "learning_rate": 2.694041663902114e-06, + "loss": 0.8599, + "step": 4766 + }, + { + "epoch": 0.77, + "grad_norm": 1.762393339079883, + "learning_rate": 2.690478759231039e-06, + "loss": 0.8208, + "step": 4767 + }, + { + "epoch": 0.77, + "grad_norm": 1.8830039129364213, + "learning_rate": 2.686917845875038e-06, + "loss": 0.8371, + "step": 4768 + }, + { + "epoch": 0.77, + "grad_norm": 1.345728826900248, + "learning_rate": 2.683358924804198e-06, + "loss": 0.8638, + "step": 4769 + }, + { + "epoch": 0.77, + "grad_norm": 1.6009295468874667, + "learning_rate": 2.679801996988075e-06, + "loss": 0.9421, + "step": 4770 + }, + { + "epoch": 0.77, + "grad_norm": 1.309622731891372, + "learning_rate": 2.676247063395668e-06, + "loss": 0.8002, + "step": 4771 + }, + { + "epoch": 0.77, + "grad_norm": 1.4069561645583646, + "learning_rate": 2.6726941249954443e-06, + "loss": 0.8256, + "step": 4772 + }, + { + "epoch": 0.77, + "grad_norm": 1.7235899090492846, + "learning_rate": 2.669143182755315e-06, + "loss": 0.8037, + "step": 4773 + }, + { + "epoch": 0.77, + "grad_norm": 1.3571092828848632, + "learning_rate": 2.6655942376426635e-06, + "loss": 0.7565, + "step": 4774 + }, + { + "epoch": 0.77, + "grad_norm": 1.9341718825705039, + "learning_rate": 2.6620472906243123e-06, + "loss": 0.8373, + "step": 4775 + }, + { + "epoch": 0.77, + "grad_norm": 1.5728898313872097, + "learning_rate": 2.6585023426665534e-06, + "loss": 0.8731, + "step": 4776 + }, + { + "epoch": 0.77, + "grad_norm": 1.4449965260473816, + "learning_rate": 2.6549593947351258e-06, + "loss": 0.8004, + "step": 4777 + }, + { + "epoch": 0.77, + "grad_norm": 1.3828991570502336, + "learning_rate": 2.6514184477952244e-06, + "loss": 0.7977, + "step": 4778 + }, + { + "epoch": 0.77, + "grad_norm": 1.3006525559527446, + "learning_rate": 2.6478795028115046e-06, + "loss": 0.8631, + "step": 4779 + }, + { + "epoch": 0.77, + "grad_norm": 1.3091878232855372, + "learning_rate": 2.644342560748071e-06, + "loss": 0.7841, + "step": 4780 + }, + { + "epoch": 0.77, + "grad_norm": 1.3827949202222882, + "learning_rate": 2.6408076225684808e-06, + "loss": 0.8129, + "step": 4781 + }, + { + "epoch": 0.77, + "grad_norm": 1.4711646043708109, + "learning_rate": 2.6372746892357514e-06, + "loss": 0.7798, + "step": 4782 + }, + { + "epoch": 0.77, + "grad_norm": 1.4743790598241788, + "learning_rate": 2.6337437617123586e-06, + "loss": 0.825, + "step": 4783 + }, + { + "epoch": 0.77, + "grad_norm": 1.4533201043090753, + "learning_rate": 2.630214840960209e-06, + "loss": 0.8003, + "step": 4784 + }, + { + "epoch": 0.77, + "grad_norm": 1.8438111336521879, + "learning_rate": 2.626687927940688e-06, + "loss": 0.7874, + "step": 4785 + }, + { + "epoch": 0.77, + "grad_norm": 1.4532204437700442, + "learning_rate": 2.623163023614623e-06, + "loss": 0.7926, + "step": 4786 + }, + { + "epoch": 0.77, + "grad_norm": 1.2711403768875933, + "learning_rate": 2.6196401289422955e-06, + "loss": 0.8532, + "step": 4787 + }, + { + "epoch": 0.77, + "grad_norm": 1.3025418386631398, + "learning_rate": 2.6161192448834348e-06, + "loss": 0.8037, + "step": 4788 + }, + { + "epoch": 0.77, + "grad_norm": 1.3535184750107214, + "learning_rate": 2.6126003723972325e-06, + "loss": 0.8376, + "step": 4789 + }, + { + "epoch": 0.77, + "grad_norm": 1.3210481316875209, + "learning_rate": 2.609083512442323e-06, + "loss": 0.8505, + "step": 4790 + }, + { + "epoch": 0.77, + "grad_norm": 1.419981395754241, + "learning_rate": 2.6055686659767944e-06, + "loss": 0.8112, + "step": 4791 + }, + { + "epoch": 0.77, + "grad_norm": 1.4878739089908137, + "learning_rate": 2.6020558339581893e-06, + "loss": 0.8181, + "step": 4792 + }, + { + "epoch": 0.77, + "grad_norm": 1.4601986426827283, + "learning_rate": 2.5985450173435035e-06, + "loss": 0.8827, + "step": 4793 + }, + { + "epoch": 0.77, + "grad_norm": 1.4281533105431454, + "learning_rate": 2.5950362170891774e-06, + "loss": 0.8276, + "step": 4794 + }, + { + "epoch": 0.77, + "grad_norm": 1.4420822730530534, + "learning_rate": 2.591529434151101e-06, + "loss": 0.886, + "step": 4795 + }, + { + "epoch": 0.77, + "grad_norm": 1.3207776696938347, + "learning_rate": 2.5880246694846266e-06, + "loss": 0.7761, + "step": 4796 + }, + { + "epoch": 0.77, + "grad_norm": 1.3662813726105645, + "learning_rate": 2.584521924044544e-06, + "loss": 0.8614, + "step": 4797 + }, + { + "epoch": 0.77, + "grad_norm": 1.4237891023099956, + "learning_rate": 2.5810211987850953e-06, + "loss": 0.7869, + "step": 4798 + }, + { + "epoch": 0.77, + "grad_norm": 1.351238349675533, + "learning_rate": 2.577522494659981e-06, + "loss": 0.8733, + "step": 4799 + }, + { + "epoch": 0.77, + "grad_norm": 1.499371123609028, + "learning_rate": 2.574025812622337e-06, + "loss": 0.8017, + "step": 4800 + }, + { + "epoch": 0.77, + "grad_norm": 1.4614065648888892, + "learning_rate": 2.5705311536247622e-06, + "loss": 0.8006, + "step": 4801 + }, + { + "epoch": 0.77, + "grad_norm": 1.6781911389185815, + "learning_rate": 2.567038518619297e-06, + "loss": 0.8639, + "step": 4802 + }, + { + "epoch": 0.77, + "grad_norm": 1.38417838784059, + "learning_rate": 2.5635479085574267e-06, + "loss": 0.8496, + "step": 4803 + }, + { + "epoch": 0.77, + "grad_norm": 1.379796672030245, + "learning_rate": 2.5600593243900927e-06, + "loss": 0.8425, + "step": 4804 + }, + { + "epoch": 0.77, + "grad_norm": 1.5074689083269697, + "learning_rate": 2.5565727670676887e-06, + "loss": 0.8421, + "step": 4805 + }, + { + "epoch": 0.77, + "grad_norm": 1.1869306218867617, + "learning_rate": 2.5530882375400358e-06, + "loss": 0.8157, + "step": 4806 + }, + { + "epoch": 0.77, + "grad_norm": 1.1947729113732086, + "learning_rate": 2.549605736756423e-06, + "loss": 0.7717, + "step": 4807 + }, + { + "epoch": 0.77, + "grad_norm": 1.632941741710884, + "learning_rate": 2.5461252656655813e-06, + "loss": 0.81, + "step": 4808 + }, + { + "epoch": 0.77, + "grad_norm": 1.2887486762177378, + "learning_rate": 2.5426468252156846e-06, + "loss": 0.8519, + "step": 4809 + }, + { + "epoch": 0.77, + "grad_norm": 1.5638530999744125, + "learning_rate": 2.5391704163543516e-06, + "loss": 0.8048, + "step": 4810 + }, + { + "epoch": 0.78, + "grad_norm": 1.2852429651293653, + "learning_rate": 2.535696040028658e-06, + "loss": 0.7383, + "step": 4811 + }, + { + "epoch": 0.78, + "grad_norm": 1.4417582130002655, + "learning_rate": 2.532223697185122e-06, + "loss": 0.852, + "step": 4812 + }, + { + "epoch": 0.78, + "grad_norm": 1.3789184896666142, + "learning_rate": 2.528753388769697e-06, + "loss": 0.8387, + "step": 4813 + }, + { + "epoch": 0.78, + "grad_norm": 1.6858790082569208, + "learning_rate": 2.5252851157277945e-06, + "loss": 0.8815, + "step": 4814 + }, + { + "epoch": 0.78, + "grad_norm": 1.5436280749219542, + "learning_rate": 2.521818879004271e-06, + "loss": 0.7918, + "step": 4815 + }, + { + "epoch": 0.78, + "grad_norm": 1.384937980482031, + "learning_rate": 2.518354679543422e-06, + "loss": 0.8232, + "step": 4816 + }, + { + "epoch": 0.78, + "grad_norm": 0.7151801927878798, + "learning_rate": 2.514892518288988e-06, + "loss": 0.3207, + "step": 4817 + }, + { + "epoch": 0.78, + "grad_norm": 1.469952707223259, + "learning_rate": 2.511432396184165e-06, + "loss": 0.8778, + "step": 4818 + }, + { + "epoch": 0.78, + "grad_norm": 1.3119519720184325, + "learning_rate": 2.5079743141715786e-06, + "loss": 0.8678, + "step": 4819 + }, + { + "epoch": 0.78, + "grad_norm": 1.4311901218994671, + "learning_rate": 2.504518273193306e-06, + "loss": 0.7825, + "step": 4820 + }, + { + "epoch": 0.78, + "grad_norm": 1.2659290869708009, + "learning_rate": 2.501064274190872e-06, + "loss": 0.7607, + "step": 4821 + }, + { + "epoch": 0.78, + "grad_norm": 1.5034695090230736, + "learning_rate": 2.497612318105237e-06, + "loss": 0.8115, + "step": 4822 + }, + { + "epoch": 0.78, + "grad_norm": 1.5035590080569217, + "learning_rate": 2.4941624058768143e-06, + "loss": 0.8338, + "step": 4823 + }, + { + "epoch": 0.78, + "grad_norm": 1.5047508871338022, + "learning_rate": 2.4907145384454514e-06, + "loss": 0.8547, + "step": 4824 + }, + { + "epoch": 0.78, + "grad_norm": 1.468443482517806, + "learning_rate": 2.4872687167504393e-06, + "loss": 0.7849, + "step": 4825 + }, + { + "epoch": 0.78, + "grad_norm": 1.6360440024163683, + "learning_rate": 2.4838249417305214e-06, + "loss": 0.8144, + "step": 4826 + }, + { + "epoch": 0.78, + "grad_norm": 1.3644007756030407, + "learning_rate": 2.480383214323875e-06, + "loss": 0.8602, + "step": 4827 + }, + { + "epoch": 0.78, + "grad_norm": 0.8615700366887754, + "learning_rate": 2.476943535468117e-06, + "loss": 0.3462, + "step": 4828 + }, + { + "epoch": 0.78, + "grad_norm": 1.3804834628441032, + "learning_rate": 2.4735059061003143e-06, + "loss": 0.7757, + "step": 4829 + }, + { + "epoch": 0.78, + "grad_norm": 1.446131978464802, + "learning_rate": 2.470070327156975e-06, + "loss": 0.8712, + "step": 4830 + }, + { + "epoch": 0.78, + "grad_norm": 1.3351877796965814, + "learning_rate": 2.4666367995740414e-06, + "loss": 0.8211, + "step": 4831 + }, + { + "epoch": 0.78, + "grad_norm": 1.5299335779708552, + "learning_rate": 2.4632053242869005e-06, + "loss": 0.7839, + "step": 4832 + }, + { + "epoch": 0.78, + "grad_norm": 1.411685526488035, + "learning_rate": 2.4597759022303813e-06, + "loss": 0.8079, + "step": 4833 + }, + { + "epoch": 0.78, + "grad_norm": 1.6286817259844446, + "learning_rate": 2.4563485343387596e-06, + "loss": 0.8306, + "step": 4834 + }, + { + "epoch": 0.78, + "grad_norm": 1.4052947097769446, + "learning_rate": 2.4529232215457333e-06, + "loss": 0.7712, + "step": 4835 + }, + { + "epoch": 0.78, + "grad_norm": 1.2947740452447218, + "learning_rate": 2.4494999647844574e-06, + "loss": 0.8235, + "step": 4836 + }, + { + "epoch": 0.78, + "grad_norm": 1.459633565323603, + "learning_rate": 2.4460787649875263e-06, + "loss": 0.8135, + "step": 4837 + }, + { + "epoch": 0.78, + "grad_norm": 1.7226874463290647, + "learning_rate": 2.4426596230869624e-06, + "loss": 0.8139, + "step": 4838 + }, + { + "epoch": 0.78, + "grad_norm": 1.3856042822890058, + "learning_rate": 2.439242540014236e-06, + "loss": 0.8395, + "step": 4839 + }, + { + "epoch": 0.78, + "grad_norm": 1.4054211193631352, + "learning_rate": 2.4358275167002564e-06, + "loss": 0.8215, + "step": 4840 + }, + { + "epoch": 0.78, + "grad_norm": 1.4133799114657297, + "learning_rate": 2.4324145540753697e-06, + "loss": 0.7281, + "step": 4841 + }, + { + "epoch": 0.78, + "grad_norm": 1.5179157716082834, + "learning_rate": 2.429003653069357e-06, + "loss": 0.8626, + "step": 4842 + }, + { + "epoch": 0.78, + "grad_norm": 1.5471082388716655, + "learning_rate": 2.4255948146114483e-06, + "loss": 0.8076, + "step": 4843 + }, + { + "epoch": 0.78, + "grad_norm": 1.2998987395997688, + "learning_rate": 2.4221880396302977e-06, + "loss": 0.8017, + "step": 4844 + }, + { + "epoch": 0.78, + "grad_norm": 1.400121432864329, + "learning_rate": 2.418783329054013e-06, + "loss": 0.7756, + "step": 4845 + }, + { + "epoch": 0.78, + "grad_norm": 1.267401055427947, + "learning_rate": 2.4153806838101257e-06, + "loss": 0.9318, + "step": 4846 + }, + { + "epoch": 0.78, + "grad_norm": 1.5193188115508298, + "learning_rate": 2.4119801048256096e-06, + "loss": 0.7923, + "step": 4847 + }, + { + "epoch": 0.78, + "grad_norm": 1.6039467045122593, + "learning_rate": 2.4085815930268807e-06, + "loss": 0.843, + "step": 4848 + }, + { + "epoch": 0.78, + "grad_norm": 1.4440248114301522, + "learning_rate": 2.4051851493397835e-06, + "loss": 0.8449, + "step": 4849 + }, + { + "epoch": 0.78, + "grad_norm": 1.4906822059158764, + "learning_rate": 2.401790774689602e-06, + "loss": 0.7083, + "step": 4850 + }, + { + "epoch": 0.78, + "grad_norm": 1.4548501133684855, + "learning_rate": 2.3983984700010587e-06, + "loss": 0.8982, + "step": 4851 + }, + { + "epoch": 0.78, + "grad_norm": 1.371044080600788, + "learning_rate": 2.395008236198315e-06, + "loss": 0.8117, + "step": 4852 + }, + { + "epoch": 0.78, + "grad_norm": 1.3581950825970681, + "learning_rate": 2.39162007420496e-06, + "loss": 0.8198, + "step": 4853 + }, + { + "epoch": 0.78, + "grad_norm": 1.4548097151697144, + "learning_rate": 2.3882339849440206e-06, + "loss": 0.7942, + "step": 4854 + }, + { + "epoch": 0.78, + "grad_norm": 1.3890936347842693, + "learning_rate": 2.384849969337967e-06, + "loss": 0.8049, + "step": 4855 + }, + { + "epoch": 0.78, + "grad_norm": 1.3257029986849345, + "learning_rate": 2.381468028308693e-06, + "loss": 0.7812, + "step": 4856 + }, + { + "epoch": 0.78, + "grad_norm": 1.3278843233877267, + "learning_rate": 2.378088162777532e-06, + "loss": 0.8543, + "step": 4857 + }, + { + "epoch": 0.78, + "grad_norm": 0.8270242264498211, + "learning_rate": 2.3747103736652543e-06, + "loss": 0.3562, + "step": 4858 + }, + { + "epoch": 0.78, + "grad_norm": 1.531645704249799, + "learning_rate": 2.371334661892066e-06, + "loss": 0.8645, + "step": 4859 + }, + { + "epoch": 0.78, + "grad_norm": 1.2539261492519806, + "learning_rate": 2.3679610283776023e-06, + "loss": 0.7827, + "step": 4860 + }, + { + "epoch": 0.78, + "grad_norm": 1.498486350573901, + "learning_rate": 2.3645894740409294e-06, + "loss": 0.6973, + "step": 4861 + }, + { + "epoch": 0.78, + "grad_norm": 1.3829209674439278, + "learning_rate": 2.361219999800558e-06, + "loss": 0.7672, + "step": 4862 + }, + { + "epoch": 0.78, + "grad_norm": 1.7305582746679888, + "learning_rate": 2.3578526065744223e-06, + "loss": 0.8456, + "step": 4863 + }, + { + "epoch": 0.78, + "grad_norm": 1.475110020817717, + "learning_rate": 2.3544872952798913e-06, + "loss": 0.7948, + "step": 4864 + }, + { + "epoch": 0.78, + "grad_norm": 1.2527863256211342, + "learning_rate": 2.351124066833773e-06, + "loss": 0.8552, + "step": 4865 + }, + { + "epoch": 0.78, + "grad_norm": 1.6116378382492695, + "learning_rate": 2.3477629221522992e-06, + "loss": 0.8105, + "step": 4866 + }, + { + "epoch": 0.78, + "grad_norm": 1.5100930193290318, + "learning_rate": 2.3444038621511435e-06, + "loss": 0.8675, + "step": 4867 + }, + { + "epoch": 0.78, + "grad_norm": 1.4794449361173752, + "learning_rate": 2.341046887745403e-06, + "loss": 0.799, + "step": 4868 + }, + { + "epoch": 0.78, + "grad_norm": 1.3365418004334388, + "learning_rate": 2.3376919998496083e-06, + "loss": 0.8863, + "step": 4869 + }, + { + "epoch": 0.78, + "grad_norm": 1.2790553167018173, + "learning_rate": 2.3343391993777274e-06, + "loss": 0.7871, + "step": 4870 + }, + { + "epoch": 0.78, + "grad_norm": 1.4909981686352511, + "learning_rate": 2.330988487243152e-06, + "loss": 0.7723, + "step": 4871 + }, + { + "epoch": 0.78, + "grad_norm": 1.5190453671884316, + "learning_rate": 2.3276398643587127e-06, + "loss": 0.7908, + "step": 4872 + }, + { + "epoch": 0.79, + "grad_norm": 0.891691762679748, + "learning_rate": 2.3242933316366613e-06, + "loss": 0.3192, + "step": 4873 + }, + { + "epoch": 0.79, + "grad_norm": 1.2974824933178404, + "learning_rate": 2.320948889988691e-06, + "loss": 0.8117, + "step": 4874 + }, + { + "epoch": 0.79, + "grad_norm": 1.4645519601759032, + "learning_rate": 2.3176065403259184e-06, + "loss": 0.7029, + "step": 4875 + }, + { + "epoch": 0.79, + "grad_norm": 1.6171250187280124, + "learning_rate": 2.314266283558887e-06, + "loss": 0.8657, + "step": 4876 + }, + { + "epoch": 0.79, + "grad_norm": 1.5250751355932701, + "learning_rate": 2.310928120597581e-06, + "loss": 0.8258, + "step": 4877 + }, + { + "epoch": 0.79, + "grad_norm": 1.2718792633944718, + "learning_rate": 2.307592052351406e-06, + "loss": 0.8567, + "step": 4878 + }, + { + "epoch": 0.79, + "grad_norm": 1.3808272262240875, + "learning_rate": 2.3042580797291956e-06, + "loss": 0.7549, + "step": 4879 + }, + { + "epoch": 0.79, + "grad_norm": 1.3979685119512733, + "learning_rate": 2.300926203639218e-06, + "loss": 0.8039, + "step": 4880 + }, + { + "epoch": 0.79, + "grad_norm": 1.283493816858713, + "learning_rate": 2.2975964249891723e-06, + "loss": 0.7298, + "step": 4881 + }, + { + "epoch": 0.79, + "grad_norm": 1.437431135549487, + "learning_rate": 2.294268744686178e-06, + "loss": 0.8407, + "step": 4882 + }, + { + "epoch": 0.79, + "grad_norm": 1.4352529977511503, + "learning_rate": 2.2909431636367853e-06, + "loss": 0.8531, + "step": 4883 + }, + { + "epoch": 0.79, + "grad_norm": 1.2454554161989482, + "learning_rate": 2.2876196827469776e-06, + "loss": 0.7123, + "step": 4884 + }, + { + "epoch": 0.79, + "grad_norm": 1.7406299700143835, + "learning_rate": 2.284298302922162e-06, + "loss": 0.8469, + "step": 4885 + }, + { + "epoch": 0.79, + "grad_norm": 1.4686450819323873, + "learning_rate": 2.2809790250671717e-06, + "loss": 0.7989, + "step": 4886 + }, + { + "epoch": 0.79, + "grad_norm": 1.3334465335420826, + "learning_rate": 2.2776618500862725e-06, + "loss": 0.8732, + "step": 4887 + }, + { + "epoch": 0.79, + "grad_norm": 1.4262962802414627, + "learning_rate": 2.2743467788831496e-06, + "loss": 0.7672, + "step": 4888 + }, + { + "epoch": 0.79, + "grad_norm": 1.6001630996112, + "learning_rate": 2.2710338123609264e-06, + "loss": 0.8628, + "step": 4889 + }, + { + "epoch": 0.79, + "grad_norm": 1.3171500823310665, + "learning_rate": 2.267722951422141e-06, + "loss": 0.8298, + "step": 4890 + }, + { + "epoch": 0.79, + "grad_norm": 1.3425310172683729, + "learning_rate": 2.2644141969687674e-06, + "loss": 0.7635, + "step": 4891 + }, + { + "epoch": 0.79, + "grad_norm": 1.4090316969685044, + "learning_rate": 2.2611075499021985e-06, + "loss": 0.8712, + "step": 4892 + }, + { + "epoch": 0.79, + "grad_norm": 1.2114792288844773, + "learning_rate": 2.257803011123254e-06, + "loss": 0.7515, + "step": 4893 + }, + { + "epoch": 0.79, + "grad_norm": 1.5790682207030717, + "learning_rate": 2.2545005815321875e-06, + "loss": 0.821, + "step": 4894 + }, + { + "epoch": 0.79, + "grad_norm": 1.5206421610230947, + "learning_rate": 2.2512002620286653e-06, + "loss": 0.8152, + "step": 4895 + }, + { + "epoch": 0.79, + "grad_norm": 0.9888513366390224, + "learning_rate": 2.2479020535117924e-06, + "loss": 0.321, + "step": 4896 + }, + { + "epoch": 0.79, + "grad_norm": 1.2941426662874576, + "learning_rate": 2.2446059568800872e-06, + "loss": 0.8273, + "step": 4897 + }, + { + "epoch": 0.79, + "grad_norm": 1.3714165837312662, + "learning_rate": 2.241311973031496e-06, + "loss": 0.7805, + "step": 4898 + }, + { + "epoch": 0.79, + "grad_norm": 1.2380605968849425, + "learning_rate": 2.2380201028633954e-06, + "loss": 0.8354, + "step": 4899 + }, + { + "epoch": 0.79, + "grad_norm": 1.2413457569161983, + "learning_rate": 2.2347303472725778e-06, + "loss": 0.7661, + "step": 4900 + }, + { + "epoch": 0.79, + "grad_norm": 1.5217746633983296, + "learning_rate": 2.2314427071552624e-06, + "loss": 0.7944, + "step": 4901 + }, + { + "epoch": 0.79, + "grad_norm": 1.416094124689887, + "learning_rate": 2.228157183407096e-06, + "loss": 0.7765, + "step": 4902 + }, + { + "epoch": 0.79, + "grad_norm": 1.3228778795096796, + "learning_rate": 2.2248737769231497e-06, + "loss": 0.8357, + "step": 4903 + }, + { + "epoch": 0.79, + "grad_norm": 1.427865929680922, + "learning_rate": 2.2215924885979035e-06, + "loss": 0.7986, + "step": 4904 + }, + { + "epoch": 0.79, + "grad_norm": 1.32154536809842, + "learning_rate": 2.218313319325277e-06, + "loss": 0.785, + "step": 4905 + }, + { + "epoch": 0.79, + "grad_norm": 1.3219310486524143, + "learning_rate": 2.2150362699986084e-06, + "loss": 0.8328, + "step": 4906 + }, + { + "epoch": 0.79, + "grad_norm": 1.1973314757230091, + "learning_rate": 2.2117613415106533e-06, + "loss": 0.7628, + "step": 4907 + }, + { + "epoch": 0.79, + "grad_norm": 1.3054948681445049, + "learning_rate": 2.20848853475359e-06, + "loss": 0.8044, + "step": 4908 + }, + { + "epoch": 0.79, + "grad_norm": 1.4631344104437332, + "learning_rate": 2.2052178506190267e-06, + "loss": 0.821, + "step": 4909 + }, + { + "epoch": 0.79, + "grad_norm": 1.3585536985981372, + "learning_rate": 2.201949289997983e-06, + "loss": 0.8458, + "step": 4910 + }, + { + "epoch": 0.79, + "grad_norm": 1.4709928522920759, + "learning_rate": 2.1986828537809103e-06, + "loss": 0.8193, + "step": 4911 + }, + { + "epoch": 0.79, + "grad_norm": 1.274851194222755, + "learning_rate": 2.195418542857669e-06, + "loss": 0.8186, + "step": 4912 + }, + { + "epoch": 0.79, + "grad_norm": 1.2363040252795432, + "learning_rate": 2.1921563581175553e-06, + "loss": 0.8189, + "step": 4913 + }, + { + "epoch": 0.79, + "grad_norm": 1.455114285448573, + "learning_rate": 2.1888963004492735e-06, + "loss": 0.7653, + "step": 4914 + }, + { + "epoch": 0.79, + "grad_norm": 1.3326152922264465, + "learning_rate": 2.1856383707409513e-06, + "loss": 0.8291, + "step": 4915 + }, + { + "epoch": 0.79, + "grad_norm": 1.4497609910594378, + "learning_rate": 2.1823825698801424e-06, + "loss": 0.8672, + "step": 4916 + }, + { + "epoch": 0.79, + "grad_norm": 1.2494401601952263, + "learning_rate": 2.179128898753814e-06, + "loss": 0.8037, + "step": 4917 + }, + { + "epoch": 0.79, + "grad_norm": 1.364618215557938, + "learning_rate": 2.1758773582483594e-06, + "loss": 0.8156, + "step": 4918 + }, + { + "epoch": 0.79, + "grad_norm": 1.70733421984757, + "learning_rate": 2.172627949249586e-06, + "loss": 0.7832, + "step": 4919 + }, + { + "epoch": 0.79, + "grad_norm": 1.33817857650755, + "learning_rate": 2.169380672642719e-06, + "loss": 0.8458, + "step": 4920 + }, + { + "epoch": 0.79, + "grad_norm": 1.4138624408050462, + "learning_rate": 2.166135529312412e-06, + "loss": 0.8032, + "step": 4921 + }, + { + "epoch": 0.79, + "grad_norm": 1.558824437261852, + "learning_rate": 2.1628925201427285e-06, + "loss": 0.3233, + "step": 4922 + }, + { + "epoch": 0.79, + "grad_norm": 1.3523664770635242, + "learning_rate": 2.159651646017151e-06, + "loss": 0.8588, + "step": 4923 + }, + { + "epoch": 0.79, + "grad_norm": 1.280496848768631, + "learning_rate": 2.156412907818586e-06, + "loss": 0.7931, + "step": 4924 + }, + { + "epoch": 0.79, + "grad_norm": 1.5848470060493143, + "learning_rate": 2.1531763064293587e-06, + "loss": 0.8278, + "step": 4925 + }, + { + "epoch": 0.79, + "grad_norm": 1.3847246345565354, + "learning_rate": 2.1499418427312002e-06, + "loss": 0.8234, + "step": 4926 + }, + { + "epoch": 0.79, + "grad_norm": 1.4651939431258614, + "learning_rate": 2.146709517605271e-06, + "loss": 0.857, + "step": 4927 + }, + { + "epoch": 0.79, + "grad_norm": 1.4799943764088812, + "learning_rate": 2.14347933193215e-06, + "loss": 0.8846, + "step": 4928 + }, + { + "epoch": 0.79, + "grad_norm": 1.3409975307065036, + "learning_rate": 2.1402512865918246e-06, + "loss": 0.7431, + "step": 4929 + }, + { + "epoch": 0.79, + "grad_norm": 1.6089024807819252, + "learning_rate": 2.1370253824637023e-06, + "loss": 0.8386, + "step": 4930 + }, + { + "epoch": 0.79, + "grad_norm": 1.371510306547513, + "learning_rate": 2.1338016204266087e-06, + "loss": 0.8359, + "step": 4931 + }, + { + "epoch": 0.79, + "grad_norm": 1.3809211425779309, + "learning_rate": 2.1305800013587906e-06, + "loss": 0.801, + "step": 4932 + }, + { + "epoch": 0.79, + "grad_norm": 1.477976659202525, + "learning_rate": 2.1273605261379015e-06, + "loss": 0.8911, + "step": 4933 + }, + { + "epoch": 0.79, + "grad_norm": 1.3540908511805727, + "learning_rate": 2.124143195641013e-06, + "loss": 0.7525, + "step": 4934 + }, + { + "epoch": 0.8, + "grad_norm": 1.23683413249256, + "learning_rate": 2.1209280107446195e-06, + "loss": 0.8613, + "step": 4935 + }, + { + "epoch": 0.8, + "grad_norm": 1.3717767666163534, + "learning_rate": 2.117714972324624e-06, + "loss": 0.7137, + "step": 4936 + }, + { + "epoch": 0.8, + "grad_norm": 1.6411127512101575, + "learning_rate": 2.1145040812563432e-06, + "loss": 0.8101, + "step": 4937 + }, + { + "epoch": 0.8, + "grad_norm": 1.3018105332386591, + "learning_rate": 2.1112953384145195e-06, + "loss": 0.7273, + "step": 4938 + }, + { + "epoch": 0.8, + "grad_norm": 1.5321370184354948, + "learning_rate": 2.1080887446732955e-06, + "loss": 0.8928, + "step": 4939 + }, + { + "epoch": 0.8, + "grad_norm": 1.4569233337916576, + "learning_rate": 2.1048843009062424e-06, + "loss": 0.7999, + "step": 4940 + }, + { + "epoch": 0.8, + "grad_norm": 1.4689522208340593, + "learning_rate": 2.1016820079863366e-06, + "loss": 0.8395, + "step": 4941 + }, + { + "epoch": 0.8, + "grad_norm": 1.2633881729492629, + "learning_rate": 2.0984818667859665e-06, + "loss": 0.8091, + "step": 4942 + }, + { + "epoch": 0.8, + "grad_norm": 1.429791619986001, + "learning_rate": 2.0952838781769446e-06, + "loss": 0.8175, + "step": 4943 + }, + { + "epoch": 0.8, + "grad_norm": 1.4025088986031191, + "learning_rate": 2.09208804303049e-06, + "loss": 0.7822, + "step": 4944 + }, + { + "epoch": 0.8, + "grad_norm": 1.4240692389770133, + "learning_rate": 2.088894362217233e-06, + "loss": 0.745, + "step": 4945 + }, + { + "epoch": 0.8, + "grad_norm": 1.4629725228317716, + "learning_rate": 2.0857028366072217e-06, + "loss": 0.8252, + "step": 4946 + }, + { + "epoch": 0.8, + "grad_norm": 0.828667828838698, + "learning_rate": 2.0825134670699232e-06, + "loss": 0.3224, + "step": 4947 + }, + { + "epoch": 0.8, + "grad_norm": 1.9034732524242843, + "learning_rate": 2.0793262544741965e-06, + "loss": 0.766, + "step": 4948 + }, + { + "epoch": 0.8, + "grad_norm": 1.5508099602556762, + "learning_rate": 2.076141199688333e-06, + "loss": 0.7234, + "step": 4949 + }, + { + "epoch": 0.8, + "grad_norm": 1.2106260760612215, + "learning_rate": 2.072958303580033e-06, + "loss": 0.8143, + "step": 4950 + }, + { + "epoch": 0.8, + "grad_norm": 1.305842215539109, + "learning_rate": 2.0697775670164e-06, + "loss": 0.7832, + "step": 4951 + }, + { + "epoch": 0.8, + "grad_norm": 1.3152337498290907, + "learning_rate": 2.0665989908639526e-06, + "loss": 0.9129, + "step": 4952 + }, + { + "epoch": 0.8, + "grad_norm": 1.5543578004364778, + "learning_rate": 2.063422575988626e-06, + "loss": 0.8357, + "step": 4953 + }, + { + "epoch": 0.8, + "grad_norm": 1.3797778192897858, + "learning_rate": 2.060248323255769e-06, + "loss": 0.7668, + "step": 4954 + }, + { + "epoch": 0.8, + "grad_norm": 1.6743870511952674, + "learning_rate": 2.057076233530123e-06, + "loss": 0.8455, + "step": 4955 + }, + { + "epoch": 0.8, + "grad_norm": 1.3367015399415796, + "learning_rate": 2.053906307675859e-06, + "loss": 0.8379, + "step": 4956 + }, + { + "epoch": 0.8, + "grad_norm": 1.4031000342728666, + "learning_rate": 2.0507385465565557e-06, + "loss": 0.8457, + "step": 4957 + }, + { + "epoch": 0.8, + "grad_norm": 1.4750816732006027, + "learning_rate": 2.0475729510351937e-06, + "loss": 0.875, + "step": 4958 + }, + { + "epoch": 0.8, + "grad_norm": 1.2764264248423196, + "learning_rate": 2.044409521974168e-06, + "loss": 0.7769, + "step": 4959 + }, + { + "epoch": 0.8, + "grad_norm": 0.8383751048744504, + "learning_rate": 2.0412482602352877e-06, + "loss": 0.3404, + "step": 4960 + }, + { + "epoch": 0.8, + "grad_norm": 1.326247915354842, + "learning_rate": 2.038089166679763e-06, + "loss": 0.8359, + "step": 4961 + }, + { + "epoch": 0.8, + "grad_norm": 1.4383470239990104, + "learning_rate": 2.0349322421682238e-06, + "loss": 0.8909, + "step": 4962 + }, + { + "epoch": 0.8, + "grad_norm": 1.244927983400676, + "learning_rate": 2.0317774875606997e-06, + "loss": 0.8782, + "step": 4963 + }, + { + "epoch": 0.8, + "grad_norm": 1.4768380476919143, + "learning_rate": 2.0286249037166316e-06, + "loss": 0.8641, + "step": 4964 + }, + { + "epoch": 0.8, + "grad_norm": 1.56417525546673, + "learning_rate": 2.025474491494874e-06, + "loss": 0.833, + "step": 4965 + }, + { + "epoch": 0.8, + "grad_norm": 1.4872032852321055, + "learning_rate": 2.0223262517536833e-06, + "loss": 0.882, + "step": 4966 + }, + { + "epoch": 0.8, + "grad_norm": 1.7139346492564387, + "learning_rate": 2.0191801853507264e-06, + "loss": 0.8147, + "step": 4967 + }, + { + "epoch": 0.8, + "grad_norm": 1.5696320945675561, + "learning_rate": 2.0160362931430788e-06, + "loss": 0.8434, + "step": 4968 + }, + { + "epoch": 0.8, + "grad_norm": 1.2844456676431177, + "learning_rate": 2.0128945759872264e-06, + "loss": 0.7763, + "step": 4969 + }, + { + "epoch": 0.8, + "grad_norm": 1.5926982188283063, + "learning_rate": 2.009755034739057e-06, + "loss": 0.7946, + "step": 4970 + }, + { + "epoch": 0.8, + "grad_norm": 1.3866952652352946, + "learning_rate": 2.0066176702538675e-06, + "loss": 0.853, + "step": 4971 + }, + { + "epoch": 0.8, + "grad_norm": 1.281355354510402, + "learning_rate": 2.0034824833863652e-06, + "loss": 0.7551, + "step": 4972 + }, + { + "epoch": 0.8, + "grad_norm": 1.4413213343018103, + "learning_rate": 2.0003494749906595e-06, + "loss": 0.7746, + "step": 4973 + }, + { + "epoch": 0.8, + "grad_norm": 1.3317028127368096, + "learning_rate": 1.9972186459202656e-06, + "loss": 0.7318, + "step": 4974 + }, + { + "epoch": 0.8, + "grad_norm": 1.578013515901678, + "learning_rate": 1.9940899970281115e-06, + "loss": 0.8293, + "step": 4975 + }, + { + "epoch": 0.8, + "grad_norm": 1.1725247862507482, + "learning_rate": 1.9909635291665307e-06, + "loss": 0.8366, + "step": 4976 + }, + { + "epoch": 0.8, + "grad_norm": 1.4371700128390135, + "learning_rate": 1.9878392431872506e-06, + "loss": 0.8911, + "step": 4977 + }, + { + "epoch": 0.8, + "grad_norm": 1.3495542491524504, + "learning_rate": 1.984717139941417e-06, + "loss": 0.765, + "step": 4978 + }, + { + "epoch": 0.8, + "grad_norm": 1.382734345662889, + "learning_rate": 1.9815972202795796e-06, + "loss": 0.9208, + "step": 4979 + }, + { + "epoch": 0.8, + "grad_norm": 1.542160043162817, + "learning_rate": 1.9784794850516874e-06, + "loss": 0.9225, + "step": 4980 + }, + { + "epoch": 0.8, + "grad_norm": 1.4904646948596165, + "learning_rate": 1.9753639351070954e-06, + "loss": 0.8672, + "step": 4981 + }, + { + "epoch": 0.8, + "grad_norm": 1.2483181367610416, + "learning_rate": 1.9722505712945715e-06, + "loss": 0.7887, + "step": 4982 + }, + { + "epoch": 0.8, + "grad_norm": 0.8341131122132301, + "learning_rate": 1.969139394462277e-06, + "loss": 0.3182, + "step": 4983 + }, + { + "epoch": 0.8, + "grad_norm": 1.3884108280596756, + "learning_rate": 1.9660304054577815e-06, + "loss": 0.7783, + "step": 4984 + }, + { + "epoch": 0.8, + "grad_norm": 1.4760301515876666, + "learning_rate": 1.962923605128064e-06, + "loss": 0.7501, + "step": 4985 + }, + { + "epoch": 0.8, + "grad_norm": 1.3547770414492288, + "learning_rate": 1.9598189943194965e-06, + "loss": 0.8309, + "step": 4986 + }, + { + "epoch": 0.8, + "grad_norm": 1.2618513214805087, + "learning_rate": 1.956716573877867e-06, + "loss": 0.7802, + "step": 4987 + }, + { + "epoch": 0.8, + "grad_norm": 1.3440339937459915, + "learning_rate": 1.953616344648357e-06, + "loss": 0.7934, + "step": 4988 + }, + { + "epoch": 0.8, + "grad_norm": 1.555677460578652, + "learning_rate": 1.9505183074755516e-06, + "loss": 0.8707, + "step": 4989 + }, + { + "epoch": 0.8, + "grad_norm": 1.4685647396165218, + "learning_rate": 1.9474224632034442e-06, + "loss": 0.8026, + "step": 4990 + }, + { + "epoch": 0.8, + "grad_norm": 1.3080752904300719, + "learning_rate": 1.944328812675432e-06, + "loss": 0.776, + "step": 4991 + }, + { + "epoch": 0.8, + "grad_norm": 1.1745329560056474, + "learning_rate": 1.941237356734307e-06, + "loss": 0.8533, + "step": 4992 + }, + { + "epoch": 0.8, + "grad_norm": 1.4447081765168766, + "learning_rate": 1.938148096222264e-06, + "loss": 0.778, + "step": 4993 + }, + { + "epoch": 0.8, + "grad_norm": 1.398319234056097, + "learning_rate": 1.935061031980909e-06, + "loss": 0.8075, + "step": 4994 + }, + { + "epoch": 0.8, + "grad_norm": 1.3836104009133627, + "learning_rate": 1.9319761648512404e-06, + "loss": 0.8145, + "step": 4995 + }, + { + "epoch": 0.8, + "grad_norm": 1.4189440523103918, + "learning_rate": 1.9288934956736572e-06, + "loss": 0.833, + "step": 4996 + }, + { + "epoch": 0.81, + "grad_norm": 1.381502206556557, + "learning_rate": 1.9258130252879683e-06, + "loss": 0.8189, + "step": 4997 + }, + { + "epoch": 0.81, + "grad_norm": 1.337234391122214, + "learning_rate": 1.922734754533382e-06, + "loss": 0.8829, + "step": 4998 + }, + { + "epoch": 0.81, + "grad_norm": 1.6450991646663586, + "learning_rate": 1.919658684248494e-06, + "loss": 0.8132, + "step": 4999 + }, + { + "epoch": 0.81, + "grad_norm": 1.3288605896889056, + "learning_rate": 1.9165848152713174e-06, + "loss": 0.8382, + "step": 5000 + }, + { + "epoch": 0.81, + "grad_norm": 1.5920815616961541, + "learning_rate": 1.91351314843926e-06, + "loss": 0.8538, + "step": 5001 + }, + { + "epoch": 0.81, + "grad_norm": 1.3735396530560529, + "learning_rate": 1.910443684589127e-06, + "loss": 0.8138, + "step": 5002 + }, + { + "epoch": 0.81, + "grad_norm": 1.373939426866001, + "learning_rate": 1.9073764245571204e-06, + "loss": 0.7805, + "step": 5003 + }, + { + "epoch": 0.81, + "grad_norm": 1.2317954445971735, + "learning_rate": 1.904311369178854e-06, + "loss": 0.7774, + "step": 5004 + }, + { + "epoch": 0.81, + "grad_norm": 1.5220337733999811, + "learning_rate": 1.9012485192893283e-06, + "loss": 0.8089, + "step": 5005 + }, + { + "epoch": 0.81, + "grad_norm": 1.310509976096398, + "learning_rate": 1.8981878757229478e-06, + "loss": 0.8564, + "step": 5006 + }, + { + "epoch": 0.81, + "grad_norm": 1.4455000890961793, + "learning_rate": 1.89512943931352e-06, + "loss": 0.858, + "step": 5007 + }, + { + "epoch": 0.81, + "grad_norm": 0.7857974338097528, + "learning_rate": 1.892073210894242e-06, + "loss": 0.3182, + "step": 5008 + }, + { + "epoch": 0.81, + "grad_norm": 1.3308503413194692, + "learning_rate": 1.8890191912977207e-06, + "loss": 0.8175, + "step": 5009 + }, + { + "epoch": 0.81, + "grad_norm": 1.3915294695467166, + "learning_rate": 1.88596738135595e-06, + "loss": 0.8437, + "step": 5010 + }, + { + "epoch": 0.81, + "grad_norm": 1.240944446585345, + "learning_rate": 1.8829177819003308e-06, + "loss": 0.7826, + "step": 5011 + }, + { + "epoch": 0.81, + "grad_norm": 1.2770327924497142, + "learning_rate": 1.879870393761657e-06, + "loss": 0.8177, + "step": 5012 + }, + { + "epoch": 0.81, + "grad_norm": 1.2381078308343385, + "learning_rate": 1.8768252177701184e-06, + "loss": 0.9168, + "step": 5013 + }, + { + "epoch": 0.81, + "grad_norm": 1.4272174806235682, + "learning_rate": 1.8737822547553086e-06, + "loss": 0.7523, + "step": 5014 + }, + { + "epoch": 0.81, + "grad_norm": 1.6680535645552008, + "learning_rate": 1.870741505546212e-06, + "loss": 0.8354, + "step": 5015 + }, + { + "epoch": 0.81, + "grad_norm": 1.4755194337855941, + "learning_rate": 1.8677029709712147e-06, + "loss": 0.8325, + "step": 5016 + }, + { + "epoch": 0.81, + "grad_norm": 1.2488777755732807, + "learning_rate": 1.8646666518580968e-06, + "loss": 0.7864, + "step": 5017 + }, + { + "epoch": 0.81, + "grad_norm": 1.3643468344444352, + "learning_rate": 1.8616325490340326e-06, + "loss": 0.7791, + "step": 5018 + }, + { + "epoch": 0.81, + "grad_norm": 1.4526373419239407, + "learning_rate": 1.858600663325597e-06, + "loss": 0.7599, + "step": 5019 + }, + { + "epoch": 0.81, + "grad_norm": 1.6143843941003542, + "learning_rate": 1.8555709955587654e-06, + "loss": 0.8687, + "step": 5020 + }, + { + "epoch": 0.81, + "grad_norm": 1.4101868161193605, + "learning_rate": 1.8525435465588914e-06, + "loss": 0.8205, + "step": 5021 + }, + { + "epoch": 0.81, + "grad_norm": 1.5821432756861946, + "learning_rate": 1.8495183171507415e-06, + "loss": 0.8963, + "step": 5022 + }, + { + "epoch": 0.81, + "grad_norm": 1.3947321943354474, + "learning_rate": 1.8464953081584735e-06, + "loss": 0.8352, + "step": 5023 + }, + { + "epoch": 0.81, + "grad_norm": 1.1690344660963368, + "learning_rate": 1.843474520405637e-06, + "loss": 0.7914, + "step": 5024 + }, + { + "epoch": 0.81, + "grad_norm": 1.5407844933383912, + "learning_rate": 1.8404559547151746e-06, + "loss": 0.8012, + "step": 5025 + }, + { + "epoch": 0.81, + "grad_norm": 1.306542936373972, + "learning_rate": 1.8374396119094307e-06, + "loss": 0.781, + "step": 5026 + }, + { + "epoch": 0.81, + "grad_norm": 1.5559624315763207, + "learning_rate": 1.8344254928101401e-06, + "loss": 0.8492, + "step": 5027 + }, + { + "epoch": 0.81, + "grad_norm": 1.3933385107918228, + "learning_rate": 1.8314135982384274e-06, + "loss": 0.8823, + "step": 5028 + }, + { + "epoch": 0.81, + "grad_norm": 1.614022823429547, + "learning_rate": 1.828403929014818e-06, + "loss": 0.811, + "step": 5029 + }, + { + "epoch": 0.81, + "grad_norm": 1.7507100288045265, + "learning_rate": 1.8253964859592343e-06, + "loss": 0.8707, + "step": 5030 + }, + { + "epoch": 0.81, + "grad_norm": 1.4960798867980243, + "learning_rate": 1.8223912698909807e-06, + "loss": 0.782, + "step": 5031 + }, + { + "epoch": 0.81, + "grad_norm": 1.299790092116909, + "learning_rate": 1.81938828162876e-06, + "loss": 0.81, + "step": 5032 + }, + { + "epoch": 0.81, + "grad_norm": 1.475638757867349, + "learning_rate": 1.8163875219906735e-06, + "loss": 0.8497, + "step": 5033 + }, + { + "epoch": 0.81, + "grad_norm": 1.4561037161521124, + "learning_rate": 1.8133889917942093e-06, + "loss": 0.8392, + "step": 5034 + }, + { + "epoch": 0.81, + "grad_norm": 1.2019121982238032, + "learning_rate": 1.8103926918562465e-06, + "loss": 0.7549, + "step": 5035 + }, + { + "epoch": 0.81, + "grad_norm": 1.387506539929717, + "learning_rate": 1.807398622993064e-06, + "loss": 0.8627, + "step": 5036 + }, + { + "epoch": 0.81, + "grad_norm": 1.2750852611407033, + "learning_rate": 1.804406786020325e-06, + "loss": 0.8511, + "step": 5037 + }, + { + "epoch": 0.81, + "grad_norm": 1.2466001064784635, + "learning_rate": 1.8014171817530934e-06, + "loss": 0.806, + "step": 5038 + }, + { + "epoch": 0.81, + "grad_norm": 1.6682169107628473, + "learning_rate": 1.7984298110058162e-06, + "loss": 0.8147, + "step": 5039 + }, + { + "epoch": 0.81, + "grad_norm": 1.4502595546891326, + "learning_rate": 1.7954446745923325e-06, + "loss": 0.7062, + "step": 5040 + }, + { + "epoch": 0.81, + "grad_norm": 1.3866398244882665, + "learning_rate": 1.7924617733258831e-06, + "loss": 0.8324, + "step": 5041 + }, + { + "epoch": 0.81, + "grad_norm": 1.6159636316932655, + "learning_rate": 1.7894811080190888e-06, + "loss": 0.8245, + "step": 5042 + }, + { + "epoch": 0.81, + "grad_norm": 1.4197749897517802, + "learning_rate": 1.7865026794839625e-06, + "loss": 0.7807, + "step": 5043 + }, + { + "epoch": 0.81, + "grad_norm": 1.3108587270617214, + "learning_rate": 1.7835264885319127e-06, + "loss": 0.7052, + "step": 5044 + }, + { + "epoch": 0.81, + "grad_norm": 1.3088003671565214, + "learning_rate": 1.7805525359737375e-06, + "loss": 0.7817, + "step": 5045 + }, + { + "epoch": 0.81, + "grad_norm": 0.8993869325934439, + "learning_rate": 1.7775808226196222e-06, + "loss": 0.3394, + "step": 5046 + }, + { + "epoch": 0.81, + "grad_norm": 1.457712258802836, + "learning_rate": 1.7746113492791407e-06, + "loss": 0.8551, + "step": 5047 + }, + { + "epoch": 0.81, + "grad_norm": 1.543952776597069, + "learning_rate": 1.7716441167612618e-06, + "loss": 0.9814, + "step": 5048 + }, + { + "epoch": 0.81, + "grad_norm": 1.2144080335884664, + "learning_rate": 1.7686791258743475e-06, + "loss": 0.8539, + "step": 5049 + }, + { + "epoch": 0.81, + "grad_norm": 1.2271072338277673, + "learning_rate": 1.765716377426131e-06, + "loss": 0.7837, + "step": 5050 + }, + { + "epoch": 0.81, + "grad_norm": 1.4025052988354734, + "learning_rate": 1.7627558722237525e-06, + "loss": 0.8025, + "step": 5051 + }, + { + "epoch": 0.81, + "grad_norm": 1.6193514129916942, + "learning_rate": 1.759797611073738e-06, + "loss": 0.821, + "step": 5052 + }, + { + "epoch": 0.81, + "grad_norm": 1.2738069425504421, + "learning_rate": 1.7568415947819973e-06, + "loss": 0.8177, + "step": 5053 + }, + { + "epoch": 0.81, + "grad_norm": 1.4691338430692886, + "learning_rate": 1.753887824153827e-06, + "loss": 0.812, + "step": 5054 + }, + { + "epoch": 0.81, + "grad_norm": 1.3654618332988104, + "learning_rate": 1.7509362999939217e-06, + "loss": 0.8219, + "step": 5055 + }, + { + "epoch": 0.81, + "grad_norm": 1.467219133208719, + "learning_rate": 1.7479870231063544e-06, + "loss": 0.8599, + "step": 5056 + }, + { + "epoch": 0.81, + "grad_norm": 1.3472446670074512, + "learning_rate": 1.7450399942945884e-06, + "loss": 0.7314, + "step": 5057 + }, + { + "epoch": 0.81, + "grad_norm": 1.3045122091667622, + "learning_rate": 1.74209521436148e-06, + "loss": 0.7127, + "step": 5058 + }, + { + "epoch": 0.82, + "grad_norm": 1.4135689229651887, + "learning_rate": 1.739152684109262e-06, + "loss": 0.7722, + "step": 5059 + }, + { + "epoch": 0.82, + "grad_norm": 1.406852187590174, + "learning_rate": 1.7362124043395678e-06, + "loss": 0.8515, + "step": 5060 + }, + { + "epoch": 0.82, + "grad_norm": 1.4771758425393624, + "learning_rate": 1.7332743758534076e-06, + "loss": 0.8116, + "step": 5061 + }, + { + "epoch": 0.82, + "grad_norm": 1.2923726165470755, + "learning_rate": 1.730338599451178e-06, + "loss": 0.8634, + "step": 5062 + }, + { + "epoch": 0.82, + "grad_norm": 1.3782086843537737, + "learning_rate": 1.727405075932671e-06, + "loss": 0.8849, + "step": 5063 + }, + { + "epoch": 0.82, + "grad_norm": 1.4676736092137324, + "learning_rate": 1.7244738060970567e-06, + "loss": 0.7694, + "step": 5064 + }, + { + "epoch": 0.82, + "grad_norm": 1.361870870256793, + "learning_rate": 1.7215447907428907e-06, + "loss": 0.8263, + "step": 5065 + }, + { + "epoch": 0.82, + "grad_norm": 1.707800114797023, + "learning_rate": 1.7186180306681221e-06, + "loss": 0.8249, + "step": 5066 + }, + { + "epoch": 0.82, + "grad_norm": 1.200244593714298, + "learning_rate": 1.7156935266700814e-06, + "loss": 0.7765, + "step": 5067 + }, + { + "epoch": 0.82, + "grad_norm": 1.5507454063251245, + "learning_rate": 1.712771279545482e-06, + "loss": 0.7326, + "step": 5068 + }, + { + "epoch": 0.82, + "grad_norm": 1.3112502186302735, + "learning_rate": 1.7098512900904219e-06, + "loss": 0.8443, + "step": 5069 + }, + { + "epoch": 0.82, + "grad_norm": 1.5777756149604878, + "learning_rate": 1.706933559100391e-06, + "loss": 0.8657, + "step": 5070 + }, + { + "epoch": 0.82, + "grad_norm": 1.3952305573733161, + "learning_rate": 1.7040180873702629e-06, + "loss": 0.8325, + "step": 5071 + }, + { + "epoch": 0.82, + "grad_norm": 1.3971682854548926, + "learning_rate": 1.7011048756942817e-06, + "loss": 0.8687, + "step": 5072 + }, + { + "epoch": 0.82, + "grad_norm": 1.4110709285696406, + "learning_rate": 1.6981939248660938e-06, + "loss": 0.7954, + "step": 5073 + }, + { + "epoch": 0.82, + "grad_norm": 1.3690681434932765, + "learning_rate": 1.6952852356787241e-06, + "loss": 0.7396, + "step": 5074 + }, + { + "epoch": 0.82, + "grad_norm": 1.3798590500790802, + "learning_rate": 1.6923788089245764e-06, + "loss": 0.818, + "step": 5075 + }, + { + "epoch": 0.82, + "grad_norm": 0.8469252675699344, + "learning_rate": 1.6894746453954402e-06, + "loss": 0.3169, + "step": 5076 + }, + { + "epoch": 0.82, + "grad_norm": 1.484077686098373, + "learning_rate": 1.6865727458824931e-06, + "loss": 0.8365, + "step": 5077 + }, + { + "epoch": 0.82, + "grad_norm": 1.3619481949965073, + "learning_rate": 1.6836731111762915e-06, + "loss": 0.8204, + "step": 5078 + }, + { + "epoch": 0.82, + "grad_norm": 1.3995813295076152, + "learning_rate": 1.6807757420667737e-06, + "loss": 0.8433, + "step": 5079 + }, + { + "epoch": 0.82, + "grad_norm": 1.4731730141318553, + "learning_rate": 1.6778806393432656e-06, + "loss": 0.811, + "step": 5080 + }, + { + "epoch": 0.82, + "grad_norm": 1.3800042268396344, + "learning_rate": 1.67498780379447e-06, + "loss": 0.8236, + "step": 5081 + }, + { + "epoch": 0.82, + "grad_norm": 1.279325942808528, + "learning_rate": 1.6720972362084798e-06, + "loss": 0.7728, + "step": 5082 + }, + { + "epoch": 0.82, + "grad_norm": 1.5436306783039384, + "learning_rate": 1.6692089373727616e-06, + "loss": 0.8697, + "step": 5083 + }, + { + "epoch": 0.82, + "grad_norm": 0.8720506866655976, + "learning_rate": 1.6663229080741672e-06, + "loss": 0.3233, + "step": 5084 + }, + { + "epoch": 0.82, + "grad_norm": 1.2081759415999347, + "learning_rate": 1.6634391490989333e-06, + "loss": 0.8524, + "step": 5085 + }, + { + "epoch": 0.82, + "grad_norm": 1.2787716712005615, + "learning_rate": 1.6605576612326745e-06, + "loss": 0.8367, + "step": 5086 + }, + { + "epoch": 0.82, + "grad_norm": 1.5530114736359613, + "learning_rate": 1.6576784452603833e-06, + "loss": 0.8714, + "step": 5087 + }, + { + "epoch": 0.82, + "grad_norm": 1.6558686381911316, + "learning_rate": 1.654801501966442e-06, + "loss": 0.7839, + "step": 5088 + }, + { + "epoch": 0.82, + "grad_norm": 1.3853548777546105, + "learning_rate": 1.6519268321346104e-06, + "loss": 0.8155, + "step": 5089 + }, + { + "epoch": 0.82, + "grad_norm": 1.486518980760601, + "learning_rate": 1.6490544365480266e-06, + "loss": 0.8509, + "step": 5090 + }, + { + "epoch": 0.82, + "grad_norm": 1.4579222916588561, + "learning_rate": 1.6461843159892055e-06, + "loss": 0.8492, + "step": 5091 + }, + { + "epoch": 0.82, + "grad_norm": 1.577792589426919, + "learning_rate": 1.643316471240054e-06, + "loss": 0.7621, + "step": 5092 + }, + { + "epoch": 0.82, + "grad_norm": 1.6237117734367241, + "learning_rate": 1.6404509030818493e-06, + "loss": 0.8365, + "step": 5093 + }, + { + "epoch": 0.82, + "grad_norm": 1.1183655103434567, + "learning_rate": 1.6375876122952483e-06, + "loss": 0.7826, + "step": 5094 + }, + { + "epoch": 0.82, + "grad_norm": 1.7174729913457785, + "learning_rate": 1.6347265996602935e-06, + "loss": 0.8131, + "step": 5095 + }, + { + "epoch": 0.82, + "grad_norm": 1.3645984385530405, + "learning_rate": 1.6318678659564046e-06, + "loss": 0.8068, + "step": 5096 + }, + { + "epoch": 0.82, + "grad_norm": 1.3493603353754806, + "learning_rate": 1.629011411962379e-06, + "loss": 0.8071, + "step": 5097 + }, + { + "epoch": 0.82, + "grad_norm": 1.5601298165219113, + "learning_rate": 1.6261572384563895e-06, + "loss": 0.827, + "step": 5098 + }, + { + "epoch": 0.82, + "grad_norm": 1.4175912621247566, + "learning_rate": 1.6233053462159965e-06, + "loss": 0.858, + "step": 5099 + }, + { + "epoch": 0.82, + "grad_norm": 1.4559034541473175, + "learning_rate": 1.6204557360181328e-06, + "loss": 0.7735, + "step": 5100 + }, + { + "epoch": 0.82, + "grad_norm": 0.9364697875566013, + "learning_rate": 1.6176084086391075e-06, + "loss": 0.346, + "step": 5101 + }, + { + "epoch": 0.82, + "grad_norm": 1.1845600901364681, + "learning_rate": 1.6147633648546157e-06, + "loss": 0.7923, + "step": 5102 + }, + { + "epoch": 0.82, + "grad_norm": 1.450168938188232, + "learning_rate": 1.6119206054397218e-06, + "loss": 0.7317, + "step": 5103 + }, + { + "epoch": 0.82, + "grad_norm": 1.3708730666751399, + "learning_rate": 1.6090801311688764e-06, + "loss": 0.7384, + "step": 5104 + }, + { + "epoch": 0.82, + "grad_norm": 1.4215141437146308, + "learning_rate": 1.6062419428158993e-06, + "loss": 0.7724, + "step": 5105 + }, + { + "epoch": 0.82, + "grad_norm": 1.3292631720923311, + "learning_rate": 1.603406041153991e-06, + "loss": 0.843, + "step": 5106 + }, + { + "epoch": 0.82, + "grad_norm": 1.4004163008162296, + "learning_rate": 1.6005724269557322e-06, + "loss": 0.8301, + "step": 5107 + }, + { + "epoch": 0.82, + "grad_norm": 1.2867549262030802, + "learning_rate": 1.5977411009930743e-06, + "loss": 0.8876, + "step": 5108 + }, + { + "epoch": 0.82, + "grad_norm": 1.644813344075044, + "learning_rate": 1.5949120640373517e-06, + "loss": 0.836, + "step": 5109 + }, + { + "epoch": 0.82, + "grad_norm": 1.2354139093641139, + "learning_rate": 1.5920853168592676e-06, + "loss": 0.7942, + "step": 5110 + }, + { + "epoch": 0.82, + "grad_norm": 1.6468215746420534, + "learning_rate": 1.5892608602289129e-06, + "loss": 0.7912, + "step": 5111 + }, + { + "epoch": 0.82, + "grad_norm": 1.295500220522812, + "learning_rate": 1.5864386949157419e-06, + "loss": 0.8419, + "step": 5112 + }, + { + "epoch": 0.82, + "grad_norm": 1.2374374334246054, + "learning_rate": 1.5836188216885895e-06, + "loss": 0.8372, + "step": 5113 + }, + { + "epoch": 0.82, + "grad_norm": 1.2816757266810712, + "learning_rate": 1.5808012413156715e-06, + "loss": 0.7773, + "step": 5114 + }, + { + "epoch": 0.82, + "grad_norm": 1.2494049406769672, + "learning_rate": 1.5779859545645714e-06, + "loss": 0.7565, + "step": 5115 + }, + { + "epoch": 0.82, + "grad_norm": 1.6918840884151998, + "learning_rate": 1.5751729622022494e-06, + "loss": 0.8635, + "step": 5116 + }, + { + "epoch": 0.82, + "grad_norm": 1.7806019367469736, + "learning_rate": 1.5723622649950442e-06, + "loss": 0.879, + "step": 5117 + }, + { + "epoch": 0.82, + "grad_norm": 1.288220068171052, + "learning_rate": 1.5695538637086693e-06, + "loss": 0.7994, + "step": 5118 + }, + { + "epoch": 0.82, + "grad_norm": 1.5279081227515365, + "learning_rate": 1.5667477591082092e-06, + "loss": 0.7591, + "step": 5119 + }, + { + "epoch": 0.82, + "grad_norm": 1.2645970967450206, + "learning_rate": 1.5639439519581212e-06, + "loss": 0.8308, + "step": 5120 + }, + { + "epoch": 0.83, + "grad_norm": 1.4517170394186936, + "learning_rate": 1.5611424430222432e-06, + "loss": 0.7999, + "step": 5121 + }, + { + "epoch": 0.83, + "grad_norm": 1.2419227495652463, + "learning_rate": 1.5583432330637826e-06, + "loss": 0.7769, + "step": 5122 + }, + { + "epoch": 0.83, + "grad_norm": 1.3925831321155473, + "learning_rate": 1.5555463228453193e-06, + "loss": 0.818, + "step": 5123 + }, + { + "epoch": 0.83, + "grad_norm": 1.1159012716213748, + "learning_rate": 1.5527517131288128e-06, + "loss": 0.806, + "step": 5124 + }, + { + "epoch": 0.83, + "grad_norm": 1.3369591867649087, + "learning_rate": 1.5499594046755862e-06, + "loss": 0.7984, + "step": 5125 + }, + { + "epoch": 0.83, + "grad_norm": 1.5889560273695866, + "learning_rate": 1.5471693982463476e-06, + "loss": 0.8504, + "step": 5126 + }, + { + "epoch": 0.83, + "grad_norm": 1.3704076351712626, + "learning_rate": 1.5443816946011658e-06, + "loss": 0.7968, + "step": 5127 + }, + { + "epoch": 0.83, + "grad_norm": 1.2752378607388382, + "learning_rate": 1.5415962944994933e-06, + "loss": 0.795, + "step": 5128 + }, + { + "epoch": 0.83, + "grad_norm": 1.281199917308717, + "learning_rate": 1.5388131987001464e-06, + "loss": 0.8541, + "step": 5129 + }, + { + "epoch": 0.83, + "grad_norm": 1.666358964922515, + "learning_rate": 1.5360324079613175e-06, + "loss": 0.7873, + "step": 5130 + }, + { + "epoch": 0.83, + "grad_norm": 1.3802410741887337, + "learning_rate": 1.533253923040573e-06, + "loss": 0.8144, + "step": 5131 + }, + { + "epoch": 0.83, + "grad_norm": 1.531867712825312, + "learning_rate": 1.5304777446948448e-06, + "loss": 0.8894, + "step": 5132 + }, + { + "epoch": 0.83, + "grad_norm": 1.4776925475583678, + "learning_rate": 1.527703873680445e-06, + "loss": 0.6912, + "step": 5133 + }, + { + "epoch": 0.83, + "grad_norm": 1.349230626892801, + "learning_rate": 1.52493231075305e-06, + "loss": 0.816, + "step": 5134 + }, + { + "epoch": 0.83, + "grad_norm": 1.5196554622287257, + "learning_rate": 1.522163056667708e-06, + "loss": 0.8735, + "step": 5135 + }, + { + "epoch": 0.83, + "grad_norm": 1.1917645134622263, + "learning_rate": 1.5193961121788448e-06, + "loss": 0.813, + "step": 5136 + }, + { + "epoch": 0.83, + "grad_norm": 1.3821789218130558, + "learning_rate": 1.5166314780402492e-06, + "loss": 0.7479, + "step": 5137 + }, + { + "epoch": 0.83, + "grad_norm": 1.388687201440847, + "learning_rate": 1.513869155005082e-06, + "loss": 0.6952, + "step": 5138 + }, + { + "epoch": 0.83, + "grad_norm": 1.6593997196468362, + "learning_rate": 1.5111091438258796e-06, + "loss": 0.7562, + "step": 5139 + }, + { + "epoch": 0.83, + "grad_norm": 1.3286637013102585, + "learning_rate": 1.5083514452545488e-06, + "loss": 0.7483, + "step": 5140 + }, + { + "epoch": 0.83, + "grad_norm": 1.472185802193547, + "learning_rate": 1.5055960600423524e-06, + "loss": 0.7627, + "step": 5141 + }, + { + "epoch": 0.83, + "grad_norm": 1.4975917477913243, + "learning_rate": 1.5028429889399388e-06, + "loss": 0.7848, + "step": 5142 + }, + { + "epoch": 0.83, + "grad_norm": 1.3177877331234338, + "learning_rate": 1.5000922326973233e-06, + "loss": 0.7978, + "step": 5143 + }, + { + "epoch": 0.83, + "grad_norm": 1.3262964942932212, + "learning_rate": 1.4973437920638856e-06, + "loss": 0.82, + "step": 5144 + }, + { + "epoch": 0.83, + "grad_norm": 1.3916283748787606, + "learning_rate": 1.4945976677883723e-06, + "loss": 0.8408, + "step": 5145 + }, + { + "epoch": 0.83, + "grad_norm": 0.8167666399544077, + "learning_rate": 1.4918538606189104e-06, + "loss": 0.3028, + "step": 5146 + }, + { + "epoch": 0.83, + "grad_norm": 1.466419347263555, + "learning_rate": 1.4891123713029832e-06, + "loss": 0.8332, + "step": 5147 + }, + { + "epoch": 0.83, + "grad_norm": 1.5608451639345016, + "learning_rate": 1.4863732005874509e-06, + "loss": 0.8029, + "step": 5148 + }, + { + "epoch": 0.83, + "grad_norm": 1.483068594634602, + "learning_rate": 1.483636349218538e-06, + "loss": 0.859, + "step": 5149 + }, + { + "epoch": 0.83, + "grad_norm": 1.3787146139980786, + "learning_rate": 1.4809018179418388e-06, + "loss": 0.8477, + "step": 5150 + }, + { + "epoch": 0.83, + "grad_norm": 1.5476422742995382, + "learning_rate": 1.4781696075023156e-06, + "loss": 0.8135, + "step": 5151 + }, + { + "epoch": 0.83, + "grad_norm": 1.3864963484112915, + "learning_rate": 1.4754397186442947e-06, + "loss": 0.8719, + "step": 5152 + }, + { + "epoch": 0.83, + "grad_norm": 1.231744563492654, + "learning_rate": 1.4727121521114784e-06, + "loss": 0.8273, + "step": 5153 + }, + { + "epoch": 0.83, + "grad_norm": 1.4045198380272634, + "learning_rate": 1.4699869086469242e-06, + "loss": 0.8591, + "step": 5154 + }, + { + "epoch": 0.83, + "grad_norm": 1.2808932456086721, + "learning_rate": 1.4672639889930707e-06, + "loss": 0.8424, + "step": 5155 + }, + { + "epoch": 0.83, + "grad_norm": 1.3762489188730964, + "learning_rate": 1.464543393891712e-06, + "loss": 0.8835, + "step": 5156 + }, + { + "epoch": 0.83, + "grad_norm": 1.3941458248266982, + "learning_rate": 1.461825124084012e-06, + "loss": 0.8291, + "step": 5157 + }, + { + "epoch": 0.83, + "grad_norm": 1.3477287729071803, + "learning_rate": 1.4591091803105072e-06, + "loss": 0.9062, + "step": 5158 + }, + { + "epoch": 0.83, + "grad_norm": 1.88062324507363, + "learning_rate": 1.4563955633110926e-06, + "loss": 0.8585, + "step": 5159 + }, + { + "epoch": 0.83, + "grad_norm": 1.3350133669309923, + "learning_rate": 1.453684273825029e-06, + "loss": 0.7884, + "step": 5160 + }, + { + "epoch": 0.83, + "grad_norm": 1.4369804795006753, + "learning_rate": 1.450975312590951e-06, + "loss": 0.8529, + "step": 5161 + }, + { + "epoch": 0.83, + "grad_norm": 1.2923006156700652, + "learning_rate": 1.448268680346857e-06, + "loss": 0.807, + "step": 5162 + }, + { + "epoch": 0.83, + "grad_norm": 0.9007617646845156, + "learning_rate": 1.4455643778301e-06, + "loss": 0.344, + "step": 5163 + }, + { + "epoch": 0.83, + "grad_norm": 1.1399077465316996, + "learning_rate": 1.442862405777411e-06, + "loss": 0.8053, + "step": 5164 + }, + { + "epoch": 0.83, + "grad_norm": 1.0635498720443177, + "learning_rate": 1.440162764924884e-06, + "loss": 0.343, + "step": 5165 + }, + { + "epoch": 0.83, + "grad_norm": 1.5914569245998573, + "learning_rate": 1.4374654560079725e-06, + "loss": 0.739, + "step": 5166 + }, + { + "epoch": 0.83, + "grad_norm": 1.5971842443977484, + "learning_rate": 1.434770479761497e-06, + "loss": 0.7933, + "step": 5167 + }, + { + "epoch": 0.83, + "grad_norm": 1.2015511204815645, + "learning_rate": 1.4320778369196443e-06, + "loss": 0.8044, + "step": 5168 + }, + { + "epoch": 0.83, + "grad_norm": 1.4362604930112997, + "learning_rate": 1.4293875282159698e-06, + "loss": 0.8369, + "step": 5169 + }, + { + "epoch": 0.83, + "grad_norm": 0.8827222244134763, + "learning_rate": 1.4266995543833772e-06, + "loss": 0.3448, + "step": 5170 + }, + { + "epoch": 0.83, + "grad_norm": 1.3814882632983223, + "learning_rate": 1.424013916154151e-06, + "loss": 0.8432, + "step": 5171 + }, + { + "epoch": 0.83, + "grad_norm": 1.4217982598964616, + "learning_rate": 1.421330614259936e-06, + "loss": 0.8473, + "step": 5172 + }, + { + "epoch": 0.83, + "grad_norm": 1.3341171357763792, + "learning_rate": 1.4186496494317325e-06, + "loss": 0.6961, + "step": 5173 + }, + { + "epoch": 0.83, + "grad_norm": 1.285275248696766, + "learning_rate": 1.415971022399909e-06, + "loss": 0.7634, + "step": 5174 + }, + { + "epoch": 0.83, + "grad_norm": 1.5157854453678503, + "learning_rate": 1.4132947338942016e-06, + "loss": 0.8047, + "step": 5175 + }, + { + "epoch": 0.83, + "grad_norm": 1.4247176449015786, + "learning_rate": 1.4106207846437003e-06, + "loss": 0.8821, + "step": 5176 + }, + { + "epoch": 0.83, + "grad_norm": 1.653967286890549, + "learning_rate": 1.407949175376867e-06, + "loss": 0.802, + "step": 5177 + }, + { + "epoch": 0.83, + "grad_norm": 1.4985355987498457, + "learning_rate": 1.4052799068215206e-06, + "loss": 0.8427, + "step": 5178 + }, + { + "epoch": 0.83, + "grad_norm": 1.3427708324060312, + "learning_rate": 1.4026129797048393e-06, + "loss": 0.8274, + "step": 5179 + }, + { + "epoch": 0.83, + "grad_norm": 1.280001738428238, + "learning_rate": 1.399948394753372e-06, + "loss": 0.7792, + "step": 5180 + }, + { + "epoch": 0.83, + "grad_norm": 1.374135420193684, + "learning_rate": 1.3972861526930249e-06, + "loss": 0.8397, + "step": 5181 + }, + { + "epoch": 0.83, + "grad_norm": 1.2226981416356757, + "learning_rate": 1.394626254249063e-06, + "loss": 0.7853, + "step": 5182 + }, + { + "epoch": 0.84, + "grad_norm": 1.4674074778926702, + "learning_rate": 1.391968700146118e-06, + "loss": 0.8222, + "step": 5183 + }, + { + "epoch": 0.84, + "grad_norm": 1.5871191672779557, + "learning_rate": 1.389313491108184e-06, + "loss": 0.8152, + "step": 5184 + }, + { + "epoch": 0.84, + "grad_norm": 1.3496408327745233, + "learning_rate": 1.386660627858607e-06, + "loss": 0.7802, + "step": 5185 + }, + { + "epoch": 0.84, + "grad_norm": 1.3111024366906372, + "learning_rate": 1.384010111120102e-06, + "loss": 0.8052, + "step": 5186 + }, + { + "epoch": 0.84, + "grad_norm": 1.1420869260492195, + "learning_rate": 1.3813619416147472e-06, + "loss": 0.7785, + "step": 5187 + }, + { + "epoch": 0.84, + "grad_norm": 1.283622370561614, + "learning_rate": 1.378716120063973e-06, + "loss": 0.8175, + "step": 5188 + }, + { + "epoch": 0.84, + "grad_norm": 1.5492890022227461, + "learning_rate": 1.3760726471885722e-06, + "loss": 0.8577, + "step": 5189 + }, + { + "epoch": 0.84, + "grad_norm": 1.2629827170830878, + "learning_rate": 1.3734315237087027e-06, + "loss": 0.8756, + "step": 5190 + }, + { + "epoch": 0.84, + "grad_norm": 1.3352656041201154, + "learning_rate": 1.3707927503438833e-06, + "loss": 0.8213, + "step": 5191 + }, + { + "epoch": 0.84, + "grad_norm": 1.4744377885865863, + "learning_rate": 1.3681563278129794e-06, + "loss": 0.8156, + "step": 5192 + }, + { + "epoch": 0.84, + "grad_norm": 1.261076051561022, + "learning_rate": 1.3655222568342308e-06, + "loss": 0.8368, + "step": 5193 + }, + { + "epoch": 0.84, + "grad_norm": 1.3144799466393677, + "learning_rate": 1.3628905381252322e-06, + "loss": 0.7739, + "step": 5194 + }, + { + "epoch": 0.84, + "grad_norm": 1.6395217552344568, + "learning_rate": 1.3602611724029335e-06, + "loss": 0.8914, + "step": 5195 + }, + { + "epoch": 0.84, + "grad_norm": 1.4516780722571418, + "learning_rate": 1.3576341603836462e-06, + "loss": 0.8117, + "step": 5196 + }, + { + "epoch": 0.84, + "grad_norm": 0.7639226824773393, + "learning_rate": 1.3550095027830435e-06, + "loss": 0.327, + "step": 5197 + }, + { + "epoch": 0.84, + "grad_norm": 1.405972291095598, + "learning_rate": 1.352387200316152e-06, + "loss": 0.796, + "step": 5198 + }, + { + "epoch": 0.84, + "grad_norm": 1.3665672772124704, + "learning_rate": 1.3497672536973594e-06, + "loss": 0.8, + "step": 5199 + }, + { + "epoch": 0.84, + "grad_norm": 1.5911074262771294, + "learning_rate": 1.3471496636404124e-06, + "loss": 0.8445, + "step": 5200 + }, + { + "epoch": 0.84, + "grad_norm": 1.355154899160837, + "learning_rate": 1.344534430858413e-06, + "loss": 0.7902, + "step": 5201 + }, + { + "epoch": 0.84, + "grad_norm": 0.8715510862397434, + "learning_rate": 1.3419215560638265e-06, + "loss": 0.3488, + "step": 5202 + }, + { + "epoch": 0.84, + "grad_norm": 1.361774718862833, + "learning_rate": 1.3393110399684695e-06, + "loss": 0.8451, + "step": 5203 + }, + { + "epoch": 0.84, + "grad_norm": 1.559890431159658, + "learning_rate": 1.3367028832835149e-06, + "loss": 0.8582, + "step": 5204 + }, + { + "epoch": 0.84, + "grad_norm": 1.2309376073119616, + "learning_rate": 1.3340970867195014e-06, + "loss": 0.7595, + "step": 5205 + }, + { + "epoch": 0.84, + "grad_norm": 1.886840777863905, + "learning_rate": 1.331493650986323e-06, + "loss": 0.8158, + "step": 5206 + }, + { + "epoch": 0.84, + "grad_norm": 1.292222095543013, + "learning_rate": 1.328892576793217e-06, + "loss": 0.801, + "step": 5207 + }, + { + "epoch": 0.84, + "grad_norm": 1.3361651773133782, + "learning_rate": 1.3262938648487955e-06, + "loss": 0.7986, + "step": 5208 + }, + { + "epoch": 0.84, + "grad_norm": 1.3923923153124236, + "learning_rate": 1.3236975158610178e-06, + "loss": 0.7587, + "step": 5209 + }, + { + "epoch": 0.84, + "grad_norm": 1.4426340641060156, + "learning_rate": 1.321103530537201e-06, + "loss": 0.8023, + "step": 5210 + }, + { + "epoch": 0.84, + "grad_norm": 1.5681359879012584, + "learning_rate": 1.318511909584016e-06, + "loss": 0.8019, + "step": 5211 + }, + { + "epoch": 0.84, + "grad_norm": 1.3857614171103256, + "learning_rate": 1.3159226537074933e-06, + "loss": 0.7778, + "step": 5212 + }, + { + "epoch": 0.84, + "grad_norm": 1.4395662483914837, + "learning_rate": 1.3133357636130217e-06, + "loss": 0.8223, + "step": 5213 + }, + { + "epoch": 0.84, + "grad_norm": 1.3313365261463554, + "learning_rate": 1.3107512400053335e-06, + "loss": 0.8549, + "step": 5214 + }, + { + "epoch": 0.84, + "grad_norm": 1.5227750878618302, + "learning_rate": 1.3081690835885274e-06, + "loss": 0.8435, + "step": 5215 + }, + { + "epoch": 0.84, + "grad_norm": 1.3533725628515563, + "learning_rate": 1.3055892950660576e-06, + "loss": 0.7815, + "step": 5216 + }, + { + "epoch": 0.84, + "grad_norm": 1.473533881110601, + "learning_rate": 1.303011875140726e-06, + "loss": 0.7739, + "step": 5217 + }, + { + "epoch": 0.84, + "grad_norm": 1.3354294731870167, + "learning_rate": 1.3004368245146913e-06, + "loss": 0.8477, + "step": 5218 + }, + { + "epoch": 0.84, + "grad_norm": 1.3491595773941265, + "learning_rate": 1.2978641438894735e-06, + "loss": 0.7942, + "step": 5219 + }, + { + "epoch": 0.84, + "grad_norm": 0.9494856539621644, + "learning_rate": 1.2952938339659382e-06, + "loss": 0.3366, + "step": 5220 + }, + { + "epoch": 0.84, + "grad_norm": 0.7534712854780335, + "learning_rate": 1.2927258954443066e-06, + "loss": 0.3279, + "step": 5221 + }, + { + "epoch": 0.84, + "grad_norm": 1.4389656767180963, + "learning_rate": 1.2901603290241615e-06, + "loss": 0.8152, + "step": 5222 + }, + { + "epoch": 0.84, + "grad_norm": 1.1974218423745941, + "learning_rate": 1.287597135404428e-06, + "loss": 0.8119, + "step": 5223 + }, + { + "epoch": 0.84, + "grad_norm": 1.3273289107250414, + "learning_rate": 1.2850363152833955e-06, + "loss": 0.8064, + "step": 5224 + }, + { + "epoch": 0.84, + "grad_norm": 1.5597652368428019, + "learning_rate": 1.2824778693587014e-06, + "loss": 0.8237, + "step": 5225 + }, + { + "epoch": 0.84, + "grad_norm": 1.2511474636321367, + "learning_rate": 1.2799217983273326e-06, + "loss": 0.7949, + "step": 5226 + }, + { + "epoch": 0.84, + "grad_norm": 1.5563569857503288, + "learning_rate": 1.2773681028856366e-06, + "loss": 0.834, + "step": 5227 + }, + { + "epoch": 0.84, + "grad_norm": 1.4451860262976652, + "learning_rate": 1.2748167837293113e-06, + "loss": 0.7915, + "step": 5228 + }, + { + "epoch": 0.84, + "grad_norm": 1.2503151693729173, + "learning_rate": 1.2722678415534062e-06, + "loss": 0.823, + "step": 5229 + }, + { + "epoch": 0.84, + "grad_norm": 1.4464895649053773, + "learning_rate": 1.2697212770523205e-06, + "loss": 0.773, + "step": 5230 + }, + { + "epoch": 0.84, + "grad_norm": 1.5416266960657676, + "learning_rate": 1.2671770909198122e-06, + "loss": 0.836, + "step": 5231 + }, + { + "epoch": 0.84, + "grad_norm": 1.4724177842876036, + "learning_rate": 1.264635283848985e-06, + "loss": 0.8444, + "step": 5232 + }, + { + "epoch": 0.84, + "grad_norm": 1.582534792319502, + "learning_rate": 1.262095856532297e-06, + "loss": 0.8361, + "step": 5233 + }, + { + "epoch": 0.84, + "grad_norm": 1.4605725394792384, + "learning_rate": 1.2595588096615596e-06, + "loss": 0.8503, + "step": 5234 + }, + { + "epoch": 0.84, + "grad_norm": 1.4133550787009541, + "learning_rate": 1.2570241439279386e-06, + "loss": 0.8771, + "step": 5235 + }, + { + "epoch": 0.84, + "grad_norm": 1.3808401442325793, + "learning_rate": 1.254491860021938e-06, + "loss": 0.8312, + "step": 5236 + }, + { + "epoch": 0.84, + "grad_norm": 1.4008230940828748, + "learning_rate": 1.2519619586334253e-06, + "loss": 0.8276, + "step": 5237 + }, + { + "epoch": 0.84, + "grad_norm": 1.5126666613648918, + "learning_rate": 1.249434440451619e-06, + "loss": 0.7822, + "step": 5238 + }, + { + "epoch": 0.84, + "grad_norm": 1.359159874953255, + "learning_rate": 1.2469093061650816e-06, + "loss": 0.8306, + "step": 5239 + }, + { + "epoch": 0.84, + "grad_norm": 1.188031279306702, + "learning_rate": 1.2443865564617274e-06, + "loss": 0.7191, + "step": 5240 + }, + { + "epoch": 0.84, + "grad_norm": 1.2935344036400596, + "learning_rate": 1.2418661920288278e-06, + "loss": 0.8145, + "step": 5241 + }, + { + "epoch": 0.84, + "grad_norm": 1.465430177924679, + "learning_rate": 1.2393482135529954e-06, + "loss": 0.7872, + "step": 5242 + }, + { + "epoch": 0.84, + "grad_norm": 1.6050514319236442, + "learning_rate": 1.2368326217201976e-06, + "loss": 0.7719, + "step": 5243 + }, + { + "epoch": 0.84, + "grad_norm": 1.3161406362257608, + "learning_rate": 1.2343194172157535e-06, + "loss": 0.7653, + "step": 5244 + }, + { + "epoch": 0.85, + "grad_norm": 1.4057114056497189, + "learning_rate": 1.2318086007243257e-06, + "loss": 0.7967, + "step": 5245 + }, + { + "epoch": 0.85, + "grad_norm": 1.355464617863054, + "learning_rate": 1.2293001729299336e-06, + "loss": 0.8514, + "step": 5246 + }, + { + "epoch": 0.85, + "grad_norm": 1.3290198530128567, + "learning_rate": 1.2267941345159385e-06, + "loss": 0.903, + "step": 5247 + }, + { + "epoch": 0.85, + "grad_norm": 1.4605060025587326, + "learning_rate": 1.2242904861650574e-06, + "loss": 0.8178, + "step": 5248 + }, + { + "epoch": 0.85, + "grad_norm": 1.5257403629731268, + "learning_rate": 1.2217892285593513e-06, + "loss": 0.8219, + "step": 5249 + }, + { + "epoch": 0.85, + "grad_norm": 1.4309607982488288, + "learning_rate": 1.219290362380231e-06, + "loss": 0.816, + "step": 5250 + }, + { + "epoch": 0.85, + "grad_norm": 1.379907772489096, + "learning_rate": 1.2167938883084595e-06, + "loss": 0.857, + "step": 5251 + }, + { + "epoch": 0.85, + "grad_norm": 1.3503707243987104, + "learning_rate": 1.2142998070241407e-06, + "loss": 0.7346, + "step": 5252 + }, + { + "epoch": 0.85, + "grad_norm": 1.2630054258278525, + "learning_rate": 1.2118081192067365e-06, + "loss": 0.8236, + "step": 5253 + }, + { + "epoch": 0.85, + "grad_norm": 1.412888702304788, + "learning_rate": 1.2093188255350485e-06, + "loss": 0.8277, + "step": 5254 + }, + { + "epoch": 0.85, + "grad_norm": 1.4827971391816805, + "learning_rate": 1.2068319266872264e-06, + "loss": 0.7889, + "step": 5255 + }, + { + "epoch": 0.85, + "grad_norm": 1.4803430483756137, + "learning_rate": 1.2043474233407737e-06, + "loss": 0.8284, + "step": 5256 + }, + { + "epoch": 0.85, + "grad_norm": 1.269311023464135, + "learning_rate": 1.2018653161725392e-06, + "loss": 0.7998, + "step": 5257 + }, + { + "epoch": 0.85, + "grad_norm": 1.5161565531822676, + "learning_rate": 1.1993856058587117e-06, + "loss": 0.8194, + "step": 5258 + }, + { + "epoch": 0.85, + "grad_norm": 1.3442780647806312, + "learning_rate": 1.1969082930748343e-06, + "loss": 0.8079, + "step": 5259 + }, + { + "epoch": 0.85, + "grad_norm": 1.6027828190404312, + "learning_rate": 1.1944333784957996e-06, + "loss": 0.8415, + "step": 5260 + }, + { + "epoch": 0.85, + "grad_norm": 1.7932396540712283, + "learning_rate": 1.191960862795839e-06, + "loss": 0.7442, + "step": 5261 + }, + { + "epoch": 0.85, + "grad_norm": 1.4315670470706994, + "learning_rate": 1.1894907466485317e-06, + "loss": 0.7775, + "step": 5262 + }, + { + "epoch": 0.85, + "grad_norm": 1.4226688111864627, + "learning_rate": 1.1870230307268116e-06, + "loss": 0.8064, + "step": 5263 + }, + { + "epoch": 0.85, + "grad_norm": 1.2801781861889887, + "learning_rate": 1.1845577157029474e-06, + "loss": 0.7717, + "step": 5264 + }, + { + "epoch": 0.85, + "grad_norm": 1.3965523743246373, + "learning_rate": 1.1820948022485602e-06, + "loss": 0.8608, + "step": 5265 + }, + { + "epoch": 0.85, + "grad_norm": 1.4052247880012798, + "learning_rate": 1.1796342910346147e-06, + "loss": 0.7928, + "step": 5266 + }, + { + "epoch": 0.85, + "grad_norm": 1.7039930762705902, + "learning_rate": 1.1771761827314254e-06, + "loss": 0.766, + "step": 5267 + }, + { + "epoch": 0.85, + "grad_norm": 1.2651174905273497, + "learning_rate": 1.1747204780086462e-06, + "loss": 0.7832, + "step": 5268 + }, + { + "epoch": 0.85, + "grad_norm": 1.4302796924854877, + "learning_rate": 1.1722671775352778e-06, + "loss": 0.8402, + "step": 5269 + }, + { + "epoch": 0.85, + "grad_norm": 1.518725516225925, + "learning_rate": 1.16981628197967e-06, + "loss": 0.7862, + "step": 5270 + }, + { + "epoch": 0.85, + "grad_norm": 1.296674607346402, + "learning_rate": 1.1673677920095116e-06, + "loss": 0.7981, + "step": 5271 + }, + { + "epoch": 0.85, + "grad_norm": 1.531081827815868, + "learning_rate": 1.1649217082918385e-06, + "loss": 0.8326, + "step": 5272 + }, + { + "epoch": 0.85, + "grad_norm": 1.511847252768553, + "learning_rate": 1.1624780314930339e-06, + "loss": 0.8506, + "step": 5273 + }, + { + "epoch": 0.85, + "grad_norm": 1.550381314621408, + "learning_rate": 1.16003676227882e-06, + "loss": 0.8099, + "step": 5274 + }, + { + "epoch": 0.85, + "grad_norm": 1.2903987273412068, + "learning_rate": 1.157597901314268e-06, + "loss": 0.8061, + "step": 5275 + }, + { + "epoch": 0.85, + "grad_norm": 1.4829916038290576, + "learning_rate": 1.1551614492637908e-06, + "loss": 0.8013, + "step": 5276 + }, + { + "epoch": 0.85, + "grad_norm": 1.3837284352139283, + "learning_rate": 1.152727406791142e-06, + "loss": 0.7984, + "step": 5277 + }, + { + "epoch": 0.85, + "grad_norm": 1.2642908157715427, + "learning_rate": 1.1502957745594256e-06, + "loss": 0.8012, + "step": 5278 + }, + { + "epoch": 0.85, + "grad_norm": 1.2597127367347019, + "learning_rate": 1.1478665532310839e-06, + "loss": 0.8059, + "step": 5279 + }, + { + "epoch": 0.85, + "grad_norm": 1.4216110900259018, + "learning_rate": 1.1454397434679022e-06, + "loss": 0.7939, + "step": 5280 + }, + { + "epoch": 0.85, + "grad_norm": 1.4507115309062562, + "learning_rate": 1.1430153459310112e-06, + "loss": 0.8835, + "step": 5281 + }, + { + "epoch": 0.85, + "grad_norm": 1.3149758811423164, + "learning_rate": 1.1405933612808862e-06, + "loss": 0.7915, + "step": 5282 + }, + { + "epoch": 0.85, + "grad_norm": 1.2809584063375505, + "learning_rate": 1.1381737901773405e-06, + "loss": 0.7705, + "step": 5283 + }, + { + "epoch": 0.85, + "grad_norm": 0.8853702626503501, + "learning_rate": 1.13575663327953e-06, + "loss": 0.3126, + "step": 5284 + }, + { + "epoch": 0.85, + "grad_norm": 1.5293814597969675, + "learning_rate": 1.1333418912459593e-06, + "loss": 0.8855, + "step": 5285 + }, + { + "epoch": 0.85, + "grad_norm": 1.4713699841422747, + "learning_rate": 1.1309295647344675e-06, + "loss": 0.8459, + "step": 5286 + }, + { + "epoch": 0.85, + "grad_norm": 1.4923331579048758, + "learning_rate": 1.1285196544022392e-06, + "loss": 0.9115, + "step": 5287 + }, + { + "epoch": 0.85, + "grad_norm": 1.5148807411438387, + "learning_rate": 1.126112160905799e-06, + "loss": 0.854, + "step": 5288 + }, + { + "epoch": 0.85, + "grad_norm": 1.4175981087772667, + "learning_rate": 1.12370708490102e-06, + "loss": 0.8184, + "step": 5289 + }, + { + "epoch": 0.85, + "grad_norm": 1.319453889951722, + "learning_rate": 1.1213044270431062e-06, + "loss": 0.7745, + "step": 5290 + }, + { + "epoch": 0.85, + "grad_norm": 1.3227461926955983, + "learning_rate": 1.1189041879866081e-06, + "loss": 0.8795, + "step": 5291 + }, + { + "epoch": 0.85, + "grad_norm": 1.4965099466708551, + "learning_rate": 1.1165063683854193e-06, + "loss": 0.8549, + "step": 5292 + }, + { + "epoch": 0.85, + "grad_norm": 1.3253127729833423, + "learning_rate": 1.1141109688927709e-06, + "loss": 0.7515, + "step": 5293 + }, + { + "epoch": 0.85, + "grad_norm": 1.2909171698824828, + "learning_rate": 1.1117179901612328e-06, + "loss": 0.7636, + "step": 5294 + }, + { + "epoch": 0.85, + "grad_norm": 1.35830437185751, + "learning_rate": 1.109327432842725e-06, + "loss": 0.8658, + "step": 5295 + }, + { + "epoch": 0.85, + "grad_norm": 1.4579160007306255, + "learning_rate": 1.106939297588494e-06, + "loss": 0.8115, + "step": 5296 + }, + { + "epoch": 0.85, + "grad_norm": 1.1547597410538475, + "learning_rate": 1.1045535850491396e-06, + "loss": 0.726, + "step": 5297 + }, + { + "epoch": 0.85, + "grad_norm": 1.4124817785843204, + "learning_rate": 1.1021702958745917e-06, + "loss": 0.7804, + "step": 5298 + }, + { + "epoch": 0.85, + "grad_norm": 1.5272846100716195, + "learning_rate": 1.0997894307141244e-06, + "loss": 0.7293, + "step": 5299 + }, + { + "epoch": 0.85, + "grad_norm": 1.494225150566031, + "learning_rate": 1.0974109902163544e-06, + "loss": 0.8458, + "step": 5300 + }, + { + "epoch": 0.85, + "grad_norm": 1.4533799573309003, + "learning_rate": 1.0950349750292311e-06, + "loss": 0.8197, + "step": 5301 + }, + { + "epoch": 0.85, + "grad_norm": 1.2727239809694235, + "learning_rate": 1.0926613858000456e-06, + "loss": 0.8111, + "step": 5302 + }, + { + "epoch": 0.85, + "grad_norm": 1.6093514941869236, + "learning_rate": 1.0902902231754309e-06, + "loss": 0.7813, + "step": 5303 + }, + { + "epoch": 0.85, + "grad_norm": 1.3827534908161128, + "learning_rate": 1.087921487801359e-06, + "loss": 0.7859, + "step": 5304 + }, + { + "epoch": 0.85, + "grad_norm": 1.6754433217814297, + "learning_rate": 1.0855551803231368e-06, + "loss": 0.8533, + "step": 5305 + }, + { + "epoch": 0.85, + "grad_norm": 1.6292151625882672, + "learning_rate": 1.0831913013854101e-06, + "loss": 0.7774, + "step": 5306 + }, + { + "epoch": 0.86, + "grad_norm": 1.5114736136468767, + "learning_rate": 1.080829851632167e-06, + "loss": 0.9358, + "step": 5307 + }, + { + "epoch": 0.86, + "grad_norm": 1.34756071125226, + "learning_rate": 1.0784708317067316e-06, + "loss": 0.7975, + "step": 5308 + }, + { + "epoch": 0.86, + "grad_norm": 1.4740206983406128, + "learning_rate": 1.0761142422517623e-06, + "loss": 0.8691, + "step": 5309 + }, + { + "epoch": 0.86, + "grad_norm": 0.8178306808735657, + "learning_rate": 1.073760083909262e-06, + "loss": 0.3266, + "step": 5310 + }, + { + "epoch": 0.86, + "grad_norm": 1.3789162554391414, + "learning_rate": 1.0714083573205702e-06, + "loss": 0.8508, + "step": 5311 + }, + { + "epoch": 0.86, + "grad_norm": 1.6046089656912332, + "learning_rate": 1.069059063126361e-06, + "loss": 0.8279, + "step": 5312 + }, + { + "epoch": 0.86, + "grad_norm": 1.3729483486329883, + "learning_rate": 1.066712201966642e-06, + "loss": 0.7292, + "step": 5313 + }, + { + "epoch": 0.86, + "grad_norm": 1.554574565164168, + "learning_rate": 1.06436777448077e-06, + "loss": 0.8377, + "step": 5314 + }, + { + "epoch": 0.86, + "grad_norm": 1.2737446981961735, + "learning_rate": 1.0620257813074274e-06, + "loss": 0.7761, + "step": 5315 + }, + { + "epoch": 0.86, + "grad_norm": 1.3167963072651943, + "learning_rate": 1.0596862230846371e-06, + "loss": 0.7619, + "step": 5316 + }, + { + "epoch": 0.86, + "grad_norm": 1.3108927647644004, + "learning_rate": 1.0573491004497637e-06, + "loss": 0.7994, + "step": 5317 + }, + { + "epoch": 0.86, + "grad_norm": 1.6003325438068274, + "learning_rate": 1.055014414039498e-06, + "loss": 0.7997, + "step": 5318 + }, + { + "epoch": 0.86, + "grad_norm": 1.5489227270840964, + "learning_rate": 1.0526821644898777e-06, + "loss": 0.8034, + "step": 5319 + }, + { + "epoch": 0.86, + "grad_norm": 1.5553913907194774, + "learning_rate": 1.0503523524362701e-06, + "loss": 0.811, + "step": 5320 + }, + { + "epoch": 0.86, + "grad_norm": 1.2811553664992879, + "learning_rate": 1.048024978513379e-06, + "loss": 0.8333, + "step": 5321 + }, + { + "epoch": 0.86, + "grad_norm": 1.4998375288075945, + "learning_rate": 1.0457000433552478e-06, + "loss": 0.8023, + "step": 5322 + }, + { + "epoch": 0.86, + "grad_norm": 1.36212382944794, + "learning_rate": 1.0433775475952511e-06, + "loss": 0.8567, + "step": 5323 + }, + { + "epoch": 0.86, + "grad_norm": 1.593792342109697, + "learning_rate": 1.0410574918660998e-06, + "loss": 0.8977, + "step": 5324 + }, + { + "epoch": 0.86, + "grad_norm": 1.3122936523672994, + "learning_rate": 1.0387398767998425e-06, + "loss": 0.8681, + "step": 5325 + }, + { + "epoch": 0.86, + "grad_norm": 1.5096601261818865, + "learning_rate": 1.036424703027863e-06, + "loss": 0.7307, + "step": 5326 + }, + { + "epoch": 0.86, + "grad_norm": 1.345246935979127, + "learning_rate": 1.0341119711808778e-06, + "loss": 0.8123, + "step": 5327 + }, + { + "epoch": 0.86, + "grad_norm": 1.2909270153714156, + "learning_rate": 1.0318016818889343e-06, + "loss": 0.8591, + "step": 5328 + }, + { + "epoch": 0.86, + "grad_norm": 1.4654301830089067, + "learning_rate": 1.0294938357814254e-06, + "loss": 0.844, + "step": 5329 + }, + { + "epoch": 0.86, + "grad_norm": 1.4175686455664849, + "learning_rate": 1.0271884334870685e-06, + "loss": 0.8032, + "step": 5330 + }, + { + "epoch": 0.86, + "grad_norm": 1.3158467742401752, + "learning_rate": 1.0248854756339176e-06, + "loss": 0.8303, + "step": 5331 + }, + { + "epoch": 0.86, + "grad_norm": 1.6588025958390515, + "learning_rate": 1.0225849628493634e-06, + "loss": 0.8009, + "step": 5332 + }, + { + "epoch": 0.86, + "grad_norm": 1.4908201108952597, + "learning_rate": 1.020286895760132e-06, + "loss": 0.7647, + "step": 5333 + }, + { + "epoch": 0.86, + "grad_norm": 1.4396252186359273, + "learning_rate": 1.0179912749922772e-06, + "loss": 0.8503, + "step": 5334 + }, + { + "epoch": 0.86, + "grad_norm": 1.4358349329447966, + "learning_rate": 1.0156981011711875e-06, + "loss": 0.8255, + "step": 5335 + }, + { + "epoch": 0.86, + "grad_norm": 1.4040875955391896, + "learning_rate": 1.01340737492159e-06, + "loss": 0.8663, + "step": 5336 + }, + { + "epoch": 0.86, + "grad_norm": 1.318424795434939, + "learning_rate": 1.011119096867541e-06, + "loss": 0.8305, + "step": 5337 + }, + { + "epoch": 0.86, + "grad_norm": 1.4382492133089426, + "learning_rate": 1.0088332676324285e-06, + "loss": 0.7864, + "step": 5338 + }, + { + "epoch": 0.86, + "grad_norm": 1.3338805922075627, + "learning_rate": 1.006549887838978e-06, + "loss": 0.8641, + "step": 5339 + }, + { + "epoch": 0.86, + "grad_norm": 1.3725193076412905, + "learning_rate": 1.0042689581092424e-06, + "loss": 0.8306, + "step": 5340 + }, + { + "epoch": 0.86, + "grad_norm": 1.5409036415565949, + "learning_rate": 1.001990479064613e-06, + "loss": 0.8476, + "step": 5341 + }, + { + "epoch": 0.86, + "grad_norm": 1.2461600480335424, + "learning_rate": 9.997144513258095e-07, + "loss": 0.8206, + "step": 5342 + }, + { + "epoch": 0.86, + "grad_norm": 1.2934695733205006, + "learning_rate": 9.974408755128817e-07, + "loss": 0.8108, + "step": 5343 + }, + { + "epoch": 0.86, + "grad_norm": 1.3059599262291892, + "learning_rate": 9.951697522452175e-07, + "loss": 0.8188, + "step": 5344 + }, + { + "epoch": 0.86, + "grad_norm": 1.5349864871918681, + "learning_rate": 9.92901082141531e-07, + "loss": 0.7302, + "step": 5345 + }, + { + "epoch": 0.86, + "grad_norm": 1.3265620431573484, + "learning_rate": 9.90634865819874e-07, + "loss": 0.8108, + "step": 5346 + }, + { + "epoch": 0.86, + "grad_norm": 1.3310033028438562, + "learning_rate": 9.883711038976218e-07, + "loss": 0.7748, + "step": 5347 + }, + { + "epoch": 0.86, + "grad_norm": 1.3102875927753372, + "learning_rate": 9.861097969914901e-07, + "loss": 0.8176, + "step": 5348 + }, + { + "epoch": 0.86, + "grad_norm": 1.2864623274013998, + "learning_rate": 9.838509457175183e-07, + "loss": 0.7972, + "step": 5349 + }, + { + "epoch": 0.86, + "grad_norm": 1.8661303435429006, + "learning_rate": 9.815945506910795e-07, + "loss": 0.7891, + "step": 5350 + }, + { + "epoch": 0.86, + "grad_norm": 1.2877256765941425, + "learning_rate": 9.793406125268801e-07, + "loss": 0.8037, + "step": 5351 + }, + { + "epoch": 0.86, + "grad_norm": 1.2962416343179906, + "learning_rate": 9.770891318389542e-07, + "loss": 0.8638, + "step": 5352 + }, + { + "epoch": 0.86, + "grad_norm": 1.5687350990529993, + "learning_rate": 9.748401092406657e-07, + "loss": 0.8161, + "step": 5353 + }, + { + "epoch": 0.86, + "grad_norm": 1.4920707477211714, + "learning_rate": 9.7259354534471e-07, + "loss": 0.9224, + "step": 5354 + }, + { + "epoch": 0.86, + "grad_norm": 1.2192861824876913, + "learning_rate": 9.703494407631176e-07, + "loss": 0.7941, + "step": 5355 + }, + { + "epoch": 0.86, + "grad_norm": 1.5672857248534733, + "learning_rate": 9.681077961072405e-07, + "loss": 0.8246, + "step": 5356 + }, + { + "epoch": 0.86, + "grad_norm": 1.5497155407251213, + "learning_rate": 9.658686119877636e-07, + "loss": 0.8071, + "step": 5357 + }, + { + "epoch": 0.86, + "grad_norm": 1.3386746656980388, + "learning_rate": 9.636318890147057e-07, + "loss": 0.7863, + "step": 5358 + }, + { + "epoch": 0.86, + "grad_norm": 1.5672472814984217, + "learning_rate": 9.613976277974101e-07, + "loss": 0.8036, + "step": 5359 + }, + { + "epoch": 0.86, + "grad_norm": 1.5231821647045571, + "learning_rate": 9.591658289445504e-07, + "loss": 0.7557, + "step": 5360 + }, + { + "epoch": 0.86, + "grad_norm": 1.3801120883445477, + "learning_rate": 9.569364930641323e-07, + "loss": 0.8045, + "step": 5361 + }, + { + "epoch": 0.86, + "grad_norm": 0.8228643643179944, + "learning_rate": 9.547096207634843e-07, + "loss": 0.3415, + "step": 5362 + }, + { + "epoch": 0.86, + "grad_norm": 1.3445452212561466, + "learning_rate": 9.524852126492734e-07, + "loss": 0.8334, + "step": 5363 + }, + { + "epoch": 0.86, + "grad_norm": 1.235738286638803, + "learning_rate": 9.50263269327486e-07, + "loss": 0.7663, + "step": 5364 + }, + { + "epoch": 0.86, + "grad_norm": 1.4175873432611288, + "learning_rate": 9.480437914034402e-07, + "loss": 0.8618, + "step": 5365 + }, + { + "epoch": 0.86, + "grad_norm": 1.3902692460298143, + "learning_rate": 9.458267794817866e-07, + "loss": 0.8039, + "step": 5366 + }, + { + "epoch": 0.86, + "grad_norm": 1.578026350639442, + "learning_rate": 9.436122341664955e-07, + "loss": 0.7876, + "step": 5367 + }, + { + "epoch": 0.86, + "grad_norm": 1.654528290912296, + "learning_rate": 9.414001560608743e-07, + "loss": 0.8799, + "step": 5368 + }, + { + "epoch": 0.87, + "grad_norm": 1.4148816579935815, + "learning_rate": 9.391905457675499e-07, + "loss": 0.8536, + "step": 5369 + }, + { + "epoch": 0.87, + "grad_norm": 1.496873148969768, + "learning_rate": 9.369834038884862e-07, + "loss": 0.8801, + "step": 5370 + }, + { + "epoch": 0.87, + "grad_norm": 1.4258834802217042, + "learning_rate": 9.347787310249668e-07, + "loss": 0.8301, + "step": 5371 + }, + { + "epoch": 0.87, + "grad_norm": 1.4403554793817133, + "learning_rate": 9.325765277776033e-07, + "loss": 0.7295, + "step": 5372 + }, + { + "epoch": 0.87, + "grad_norm": 1.2911688797344099, + "learning_rate": 9.303767947463416e-07, + "loss": 0.8392, + "step": 5373 + }, + { + "epoch": 0.87, + "grad_norm": 1.323787987675487, + "learning_rate": 9.281795325304454e-07, + "loss": 0.757, + "step": 5374 + }, + { + "epoch": 0.87, + "grad_norm": 1.2607834851321609, + "learning_rate": 9.259847417285084e-07, + "loss": 0.7858, + "step": 5375 + }, + { + "epoch": 0.87, + "grad_norm": 1.4108271330432955, + "learning_rate": 9.237924229384554e-07, + "loss": 0.78, + "step": 5376 + }, + { + "epoch": 0.87, + "grad_norm": 1.7111532192887853, + "learning_rate": 9.216025767575376e-07, + "loss": 0.7721, + "step": 5377 + }, + { + "epoch": 0.87, + "grad_norm": 1.2593002953273762, + "learning_rate": 9.194152037823211e-07, + "loss": 0.8091, + "step": 5378 + }, + { + "epoch": 0.87, + "grad_norm": 1.4634044404422597, + "learning_rate": 9.172303046087105e-07, + "loss": 0.768, + "step": 5379 + }, + { + "epoch": 0.87, + "grad_norm": 1.4037826170798435, + "learning_rate": 9.150478798319351e-07, + "loss": 0.8737, + "step": 5380 + }, + { + "epoch": 0.87, + "grad_norm": 1.3267436961216668, + "learning_rate": 9.128679300465459e-07, + "loss": 0.8347, + "step": 5381 + }, + { + "epoch": 0.87, + "grad_norm": 1.2077387885650863, + "learning_rate": 9.10690455846418e-07, + "loss": 0.796, + "step": 5382 + }, + { + "epoch": 0.87, + "grad_norm": 1.3934984824027679, + "learning_rate": 9.085154578247613e-07, + "loss": 0.8502, + "step": 5383 + }, + { + "epoch": 0.87, + "grad_norm": 1.62194190989826, + "learning_rate": 9.063429365740995e-07, + "loss": 0.8264, + "step": 5384 + }, + { + "epoch": 0.87, + "grad_norm": 1.585135511983675, + "learning_rate": 9.041728926862914e-07, + "loss": 0.7999, + "step": 5385 + }, + { + "epoch": 0.87, + "grad_norm": 0.8390152753872244, + "learning_rate": 9.020053267525142e-07, + "loss": 0.3605, + "step": 5386 + }, + { + "epoch": 0.87, + "grad_norm": 1.4046042792933673, + "learning_rate": 8.998402393632755e-07, + "loss": 0.8167, + "step": 5387 + }, + { + "epoch": 0.87, + "grad_norm": 1.1615341329555229, + "learning_rate": 8.976776311084024e-07, + "loss": 0.8176, + "step": 5388 + }, + { + "epoch": 0.87, + "grad_norm": 1.4072841530583289, + "learning_rate": 8.95517502577048e-07, + "loss": 0.8128, + "step": 5389 + }, + { + "epoch": 0.87, + "grad_norm": 1.3823471458201901, + "learning_rate": 8.933598543576938e-07, + "loss": 0.8239, + "step": 5390 + }, + { + "epoch": 0.87, + "grad_norm": 1.499590611719277, + "learning_rate": 8.912046870381397e-07, + "loss": 0.725, + "step": 5391 + }, + { + "epoch": 0.87, + "grad_norm": 1.5252665177385483, + "learning_rate": 8.89052001205517e-07, + "loss": 0.7988, + "step": 5392 + }, + { + "epoch": 0.87, + "grad_norm": 1.3328610950814526, + "learning_rate": 8.869017974462735e-07, + "loss": 0.8139, + "step": 5393 + }, + { + "epoch": 0.87, + "grad_norm": 1.3309228437173843, + "learning_rate": 8.847540763461815e-07, + "loss": 0.8512, + "step": 5394 + }, + { + "epoch": 0.87, + "grad_norm": 1.6036483440232907, + "learning_rate": 8.826088384903453e-07, + "loss": 0.7421, + "step": 5395 + }, + { + "epoch": 0.87, + "grad_norm": 0.9962180854763197, + "learning_rate": 8.804660844631841e-07, + "loss": 0.3408, + "step": 5396 + }, + { + "epoch": 0.87, + "grad_norm": 1.516697891600251, + "learning_rate": 8.783258148484397e-07, + "loss": 0.8111, + "step": 5397 + }, + { + "epoch": 0.87, + "grad_norm": 1.4205706700086675, + "learning_rate": 8.761880302291847e-07, + "loss": 0.8041, + "step": 5398 + }, + { + "epoch": 0.87, + "grad_norm": 1.2827767156454926, + "learning_rate": 8.740527311878133e-07, + "loss": 0.8701, + "step": 5399 + }, + { + "epoch": 0.87, + "grad_norm": 1.1579982019837285, + "learning_rate": 8.719199183060323e-07, + "loss": 0.7499, + "step": 5400 + }, + { + "epoch": 0.87, + "grad_norm": 1.351868499723918, + "learning_rate": 8.697895921648824e-07, + "loss": 0.7565, + "step": 5401 + }, + { + "epoch": 0.87, + "grad_norm": 1.6339733190319774, + "learning_rate": 8.676617533447251e-07, + "loss": 0.8408, + "step": 5402 + }, + { + "epoch": 0.87, + "grad_norm": 1.567736801982584, + "learning_rate": 8.655364024252411e-07, + "loss": 0.7769, + "step": 5403 + }, + { + "epoch": 0.87, + "grad_norm": 1.50482176372812, + "learning_rate": 8.634135399854315e-07, + "loss": 0.8301, + "step": 5404 + }, + { + "epoch": 0.87, + "grad_norm": 1.423011554789644, + "learning_rate": 8.612931666036262e-07, + "loss": 0.778, + "step": 5405 + }, + { + "epoch": 0.87, + "grad_norm": 1.3759096740562957, + "learning_rate": 8.59175282857475e-07, + "loss": 0.8251, + "step": 5406 + }, + { + "epoch": 0.87, + "grad_norm": 1.3709497749780362, + "learning_rate": 8.570598893239413e-07, + "loss": 0.8121, + "step": 5407 + }, + { + "epoch": 0.87, + "grad_norm": 1.571229915797805, + "learning_rate": 8.549469865793214e-07, + "loss": 0.7938, + "step": 5408 + }, + { + "epoch": 0.87, + "grad_norm": 1.3566478525853967, + "learning_rate": 8.528365751992284e-07, + "loss": 0.8598, + "step": 5409 + }, + { + "epoch": 0.87, + "grad_norm": 1.300280629298944, + "learning_rate": 8.507286557585948e-07, + "loss": 0.7454, + "step": 5410 + }, + { + "epoch": 0.87, + "grad_norm": 1.5006524631441105, + "learning_rate": 8.486232288316754e-07, + "loss": 0.7674, + "step": 5411 + }, + { + "epoch": 0.87, + "grad_norm": 1.3890595451619632, + "learning_rate": 8.465202949920492e-07, + "loss": 0.7835, + "step": 5412 + }, + { + "epoch": 0.87, + "grad_norm": 1.42601130145009, + "learning_rate": 8.444198548126103e-07, + "loss": 0.84, + "step": 5413 + }, + { + "epoch": 0.87, + "grad_norm": 1.4124990014108572, + "learning_rate": 8.423219088655788e-07, + "loss": 0.8597, + "step": 5414 + }, + { + "epoch": 0.87, + "grad_norm": 1.261855303494956, + "learning_rate": 8.402264577224928e-07, + "loss": 0.8045, + "step": 5415 + }, + { + "epoch": 0.87, + "grad_norm": 1.5508183587772695, + "learning_rate": 8.381335019542091e-07, + "loss": 0.8461, + "step": 5416 + }, + { + "epoch": 0.87, + "grad_norm": 1.3596466658976296, + "learning_rate": 8.360430421309096e-07, + "loss": 0.8641, + "step": 5417 + }, + { + "epoch": 0.87, + "grad_norm": 1.6317987363582516, + "learning_rate": 8.339550788220907e-07, + "loss": 0.8218, + "step": 5418 + }, + { + "epoch": 0.87, + "grad_norm": 1.4675176959863563, + "learning_rate": 8.318696125965698e-07, + "loss": 0.7307, + "step": 5419 + }, + { + "epoch": 0.87, + "grad_norm": 1.8528473305496553, + "learning_rate": 8.297866440224889e-07, + "loss": 0.7819, + "step": 5420 + }, + { + "epoch": 0.87, + "grad_norm": 1.505922752808056, + "learning_rate": 8.277061736673064e-07, + "loss": 0.8035, + "step": 5421 + }, + { + "epoch": 0.87, + "grad_norm": 1.3763313161030533, + "learning_rate": 8.256282020977957e-07, + "loss": 0.7318, + "step": 5422 + }, + { + "epoch": 0.87, + "grad_norm": 1.3138751938883437, + "learning_rate": 8.235527298800549e-07, + "loss": 0.7871, + "step": 5423 + }, + { + "epoch": 0.87, + "grad_norm": 1.5854933749888387, + "learning_rate": 8.21479757579503e-07, + "loss": 0.7521, + "step": 5424 + }, + { + "epoch": 0.87, + "grad_norm": 1.4420092451811641, + "learning_rate": 8.194092857608726e-07, + "loss": 0.7972, + "step": 5425 + }, + { + "epoch": 0.87, + "grad_norm": 1.2656269198567935, + "learning_rate": 8.173413149882147e-07, + "loss": 0.8772, + "step": 5426 + }, + { + "epoch": 0.87, + "grad_norm": 1.236147276742048, + "learning_rate": 8.152758458249055e-07, + "loss": 0.7712, + "step": 5427 + }, + { + "epoch": 0.87, + "grad_norm": 1.4593082021368307, + "learning_rate": 8.132128788336368e-07, + "loss": 0.7778, + "step": 5428 + }, + { + "epoch": 0.87, + "grad_norm": 2.0272526794073764, + "learning_rate": 8.111524145764116e-07, + "loss": 0.7643, + "step": 5429 + }, + { + "epoch": 0.87, + "grad_norm": 1.2789420403807168, + "learning_rate": 8.090944536145606e-07, + "loss": 0.8856, + "step": 5430 + }, + { + "epoch": 0.88, + "grad_norm": 1.3897070356306853, + "learning_rate": 8.070389965087311e-07, + "loss": 0.7865, + "step": 5431 + }, + { + "epoch": 0.88, + "grad_norm": 1.4157138614732654, + "learning_rate": 8.04986043818885e-07, + "loss": 0.8944, + "step": 5432 + }, + { + "epoch": 0.88, + "grad_norm": 1.8019803837538457, + "learning_rate": 8.029355961043006e-07, + "loss": 0.8657, + "step": 5433 + }, + { + "epoch": 0.88, + "grad_norm": 1.5378706681514454, + "learning_rate": 8.008876539235799e-07, + "loss": 0.7841, + "step": 5434 + }, + { + "epoch": 0.88, + "grad_norm": 1.8344634631822374, + "learning_rate": 7.988422178346378e-07, + "loss": 0.7618, + "step": 5435 + }, + { + "epoch": 0.88, + "grad_norm": 1.2374102484211469, + "learning_rate": 7.967992883947051e-07, + "loss": 0.7747, + "step": 5436 + }, + { + "epoch": 0.88, + "grad_norm": 1.2646487026140678, + "learning_rate": 7.947588661603345e-07, + "loss": 0.846, + "step": 5437 + }, + { + "epoch": 0.88, + "grad_norm": 1.4172264736204214, + "learning_rate": 7.927209516873924e-07, + "loss": 0.7937, + "step": 5438 + }, + { + "epoch": 0.88, + "grad_norm": 1.401876491281377, + "learning_rate": 7.906855455310647e-07, + "loss": 0.762, + "step": 5439 + }, + { + "epoch": 0.88, + "grad_norm": 1.3406368138386309, + "learning_rate": 7.886526482458501e-07, + "loss": 0.8578, + "step": 5440 + }, + { + "epoch": 0.88, + "grad_norm": 1.488750226179341, + "learning_rate": 7.866222603855656e-07, + "loss": 0.7329, + "step": 5441 + }, + { + "epoch": 0.88, + "grad_norm": 1.328318248040999, + "learning_rate": 7.845943825033442e-07, + "loss": 0.8179, + "step": 5442 + }, + { + "epoch": 0.88, + "grad_norm": 1.640855250777269, + "learning_rate": 7.825690151516418e-07, + "loss": 0.8538, + "step": 5443 + }, + { + "epoch": 0.88, + "grad_norm": 1.3985598460015132, + "learning_rate": 7.805461588822161e-07, + "loss": 0.8194, + "step": 5444 + }, + { + "epoch": 0.88, + "grad_norm": 1.2270716252892366, + "learning_rate": 7.785258142461516e-07, + "loss": 0.7748, + "step": 5445 + }, + { + "epoch": 0.88, + "grad_norm": 1.4379233450872904, + "learning_rate": 7.765079817938493e-07, + "loss": 0.8291, + "step": 5446 + }, + { + "epoch": 0.88, + "grad_norm": 1.7274313936811894, + "learning_rate": 7.744926620750193e-07, + "loss": 0.8701, + "step": 5447 + }, + { + "epoch": 0.88, + "grad_norm": 1.396616671371132, + "learning_rate": 7.724798556386892e-07, + "loss": 0.8344, + "step": 5448 + }, + { + "epoch": 0.88, + "grad_norm": 1.336563401528519, + "learning_rate": 7.704695630332048e-07, + "loss": 0.8124, + "step": 5449 + }, + { + "epoch": 0.88, + "grad_norm": 1.4954696840226143, + "learning_rate": 7.684617848062281e-07, + "loss": 0.762, + "step": 5450 + }, + { + "epoch": 0.88, + "grad_norm": 1.5133165286948758, + "learning_rate": 7.664565215047259e-07, + "loss": 0.7706, + "step": 5451 + }, + { + "epoch": 0.88, + "grad_norm": 0.8045847876975954, + "learning_rate": 7.644537736749924e-07, + "loss": 0.3582, + "step": 5452 + }, + { + "epoch": 0.88, + "grad_norm": 1.6282576349070783, + "learning_rate": 7.624535418626323e-07, + "loss": 0.8762, + "step": 5453 + }, + { + "epoch": 0.88, + "grad_norm": 1.3720551086072128, + "learning_rate": 7.604558266125606e-07, + "loss": 0.87, + "step": 5454 + }, + { + "epoch": 0.88, + "grad_norm": 1.3727016225175706, + "learning_rate": 7.584606284690099e-07, + "loss": 0.8063, + "step": 5455 + }, + { + "epoch": 0.88, + "grad_norm": 0.710926184524886, + "learning_rate": 7.56467947975531e-07, + "loss": 0.33, + "step": 5456 + }, + { + "epoch": 0.88, + "grad_norm": 1.5290127486619944, + "learning_rate": 7.544777856749818e-07, + "loss": 0.764, + "step": 5457 + }, + { + "epoch": 0.88, + "grad_norm": 1.476218180903004, + "learning_rate": 7.524901421095365e-07, + "loss": 0.7946, + "step": 5458 + }, + { + "epoch": 0.88, + "grad_norm": 1.4853940508732788, + "learning_rate": 7.505050178206874e-07, + "loss": 0.7391, + "step": 5459 + }, + { + "epoch": 0.88, + "grad_norm": 1.302246677325604, + "learning_rate": 7.485224133492341e-07, + "loss": 0.7194, + "step": 5460 + }, + { + "epoch": 0.88, + "grad_norm": 1.3149103903981434, + "learning_rate": 7.465423292352947e-07, + "loss": 0.8352, + "step": 5461 + }, + { + "epoch": 0.88, + "grad_norm": 0.8798223283112991, + "learning_rate": 7.445647660182987e-07, + "loss": 0.3494, + "step": 5462 + }, + { + "epoch": 0.88, + "grad_norm": 1.522163359278408, + "learning_rate": 7.425897242369861e-07, + "loss": 0.7849, + "step": 5463 + }, + { + "epoch": 0.88, + "grad_norm": 1.3410667052737066, + "learning_rate": 7.406172044294157e-07, + "loss": 0.844, + "step": 5464 + }, + { + "epoch": 0.88, + "grad_norm": 1.6111707390880774, + "learning_rate": 7.386472071329543e-07, + "loss": 0.7971, + "step": 5465 + }, + { + "epoch": 0.88, + "grad_norm": 1.7121131539656658, + "learning_rate": 7.366797328842856e-07, + "loss": 0.7864, + "step": 5466 + }, + { + "epoch": 0.88, + "grad_norm": 1.4595184395565537, + "learning_rate": 7.347147822194012e-07, + "loss": 0.8347, + "step": 5467 + }, + { + "epoch": 0.88, + "grad_norm": 1.3449681189057456, + "learning_rate": 7.327523556736104e-07, + "loss": 0.771, + "step": 5468 + }, + { + "epoch": 0.88, + "grad_norm": 0.849130815798013, + "learning_rate": 7.307924537815314e-07, + "loss": 0.3358, + "step": 5469 + }, + { + "epoch": 0.88, + "grad_norm": 1.7104378742622606, + "learning_rate": 7.288350770770935e-07, + "loss": 0.7534, + "step": 5470 + }, + { + "epoch": 0.88, + "grad_norm": 1.4261393345610522, + "learning_rate": 7.268802260935415e-07, + "loss": 0.8283, + "step": 5471 + }, + { + "epoch": 0.88, + "grad_norm": 1.5201861801478926, + "learning_rate": 7.249279013634348e-07, + "loss": 0.7815, + "step": 5472 + }, + { + "epoch": 0.88, + "grad_norm": 1.343667400324821, + "learning_rate": 7.229781034186323e-07, + "loss": 0.815, + "step": 5473 + }, + { + "epoch": 0.88, + "grad_norm": 1.5539203289309018, + "learning_rate": 7.210308327903182e-07, + "loss": 0.866, + "step": 5474 + }, + { + "epoch": 0.88, + "grad_norm": 1.286695711263135, + "learning_rate": 7.190860900089824e-07, + "loss": 0.8009, + "step": 5475 + }, + { + "epoch": 0.88, + "grad_norm": 1.6320062465131093, + "learning_rate": 7.171438756044258e-07, + "loss": 0.8289, + "step": 5476 + }, + { + "epoch": 0.88, + "grad_norm": 1.3890640218820804, + "learning_rate": 7.152041901057594e-07, + "loss": 0.845, + "step": 5477 + }, + { + "epoch": 0.88, + "grad_norm": 1.413327070645842, + "learning_rate": 7.132670340414106e-07, + "loss": 0.7139, + "step": 5478 + }, + { + "epoch": 0.88, + "grad_norm": 1.468835601189063, + "learning_rate": 7.113324079391115e-07, + "loss": 0.8212, + "step": 5479 + }, + { + "epoch": 0.88, + "grad_norm": 1.3552067686242206, + "learning_rate": 7.094003123259063e-07, + "loss": 0.7286, + "step": 5480 + }, + { + "epoch": 0.88, + "grad_norm": 0.7745663179327958, + "learning_rate": 7.07470747728155e-07, + "loss": 0.3087, + "step": 5481 + }, + { + "epoch": 0.88, + "grad_norm": 0.8653127395843776, + "learning_rate": 7.055437146715194e-07, + "loss": 0.3353, + "step": 5482 + }, + { + "epoch": 0.88, + "grad_norm": 1.4735506488603232, + "learning_rate": 7.036192136809816e-07, + "loss": 0.8447, + "step": 5483 + }, + { + "epoch": 0.88, + "grad_norm": 1.3002765280600637, + "learning_rate": 7.016972452808246e-07, + "loss": 0.8245, + "step": 5484 + }, + { + "epoch": 0.88, + "grad_norm": 1.5306605803009683, + "learning_rate": 6.997778099946495e-07, + "loss": 0.8325, + "step": 5485 + }, + { + "epoch": 0.88, + "grad_norm": 1.2574716003850852, + "learning_rate": 6.978609083453602e-07, + "loss": 0.7942, + "step": 5486 + }, + { + "epoch": 0.88, + "grad_norm": 1.4004290720451564, + "learning_rate": 6.959465408551724e-07, + "loss": 0.8169, + "step": 5487 + }, + { + "epoch": 0.88, + "grad_norm": 1.7767445728051512, + "learning_rate": 6.940347080456178e-07, + "loss": 0.7818, + "step": 5488 + }, + { + "epoch": 0.88, + "grad_norm": 1.8671196689315561, + "learning_rate": 6.921254104375264e-07, + "loss": 0.7944, + "step": 5489 + }, + { + "epoch": 0.88, + "grad_norm": 0.8901832388452009, + "learning_rate": 6.902186485510476e-07, + "loss": 0.3507, + "step": 5490 + }, + { + "epoch": 0.88, + "grad_norm": 1.555749173658839, + "learning_rate": 6.883144229056349e-07, + "loss": 0.7814, + "step": 5491 + }, + { + "epoch": 0.88, + "grad_norm": 1.5763145320767764, + "learning_rate": 6.864127340200499e-07, + "loss": 0.8514, + "step": 5492 + }, + { + "epoch": 0.89, + "grad_norm": 1.5476265403769007, + "learning_rate": 6.845135824123672e-07, + "loss": 0.7905, + "step": 5493 + }, + { + "epoch": 0.89, + "grad_norm": 1.523068056181864, + "learning_rate": 6.826169685999707e-07, + "loss": 0.8697, + "step": 5494 + }, + { + "epoch": 0.89, + "grad_norm": 1.3408861217060435, + "learning_rate": 6.807228930995436e-07, + "loss": 0.7475, + "step": 5495 + }, + { + "epoch": 0.89, + "grad_norm": 1.6021598010303677, + "learning_rate": 6.788313564270877e-07, + "loss": 0.7784, + "step": 5496 + }, + { + "epoch": 0.89, + "grad_norm": 1.255160562599145, + "learning_rate": 6.769423590979107e-07, + "loss": 0.7566, + "step": 5497 + }, + { + "epoch": 0.89, + "grad_norm": 1.459850173754575, + "learning_rate": 6.750559016266278e-07, + "loss": 0.8862, + "step": 5498 + }, + { + "epoch": 0.89, + "grad_norm": 1.2259314483168189, + "learning_rate": 6.731719845271589e-07, + "loss": 0.8708, + "step": 5499 + }, + { + "epoch": 0.89, + "grad_norm": 0.8215457099087253, + "learning_rate": 6.71290608312739e-07, + "loss": 0.324, + "step": 5500 + }, + { + "epoch": 0.89, + "grad_norm": 1.3397638645351855, + "learning_rate": 6.694117734959038e-07, + "loss": 0.8443, + "step": 5501 + }, + { + "epoch": 0.89, + "grad_norm": 1.7981340288996954, + "learning_rate": 6.675354805885004e-07, + "loss": 0.746, + "step": 5502 + }, + { + "epoch": 0.89, + "grad_norm": 1.2923434415092097, + "learning_rate": 6.656617301016833e-07, + "loss": 0.8123, + "step": 5503 + }, + { + "epoch": 0.89, + "grad_norm": 1.3998882431368602, + "learning_rate": 6.637905225459129e-07, + "loss": 0.8918, + "step": 5504 + }, + { + "epoch": 0.89, + "grad_norm": 1.3408067185781782, + "learning_rate": 6.619218584309595e-07, + "loss": 0.8396, + "step": 5505 + }, + { + "epoch": 0.89, + "grad_norm": 1.5629337984151004, + "learning_rate": 6.600557382658956e-07, + "loss": 0.8606, + "step": 5506 + }, + { + "epoch": 0.89, + "grad_norm": 1.2827171718458665, + "learning_rate": 6.58192162559107e-07, + "loss": 0.7833, + "step": 5507 + }, + { + "epoch": 0.89, + "grad_norm": 1.5926487819510016, + "learning_rate": 6.563311318182819e-07, + "loss": 0.8573, + "step": 5508 + }, + { + "epoch": 0.89, + "grad_norm": 1.3365687306740295, + "learning_rate": 6.544726465504159e-07, + "loss": 0.785, + "step": 5509 + }, + { + "epoch": 0.89, + "grad_norm": 1.4941721694708086, + "learning_rate": 6.526167072618117e-07, + "loss": 0.8926, + "step": 5510 + }, + { + "epoch": 0.89, + "grad_norm": 1.6306568945177617, + "learning_rate": 6.507633144580783e-07, + "loss": 0.8306, + "step": 5511 + }, + { + "epoch": 0.89, + "grad_norm": 1.3309414676196005, + "learning_rate": 6.489124686441328e-07, + "loss": 0.8119, + "step": 5512 + }, + { + "epoch": 0.89, + "grad_norm": 1.350611640950385, + "learning_rate": 6.470641703241942e-07, + "loss": 0.8308, + "step": 5513 + }, + { + "epoch": 0.89, + "grad_norm": 1.5266014982169664, + "learning_rate": 6.452184200017897e-07, + "loss": 0.802, + "step": 5514 + }, + { + "epoch": 0.89, + "grad_norm": 1.3800896660305717, + "learning_rate": 6.433752181797548e-07, + "loss": 0.7722, + "step": 5515 + }, + { + "epoch": 0.89, + "grad_norm": 1.4764110144357536, + "learning_rate": 6.415345653602278e-07, + "loss": 0.9001, + "step": 5516 + }, + { + "epoch": 0.89, + "grad_norm": 1.47773841803913, + "learning_rate": 6.396964620446522e-07, + "loss": 0.8194, + "step": 5517 + }, + { + "epoch": 0.89, + "grad_norm": 1.531522237044744, + "learning_rate": 6.378609087337773e-07, + "loss": 0.8356, + "step": 5518 + }, + { + "epoch": 0.89, + "grad_norm": 1.6942495642861382, + "learning_rate": 6.360279059276619e-07, + "loss": 0.8148, + "step": 5519 + }, + { + "epoch": 0.89, + "grad_norm": 1.5477608446558635, + "learning_rate": 6.341974541256635e-07, + "loss": 0.8666, + "step": 5520 + }, + { + "epoch": 0.89, + "grad_norm": 1.7404343605727923, + "learning_rate": 6.323695538264474e-07, + "loss": 0.7703, + "step": 5521 + }, + { + "epoch": 0.89, + "grad_norm": 1.6828815094293763, + "learning_rate": 6.305442055279864e-07, + "loss": 0.8369, + "step": 5522 + }, + { + "epoch": 0.89, + "grad_norm": 0.8073488456206463, + "learning_rate": 6.287214097275551e-07, + "loss": 0.3474, + "step": 5523 + }, + { + "epoch": 0.89, + "grad_norm": 1.4320248794622075, + "learning_rate": 6.269011669217306e-07, + "loss": 0.7903, + "step": 5524 + }, + { + "epoch": 0.89, + "grad_norm": 1.3042153254278122, + "learning_rate": 6.250834776063996e-07, + "loss": 0.8653, + "step": 5525 + }, + { + "epoch": 0.89, + "grad_norm": 1.51746061230947, + "learning_rate": 6.232683422767516e-07, + "loss": 0.8365, + "step": 5526 + }, + { + "epoch": 0.89, + "grad_norm": 1.524819469245884, + "learning_rate": 6.214557614272787e-07, + "loss": 0.8246, + "step": 5527 + }, + { + "epoch": 0.89, + "grad_norm": 1.524765059771098, + "learning_rate": 6.196457355517749e-07, + "loss": 0.8655, + "step": 5528 + }, + { + "epoch": 0.89, + "grad_norm": 1.4067262253779294, + "learning_rate": 6.178382651433456e-07, + "loss": 0.8146, + "step": 5529 + }, + { + "epoch": 0.89, + "grad_norm": 1.4242190307139813, + "learning_rate": 6.160333506943939e-07, + "loss": 0.8066, + "step": 5530 + }, + { + "epoch": 0.89, + "grad_norm": 1.3184105411066747, + "learning_rate": 6.142309926966273e-07, + "loss": 0.7667, + "step": 5531 + }, + { + "epoch": 0.89, + "grad_norm": 1.2522231047447183, + "learning_rate": 6.124311916410586e-07, + "loss": 0.7682, + "step": 5532 + }, + { + "epoch": 0.89, + "grad_norm": 0.762020493920924, + "learning_rate": 6.106339480180023e-07, + "loss": 0.3376, + "step": 5533 + }, + { + "epoch": 0.89, + "grad_norm": 0.692216606394664, + "learning_rate": 6.088392623170802e-07, + "loss": 0.323, + "step": 5534 + }, + { + "epoch": 0.89, + "grad_norm": 1.4347446094264773, + "learning_rate": 6.070471350272111e-07, + "loss": 0.8295, + "step": 5535 + }, + { + "epoch": 0.89, + "grad_norm": 1.519851797856534, + "learning_rate": 6.052575666366189e-07, + "loss": 0.799, + "step": 5536 + }, + { + "epoch": 0.89, + "grad_norm": 1.606896747478483, + "learning_rate": 6.034705576328348e-07, + "loss": 0.7856, + "step": 5537 + }, + { + "epoch": 0.89, + "grad_norm": 1.3655201144935198, + "learning_rate": 6.016861085026881e-07, + "loss": 0.8737, + "step": 5538 + }, + { + "epoch": 0.89, + "grad_norm": 1.2841255727759426, + "learning_rate": 5.999042197323102e-07, + "loss": 0.7866, + "step": 5539 + }, + { + "epoch": 0.89, + "grad_norm": 1.4415901882124487, + "learning_rate": 5.98124891807137e-07, + "loss": 0.7892, + "step": 5540 + }, + { + "epoch": 0.89, + "grad_norm": 1.5249203336731263, + "learning_rate": 5.963481252119096e-07, + "loss": 0.8783, + "step": 5541 + }, + { + "epoch": 0.89, + "grad_norm": 1.4693523158618806, + "learning_rate": 5.945739204306666e-07, + "loss": 0.8708, + "step": 5542 + }, + { + "epoch": 0.89, + "grad_norm": 1.3794433369062153, + "learning_rate": 5.928022779467468e-07, + "loss": 0.7912, + "step": 5543 + }, + { + "epoch": 0.89, + "grad_norm": 1.4985344943644534, + "learning_rate": 5.910331982428008e-07, + "loss": 0.7766, + "step": 5544 + }, + { + "epoch": 0.89, + "grad_norm": 1.2925112994083339, + "learning_rate": 5.892666818007698e-07, + "loss": 0.814, + "step": 5545 + }, + { + "epoch": 0.89, + "grad_norm": 1.5027866760237576, + "learning_rate": 5.87502729101902e-07, + "loss": 0.8749, + "step": 5546 + }, + { + "epoch": 0.89, + "grad_norm": 1.3905884397715607, + "learning_rate": 5.857413406267476e-07, + "loss": 0.7729, + "step": 5547 + }, + { + "epoch": 0.89, + "grad_norm": 1.7936364542930816, + "learning_rate": 5.839825168551594e-07, + "loss": 0.8335, + "step": 5548 + }, + { + "epoch": 0.89, + "grad_norm": 0.8466180819678006, + "learning_rate": 5.822262582662874e-07, + "loss": 0.3208, + "step": 5549 + }, + { + "epoch": 0.89, + "grad_norm": 1.3251947558970396, + "learning_rate": 5.804725653385846e-07, + "loss": 0.8499, + "step": 5550 + }, + { + "epoch": 0.89, + "grad_norm": 1.5271264846107917, + "learning_rate": 5.787214385498063e-07, + "loss": 0.763, + "step": 5551 + }, + { + "epoch": 0.89, + "grad_norm": 1.152165909153071, + "learning_rate": 5.769728783770068e-07, + "loss": 0.8192, + "step": 5552 + }, + { + "epoch": 0.89, + "grad_norm": 1.6271298290796294, + "learning_rate": 5.752268852965426e-07, + "loss": 0.8475, + "step": 5553 + }, + { + "epoch": 0.89, + "grad_norm": 1.3203849134253383, + "learning_rate": 5.734834597840699e-07, + "loss": 0.797, + "step": 5554 + }, + { + "epoch": 0.9, + "grad_norm": 1.3207920677052762, + "learning_rate": 5.717426023145456e-07, + "loss": 0.8899, + "step": 5555 + }, + { + "epoch": 0.9, + "grad_norm": 1.2730090226383586, + "learning_rate": 5.70004313362229e-07, + "loss": 0.7354, + "step": 5556 + }, + { + "epoch": 0.9, + "grad_norm": 0.8767600359349941, + "learning_rate": 5.682685934006771e-07, + "loss": 0.3161, + "step": 5557 + }, + { + "epoch": 0.9, + "grad_norm": 1.3433818867387002, + "learning_rate": 5.66535442902747e-07, + "loss": 0.8635, + "step": 5558 + }, + { + "epoch": 0.9, + "grad_norm": 1.3512846779696344, + "learning_rate": 5.648048623405977e-07, + "loss": 0.7641, + "step": 5559 + }, + { + "epoch": 0.9, + "grad_norm": 1.6683006821796067, + "learning_rate": 5.630768521856866e-07, + "loss": 0.775, + "step": 5560 + }, + { + "epoch": 0.9, + "grad_norm": 1.4823825426374606, + "learning_rate": 5.613514129087693e-07, + "loss": 0.8246, + "step": 5561 + }, + { + "epoch": 0.9, + "grad_norm": 1.3919054884048734, + "learning_rate": 5.596285449799055e-07, + "loss": 0.8031, + "step": 5562 + }, + { + "epoch": 0.9, + "grad_norm": 1.4806001033738834, + "learning_rate": 5.579082488684529e-07, + "loss": 0.8067, + "step": 5563 + }, + { + "epoch": 0.9, + "grad_norm": 1.3357663379577107, + "learning_rate": 5.561905250430665e-07, + "loss": 0.7998, + "step": 5564 + }, + { + "epoch": 0.9, + "grad_norm": 1.531265730679734, + "learning_rate": 5.54475373971699e-07, + "loss": 0.8269, + "step": 5565 + }, + { + "epoch": 0.9, + "grad_norm": 1.4685833036148803, + "learning_rate": 5.527627961216087e-07, + "loss": 0.7988, + "step": 5566 + }, + { + "epoch": 0.9, + "grad_norm": 1.2159730003415292, + "learning_rate": 5.510527919593478e-07, + "loss": 0.8186, + "step": 5567 + }, + { + "epoch": 0.9, + "grad_norm": 1.2849258380078918, + "learning_rate": 5.493453619507672e-07, + "loss": 0.8817, + "step": 5568 + }, + { + "epoch": 0.9, + "grad_norm": 1.5648903920254402, + "learning_rate": 5.47640506561018e-07, + "loss": 0.7554, + "step": 5569 + }, + { + "epoch": 0.9, + "grad_norm": 1.6900244694896094, + "learning_rate": 5.45938226254552e-07, + "loss": 0.6941, + "step": 5570 + }, + { + "epoch": 0.9, + "grad_norm": 1.5218016559289356, + "learning_rate": 5.442385214951151e-07, + "loss": 0.9481, + "step": 5571 + }, + { + "epoch": 0.9, + "grad_norm": 1.4683341530787384, + "learning_rate": 5.425413927457546e-07, + "loss": 0.8084, + "step": 5572 + }, + { + "epoch": 0.9, + "grad_norm": 1.3661652503510426, + "learning_rate": 5.408468404688161e-07, + "loss": 0.7409, + "step": 5573 + }, + { + "epoch": 0.9, + "grad_norm": 1.5785955846368618, + "learning_rate": 5.391548651259415e-07, + "loss": 0.8118, + "step": 5574 + }, + { + "epoch": 0.9, + "grad_norm": 1.3431355331484052, + "learning_rate": 5.374654671780688e-07, + "loss": 0.7645, + "step": 5575 + }, + { + "epoch": 0.9, + "grad_norm": 0.7792403348809664, + "learning_rate": 5.357786470854421e-07, + "loss": 0.3245, + "step": 5576 + }, + { + "epoch": 0.9, + "grad_norm": 1.4823851097081293, + "learning_rate": 5.340944053075925e-07, + "loss": 0.7747, + "step": 5577 + }, + { + "epoch": 0.9, + "grad_norm": 1.277916916741586, + "learning_rate": 5.324127423033576e-07, + "loss": 0.8175, + "step": 5578 + }, + { + "epoch": 0.9, + "grad_norm": 1.3557016683753234, + "learning_rate": 5.307336585308676e-07, + "loss": 0.76, + "step": 5579 + }, + { + "epoch": 0.9, + "grad_norm": 1.922831357478769, + "learning_rate": 5.290571544475487e-07, + "loss": 0.8002, + "step": 5580 + }, + { + "epoch": 0.9, + "grad_norm": 1.3428219498627414, + "learning_rate": 5.273832305101312e-07, + "loss": 0.7891, + "step": 5581 + }, + { + "epoch": 0.9, + "grad_norm": 1.6923478910405894, + "learning_rate": 5.257118871746347e-07, + "loss": 0.8549, + "step": 5582 + }, + { + "epoch": 0.9, + "grad_norm": 1.9669896886463294, + "learning_rate": 5.240431248963807e-07, + "loss": 0.7725, + "step": 5583 + }, + { + "epoch": 0.9, + "grad_norm": 1.4131559718399938, + "learning_rate": 5.223769441299842e-07, + "loss": 0.8149, + "step": 5584 + }, + { + "epoch": 0.9, + "grad_norm": 1.3580317598835006, + "learning_rate": 5.207133453293633e-07, + "loss": 0.869, + "step": 5585 + }, + { + "epoch": 0.9, + "grad_norm": 1.5810420275553005, + "learning_rate": 5.190523289477234e-07, + "loss": 0.8476, + "step": 5586 + }, + { + "epoch": 0.9, + "grad_norm": 1.5946614158111023, + "learning_rate": 5.173938954375734e-07, + "loss": 0.7343, + "step": 5587 + }, + { + "epoch": 0.9, + "grad_norm": 1.4162044017842432, + "learning_rate": 5.157380452507166e-07, + "loss": 0.7515, + "step": 5588 + }, + { + "epoch": 0.9, + "grad_norm": 1.4924453436435994, + "learning_rate": 5.140847788382508e-07, + "loss": 0.7205, + "step": 5589 + }, + { + "epoch": 0.9, + "grad_norm": 1.1336140246980457, + "learning_rate": 5.124340966505715e-07, + "loss": 0.8262, + "step": 5590 + }, + { + "epoch": 0.9, + "grad_norm": 1.2883493032750521, + "learning_rate": 5.107859991373698e-07, + "loss": 0.7726, + "step": 5591 + }, + { + "epoch": 0.9, + "grad_norm": 1.4137513986868562, + "learning_rate": 5.091404867476368e-07, + "loss": 0.8049, + "step": 5592 + }, + { + "epoch": 0.9, + "grad_norm": 1.4944600792253941, + "learning_rate": 5.074975599296494e-07, + "loss": 0.8724, + "step": 5593 + }, + { + "epoch": 0.9, + "grad_norm": 1.5726363959198584, + "learning_rate": 5.058572191309896e-07, + "loss": 0.7507, + "step": 5594 + }, + { + "epoch": 0.9, + "grad_norm": 1.3018099444596989, + "learning_rate": 5.042194647985311e-07, + "loss": 0.8027, + "step": 5595 + }, + { + "epoch": 0.9, + "grad_norm": 0.8361215656168353, + "learning_rate": 5.025842973784445e-07, + "loss": 0.3687, + "step": 5596 + }, + { + "epoch": 0.9, + "grad_norm": 1.4082944360247134, + "learning_rate": 5.009517173161904e-07, + "loss": 0.7881, + "step": 5597 + }, + { + "epoch": 0.9, + "grad_norm": 1.4449359525720271, + "learning_rate": 4.993217250565341e-07, + "loss": 0.8128, + "step": 5598 + }, + { + "epoch": 0.9, + "grad_norm": 1.320181308243942, + "learning_rate": 4.976943210435247e-07, + "loss": 0.8845, + "step": 5599 + }, + { + "epoch": 0.9, + "grad_norm": 1.2916138701326245, + "learning_rate": 4.960695057205178e-07, + "loss": 0.8052, + "step": 5600 + }, + { + "epoch": 0.9, + "grad_norm": 1.6095151545087636, + "learning_rate": 4.944472795301535e-07, + "loss": 0.8178, + "step": 5601 + }, + { + "epoch": 0.9, + "grad_norm": 1.2586029315015101, + "learning_rate": 4.928276429143719e-07, + "loss": 0.8256, + "step": 5602 + }, + { + "epoch": 0.9, + "grad_norm": 1.1629534324801705, + "learning_rate": 4.912105963144076e-07, + "loss": 0.7967, + "step": 5603 + }, + { + "epoch": 0.9, + "grad_norm": 1.5487660399184402, + "learning_rate": 4.895961401707882e-07, + "loss": 0.9005, + "step": 5604 + }, + { + "epoch": 0.9, + "grad_norm": 1.3227978470588002, + "learning_rate": 4.879842749233366e-07, + "loss": 0.8163, + "step": 5605 + }, + { + "epoch": 0.9, + "grad_norm": 1.5109991700832777, + "learning_rate": 4.863750010111667e-07, + "loss": 0.8554, + "step": 5606 + }, + { + "epoch": 0.9, + "grad_norm": 1.5408918321344962, + "learning_rate": 4.847683188726938e-07, + "loss": 0.834, + "step": 5607 + }, + { + "epoch": 0.9, + "grad_norm": 1.4740615357595384, + "learning_rate": 4.831642289456184e-07, + "loss": 0.776, + "step": 5608 + }, + { + "epoch": 0.9, + "grad_norm": 1.6157480395234871, + "learning_rate": 4.815627316669403e-07, + "loss": 0.8456, + "step": 5609 + }, + { + "epoch": 0.9, + "grad_norm": 1.5792749058518485, + "learning_rate": 4.799638274729513e-07, + "loss": 0.7485, + "step": 5610 + }, + { + "epoch": 0.9, + "grad_norm": 1.3733168227419978, + "learning_rate": 4.783675167992385e-07, + "loss": 0.8078, + "step": 5611 + }, + { + "epoch": 0.9, + "grad_norm": 1.4780729785826854, + "learning_rate": 4.767738000806765e-07, + "loss": 0.7943, + "step": 5612 + }, + { + "epoch": 0.9, + "grad_norm": 1.3982431080838194, + "learning_rate": 4.7518267775144233e-07, + "loss": 0.8297, + "step": 5613 + }, + { + "epoch": 0.9, + "grad_norm": 1.502008260997234, + "learning_rate": 4.7359415024500143e-07, + "loss": 0.7837, + "step": 5614 + }, + { + "epoch": 0.9, + "grad_norm": 1.4574214972621047, + "learning_rate": 4.7200821799410767e-07, + "loss": 0.7926, + "step": 5615 + }, + { + "epoch": 0.9, + "grad_norm": 1.379308878171987, + "learning_rate": 4.7042488143081766e-07, + "loss": 0.8188, + "step": 5616 + }, + { + "epoch": 0.91, + "grad_norm": 1.4145438747481005, + "learning_rate": 4.6884414098647415e-07, + "loss": 0.8721, + "step": 5617 + }, + { + "epoch": 0.91, + "grad_norm": 1.613216605519247, + "learning_rate": 4.6726599709171483e-07, + "loss": 0.8165, + "step": 5618 + }, + { + "epoch": 0.91, + "grad_norm": 1.4773727639827547, + "learning_rate": 4.6569045017646807e-07, + "loss": 0.7462, + "step": 5619 + }, + { + "epoch": 0.91, + "grad_norm": 1.4956098918245366, + "learning_rate": 4.641175006699594e-07, + "loss": 0.8706, + "step": 5620 + }, + { + "epoch": 0.91, + "grad_norm": 1.5918035167719349, + "learning_rate": 4.625471490007005e-07, + "loss": 0.7302, + "step": 5621 + }, + { + "epoch": 0.91, + "grad_norm": 1.4508785616254272, + "learning_rate": 4.609793955964992e-07, + "loss": 0.7664, + "step": 5622 + }, + { + "epoch": 0.91, + "grad_norm": 0.8383526417479003, + "learning_rate": 4.5941424088445485e-07, + "loss": 0.3414, + "step": 5623 + }, + { + "epoch": 0.91, + "grad_norm": 1.479634468237048, + "learning_rate": 4.578516852909609e-07, + "loss": 0.8519, + "step": 5624 + }, + { + "epoch": 0.91, + "grad_norm": 1.2563251978160062, + "learning_rate": 4.5629172924169793e-07, + "loss": 0.7963, + "step": 5625 + }, + { + "epoch": 0.91, + "grad_norm": 0.8709697894454793, + "learning_rate": 4.547343731616405e-07, + "loss": 0.3568, + "step": 5626 + }, + { + "epoch": 0.91, + "grad_norm": 1.6784947130135892, + "learning_rate": 4.5317961747505803e-07, + "loss": 0.7842, + "step": 5627 + }, + { + "epoch": 0.91, + "grad_norm": 1.3702008832897716, + "learning_rate": 4.5162746260550614e-07, + "loss": 0.8407, + "step": 5628 + }, + { + "epoch": 0.91, + "grad_norm": 1.4197127045172724, + "learning_rate": 4.500779089758378e-07, + "loss": 0.7327, + "step": 5629 + }, + { + "epoch": 0.91, + "grad_norm": 1.6621593334476215, + "learning_rate": 4.4853095700819196e-07, + "loss": 0.7664, + "step": 5630 + }, + { + "epoch": 0.91, + "grad_norm": 1.4717464798905782, + "learning_rate": 4.469866071239992e-07, + "loss": 0.8429, + "step": 5631 + }, + { + "epoch": 0.91, + "grad_norm": 1.228909530926282, + "learning_rate": 4.4544485974398757e-07, + "loss": 0.7894, + "step": 5632 + }, + { + "epoch": 0.91, + "grad_norm": 1.3941502144040971, + "learning_rate": 4.439057152881676e-07, + "loss": 0.8283, + "step": 5633 + }, + { + "epoch": 0.91, + "grad_norm": 1.4131273759198115, + "learning_rate": 4.4236917417584513e-07, + "loss": 0.8454, + "step": 5634 + }, + { + "epoch": 0.91, + "grad_norm": 1.2921710045579249, + "learning_rate": 4.4083523682561747e-07, + "loss": 0.7612, + "step": 5635 + }, + { + "epoch": 0.91, + "grad_norm": 0.9042498986254954, + "learning_rate": 4.393039036553748e-07, + "loss": 0.3424, + "step": 5636 + }, + { + "epoch": 0.91, + "grad_norm": 1.425707116224639, + "learning_rate": 4.377751750822867e-07, + "loss": 0.7579, + "step": 5637 + }, + { + "epoch": 0.91, + "grad_norm": 1.404853764762308, + "learning_rate": 4.362490515228257e-07, + "loss": 0.7832, + "step": 5638 + }, + { + "epoch": 0.91, + "grad_norm": 1.3824882861362426, + "learning_rate": 4.3472553339275136e-07, + "loss": 0.8301, + "step": 5639 + }, + { + "epoch": 0.91, + "grad_norm": 1.403492763650546, + "learning_rate": 4.3320462110710946e-07, + "loss": 0.7324, + "step": 5640 + }, + { + "epoch": 0.91, + "grad_norm": 1.6395437605924126, + "learning_rate": 4.316863150802375e-07, + "loss": 0.8387, + "step": 5641 + }, + { + "epoch": 0.91, + "grad_norm": 2.0039441902629087, + "learning_rate": 4.301706157257657e-07, + "loss": 0.7473, + "step": 5642 + }, + { + "epoch": 0.91, + "grad_norm": 1.425808234808583, + "learning_rate": 4.286575234566148e-07, + "loss": 0.861, + "step": 5643 + }, + { + "epoch": 0.91, + "grad_norm": 1.5896814372415942, + "learning_rate": 4.271470386849874e-07, + "loss": 0.7355, + "step": 5644 + }, + { + "epoch": 0.91, + "grad_norm": 1.1431877612810804, + "learning_rate": 4.256391618223843e-07, + "loss": 0.7865, + "step": 5645 + }, + { + "epoch": 0.91, + "grad_norm": 1.4845729222228934, + "learning_rate": 4.241338932795935e-07, + "loss": 0.8244, + "step": 5646 + }, + { + "epoch": 0.91, + "grad_norm": 1.2599954262434736, + "learning_rate": 4.226312334666904e-07, + "loss": 0.7877, + "step": 5647 + }, + { + "epoch": 0.91, + "grad_norm": 1.9118153649973397, + "learning_rate": 4.211311827930398e-07, + "loss": 0.8042, + "step": 5648 + }, + { + "epoch": 0.91, + "grad_norm": 1.441999835914149, + "learning_rate": 4.196337416672991e-07, + "loss": 0.7767, + "step": 5649 + }, + { + "epoch": 0.91, + "grad_norm": 1.3878284849497384, + "learning_rate": 4.1813891049740986e-07, + "loss": 0.8145, + "step": 5650 + }, + { + "epoch": 0.91, + "grad_norm": 1.3252025954033044, + "learning_rate": 4.166466896906085e-07, + "loss": 0.8314, + "step": 5651 + }, + { + "epoch": 0.91, + "grad_norm": 1.5864556137454788, + "learning_rate": 4.1515707965341547e-07, + "loss": 0.87, + "step": 5652 + }, + { + "epoch": 0.91, + "grad_norm": 1.2391506371172691, + "learning_rate": 4.136700807916405e-07, + "loss": 0.833, + "step": 5653 + }, + { + "epoch": 0.91, + "grad_norm": 0.8091769775687769, + "learning_rate": 4.121856935103863e-07, + "loss": 0.3439, + "step": 5654 + }, + { + "epoch": 0.91, + "grad_norm": 1.2063803846918737, + "learning_rate": 4.107039182140382e-07, + "loss": 0.7326, + "step": 5655 + }, + { + "epoch": 0.91, + "grad_norm": 1.3166823348787937, + "learning_rate": 4.0922475530627224e-07, + "loss": 0.766, + "step": 5656 + }, + { + "epoch": 0.91, + "grad_norm": 1.2419849461528696, + "learning_rate": 4.0774820519005385e-07, + "loss": 0.8341, + "step": 5657 + }, + { + "epoch": 0.91, + "grad_norm": 0.7446180424132262, + "learning_rate": 4.0627426826763903e-07, + "loss": 0.3214, + "step": 5658 + }, + { + "epoch": 0.91, + "grad_norm": 1.2046703003269057, + "learning_rate": 4.048029449405633e-07, + "loss": 0.7415, + "step": 5659 + }, + { + "epoch": 0.91, + "grad_norm": 1.366660248043041, + "learning_rate": 4.033342356096592e-07, + "loss": 0.8327, + "step": 5660 + }, + { + "epoch": 0.91, + "grad_norm": 1.5712890435332019, + "learning_rate": 4.0186814067504356e-07, + "loss": 0.7994, + "step": 5661 + }, + { + "epoch": 0.91, + "grad_norm": 1.5218286706238888, + "learning_rate": 4.0040466053612006e-07, + "loss": 0.749, + "step": 5662 + }, + { + "epoch": 0.91, + "grad_norm": 1.4804011633953513, + "learning_rate": 3.9894379559158094e-07, + "loss": 0.8082, + "step": 5663 + }, + { + "epoch": 0.91, + "grad_norm": 1.3076894909652519, + "learning_rate": 3.974855462394067e-07, + "loss": 0.7848, + "step": 5664 + }, + { + "epoch": 0.91, + "grad_norm": 1.3696682382059755, + "learning_rate": 3.960299128768663e-07, + "loss": 0.8904, + "step": 5665 + }, + { + "epoch": 0.91, + "grad_norm": 1.4192875923835468, + "learning_rate": 3.945768959005114e-07, + "loss": 0.7434, + "step": 5666 + }, + { + "epoch": 0.91, + "grad_norm": 1.375736066649245, + "learning_rate": 3.931264957061842e-07, + "loss": 0.7329, + "step": 5667 + }, + { + "epoch": 0.91, + "grad_norm": 1.1753555300218785, + "learning_rate": 3.9167871268901536e-07, + "loss": 0.8189, + "step": 5668 + }, + { + "epoch": 0.91, + "grad_norm": 1.4362959198587142, + "learning_rate": 3.9023354724342157e-07, + "loss": 0.8188, + "step": 5669 + }, + { + "epoch": 0.91, + "grad_norm": 1.2652781064094962, + "learning_rate": 3.887909997631012e-07, + "loss": 0.8346, + "step": 5670 + }, + { + "epoch": 0.91, + "grad_norm": 1.4904698020936544, + "learning_rate": 3.8735107064104994e-07, + "loss": 0.8342, + "step": 5671 + }, + { + "epoch": 0.91, + "grad_norm": 1.2775343350310018, + "learning_rate": 3.8591376026954063e-07, + "loss": 0.7385, + "step": 5672 + }, + { + "epoch": 0.91, + "grad_norm": 1.37766653477396, + "learning_rate": 3.844790690401357e-07, + "loss": 0.8394, + "step": 5673 + }, + { + "epoch": 0.91, + "grad_norm": 1.337868472424067, + "learning_rate": 3.830469973436868e-07, + "loss": 0.8119, + "step": 5674 + }, + { + "epoch": 0.91, + "grad_norm": 1.4295056858043067, + "learning_rate": 3.816175455703264e-07, + "loss": 0.8056, + "step": 5675 + }, + { + "epoch": 0.91, + "grad_norm": 1.5694926657806871, + "learning_rate": 3.8019071410948183e-07, + "loss": 0.8523, + "step": 5676 + }, + { + "epoch": 0.91, + "grad_norm": 1.2646407491678548, + "learning_rate": 3.7876650334985776e-07, + "loss": 0.8516, + "step": 5677 + }, + { + "epoch": 0.91, + "grad_norm": 1.3175515871969035, + "learning_rate": 3.7734491367944716e-07, + "loss": 0.774, + "step": 5678 + }, + { + "epoch": 0.92, + "grad_norm": 1.4966343084915807, + "learning_rate": 3.7592594548553354e-07, + "loss": 0.8444, + "step": 5679 + }, + { + "epoch": 0.92, + "grad_norm": 1.5380559779080085, + "learning_rate": 3.745095991546821e-07, + "loss": 0.7379, + "step": 5680 + }, + { + "epoch": 0.92, + "grad_norm": 1.5638468957229843, + "learning_rate": 3.7309587507274313e-07, + "loss": 0.874, + "step": 5681 + }, + { + "epoch": 0.92, + "grad_norm": 1.5280627926449353, + "learning_rate": 3.716847736248541e-07, + "loss": 0.833, + "step": 5682 + }, + { + "epoch": 0.92, + "grad_norm": 1.597118067858388, + "learning_rate": 3.7027629519544085e-07, + "loss": 0.8244, + "step": 5683 + }, + { + "epoch": 0.92, + "grad_norm": 1.2254150894660858, + "learning_rate": 3.688704401682086e-07, + "loss": 0.7363, + "step": 5684 + }, + { + "epoch": 0.92, + "grad_norm": 1.7515129771875544, + "learning_rate": 3.6746720892615216e-07, + "loss": 0.7591, + "step": 5685 + }, + { + "epoch": 0.92, + "grad_norm": 1.290912804426233, + "learning_rate": 3.6606660185154906e-07, + "loss": 0.7509, + "step": 5686 + }, + { + "epoch": 0.92, + "grad_norm": 1.4624522912311337, + "learning_rate": 3.646686193259674e-07, + "loss": 0.8895, + "step": 5687 + }, + { + "epoch": 0.92, + "grad_norm": 1.7426132218709403, + "learning_rate": 3.632732617302515e-07, + "loss": 0.7744, + "step": 5688 + }, + { + "epoch": 0.92, + "grad_norm": 1.2918204617286333, + "learning_rate": 3.61880529444536e-07, + "loss": 0.8737, + "step": 5689 + }, + { + "epoch": 0.92, + "grad_norm": 1.3938178560445185, + "learning_rate": 3.604904228482431e-07, + "loss": 0.8338, + "step": 5690 + }, + { + "epoch": 0.92, + "grad_norm": 0.7747638442811526, + "learning_rate": 3.5910294232007206e-07, + "loss": 0.318, + "step": 5691 + }, + { + "epoch": 0.92, + "grad_norm": 1.4327858108584866, + "learning_rate": 3.5771808823801266e-07, + "loss": 0.7994, + "step": 5692 + }, + { + "epoch": 0.92, + "grad_norm": 1.317227857114016, + "learning_rate": 3.563358609793377e-07, + "loss": 0.863, + "step": 5693 + }, + { + "epoch": 0.92, + "grad_norm": 1.20140643349516, + "learning_rate": 3.5495626092060367e-07, + "loss": 0.8001, + "step": 5694 + }, + { + "epoch": 0.92, + "grad_norm": 1.2908751920652093, + "learning_rate": 3.535792884376499e-07, + "loss": 0.7814, + "step": 5695 + }, + { + "epoch": 0.92, + "grad_norm": 1.3672627462388711, + "learning_rate": 3.5220494390560414e-07, + "loss": 0.824, + "step": 5696 + }, + { + "epoch": 0.92, + "grad_norm": 1.3559451317325133, + "learning_rate": 3.508332276988735e-07, + "loss": 0.7527, + "step": 5697 + }, + { + "epoch": 0.92, + "grad_norm": 1.1095629750254083, + "learning_rate": 3.494641401911536e-07, + "loss": 0.8565, + "step": 5698 + }, + { + "epoch": 0.92, + "grad_norm": 1.4507735682537302, + "learning_rate": 3.4809768175542046e-07, + "loss": 0.7472, + "step": 5699 + }, + { + "epoch": 0.92, + "grad_norm": 1.3294757204953986, + "learning_rate": 3.467338527639341e-07, + "loss": 0.8088, + "step": 5700 + }, + { + "epoch": 0.92, + "grad_norm": 1.2775825720526357, + "learning_rate": 3.453726535882418e-07, + "loss": 0.7575, + "step": 5701 + }, + { + "epoch": 0.92, + "grad_norm": 1.3716570975219533, + "learning_rate": 3.44014084599168e-07, + "loss": 0.7697, + "step": 5702 + }, + { + "epoch": 0.92, + "grad_norm": 1.1948988808928853, + "learning_rate": 3.4265814616682766e-07, + "loss": 0.8386, + "step": 5703 + }, + { + "epoch": 0.92, + "grad_norm": 1.3841450582602188, + "learning_rate": 3.4130483866061327e-07, + "loss": 0.8273, + "step": 5704 + }, + { + "epoch": 0.92, + "grad_norm": 1.4319063684211157, + "learning_rate": 3.3995416244920643e-07, + "loss": 0.8186, + "step": 5705 + }, + { + "epoch": 0.92, + "grad_norm": 1.2233245701878226, + "learning_rate": 3.386061179005651e-07, + "loss": 0.8159, + "step": 5706 + }, + { + "epoch": 0.92, + "grad_norm": 1.3536741943419865, + "learning_rate": 3.372607053819355e-07, + "loss": 0.7854, + "step": 5707 + }, + { + "epoch": 0.92, + "grad_norm": 1.4069484512647155, + "learning_rate": 3.3591792525984324e-07, + "loss": 0.7626, + "step": 5708 + }, + { + "epoch": 0.92, + "grad_norm": 1.4740732576304418, + "learning_rate": 3.345777779001036e-07, + "loss": 0.8711, + "step": 5709 + }, + { + "epoch": 0.92, + "grad_norm": 1.3409633886595875, + "learning_rate": 3.3324026366780224e-07, + "loss": 0.8287, + "step": 5710 + }, + { + "epoch": 0.92, + "grad_norm": 1.609613561772463, + "learning_rate": 3.3190538292732e-07, + "loss": 0.8199, + "step": 5711 + }, + { + "epoch": 0.92, + "grad_norm": 1.4331344627553741, + "learning_rate": 3.305731360423159e-07, + "loss": 0.8844, + "step": 5712 + }, + { + "epoch": 0.92, + "grad_norm": 1.5242148885673126, + "learning_rate": 3.2924352337572743e-07, + "loss": 0.8308, + "step": 5713 + }, + { + "epoch": 0.92, + "grad_norm": 1.2901727381889982, + "learning_rate": 3.2791654528977924e-07, + "loss": 0.7207, + "step": 5714 + }, + { + "epoch": 0.92, + "grad_norm": 1.563864787816798, + "learning_rate": 3.2659220214597666e-07, + "loss": 0.7675, + "step": 5715 + }, + { + "epoch": 0.92, + "grad_norm": 1.4557242411892386, + "learning_rate": 3.2527049430510883e-07, + "loss": 0.7986, + "step": 5716 + }, + { + "epoch": 0.92, + "grad_norm": 1.4548453199159757, + "learning_rate": 3.239514221272411e-07, + "loss": 0.8285, + "step": 5717 + }, + { + "epoch": 0.92, + "grad_norm": 1.3284064120991832, + "learning_rate": 3.226349859717293e-07, + "loss": 0.7902, + "step": 5718 + }, + { + "epoch": 0.92, + "grad_norm": 1.3425087366161716, + "learning_rate": 3.2132118619720545e-07, + "loss": 0.8894, + "step": 5719 + }, + { + "epoch": 0.92, + "grad_norm": 1.3559409941788485, + "learning_rate": 3.2001002316158434e-07, + "loss": 0.8341, + "step": 5720 + }, + { + "epoch": 0.92, + "grad_norm": 1.2115435356620574, + "learning_rate": 3.1870149722206366e-07, + "loss": 0.768, + "step": 5721 + }, + { + "epoch": 0.92, + "grad_norm": 1.649036182371031, + "learning_rate": 3.1739560873512155e-07, + "loss": 0.7992, + "step": 5722 + }, + { + "epoch": 0.92, + "grad_norm": 1.559163989533164, + "learning_rate": 3.1609235805651896e-07, + "loss": 0.7702, + "step": 5723 + }, + { + "epoch": 0.92, + "grad_norm": 0.7531969286376207, + "learning_rate": 3.147917455412952e-07, + "loss": 0.3293, + "step": 5724 + }, + { + "epoch": 0.92, + "grad_norm": 1.4354961593907734, + "learning_rate": 3.134937715437758e-07, + "loss": 0.7518, + "step": 5725 + }, + { + "epoch": 0.92, + "grad_norm": 1.432405222740573, + "learning_rate": 3.121984364175612e-07, + "loss": 0.8483, + "step": 5726 + }, + { + "epoch": 0.92, + "grad_norm": 1.4661140452564443, + "learning_rate": 3.109057405155402e-07, + "loss": 0.851, + "step": 5727 + }, + { + "epoch": 0.92, + "grad_norm": 1.4970222286258723, + "learning_rate": 3.0961568418987673e-07, + "loss": 0.7869, + "step": 5728 + }, + { + "epoch": 0.92, + "grad_norm": 1.3134968327089118, + "learning_rate": 3.0832826779201633e-07, + "loss": 0.7834, + "step": 5729 + }, + { + "epoch": 0.92, + "grad_norm": 1.565020378607771, + "learning_rate": 3.070434916726905e-07, + "loss": 0.8209, + "step": 5730 + }, + { + "epoch": 0.92, + "grad_norm": 1.25055365073191, + "learning_rate": 3.0576135618190393e-07, + "loss": 0.809, + "step": 5731 + }, + { + "epoch": 0.92, + "grad_norm": 1.4163288000333403, + "learning_rate": 3.044818616689471e-07, + "loss": 0.8356, + "step": 5732 + }, + { + "epoch": 0.92, + "grad_norm": 1.6504098987018232, + "learning_rate": 3.032050084823901e-07, + "loss": 0.7658, + "step": 5733 + }, + { + "epoch": 0.92, + "grad_norm": 1.5054144548375827, + "learning_rate": 3.019307969700824e-07, + "loss": 0.7849, + "step": 5734 + }, + { + "epoch": 0.92, + "grad_norm": 1.257743883926915, + "learning_rate": 3.006592274791553e-07, + "loss": 0.7839, + "step": 5735 + }, + { + "epoch": 0.92, + "grad_norm": 0.7418143965875021, + "learning_rate": 2.993903003560172e-07, + "loss": 0.347, + "step": 5736 + }, + { + "epoch": 0.92, + "grad_norm": 1.4109724112032695, + "learning_rate": 2.981240159463616e-07, + "loss": 0.7923, + "step": 5737 + }, + { + "epoch": 0.92, + "grad_norm": 1.7100957145276017, + "learning_rate": 2.9686037459515707e-07, + "loss": 0.8123, + "step": 5738 + }, + { + "epoch": 0.92, + "grad_norm": 1.265140717476901, + "learning_rate": 2.9559937664665474e-07, + "loss": 0.8439, + "step": 5739 + }, + { + "epoch": 0.92, + "grad_norm": 1.4061792739990566, + "learning_rate": 2.9434102244438544e-07, + "loss": 0.7573, + "step": 5740 + }, + { + "epoch": 0.92, + "grad_norm": 1.4601666232251374, + "learning_rate": 2.9308531233115947e-07, + "loss": 0.842, + "step": 5741 + }, + { + "epoch": 0.93, + "grad_norm": 1.4607137620926964, + "learning_rate": 2.918322466490686e-07, + "loss": 0.7802, + "step": 5742 + }, + { + "epoch": 0.93, + "grad_norm": 1.240896137033796, + "learning_rate": 2.905818257394799e-07, + "loss": 0.8078, + "step": 5743 + }, + { + "epoch": 0.93, + "grad_norm": 1.5087117270985708, + "learning_rate": 2.8933404994304417e-07, + "loss": 0.8278, + "step": 5744 + }, + { + "epoch": 0.93, + "grad_norm": 1.1564547870488182, + "learning_rate": 2.8808891959968946e-07, + "loss": 0.7826, + "step": 5745 + }, + { + "epoch": 0.93, + "grad_norm": 1.5820855943741345, + "learning_rate": 2.868464350486222e-07, + "loss": 0.7924, + "step": 5746 + }, + { + "epoch": 0.93, + "grad_norm": 1.2096314649385618, + "learning_rate": 2.856065966283317e-07, + "loss": 0.8096, + "step": 5747 + }, + { + "epoch": 0.93, + "grad_norm": 1.4196107410686543, + "learning_rate": 2.8436940467658213e-07, + "loss": 0.8378, + "step": 5748 + }, + { + "epoch": 0.93, + "grad_norm": 1.5321116200715066, + "learning_rate": 2.831348595304206e-07, + "loss": 0.8525, + "step": 5749 + }, + { + "epoch": 0.93, + "grad_norm": 1.2809909829144148, + "learning_rate": 2.8190296152617035e-07, + "loss": 0.8575, + "step": 5750 + }, + { + "epoch": 0.93, + "grad_norm": 1.3459590328040532, + "learning_rate": 2.8067371099943286e-07, + "loss": 0.7978, + "step": 5751 + }, + { + "epoch": 0.93, + "grad_norm": 1.4054637538827472, + "learning_rate": 2.794471082850936e-07, + "loss": 0.8092, + "step": 5752 + }, + { + "epoch": 0.93, + "grad_norm": 1.5590754149523856, + "learning_rate": 2.7822315371730965e-07, + "loss": 0.8959, + "step": 5753 + }, + { + "epoch": 0.93, + "grad_norm": 1.6985122247706366, + "learning_rate": 2.7700184762952e-07, + "loss": 0.8294, + "step": 5754 + }, + { + "epoch": 0.93, + "grad_norm": 1.3573197367546328, + "learning_rate": 2.7578319035444277e-07, + "loss": 0.8212, + "step": 5755 + }, + { + "epoch": 0.93, + "grad_norm": 1.392973744666387, + "learning_rate": 2.7456718222407584e-07, + "loss": 0.7665, + "step": 5756 + }, + { + "epoch": 0.93, + "grad_norm": 1.6180717502938349, + "learning_rate": 2.7335382356969196e-07, + "loss": 0.7704, + "step": 5757 + }, + { + "epoch": 0.93, + "grad_norm": 1.3204315868258285, + "learning_rate": 2.721431147218412e-07, + "loss": 0.7897, + "step": 5758 + }, + { + "epoch": 0.93, + "grad_norm": 1.371477493153698, + "learning_rate": 2.709350560103574e-07, + "loss": 0.7673, + "step": 5759 + }, + { + "epoch": 0.93, + "grad_norm": 1.3514601663204908, + "learning_rate": 2.697296477643474e-07, + "loss": 0.8255, + "step": 5760 + }, + { + "epoch": 0.93, + "grad_norm": 0.8155569462638977, + "learning_rate": 2.6852689031219626e-07, + "loss": 0.3485, + "step": 5761 + }, + { + "epoch": 0.93, + "grad_norm": 0.9354440457369974, + "learning_rate": 2.6732678398157077e-07, + "loss": 0.3088, + "step": 5762 + }, + { + "epoch": 0.93, + "grad_norm": 1.4171039194161557, + "learning_rate": 2.6612932909941267e-07, + "loss": 0.8598, + "step": 5763 + }, + { + "epoch": 0.93, + "grad_norm": 1.4026105515365053, + "learning_rate": 2.6493452599194115e-07, + "loss": 0.8516, + "step": 5764 + }, + { + "epoch": 0.93, + "grad_norm": 1.3673707866425937, + "learning_rate": 2.637423749846524e-07, + "loss": 0.8645, + "step": 5765 + }, + { + "epoch": 0.93, + "grad_norm": 1.3910387982626506, + "learning_rate": 2.625528764023222e-07, + "loss": 0.8392, + "step": 5766 + }, + { + "epoch": 0.93, + "grad_norm": 1.6040481956092347, + "learning_rate": 2.6136603056900356e-07, + "loss": 0.8448, + "step": 5767 + }, + { + "epoch": 0.93, + "grad_norm": 1.3336763333481905, + "learning_rate": 2.601818378080245e-07, + "loss": 0.7069, + "step": 5768 + }, + { + "epoch": 0.93, + "grad_norm": 1.4230752636876016, + "learning_rate": 2.590002984419937e-07, + "loss": 0.8964, + "step": 5769 + }, + { + "epoch": 0.93, + "grad_norm": 1.5616355304672134, + "learning_rate": 2.578214127927925e-07, + "loss": 0.7656, + "step": 5770 + }, + { + "epoch": 0.93, + "grad_norm": 1.2842804742287206, + "learning_rate": 2.56645181181584e-07, + "loss": 0.7792, + "step": 5771 + }, + { + "epoch": 0.93, + "grad_norm": 1.2785013185814305, + "learning_rate": 2.5547160392880523e-07, + "loss": 0.8141, + "step": 5772 + }, + { + "epoch": 0.93, + "grad_norm": 1.3467578498724593, + "learning_rate": 2.543006813541704e-07, + "loss": 0.804, + "step": 5773 + }, + { + "epoch": 0.93, + "grad_norm": 1.5006719055435538, + "learning_rate": 2.531324137766722e-07, + "loss": 0.782, + "step": 5774 + }, + { + "epoch": 0.93, + "grad_norm": 0.7416609759890418, + "learning_rate": 2.5196680151457933e-07, + "loss": 0.3341, + "step": 5775 + }, + { + "epoch": 0.93, + "grad_norm": 1.4277388106876046, + "learning_rate": 2.508038448854344e-07, + "loss": 0.7196, + "step": 5776 + }, + { + "epoch": 0.93, + "grad_norm": 1.3245600243737297, + "learning_rate": 2.496435442060607e-07, + "loss": 0.8227, + "step": 5777 + }, + { + "epoch": 0.93, + "grad_norm": 1.6937775650983733, + "learning_rate": 2.484858997925566e-07, + "loss": 0.884, + "step": 5778 + }, + { + "epoch": 0.93, + "grad_norm": 1.5324028976144402, + "learning_rate": 2.47330911960294e-07, + "loss": 0.8383, + "step": 5779 + }, + { + "epoch": 0.93, + "grad_norm": 1.499900944985622, + "learning_rate": 2.461785810239259e-07, + "loss": 0.7424, + "step": 5780 + }, + { + "epoch": 0.93, + "grad_norm": 1.4783272415343476, + "learning_rate": 2.4502890729737773e-07, + "loss": 0.8746, + "step": 5781 + }, + { + "epoch": 0.93, + "grad_norm": 0.8030600954335674, + "learning_rate": 2.4388189109385227e-07, + "loss": 0.3103, + "step": 5782 + }, + { + "epoch": 0.93, + "grad_norm": 1.4651735214333335, + "learning_rate": 2.427375327258286e-07, + "loss": 0.7848, + "step": 5783 + }, + { + "epoch": 0.93, + "grad_norm": 1.424424955555753, + "learning_rate": 2.4159583250506157e-07, + "loss": 0.813, + "step": 5784 + }, + { + "epoch": 0.93, + "grad_norm": 1.2942199156536125, + "learning_rate": 2.4045679074258253e-07, + "loss": 0.7786, + "step": 5785 + }, + { + "epoch": 0.93, + "grad_norm": 1.2761286488554746, + "learning_rate": 2.393204077486966e-07, + "loss": 0.8546, + "step": 5786 + }, + { + "epoch": 0.93, + "grad_norm": 1.3751141577982744, + "learning_rate": 2.3818668383298605e-07, + "loss": 0.7699, + "step": 5787 + }, + { + "epoch": 0.93, + "grad_norm": 1.2053829577308015, + "learning_rate": 2.3705561930430942e-07, + "loss": 0.7559, + "step": 5788 + }, + { + "epoch": 0.93, + "grad_norm": 1.4305610437375362, + "learning_rate": 2.3592721447079912e-07, + "loss": 0.7992, + "step": 5789 + }, + { + "epoch": 0.93, + "grad_norm": 1.4770834277619957, + "learning_rate": 2.348014696398626e-07, + "loss": 0.7178, + "step": 5790 + }, + { + "epoch": 0.93, + "grad_norm": 1.3655621390311026, + "learning_rate": 2.3367838511818675e-07, + "loss": 0.7927, + "step": 5791 + }, + { + "epoch": 0.93, + "grad_norm": 1.3468260025142138, + "learning_rate": 2.325579612117279e-07, + "loss": 0.8511, + "step": 5792 + }, + { + "epoch": 0.93, + "grad_norm": 1.2992623628646924, + "learning_rate": 2.3144019822572194e-07, + "loss": 0.8055, + "step": 5793 + }, + { + "epoch": 0.93, + "grad_norm": 1.4099751115734618, + "learning_rate": 2.303250964646786e-07, + "loss": 0.9016, + "step": 5794 + }, + { + "epoch": 0.93, + "grad_norm": 1.2671931671175782, + "learning_rate": 2.2921265623238042e-07, + "loss": 0.8611, + "step": 5795 + }, + { + "epoch": 0.93, + "grad_norm": 1.3088528006625328, + "learning_rate": 2.2810287783188833e-07, + "loss": 0.8901, + "step": 5796 + }, + { + "epoch": 0.93, + "grad_norm": 1.229162341618068, + "learning_rate": 2.2699576156553715e-07, + "loss": 0.7881, + "step": 5797 + }, + { + "epoch": 0.93, + "grad_norm": 1.5970655307555903, + "learning_rate": 2.258913077349334e-07, + "loss": 0.7907, + "step": 5798 + }, + { + "epoch": 0.93, + "grad_norm": 1.3501786523290782, + "learning_rate": 2.2478951664096305e-07, + "loss": 0.8763, + "step": 5799 + }, + { + "epoch": 0.93, + "grad_norm": 1.5031802536797807, + "learning_rate": 2.236903885837849e-07, + "loss": 0.8488, + "step": 5800 + }, + { + "epoch": 0.93, + "grad_norm": 1.40396595678273, + "learning_rate": 2.2259392386282829e-07, + "loss": 0.8053, + "step": 5801 + }, + { + "epoch": 0.93, + "grad_norm": 1.6262999095678248, + "learning_rate": 2.215001227768032e-07, + "loss": 0.866, + "step": 5802 + }, + { + "epoch": 0.93, + "grad_norm": 1.566119752877557, + "learning_rate": 2.204089856236913e-07, + "loss": 0.8574, + "step": 5803 + }, + { + "epoch": 0.94, + "grad_norm": 2.0439017214184325, + "learning_rate": 2.1932051270074807e-07, + "loss": 0.8062, + "step": 5804 + }, + { + "epoch": 0.94, + "grad_norm": 1.2900990306883946, + "learning_rate": 2.182347043045019e-07, + "loss": 0.7578, + "step": 5805 + }, + { + "epoch": 0.94, + "grad_norm": 1.255523284046389, + "learning_rate": 2.1715156073075838e-07, + "loss": 0.8227, + "step": 5806 + }, + { + "epoch": 0.94, + "grad_norm": 1.2764864711141906, + "learning_rate": 2.1607108227459594e-07, + "loss": 0.8184, + "step": 5807 + }, + { + "epoch": 0.94, + "grad_norm": 1.5518717216853828, + "learning_rate": 2.1499326923036688e-07, + "loss": 0.7082, + "step": 5808 + }, + { + "epoch": 0.94, + "grad_norm": 1.441591985490072, + "learning_rate": 2.1391812189169526e-07, + "loss": 0.7736, + "step": 5809 + }, + { + "epoch": 0.94, + "grad_norm": 1.3558148044040281, + "learning_rate": 2.1284564055148337e-07, + "loss": 0.8754, + "step": 5810 + }, + { + "epoch": 0.94, + "grad_norm": 1.3069128863328823, + "learning_rate": 2.1177582550190313e-07, + "loss": 0.8436, + "step": 5811 + }, + { + "epoch": 0.94, + "grad_norm": 1.2052003900463464, + "learning_rate": 2.1070867703440135e-07, + "loss": 0.8353, + "step": 5812 + }, + { + "epoch": 0.94, + "grad_norm": 1.2648404826156798, + "learning_rate": 2.0964419543970104e-07, + "loss": 0.8447, + "step": 5813 + }, + { + "epoch": 0.94, + "grad_norm": 1.374705865676168, + "learning_rate": 2.0858238100779248e-07, + "loss": 0.7815, + "step": 5814 + }, + { + "epoch": 0.94, + "grad_norm": 1.4942750430884373, + "learning_rate": 2.075232340279465e-07, + "loss": 0.8381, + "step": 5815 + }, + { + "epoch": 0.94, + "grad_norm": 1.4149789047090398, + "learning_rate": 2.0646675478870337e-07, + "loss": 0.8018, + "step": 5816 + }, + { + "epoch": 0.94, + "grad_norm": 1.3489783443129002, + "learning_rate": 2.0541294357787512e-07, + "loss": 0.7473, + "step": 5817 + }, + { + "epoch": 0.94, + "grad_norm": 1.559304940942349, + "learning_rate": 2.0436180068255207e-07, + "loss": 0.7405, + "step": 5818 + }, + { + "epoch": 0.94, + "grad_norm": 0.8691919450931498, + "learning_rate": 2.0331332638909184e-07, + "loss": 0.3197, + "step": 5819 + }, + { + "epoch": 0.94, + "grad_norm": 1.34528276095489, + "learning_rate": 2.022675209831282e-07, + "loss": 0.7089, + "step": 5820 + }, + { + "epoch": 0.94, + "grad_norm": 1.5229308679764808, + "learning_rate": 2.0122438474956764e-07, + "loss": 0.7801, + "step": 5821 + }, + { + "epoch": 0.94, + "grad_norm": 1.4417133934472999, + "learning_rate": 2.0018391797259063e-07, + "loss": 0.894, + "step": 5822 + }, + { + "epoch": 0.94, + "grad_norm": 1.4295438149692528, + "learning_rate": 1.9914612093564822e-07, + "loss": 0.7677, + "step": 5823 + }, + { + "epoch": 0.94, + "grad_norm": 1.424046308705004, + "learning_rate": 1.9811099392146427e-07, + "loss": 0.8054, + "step": 5824 + }, + { + "epoch": 0.94, + "grad_norm": 1.586981646932607, + "learning_rate": 1.970785372120354e-07, + "loss": 0.8708, + "step": 5825 + }, + { + "epoch": 0.94, + "grad_norm": 1.4394830897303752, + "learning_rate": 1.960487510886333e-07, + "loss": 0.8099, + "step": 5826 + }, + { + "epoch": 0.94, + "grad_norm": 1.367622377169449, + "learning_rate": 1.95021635831798e-07, + "loss": 0.8392, + "step": 5827 + }, + { + "epoch": 0.94, + "grad_norm": 1.4465438810187796, + "learning_rate": 1.9399719172134458e-07, + "loss": 0.7998, + "step": 5828 + }, + { + "epoch": 0.94, + "grad_norm": 1.291521783283156, + "learning_rate": 1.9297541903636196e-07, + "loss": 0.8343, + "step": 5829 + }, + { + "epoch": 0.94, + "grad_norm": 1.3826197827338123, + "learning_rate": 1.9195631805520642e-07, + "loss": 0.7827, + "step": 5830 + }, + { + "epoch": 0.94, + "grad_norm": 1.444376988759316, + "learning_rate": 1.909398890555092e-07, + "loss": 0.7899, + "step": 5831 + }, + { + "epoch": 0.94, + "grad_norm": 1.423264810014063, + "learning_rate": 1.8992613231417546e-07, + "loss": 0.8071, + "step": 5832 + }, + { + "epoch": 0.94, + "grad_norm": 1.3749603650492794, + "learning_rate": 1.889150481073798e-07, + "loss": 0.7778, + "step": 5833 + }, + { + "epoch": 0.94, + "grad_norm": 1.1917279946400148, + "learning_rate": 1.8790663671056863e-07, + "loss": 0.8071, + "step": 5834 + }, + { + "epoch": 0.94, + "grad_norm": 1.6873534881824053, + "learning_rate": 1.8690089839846215e-07, + "loss": 0.8151, + "step": 5835 + }, + { + "epoch": 0.94, + "grad_norm": 1.3043343232306424, + "learning_rate": 1.8589783344504897e-07, + "loss": 0.7807, + "step": 5836 + }, + { + "epoch": 0.94, + "grad_norm": 1.403143849092268, + "learning_rate": 1.8489744212359495e-07, + "loss": 0.7829, + "step": 5837 + }, + { + "epoch": 0.94, + "grad_norm": 1.4177916412246403, + "learning_rate": 1.8389972470663208e-07, + "loss": 0.8537, + "step": 5838 + }, + { + "epoch": 0.94, + "grad_norm": 1.4761373501349313, + "learning_rate": 1.829046814659663e-07, + "loss": 0.8183, + "step": 5839 + }, + { + "epoch": 0.94, + "grad_norm": 1.6106246111815314, + "learning_rate": 1.8191231267267629e-07, + "loss": 0.7468, + "step": 5840 + }, + { + "epoch": 0.94, + "grad_norm": 1.5383885597551368, + "learning_rate": 1.8092261859710802e-07, + "loss": 0.8877, + "step": 5841 + }, + { + "epoch": 0.94, + "grad_norm": 1.4866331046979355, + "learning_rate": 1.799355995088836e-07, + "loss": 0.873, + "step": 5842 + }, + { + "epoch": 0.94, + "grad_norm": 1.2227693663851267, + "learning_rate": 1.7895125567689354e-07, + "loss": 0.7574, + "step": 5843 + }, + { + "epoch": 0.94, + "grad_norm": 1.5931120663971288, + "learning_rate": 1.7796958736929992e-07, + "loss": 0.8464, + "step": 5844 + }, + { + "epoch": 0.94, + "grad_norm": 1.4743775431811985, + "learning_rate": 1.7699059485353775e-07, + "loss": 0.8229, + "step": 5845 + }, + { + "epoch": 0.94, + "grad_norm": 1.3121949136465074, + "learning_rate": 1.7601427839630814e-07, + "loss": 0.8245, + "step": 5846 + }, + { + "epoch": 0.94, + "grad_norm": 1.3186576484653143, + "learning_rate": 1.7504063826359053e-07, + "loss": 0.7679, + "step": 5847 + }, + { + "epoch": 0.94, + "grad_norm": 1.451073230532813, + "learning_rate": 1.740696747206294e-07, + "loss": 0.8182, + "step": 5848 + }, + { + "epoch": 0.94, + "grad_norm": 1.4891040810206257, + "learning_rate": 1.731013880319421e-07, + "loss": 0.8541, + "step": 5849 + }, + { + "epoch": 0.94, + "grad_norm": 1.6127505777593139, + "learning_rate": 1.7213577846131647e-07, + "loss": 0.768, + "step": 5850 + }, + { + "epoch": 0.94, + "grad_norm": 1.4168771254069639, + "learning_rate": 1.7117284627181207e-07, + "loss": 0.8061, + "step": 5851 + }, + { + "epoch": 0.94, + "grad_norm": 1.7554694624331924, + "learning_rate": 1.7021259172575688e-07, + "loss": 0.7914, + "step": 5852 + }, + { + "epoch": 0.94, + "grad_norm": 1.4722342547396627, + "learning_rate": 1.6925501508475162e-07, + "loss": 0.8351, + "step": 5853 + }, + { + "epoch": 0.94, + "grad_norm": 1.47263088508031, + "learning_rate": 1.6830011660966648e-07, + "loss": 0.743, + "step": 5854 + }, + { + "epoch": 0.94, + "grad_norm": 1.458456344113911, + "learning_rate": 1.673478965606423e-07, + "loss": 0.8334, + "step": 5855 + }, + { + "epoch": 0.94, + "grad_norm": 1.567902450936999, + "learning_rate": 1.6639835519708826e-07, + "loss": 0.7675, + "step": 5856 + }, + { + "epoch": 0.94, + "grad_norm": 1.5001548405290912, + "learning_rate": 1.6545149277768845e-07, + "loss": 0.8957, + "step": 5857 + }, + { + "epoch": 0.94, + "grad_norm": 1.2718217207377867, + "learning_rate": 1.6450730956039328e-07, + "loss": 0.8219, + "step": 5858 + }, + { + "epoch": 0.94, + "grad_norm": 1.399894317827479, + "learning_rate": 1.6356580580242253e-07, + "loss": 0.7816, + "step": 5859 + }, + { + "epoch": 0.94, + "grad_norm": 1.2900403114445638, + "learning_rate": 1.626269817602699e-07, + "loss": 0.8787, + "step": 5860 + }, + { + "epoch": 0.94, + "grad_norm": 1.7878762616528001, + "learning_rate": 1.616908376896964e-07, + "loss": 0.8854, + "step": 5861 + }, + { + "epoch": 0.94, + "grad_norm": 1.3663184728286413, + "learning_rate": 1.6075737384573354e-07, + "loss": 0.7603, + "step": 5862 + }, + { + "epoch": 0.94, + "grad_norm": 1.349524226173142, + "learning_rate": 1.5982659048268124e-07, + "loss": 0.8257, + "step": 5863 + }, + { + "epoch": 0.94, + "grad_norm": 1.4152968930642944, + "learning_rate": 1.588984878541133e-07, + "loss": 0.8594, + "step": 5864 + }, + { + "epoch": 0.94, + "grad_norm": 3.247527228596684, + "learning_rate": 1.5797306621286757e-07, + "loss": 0.8295, + "step": 5865 + }, + { + "epoch": 0.95, + "grad_norm": 1.6125036587045254, + "learning_rate": 1.5705032581105563e-07, + "loss": 0.8281, + "step": 5866 + }, + { + "epoch": 0.95, + "grad_norm": 1.388090769253597, + "learning_rate": 1.561302669000586e-07, + "loss": 0.7455, + "step": 5867 + }, + { + "epoch": 0.95, + "grad_norm": 1.6150775709923049, + "learning_rate": 1.5521288973052274e-07, + "loss": 0.7759, + "step": 5868 + }, + { + "epoch": 0.95, + "grad_norm": 1.4672832343014335, + "learning_rate": 1.5429819455237137e-07, + "loss": 0.7622, + "step": 5869 + }, + { + "epoch": 0.95, + "grad_norm": 1.3014316808506072, + "learning_rate": 1.5338618161478857e-07, + "loss": 0.7879, + "step": 5870 + }, + { + "epoch": 0.95, + "grad_norm": 1.3312620941052646, + "learning_rate": 1.5247685116623335e-07, + "loss": 0.7632, + "step": 5871 + }, + { + "epoch": 0.95, + "grad_norm": 1.3173266735373872, + "learning_rate": 1.5157020345443195e-07, + "loss": 0.8153, + "step": 5872 + }, + { + "epoch": 0.95, + "grad_norm": 1.4105801473563526, + "learning_rate": 1.5066623872638242e-07, + "loss": 0.7818, + "step": 5873 + }, + { + "epoch": 0.95, + "grad_norm": 1.4172251212164064, + "learning_rate": 1.497649572283466e-07, + "loss": 0.8219, + "step": 5874 + }, + { + "epoch": 0.95, + "grad_norm": 1.7075194494363592, + "learning_rate": 1.4886635920586036e-07, + "loss": 0.7655, + "step": 5875 + }, + { + "epoch": 0.95, + "grad_norm": 1.3608861038672677, + "learning_rate": 1.479704449037256e-07, + "loss": 0.8056, + "step": 5876 + }, + { + "epoch": 0.95, + "grad_norm": 1.3032055929073807, + "learning_rate": 1.4707721456601486e-07, + "loss": 0.7612, + "step": 5877 + }, + { + "epoch": 0.95, + "grad_norm": 1.2737091672766117, + "learning_rate": 1.46186668436068e-07, + "loss": 0.828, + "step": 5878 + }, + { + "epoch": 0.95, + "grad_norm": 1.4108201191953755, + "learning_rate": 1.4529880675649534e-07, + "loss": 0.8359, + "step": 5879 + }, + { + "epoch": 0.95, + "grad_norm": 1.3237744474967679, + "learning_rate": 1.444136297691734e-07, + "loss": 0.7642, + "step": 5880 + }, + { + "epoch": 0.95, + "grad_norm": 1.5203590957346371, + "learning_rate": 1.435311377152493e-07, + "loss": 0.8492, + "step": 5881 + }, + { + "epoch": 0.95, + "grad_norm": 1.378149566571992, + "learning_rate": 1.426513308351385e-07, + "loss": 0.8823, + "step": 5882 + }, + { + "epoch": 0.95, + "grad_norm": 1.3302852095499067, + "learning_rate": 1.4177420936852482e-07, + "loss": 0.8698, + "step": 5883 + }, + { + "epoch": 0.95, + "grad_norm": 1.354800238092628, + "learning_rate": 1.4089977355436045e-07, + "loss": 0.7549, + "step": 5884 + }, + { + "epoch": 0.95, + "grad_norm": 1.428505610941251, + "learning_rate": 1.4002802363086486e-07, + "loss": 0.8756, + "step": 5885 + }, + { + "epoch": 0.95, + "grad_norm": 1.329027119162686, + "learning_rate": 1.3915895983552806e-07, + "loss": 0.8298, + "step": 5886 + }, + { + "epoch": 0.95, + "grad_norm": 1.721343783945078, + "learning_rate": 1.3829258240510624e-07, + "loss": 0.8259, + "step": 5887 + }, + { + "epoch": 0.95, + "grad_norm": 1.5230271593741773, + "learning_rate": 1.374288915756228e-07, + "loss": 0.8706, + "step": 5888 + }, + { + "epoch": 0.95, + "grad_norm": 1.3166890735485766, + "learning_rate": 1.3656788758237504e-07, + "loss": 0.8202, + "step": 5889 + }, + { + "epoch": 0.95, + "grad_norm": 1.4521925091549335, + "learning_rate": 1.3570957065991987e-07, + "loss": 0.7959, + "step": 5890 + }, + { + "epoch": 0.95, + "grad_norm": 1.488225433036324, + "learning_rate": 1.3485394104209015e-07, + "loss": 0.8386, + "step": 5891 + }, + { + "epoch": 0.95, + "grad_norm": 1.5876528050394962, + "learning_rate": 1.340009989619806e-07, + "loss": 0.8517, + "step": 5892 + }, + { + "epoch": 0.95, + "grad_norm": 1.6780228672079986, + "learning_rate": 1.3315074465195533e-07, + "loss": 0.8263, + "step": 5893 + }, + { + "epoch": 0.95, + "grad_norm": 1.3947894678084862, + "learning_rate": 1.3230317834365013e-07, + "loss": 0.8081, + "step": 5894 + }, + { + "epoch": 0.95, + "grad_norm": 1.3386729772210906, + "learning_rate": 1.3145830026796368e-07, + "loss": 0.8344, + "step": 5895 + }, + { + "epoch": 0.95, + "grad_norm": 1.3676530992070035, + "learning_rate": 1.3061611065506409e-07, + "loss": 0.7983, + "step": 5896 + }, + { + "epoch": 0.95, + "grad_norm": 1.376522024793113, + "learning_rate": 1.2977660973438667e-07, + "loss": 0.8538, + "step": 5897 + }, + { + "epoch": 0.95, + "grad_norm": 1.4251120228011547, + "learning_rate": 1.2893979773463516e-07, + "loss": 0.8602, + "step": 5898 + }, + { + "epoch": 0.95, + "grad_norm": 1.3524789476634684, + "learning_rate": 1.2810567488378055e-07, + "loss": 0.8322, + "step": 5899 + }, + { + "epoch": 0.95, + "grad_norm": 1.419861557425791, + "learning_rate": 1.2727424140905998e-07, + "loss": 0.8024, + "step": 5900 + }, + { + "epoch": 0.95, + "grad_norm": 1.6305277412976746, + "learning_rate": 1.264454975369789e-07, + "loss": 0.792, + "step": 5901 + }, + { + "epoch": 0.95, + "grad_norm": 1.49304891607622, + "learning_rate": 1.2561944349331223e-07, + "loss": 0.8088, + "step": 5902 + }, + { + "epoch": 0.95, + "grad_norm": 1.435138526240153, + "learning_rate": 1.247960795030967e-07, + "loss": 0.7957, + "step": 5903 + }, + { + "epoch": 0.95, + "grad_norm": 1.433288698326768, + "learning_rate": 1.239754057906406e-07, + "loss": 0.7976, + "step": 5904 + }, + { + "epoch": 0.95, + "grad_norm": 1.4804757326912585, + "learning_rate": 1.2315742257951847e-07, + "loss": 0.7664, + "step": 5905 + }, + { + "epoch": 0.95, + "grad_norm": 1.2560605396744795, + "learning_rate": 1.22342130092572e-07, + "loss": 0.8395, + "step": 5906 + }, + { + "epoch": 0.95, + "grad_norm": 1.4650800145666492, + "learning_rate": 1.215295285519069e-07, + "loss": 0.8587, + "step": 5907 + }, + { + "epoch": 0.95, + "grad_norm": 1.2593450191575843, + "learning_rate": 1.2071961817890053e-07, + "loss": 0.833, + "step": 5908 + }, + { + "epoch": 0.95, + "grad_norm": 1.4699047810677959, + "learning_rate": 1.1991239919419529e-07, + "loss": 0.8585, + "step": 5909 + }, + { + "epoch": 0.95, + "grad_norm": 1.3490132541749875, + "learning_rate": 1.1910787181769745e-07, + "loss": 0.7749, + "step": 5910 + }, + { + "epoch": 0.95, + "grad_norm": 1.567113981633362, + "learning_rate": 1.1830603626858394e-07, + "loss": 0.8151, + "step": 5911 + }, + { + "epoch": 0.95, + "grad_norm": 1.3243047196127378, + "learning_rate": 1.175068927652967e-07, + "loss": 0.8241, + "step": 5912 + }, + { + "epoch": 0.95, + "grad_norm": 1.4202187344578385, + "learning_rate": 1.1671044152554378e-07, + "loss": 0.8026, + "step": 5913 + }, + { + "epoch": 0.95, + "grad_norm": 1.2572403915015684, + "learning_rate": 1.1591668276630274e-07, + "loss": 0.8172, + "step": 5914 + }, + { + "epoch": 0.95, + "grad_norm": 1.5306349037121818, + "learning_rate": 1.1512561670381172e-07, + "loss": 0.8662, + "step": 5915 + }, + { + "epoch": 0.95, + "grad_norm": 1.491862956424143, + "learning_rate": 1.1433724355358167e-07, + "loss": 0.852, + "step": 5916 + }, + { + "epoch": 0.95, + "grad_norm": 0.8334847579658969, + "learning_rate": 1.1355156353038743e-07, + "loss": 0.3109, + "step": 5917 + }, + { + "epoch": 0.95, + "grad_norm": 1.3824879779382424, + "learning_rate": 1.127685768482667e-07, + "loss": 0.7762, + "step": 5918 + }, + { + "epoch": 0.95, + "grad_norm": 1.1403659251171714, + "learning_rate": 1.1198828372052994e-07, + "loss": 0.8054, + "step": 5919 + }, + { + "epoch": 0.95, + "grad_norm": 1.5357591343676864, + "learning_rate": 1.1121068435974935e-07, + "loss": 0.8803, + "step": 5920 + }, + { + "epoch": 0.95, + "grad_norm": 1.3189133462148945, + "learning_rate": 1.1043577897776547e-07, + "loss": 0.8414, + "step": 5921 + }, + { + "epoch": 0.95, + "grad_norm": 1.36018412176385, + "learning_rate": 1.0966356778568055e-07, + "loss": 0.7611, + "step": 5922 + }, + { + "epoch": 0.95, + "grad_norm": 1.4635754988760137, + "learning_rate": 1.0889405099386962e-07, + "loss": 0.8387, + "step": 5923 + }, + { + "epoch": 0.95, + "grad_norm": 1.4805583134134297, + "learning_rate": 1.0812722881197058e-07, + "loss": 0.8366, + "step": 5924 + }, + { + "epoch": 0.95, + "grad_norm": 1.494126185802427, + "learning_rate": 1.0736310144888296e-07, + "loss": 0.7916, + "step": 5925 + }, + { + "epoch": 0.95, + "grad_norm": 1.4274387053467852, + "learning_rate": 1.0660166911277914e-07, + "loss": 0.8019, + "step": 5926 + }, + { + "epoch": 0.95, + "grad_norm": 1.3518156182581955, + "learning_rate": 1.0584293201109541e-07, + "loss": 0.8356, + "step": 5927 + }, + { + "epoch": 0.96, + "grad_norm": 1.5696182032737453, + "learning_rate": 1.0508689035052977e-07, + "loss": 0.8265, + "step": 5928 + }, + { + "epoch": 0.96, + "grad_norm": 1.2474297443895914, + "learning_rate": 1.0433354433705078e-07, + "loss": 0.8143, + "step": 5929 + }, + { + "epoch": 0.96, + "grad_norm": 1.581888202393035, + "learning_rate": 1.0358289417588874e-07, + "loss": 0.7142, + "step": 5930 + }, + { + "epoch": 0.96, + "grad_norm": 1.4667049054038683, + "learning_rate": 1.0283494007154448e-07, + "loss": 0.7494, + "step": 5931 + }, + { + "epoch": 0.96, + "grad_norm": 1.3020620999194406, + "learning_rate": 1.0208968222777838e-07, + "loss": 0.8307, + "step": 5932 + }, + { + "epoch": 0.96, + "grad_norm": 1.4478430464836454, + "learning_rate": 1.0134712084762022e-07, + "loss": 0.7827, + "step": 5933 + }, + { + "epoch": 0.96, + "grad_norm": 1.3773379106121946, + "learning_rate": 1.0060725613336375e-07, + "loss": 0.7321, + "step": 5934 + }, + { + "epoch": 0.96, + "grad_norm": 1.776565208323955, + "learning_rate": 9.987008828656997e-08, + "loss": 0.8597, + "step": 5935 + }, + { + "epoch": 0.96, + "grad_norm": 1.3488961397075148, + "learning_rate": 9.913561750806378e-08, + "loss": 0.7394, + "step": 5936 + }, + { + "epoch": 0.96, + "grad_norm": 1.4416859951031693, + "learning_rate": 9.84038439979329e-08, + "loss": 0.817, + "step": 5937 + }, + { + "epoch": 0.96, + "grad_norm": 1.4380756850306522, + "learning_rate": 9.767476795553454e-08, + "loss": 0.8757, + "step": 5938 + }, + { + "epoch": 0.96, + "grad_norm": 1.695429294711731, + "learning_rate": 9.69483895794876e-08, + "loss": 0.7779, + "step": 5939 + }, + { + "epoch": 0.96, + "grad_norm": 1.2509237966196682, + "learning_rate": 9.622470906767933e-08, + "loss": 0.7999, + "step": 5940 + }, + { + "epoch": 0.96, + "grad_norm": 1.350896842295312, + "learning_rate": 9.550372661725982e-08, + "loss": 0.8087, + "step": 5941 + }, + { + "epoch": 0.96, + "grad_norm": 1.3849225076441671, + "learning_rate": 9.478544242464415e-08, + "loss": 0.8576, + "step": 5942 + }, + { + "epoch": 0.96, + "grad_norm": 1.2556512594942453, + "learning_rate": 9.40698566855125e-08, + "loss": 0.7988, + "step": 5943 + }, + { + "epoch": 0.96, + "grad_norm": 1.2394643352962174, + "learning_rate": 9.335696959481e-08, + "loss": 0.7981, + "step": 5944 + }, + { + "epoch": 0.96, + "grad_norm": 1.404950753675489, + "learning_rate": 9.264678134674687e-08, + "loss": 0.8559, + "step": 5945 + }, + { + "epoch": 0.96, + "grad_norm": 1.3717047085463214, + "learning_rate": 9.193929213480057e-08, + "loss": 0.8441, + "step": 5946 + }, + { + "epoch": 0.96, + "grad_norm": 1.6095642087413555, + "learning_rate": 9.123450215170693e-08, + "loss": 0.8259, + "step": 5947 + }, + { + "epoch": 0.96, + "grad_norm": 1.2763733290894483, + "learning_rate": 9.053241158947123e-08, + "loss": 0.8115, + "step": 5948 + }, + { + "epoch": 0.96, + "grad_norm": 1.247716192007064, + "learning_rate": 8.983302063936272e-08, + "loss": 0.8302, + "step": 5949 + }, + { + "epoch": 0.96, + "grad_norm": 0.8542310500982013, + "learning_rate": 8.913632949191564e-08, + "loss": 0.304, + "step": 5950 + }, + { + "epoch": 0.96, + "grad_norm": 1.4659786888529376, + "learning_rate": 8.844233833692595e-08, + "loss": 0.7214, + "step": 5951 + }, + { + "epoch": 0.96, + "grad_norm": 1.623691449780856, + "learning_rate": 8.775104736345796e-08, + "loss": 0.8332, + "step": 5952 + }, + { + "epoch": 0.96, + "grad_norm": 1.3750660972530822, + "learning_rate": 8.70624567598366e-08, + "loss": 0.8126, + "step": 5953 + }, + { + "epoch": 0.96, + "grad_norm": 1.295990380084933, + "learning_rate": 8.637656671365402e-08, + "loss": 0.7977, + "step": 5954 + }, + { + "epoch": 0.96, + "grad_norm": 1.3852379885039, + "learning_rate": 8.56933774117652e-08, + "loss": 0.8191, + "step": 5955 + }, + { + "epoch": 0.96, + "grad_norm": 1.2844819938716439, + "learning_rate": 8.501288904029014e-08, + "loss": 0.8717, + "step": 5956 + }, + { + "epoch": 0.96, + "grad_norm": 1.4356604583076804, + "learning_rate": 8.433510178461168e-08, + "loss": 0.7886, + "step": 5957 + }, + { + "epoch": 0.96, + "grad_norm": 1.2211143045287962, + "learning_rate": 8.366001582937988e-08, + "loss": 0.8174, + "step": 5958 + }, + { + "epoch": 0.96, + "grad_norm": 1.4134980591334174, + "learning_rate": 8.29876313585043e-08, + "loss": 0.8097, + "step": 5959 + }, + { + "epoch": 0.96, + "grad_norm": 1.4097338384573557, + "learning_rate": 8.231794855516173e-08, + "loss": 0.8465, + "step": 5960 + }, + { + "epoch": 0.96, + "grad_norm": 1.4082632898501515, + "learning_rate": 8.165096760179181e-08, + "loss": 0.7845, + "step": 5961 + }, + { + "epoch": 0.96, + "grad_norm": 1.4579992211019106, + "learning_rate": 8.09866886801014e-08, + "loss": 0.7439, + "step": 5962 + }, + { + "epoch": 0.96, + "grad_norm": 1.40576536384155, + "learning_rate": 8.032511197105353e-08, + "loss": 0.7708, + "step": 5963 + }, + { + "epoch": 0.96, + "grad_norm": 1.5105139369446332, + "learning_rate": 7.966623765488513e-08, + "loss": 0.8049, + "step": 5964 + }, + { + "epoch": 0.96, + "grad_norm": 1.6613008205619584, + "learning_rate": 7.901006591108817e-08, + "loss": 0.8259, + "step": 5965 + }, + { + "epoch": 0.96, + "grad_norm": 0.8131023040634394, + "learning_rate": 7.83565969184219e-08, + "loss": 0.3392, + "step": 5966 + }, + { + "epoch": 0.96, + "grad_norm": 1.465097045615563, + "learning_rate": 7.770583085491168e-08, + "loss": 0.764, + "step": 5967 + }, + { + "epoch": 0.96, + "grad_norm": 1.2741823051451713, + "learning_rate": 7.705776789784237e-08, + "loss": 0.8095, + "step": 5968 + }, + { + "epoch": 0.96, + "grad_norm": 1.2719255229981568, + "learning_rate": 7.641240822376495e-08, + "loss": 0.8299, + "step": 5969 + }, + { + "epoch": 0.96, + "grad_norm": 1.0409545486884109, + "learning_rate": 7.576975200849212e-08, + "loss": 0.3578, + "step": 5970 + }, + { + "epoch": 0.96, + "grad_norm": 1.474968810832697, + "learning_rate": 7.512979942710163e-08, + "loss": 0.8277, + "step": 5971 + }, + { + "epoch": 0.96, + "grad_norm": 1.4946718337515887, + "learning_rate": 7.449255065393624e-08, + "loss": 0.7364, + "step": 5972 + }, + { + "epoch": 0.96, + "grad_norm": 1.4117878512225581, + "learning_rate": 7.385800586259595e-08, + "loss": 0.8121, + "step": 5973 + }, + { + "epoch": 0.96, + "grad_norm": 1.5797484587992223, + "learning_rate": 7.32261652259525e-08, + "loss": 0.7546, + "step": 5974 + }, + { + "epoch": 0.96, + "grad_norm": 1.387185293616104, + "learning_rate": 7.259702891613374e-08, + "loss": 0.8616, + "step": 5975 + }, + { + "epoch": 0.96, + "grad_norm": 1.9568487670474821, + "learning_rate": 7.19705971045348e-08, + "loss": 0.8477, + "step": 5976 + }, + { + "epoch": 0.96, + "grad_norm": 1.4321704585995494, + "learning_rate": 7.134686996181361e-08, + "loss": 0.8604, + "step": 5977 + }, + { + "epoch": 0.96, + "grad_norm": 1.5060501515851128, + "learning_rate": 7.07258476578887e-08, + "loss": 0.8725, + "step": 5978 + }, + { + "epoch": 0.96, + "grad_norm": 0.9044262904670698, + "learning_rate": 7.010753036194584e-08, + "loss": 0.3279, + "step": 5979 + }, + { + "epoch": 0.96, + "grad_norm": 1.6601701113178298, + "learning_rate": 6.949191824243028e-08, + "loss": 0.7786, + "step": 5980 + }, + { + "epoch": 0.96, + "grad_norm": 0.8763852330342072, + "learning_rate": 6.887901146705344e-08, + "loss": 0.3228, + "step": 5981 + }, + { + "epoch": 0.96, + "grad_norm": 1.1411771368753003, + "learning_rate": 6.82688102027862e-08, + "loss": 0.7869, + "step": 5982 + }, + { + "epoch": 0.96, + "grad_norm": 1.650915739499833, + "learning_rate": 6.766131461586445e-08, + "loss": 0.881, + "step": 5983 + }, + { + "epoch": 0.96, + "grad_norm": 1.4097177148543596, + "learning_rate": 6.705652487178693e-08, + "loss": 0.8293, + "step": 5984 + }, + { + "epoch": 0.96, + "grad_norm": 1.4450980633097819, + "learning_rate": 6.645444113531519e-08, + "loss": 0.8028, + "step": 5985 + }, + { + "epoch": 0.96, + "grad_norm": 1.3876390338360636, + "learning_rate": 6.585506357047466e-08, + "loss": 0.6795, + "step": 5986 + }, + { + "epoch": 0.96, + "grad_norm": 1.509803181165102, + "learning_rate": 6.525839234055032e-08, + "loss": 0.7796, + "step": 5987 + }, + { + "epoch": 0.96, + "grad_norm": 1.2895552415925915, + "learning_rate": 6.46644276080921e-08, + "loss": 0.8759, + "step": 5988 + }, + { + "epoch": 0.96, + "grad_norm": 1.315683841578733, + "learning_rate": 6.407316953491393e-08, + "loss": 0.8095, + "step": 5989 + }, + { + "epoch": 0.97, + "grad_norm": 1.6831936006681572, + "learning_rate": 6.348461828208919e-08, + "loss": 0.9101, + "step": 5990 + }, + { + "epoch": 0.97, + "grad_norm": 1.3062773471803266, + "learning_rate": 6.289877400995625e-08, + "loss": 0.8375, + "step": 5991 + }, + { + "epoch": 0.97, + "grad_norm": 1.3658429434827144, + "learning_rate": 6.231563687811526e-08, + "loss": 0.8275, + "step": 5992 + }, + { + "epoch": 0.97, + "grad_norm": 1.4373605345627107, + "learning_rate": 6.173520704542802e-08, + "loss": 0.7993, + "step": 5993 + }, + { + "epoch": 0.97, + "grad_norm": 1.3827320105085605, + "learning_rate": 6.115748467002136e-08, + "loss": 0.749, + "step": 5994 + }, + { + "epoch": 0.97, + "grad_norm": 1.456378912182639, + "learning_rate": 6.058246990928051e-08, + "loss": 0.8514, + "step": 5995 + }, + { + "epoch": 0.97, + "grad_norm": 1.4169548786911967, + "learning_rate": 6.001016291985795e-08, + "loss": 0.7819, + "step": 5996 + }, + { + "epoch": 0.97, + "grad_norm": 1.3853744680268891, + "learning_rate": 5.944056385766339e-08, + "loss": 0.7561, + "step": 5997 + }, + { + "epoch": 0.97, + "grad_norm": 1.5661707615483282, + "learning_rate": 5.887367287787271e-08, + "loss": 0.8222, + "step": 5998 + }, + { + "epoch": 0.97, + "grad_norm": 1.5322182626541996, + "learning_rate": 5.8309490134921265e-08, + "loss": 0.8458, + "step": 5999 + }, + { + "epoch": 0.97, + "grad_norm": 1.275804848899459, + "learning_rate": 5.774801578251055e-08, + "loss": 0.8712, + "step": 6000 + }, + { + "epoch": 0.97, + "grad_norm": 1.2960530952725058, + "learning_rate": 5.718924997359932e-08, + "loss": 0.867, + "step": 6001 + }, + { + "epoch": 0.97, + "grad_norm": 1.2131069530999965, + "learning_rate": 5.663319286041136e-08, + "loss": 0.7631, + "step": 6002 + }, + { + "epoch": 0.97, + "grad_norm": 1.4120027427986936, + "learning_rate": 5.6079844594433276e-08, + "loss": 0.807, + "step": 6003 + }, + { + "epoch": 0.97, + "grad_norm": 1.3749311007230216, + "learning_rate": 5.552920532641004e-08, + "loss": 0.8161, + "step": 6004 + }, + { + "epoch": 0.97, + "grad_norm": 1.39230097143429, + "learning_rate": 5.498127520635277e-08, + "loss": 0.7324, + "step": 6005 + }, + { + "epoch": 0.97, + "grad_norm": 1.5222429155496597, + "learning_rate": 5.4436054383532054e-08, + "loss": 0.7997, + "step": 6006 + }, + { + "epoch": 0.97, + "grad_norm": 1.4188826127714713, + "learning_rate": 5.389354300648131e-08, + "loss": 0.8076, + "step": 6007 + }, + { + "epoch": 0.97, + "grad_norm": 1.437679005574304, + "learning_rate": 5.3353741222995634e-08, + "loss": 0.7932, + "step": 6008 + }, + { + "epoch": 0.97, + "grad_norm": 1.351626526619759, + "learning_rate": 5.281664918013185e-08, + "loss": 0.8316, + "step": 6009 + }, + { + "epoch": 0.97, + "grad_norm": 1.3894137568362077, + "learning_rate": 5.2282267024207355e-08, + "loss": 0.7973, + "step": 6010 + }, + { + "epoch": 0.97, + "grad_norm": 1.4942096875005764, + "learning_rate": 5.1750594900805697e-08, + "loss": 0.8346, + "step": 6011 + }, + { + "epoch": 0.97, + "grad_norm": 1.1696680674096718, + "learning_rate": 5.1221632954765455e-08, + "loss": 0.7659, + "step": 6012 + }, + { + "epoch": 0.97, + "grad_norm": 1.4419059430015377, + "learning_rate": 5.069538133019247e-08, + "loss": 0.7898, + "step": 6013 + }, + { + "epoch": 0.97, + "grad_norm": 1.3167764846289038, + "learning_rate": 5.017184017045207e-08, + "loss": 0.7442, + "step": 6014 + }, + { + "epoch": 0.97, + "grad_norm": 1.633980105415254, + "learning_rate": 4.965100961817126e-08, + "loss": 0.8692, + "step": 6015 + }, + { + "epoch": 0.97, + "grad_norm": 1.5289417612126126, + "learning_rate": 4.913288981523878e-08, + "loss": 0.8913, + "step": 6016 + }, + { + "epoch": 0.97, + "grad_norm": 1.3586197082568885, + "learning_rate": 4.8617480902802826e-08, + "loss": 0.7637, + "step": 6017 + }, + { + "epoch": 0.97, + "grad_norm": 1.5081706498786276, + "learning_rate": 4.8104783021277746e-08, + "loss": 0.888, + "step": 6018 + }, + { + "epoch": 0.97, + "grad_norm": 1.4960566682926195, + "learning_rate": 4.759479631033514e-08, + "loss": 0.8311, + "step": 6019 + }, + { + "epoch": 0.97, + "grad_norm": 1.4447719645743997, + "learning_rate": 4.708752090890944e-08, + "loss": 0.7546, + "step": 6020 + }, + { + "epoch": 0.97, + "grad_norm": 1.5115737130823927, + "learning_rate": 4.6582956955196765e-08, + "loss": 0.8294, + "step": 6021 + }, + { + "epoch": 0.97, + "grad_norm": 1.5736588424209623, + "learning_rate": 4.608110458665382e-08, + "loss": 0.8316, + "step": 6022 + }, + { + "epoch": 0.97, + "grad_norm": 1.5378659203007923, + "learning_rate": 4.558196394000014e-08, + "loss": 0.8298, + "step": 6023 + }, + { + "epoch": 0.97, + "grad_norm": 1.616295368893459, + "learning_rate": 4.508553515121472e-08, + "loss": 0.8117, + "step": 6024 + }, + { + "epoch": 0.97, + "grad_norm": 1.5298792855446002, + "learning_rate": 4.4591818355538276e-08, + "loss": 0.8507, + "step": 6025 + }, + { + "epoch": 0.97, + "grad_norm": 1.4244093526252097, + "learning_rate": 4.41008136874721e-08, + "loss": 0.765, + "step": 6026 + }, + { + "epoch": 0.97, + "grad_norm": 1.3393998759296428, + "learning_rate": 4.361252128078031e-08, + "loss": 0.8079, + "step": 6027 + }, + { + "epoch": 0.97, + "grad_norm": 1.6515032465743607, + "learning_rate": 4.31269412684876e-08, + "loss": 0.7501, + "step": 6028 + }, + { + "epoch": 0.97, + "grad_norm": 1.5129715172948077, + "learning_rate": 4.264407378287927e-08, + "loss": 0.8271, + "step": 6029 + }, + { + "epoch": 0.97, + "grad_norm": 1.3527769513029613, + "learning_rate": 4.216391895550121e-08, + "loss": 0.8913, + "step": 6030 + }, + { + "epoch": 0.97, + "grad_norm": 1.3918578410493412, + "learning_rate": 4.168647691716099e-08, + "loss": 0.8635, + "step": 6031 + }, + { + "epoch": 0.97, + "grad_norm": 1.5348407843467056, + "learning_rate": 4.12117477979257e-08, + "loss": 0.8776, + "step": 6032 + }, + { + "epoch": 0.97, + "grad_norm": 1.8687932235725033, + "learning_rate": 4.0739731727127416e-08, + "loss": 0.8466, + "step": 6033 + }, + { + "epoch": 0.97, + "grad_norm": 1.532864897772446, + "learning_rate": 4.027042883335441e-08, + "loss": 0.7994, + "step": 6034 + }, + { + "epoch": 0.97, + "grad_norm": 1.2850539100040388, + "learning_rate": 3.980383924445774e-08, + "loss": 0.8858, + "step": 6035 + }, + { + "epoch": 0.97, + "grad_norm": 1.4409219786362937, + "learning_rate": 3.933996308755017e-08, + "loss": 0.8464, + "step": 6036 + }, + { + "epoch": 0.97, + "grad_norm": 1.0407069090233243, + "learning_rate": 3.887880048900394e-08, + "loss": 0.8182, + "step": 6037 + }, + { + "epoch": 0.97, + "grad_norm": 1.5844661118852745, + "learning_rate": 3.8420351574453005e-08, + "loss": 0.8188, + "step": 6038 + }, + { + "epoch": 0.97, + "grad_norm": 1.3785471624245913, + "learning_rate": 3.796461646878968e-08, + "loss": 0.7843, + "step": 6039 + }, + { + "epoch": 0.97, + "grad_norm": 1.4713201419024522, + "learning_rate": 3.751159529617021e-08, + "loss": 0.8047, + "step": 6040 + }, + { + "epoch": 0.97, + "grad_norm": 1.3056823940316953, + "learning_rate": 3.706128818001031e-08, + "loss": 0.8285, + "step": 6041 + }, + { + "epoch": 0.97, + "grad_norm": 1.5702077067932025, + "learning_rate": 3.6613695242984085e-08, + "loss": 0.8139, + "step": 6042 + }, + { + "epoch": 0.97, + "grad_norm": 1.3816812792656656, + "learning_rate": 3.616881660703064e-08, + "loss": 0.8125, + "step": 6043 + }, + { + "epoch": 0.97, + "grad_norm": 1.4030506954133668, + "learning_rate": 3.5726652393346385e-08, + "loss": 0.7792, + "step": 6044 + }, + { + "epoch": 0.97, + "grad_norm": 1.4277002614925574, + "learning_rate": 3.528720272238828e-08, + "loss": 0.8238, + "step": 6045 + }, + { + "epoch": 0.97, + "grad_norm": 1.4315246317765293, + "learning_rate": 3.485046771387612e-08, + "loss": 0.8171, + "step": 6046 + }, + { + "epoch": 0.97, + "grad_norm": 1.5521084057513086, + "learning_rate": 3.441644748678585e-08, + "loss": 0.7468, + "step": 6047 + }, + { + "epoch": 0.97, + "grad_norm": 1.4959855862254767, + "learning_rate": 3.398514215935955e-08, + "loss": 0.7329, + "step": 6048 + }, + { + "epoch": 0.97, + "grad_norm": 1.5231993256745697, + "learning_rate": 3.355655184909545e-08, + "loss": 0.8365, + "step": 6049 + }, + { + "epoch": 0.97, + "grad_norm": 1.394047115473417, + "learning_rate": 3.313067667275238e-08, + "loss": 0.8588, + "step": 6050 + }, + { + "epoch": 0.97, + "grad_norm": 1.430275989406736, + "learning_rate": 3.270751674635197e-08, + "loss": 0.8711, + "step": 6051 + }, + { + "epoch": 0.98, + "grad_norm": 1.3594894210300184, + "learning_rate": 3.228707218517313e-08, + "loss": 0.8275, + "step": 6052 + }, + { + "epoch": 0.98, + "grad_norm": 1.4291910435840587, + "learning_rate": 3.186934310375866e-08, + "loss": 0.8332, + "step": 6053 + }, + { + "epoch": 0.98, + "grad_norm": 1.2868411359157137, + "learning_rate": 3.1454329615907554e-08, + "loss": 0.6927, + "step": 6054 + }, + { + "epoch": 0.98, + "grad_norm": 1.3345822074021731, + "learning_rate": 3.104203183468157e-08, + "loss": 0.7665, + "step": 6055 + }, + { + "epoch": 0.98, + "grad_norm": 1.4961335118390904, + "learning_rate": 3.0632449872401994e-08, + "loss": 0.8143, + "step": 6056 + }, + { + "epoch": 0.98, + "grad_norm": 1.4461928798625012, + "learning_rate": 3.0225583840650665e-08, + "loss": 0.8178, + "step": 6057 + }, + { + "epoch": 0.98, + "grad_norm": 1.5197553873852232, + "learning_rate": 2.982143385026892e-08, + "loss": 0.8036, + "step": 6058 + }, + { + "epoch": 0.98, + "grad_norm": 1.4189539191712903, + "learning_rate": 2.9420000011357585e-08, + "loss": 0.7886, + "step": 6059 + }, + { + "epoch": 0.98, + "grad_norm": 1.596172604547849, + "learning_rate": 2.9021282433279173e-08, + "loss": 0.7904, + "step": 6060 + }, + { + "epoch": 0.98, + "grad_norm": 1.3080037964953877, + "learning_rate": 2.8625281224654578e-08, + "loss": 0.8707, + "step": 6061 + }, + { + "epoch": 0.98, + "grad_norm": 1.769712903001942, + "learning_rate": 2.8231996493366387e-08, + "loss": 0.8191, + "step": 6062 + }, + { + "epoch": 0.98, + "grad_norm": 1.5817813805477723, + "learning_rate": 2.7841428346556676e-08, + "loss": 0.8729, + "step": 6063 + }, + { + "epoch": 0.98, + "grad_norm": 1.5928000840250816, + "learning_rate": 2.745357689062478e-08, + "loss": 0.722, + "step": 6064 + }, + { + "epoch": 0.98, + "grad_norm": 1.4393314468011846, + "learning_rate": 2.706844223123395e-08, + "loss": 0.7586, + "step": 6065 + }, + { + "epoch": 0.98, + "grad_norm": 1.2804768754946843, + "learning_rate": 2.6686024473304706e-08, + "loss": 0.7986, + "step": 6066 + }, + { + "epoch": 0.98, + "grad_norm": 1.6364664734585934, + "learning_rate": 2.6306323721018156e-08, + "loss": 0.8213, + "step": 6067 + }, + { + "epoch": 0.98, + "grad_norm": 1.4654030533192521, + "learning_rate": 2.5929340077816e-08, + "loss": 0.7619, + "step": 6068 + }, + { + "epoch": 0.98, + "grad_norm": 1.5345204495134486, + "learning_rate": 2.555507364639831e-08, + "loss": 0.7735, + "step": 6069 + }, + { + "epoch": 0.98, + "grad_norm": 1.3764229274509767, + "learning_rate": 2.5183524528725744e-08, + "loss": 0.7974, + "step": 6070 + }, + { + "epoch": 0.98, + "grad_norm": 1.3556201957395027, + "learning_rate": 2.481469282601845e-08, + "loss": 0.7894, + "step": 6071 + }, + { + "epoch": 0.98, + "grad_norm": 1.8722287818885825, + "learning_rate": 2.444857863875605e-08, + "loss": 0.7785, + "step": 6072 + }, + { + "epoch": 0.98, + "grad_norm": 1.3450157101216205, + "learning_rate": 2.408518206667876e-08, + "loss": 0.831, + "step": 6073 + }, + { + "epoch": 0.98, + "grad_norm": 1.3797760704152304, + "learning_rate": 2.3724503208786276e-08, + "loss": 0.7917, + "step": 6074 + }, + { + "epoch": 0.98, + "grad_norm": 1.4672671693026282, + "learning_rate": 2.3366542163336668e-08, + "loss": 0.7926, + "step": 6075 + }, + { + "epoch": 0.98, + "grad_norm": 1.4267259107256909, + "learning_rate": 2.3011299027847488e-08, + "loss": 0.8005, + "step": 6076 + }, + { + "epoch": 0.98, + "grad_norm": 1.4567062760819132, + "learning_rate": 2.2658773899097986e-08, + "loss": 0.7699, + "step": 6077 + }, + { + "epoch": 0.98, + "grad_norm": 1.5142317154996165, + "learning_rate": 2.230896687312578e-08, + "loss": 0.8172, + "step": 6078 + }, + { + "epoch": 0.98, + "grad_norm": 1.5882842378405284, + "learning_rate": 2.196187804522798e-08, + "loss": 0.8247, + "step": 6079 + }, + { + "epoch": 0.98, + "grad_norm": 1.4677942392546506, + "learning_rate": 2.1617507509960058e-08, + "loss": 0.7849, + "step": 6080 + }, + { + "epoch": 0.98, + "grad_norm": 1.4774514970971258, + "learning_rate": 2.1275855361140297e-08, + "loss": 0.7637, + "step": 6081 + }, + { + "epoch": 0.98, + "grad_norm": 1.3527214781113057, + "learning_rate": 2.0936921691842028e-08, + "loss": 0.7576, + "step": 6082 + }, + { + "epoch": 0.98, + "grad_norm": 1.4375647181734748, + "learning_rate": 2.0600706594400278e-08, + "loss": 0.7837, + "step": 6083 + }, + { + "epoch": 0.98, + "grad_norm": 1.5067429893984299, + "learning_rate": 2.0267210160409557e-08, + "loss": 0.8706, + "step": 6084 + }, + { + "epoch": 0.98, + "grad_norm": 1.3868167916415504, + "learning_rate": 1.9936432480723854e-08, + "loss": 0.8293, + "step": 6085 + }, + { + "epoch": 0.98, + "grad_norm": 1.5452650800589995, + "learning_rate": 1.9608373645456648e-08, + "loss": 0.8111, + "step": 6086 + }, + { + "epoch": 0.98, + "grad_norm": 1.3868176277262179, + "learning_rate": 1.9283033743978663e-08, + "loss": 0.7749, + "step": 6087 + }, + { + "epoch": 0.98, + "grad_norm": 1.2735195250510247, + "learning_rate": 1.896041286492345e-08, + "loss": 0.7893, + "step": 6088 + }, + { + "epoch": 0.98, + "grad_norm": 1.4559457163774372, + "learning_rate": 1.86405110961807e-08, + "loss": 0.808, + "step": 6089 + }, + { + "epoch": 0.98, + "grad_norm": 1.3190427658426584, + "learning_rate": 1.8323328524899597e-08, + "loss": 0.8573, + "step": 6090 + }, + { + "epoch": 0.98, + "grad_norm": 1.4582286151402442, + "learning_rate": 1.8008865237491014e-08, + "loss": 0.7686, + "step": 6091 + }, + { + "epoch": 0.98, + "grad_norm": 1.3004729310777106, + "learning_rate": 1.7697121319621978e-08, + "loss": 0.8467, + "step": 6092 + }, + { + "epoch": 0.98, + "grad_norm": 1.405625089237877, + "learning_rate": 1.7388096856221227e-08, + "loss": 0.8259, + "step": 6093 + }, + { + "epoch": 0.98, + "grad_norm": 1.2260298377192678, + "learning_rate": 1.7081791931475855e-08, + "loss": 0.7753, + "step": 6094 + }, + { + "epoch": 0.98, + "grad_norm": 1.4916007551046446, + "learning_rate": 1.677820662883134e-08, + "loss": 0.8167, + "step": 6095 + }, + { + "epoch": 0.98, + "grad_norm": 1.6494424805076153, + "learning_rate": 1.647734103099152e-08, + "loss": 0.8343, + "step": 6096 + }, + { + "epoch": 0.98, + "grad_norm": 1.2326458864916343, + "learning_rate": 1.6179195219921952e-08, + "loss": 0.8227, + "step": 6097 + }, + { + "epoch": 0.98, + "grad_norm": 1.350790440680907, + "learning_rate": 1.588376927684432e-08, + "loss": 0.7803, + "step": 6098 + }, + { + "epoch": 0.98, + "grad_norm": 1.2189293203724412, + "learning_rate": 1.5591063282242026e-08, + "loss": 0.8056, + "step": 6099 + }, + { + "epoch": 0.98, + "grad_norm": 1.4244769701920426, + "learning_rate": 1.530107731585684e-08, + "loss": 0.811, + "step": 6100 + }, + { + "epoch": 0.98, + "grad_norm": 1.4914891727239066, + "learning_rate": 1.5013811456687787e-08, + "loss": 0.7379, + "step": 6101 + }, + { + "epoch": 0.98, + "grad_norm": 1.4561880896162003, + "learning_rate": 1.4729265782993384e-08, + "loss": 0.7709, + "step": 6102 + }, + { + "epoch": 0.98, + "grad_norm": 1.243431717256865, + "learning_rate": 1.4447440372292732e-08, + "loss": 0.7952, + "step": 6103 + }, + { + "epoch": 0.98, + "grad_norm": 1.446095102016903, + "learning_rate": 1.41683353013633e-08, + "loss": 0.7761, + "step": 6104 + }, + { + "epoch": 0.98, + "grad_norm": 1.397530094170945, + "learning_rate": 1.3891950646239827e-08, + "loss": 0.8049, + "step": 6105 + }, + { + "epoch": 0.98, + "grad_norm": 1.3857259951042646, + "learning_rate": 1.3618286482218745e-08, + "loss": 0.797, + "step": 6106 + }, + { + "epoch": 0.98, + "grad_norm": 1.1867196246615432, + "learning_rate": 1.3347342883851532e-08, + "loss": 0.8376, + "step": 6107 + }, + { + "epoch": 0.98, + "grad_norm": 1.4819705336364355, + "learning_rate": 1.3079119924952477e-08, + "loss": 0.8393, + "step": 6108 + }, + { + "epoch": 0.98, + "grad_norm": 1.2937477896735987, + "learning_rate": 1.2813617678592017e-08, + "loss": 0.6799, + "step": 6109 + }, + { + "epoch": 0.98, + "grad_norm": 1.2776305182263576, + "learning_rate": 1.2550836217101182e-08, + "loss": 0.7636, + "step": 6110 + }, + { + "epoch": 0.98, + "grad_norm": 1.6383007320360798, + "learning_rate": 1.2290775612067151e-08, + "loss": 0.7967, + "step": 6111 + }, + { + "epoch": 0.98, + "grad_norm": 1.4671573405522675, + "learning_rate": 1.2033435934338811e-08, + "loss": 0.8732, + "step": 6112 + }, + { + "epoch": 0.98, + "grad_norm": 0.7301246309342323, + "learning_rate": 1.1778817254022301e-08, + "loss": 0.3319, + "step": 6113 + }, + { + "epoch": 0.99, + "grad_norm": 1.4879643611008029, + "learning_rate": 1.1526919640483247e-08, + "loss": 0.8405, + "step": 6114 + }, + { + "epoch": 0.99, + "grad_norm": 0.9903820302078314, + "learning_rate": 1.1277743162345644e-08, + "loss": 0.3122, + "step": 6115 + }, + { + "epoch": 0.99, + "grad_norm": 1.3711695296927295, + "learning_rate": 1.103128788749075e-08, + "loss": 0.8504, + "step": 6116 + }, + { + "epoch": 0.99, + "grad_norm": 1.5351068195121127, + "learning_rate": 1.0787553883061519e-08, + "loss": 0.8389, + "step": 6117 + }, + { + "epoch": 0.99, + "grad_norm": 1.4588913881526042, + "learning_rate": 1.0546541215455952e-08, + "loss": 0.8379, + "step": 6118 + }, + { + "epoch": 0.99, + "grad_norm": 1.8925496498815968, + "learning_rate": 1.0308249950333749e-08, + "loss": 0.8005, + "step": 6119 + }, + { + "epoch": 0.99, + "grad_norm": 1.5061494545400904, + "learning_rate": 1.0072680152611869e-08, + "loss": 0.7942, + "step": 6120 + }, + { + "epoch": 0.99, + "grad_norm": 0.8543378478037552, + "learning_rate": 9.839831886465644e-09, + "loss": 0.3638, + "step": 6121 + }, + { + "epoch": 0.99, + "grad_norm": 1.331011318751303, + "learning_rate": 9.60970521532878e-09, + "loss": 0.8169, + "step": 6122 + }, + { + "epoch": 0.99, + "grad_norm": 1.2828089124487685, + "learning_rate": 9.382300201896676e-09, + "loss": 0.7759, + "step": 6123 + }, + { + "epoch": 0.99, + "grad_norm": 0.8686874520860466, + "learning_rate": 9.157616908117562e-09, + "loss": 0.3333, + "step": 6124 + }, + { + "epoch": 0.99, + "grad_norm": 1.3888901218435323, + "learning_rate": 8.935655395203579e-09, + "loss": 0.7946, + "step": 6125 + }, + { + "epoch": 0.99, + "grad_norm": 1.4573400741590914, + "learning_rate": 8.716415723621918e-09, + "loss": 0.7958, + "step": 6126 + }, + { + "epoch": 0.99, + "grad_norm": 1.483994570719936, + "learning_rate": 8.499897953100355e-09, + "loss": 0.7756, + "step": 6127 + }, + { + "epoch": 0.99, + "grad_norm": 1.8224020392741154, + "learning_rate": 8.28610214262393e-09, + "loss": 0.829, + "step": 6128 + }, + { + "epoch": 0.99, + "grad_norm": 1.262798375189001, + "learning_rate": 8.075028350436054e-09, + "loss": 0.783, + "step": 6129 + }, + { + "epoch": 0.99, + "grad_norm": 1.1642630087575414, + "learning_rate": 7.866676634039617e-09, + "loss": 0.824, + "step": 6130 + }, + { + "epoch": 0.99, + "grad_norm": 1.5237177737718797, + "learning_rate": 7.66104705019588e-09, + "loss": 0.858, + "step": 6131 + }, + { + "epoch": 0.99, + "grad_norm": 1.4149524011921286, + "learning_rate": 7.45813965492337e-09, + "loss": 0.7364, + "step": 6132 + }, + { + "epoch": 0.99, + "grad_norm": 1.3735161572175374, + "learning_rate": 7.257954503498976e-09, + "loss": 0.8095, + "step": 6133 + }, + { + "epoch": 0.99, + "grad_norm": 0.8473921839141729, + "learning_rate": 7.060491650459078e-09, + "loss": 0.3238, + "step": 6134 + }, + { + "epoch": 0.99, + "grad_norm": 1.4612953103579018, + "learning_rate": 6.8657511495984205e-09, + "loss": 0.8739, + "step": 6135 + }, + { + "epoch": 0.99, + "grad_norm": 1.3094344299731606, + "learning_rate": 6.673733053970122e-09, + "loss": 0.7723, + "step": 6136 + }, + { + "epoch": 0.99, + "grad_norm": 1.3434094348142434, + "learning_rate": 6.4844374158834485e-09, + "loss": 0.8265, + "step": 6137 + }, + { + "epoch": 0.99, + "grad_norm": 1.3199177337354602, + "learning_rate": 6.297864286910482e-09, + "loss": 0.7932, + "step": 6138 + }, + { + "epoch": 0.99, + "grad_norm": 1.4078135364853601, + "learning_rate": 6.114013717876121e-09, + "loss": 0.8214, + "step": 6139 + }, + { + "epoch": 0.99, + "grad_norm": 1.4473654201659836, + "learning_rate": 5.9328857588680785e-09, + "loss": 0.8276, + "step": 6140 + }, + { + "epoch": 0.99, + "grad_norm": 1.319312787929375, + "learning_rate": 5.754480459229106e-09, + "loss": 0.7507, + "step": 6141 + }, + { + "epoch": 0.99, + "grad_norm": 1.2457349748029742, + "learning_rate": 5.5787978675636566e-09, + "loss": 0.7601, + "step": 6142 + }, + { + "epoch": 0.99, + "grad_norm": 1.3780636201892504, + "learning_rate": 5.405838031731225e-09, + "loss": 0.8638, + "step": 6143 + }, + { + "epoch": 0.99, + "grad_norm": 1.8567096917267634, + "learning_rate": 5.235600998850787e-09, + "loss": 0.8195, + "step": 6144 + }, + { + "epoch": 0.99, + "grad_norm": 1.2323067013536686, + "learning_rate": 5.068086815300799e-09, + "loss": 0.8351, + "step": 6145 + }, + { + "epoch": 0.99, + "grad_norm": 0.827715935755992, + "learning_rate": 4.9032955267158675e-09, + "loss": 0.3583, + "step": 6146 + }, + { + "epoch": 0.99, + "grad_norm": 1.5427901550944183, + "learning_rate": 4.741227177988972e-09, + "loss": 0.7252, + "step": 6147 + }, + { + "epoch": 0.99, + "grad_norm": 1.3712506707102023, + "learning_rate": 4.581881813272571e-09, + "loss": 0.8916, + "step": 6148 + }, + { + "epoch": 0.99, + "grad_norm": 1.4310982562815822, + "learning_rate": 4.4252594759774945e-09, + "loss": 0.8746, + "step": 6149 + }, + { + "epoch": 0.99, + "grad_norm": 1.5608320727557936, + "learning_rate": 4.271360208770725e-09, + "loss": 0.794, + "step": 6150 + }, + { + "epoch": 0.99, + "grad_norm": 1.4210714660898054, + "learning_rate": 4.120184053579834e-09, + "loss": 0.8428, + "step": 6151 + }, + { + "epoch": 0.99, + "grad_norm": 1.4890500233258597, + "learning_rate": 3.971731051588545e-09, + "loss": 0.7812, + "step": 6152 + }, + { + "epoch": 0.99, + "grad_norm": 1.2871833329232851, + "learning_rate": 3.826001243240063e-09, + "loss": 0.8156, + "step": 6153 + }, + { + "epoch": 0.99, + "grad_norm": 1.328261716201147, + "learning_rate": 3.682994668234852e-09, + "loss": 0.7434, + "step": 6154 + }, + { + "epoch": 0.99, + "grad_norm": 1.4325254198988706, + "learning_rate": 3.542711365531748e-09, + "loss": 0.8456, + "step": 6155 + }, + { + "epoch": 0.99, + "grad_norm": 1.3260906999433153, + "learning_rate": 3.405151373347959e-09, + "loss": 0.7743, + "step": 6156 + }, + { + "epoch": 0.99, + "grad_norm": 1.3132202508138637, + "learning_rate": 3.270314729159063e-09, + "loss": 0.887, + "step": 6157 + }, + { + "epoch": 0.99, + "grad_norm": 1.3073780337540433, + "learning_rate": 3.138201469697899e-09, + "loss": 0.7788, + "step": 6158 + }, + { + "epoch": 0.99, + "grad_norm": 1.2896284566045197, + "learning_rate": 3.008811630955677e-09, + "loss": 0.7936, + "step": 6159 + }, + { + "epoch": 0.99, + "grad_norm": 1.5002828842883926, + "learning_rate": 2.882145248181978e-09, + "loss": 0.787, + "step": 6160 + }, + { + "epoch": 0.99, + "grad_norm": 1.2962109469921965, + "learning_rate": 2.7582023558847537e-09, + "loss": 0.8232, + "step": 6161 + }, + { + "epoch": 0.99, + "grad_norm": 1.5180957910363062, + "learning_rate": 2.6369829878281074e-09, + "loss": 0.8467, + "step": 6162 + }, + { + "epoch": 0.99, + "grad_norm": 1.7372282031733024, + "learning_rate": 2.5184871770356224e-09, + "loss": 0.7418, + "step": 6163 + }, + { + "epoch": 0.99, + "grad_norm": 1.4654194714554771, + "learning_rate": 2.4027149557903638e-09, + "loss": 0.8389, + "step": 6164 + }, + { + "epoch": 0.99, + "grad_norm": 1.3683123593280433, + "learning_rate": 2.2896663556304378e-09, + "loss": 0.7355, + "step": 6165 + }, + { + "epoch": 0.99, + "grad_norm": 1.3456378032986156, + "learning_rate": 2.1793414073545407e-09, + "loss": 0.8504, + "step": 6166 + }, + { + "epoch": 0.99, + "grad_norm": 1.5512644509822757, + "learning_rate": 2.0717401410164097e-09, + "loss": 0.8361, + "step": 6167 + }, + { + "epoch": 0.99, + "grad_norm": 1.389581947451672, + "learning_rate": 1.9668625859314838e-09, + "loss": 0.8439, + "step": 6168 + }, + { + "epoch": 0.99, + "grad_norm": 1.3836868345287536, + "learning_rate": 1.8647087706702425e-09, + "loss": 0.9227, + "step": 6169 + }, + { + "epoch": 0.99, + "grad_norm": 1.3243755069156071, + "learning_rate": 1.7652787230637569e-09, + "loss": 0.8093, + "step": 6170 + }, + { + "epoch": 0.99, + "grad_norm": 1.8334673913570718, + "learning_rate": 1.6685724701970274e-09, + "loss": 0.742, + "step": 6171 + }, + { + "epoch": 0.99, + "grad_norm": 1.3236095326788275, + "learning_rate": 1.5745900384167567e-09, + "loss": 0.8124, + "step": 6172 + }, + { + "epoch": 0.99, + "grad_norm": 1.6421420406379479, + "learning_rate": 1.4833314533269084e-09, + "loss": 0.7681, + "step": 6173 + }, + { + "epoch": 0.99, + "grad_norm": 1.4591228363232178, + "learning_rate": 1.3947967397887064e-09, + "loss": 0.795, + "step": 6174 + }, + { + "epoch": 0.99, + "grad_norm": 1.3895153416197268, + "learning_rate": 1.3089859219195256e-09, + "loss": 0.8513, + "step": 6175 + }, + { + "epoch": 1.0, + "grad_norm": 1.3891368133246338, + "learning_rate": 1.2258990230995527e-09, + "loss": 0.8113, + "step": 6176 + }, + { + "epoch": 1.0, + "grad_norm": 1.291173652580304, + "learning_rate": 1.1455360659617942e-09, + "loss": 0.8816, + "step": 6177 + }, + { + "epoch": 1.0, + "grad_norm": 1.4979692610587292, + "learning_rate": 1.0678970724009585e-09, + "loss": 0.7963, + "step": 6178 + }, + { + "epoch": 1.0, + "grad_norm": 1.3732226050397678, + "learning_rate": 9.92982063565684e-10, + "loss": 0.8813, + "step": 6179 + }, + { + "epoch": 1.0, + "grad_norm": 1.674897591152258, + "learning_rate": 9.207910598674208e-10, + "loss": 0.8686, + "step": 6180 + }, + { + "epoch": 1.0, + "grad_norm": 0.8966911187644984, + "learning_rate": 8.513240809715495e-10, + "loss": 0.3084, + "step": 6181 + }, + { + "epoch": 1.0, + "grad_norm": 1.4861555870307426, + "learning_rate": 7.845811458029317e-10, + "loss": 0.8592, + "step": 6182 + }, + { + "epoch": 1.0, + "grad_norm": 0.852847697792143, + "learning_rate": 7.205622725448003e-10, + "loss": 0.3285, + "step": 6183 + }, + { + "epoch": 1.0, + "grad_norm": 1.3861945867513388, + "learning_rate": 6.592674786376485e-10, + "loss": 0.8363, + "step": 6184 + }, + { + "epoch": 1.0, + "grad_norm": 1.5794724339711894, + "learning_rate": 6.006967807781206e-10, + "loss": 0.8016, + "step": 6185 + }, + { + "epoch": 1.0, + "grad_norm": 1.5225244295364593, + "learning_rate": 5.448501949256724e-10, + "loss": 0.7794, + "step": 6186 + }, + { + "epoch": 1.0, + "grad_norm": 1.2166138736397747, + "learning_rate": 4.917277362914696e-10, + "loss": 0.7963, + "step": 6187 + }, + { + "epoch": 1.0, + "grad_norm": 1.2835831478607436, + "learning_rate": 4.413294193483797e-10, + "loss": 0.7081, + "step": 6188 + }, + { + "epoch": 1.0, + "grad_norm": 1.6787691466850847, + "learning_rate": 3.936552578276409e-10, + "loss": 0.8822, + "step": 6189 + }, + { + "epoch": 1.0, + "grad_norm": 1.394811230156793, + "learning_rate": 3.48705264715532e-10, + "loss": 0.8101, + "step": 6190 + }, + { + "epoch": 1.0, + "grad_norm": 1.5505917310816706, + "learning_rate": 3.0647945225781294e-10, + "loss": 0.8211, + "step": 6191 + }, + { + "epoch": 1.0, + "grad_norm": 1.6743771874917042, + "learning_rate": 2.669778319586147e-10, + "loss": 0.7956, + "step": 6192 + }, + { + "epoch": 1.0, + "grad_norm": 1.4424851500042852, + "learning_rate": 2.3020041457821885e-10, + "loss": 0.8436, + "step": 6193 + }, + { + "epoch": 1.0, + "grad_norm": 1.7040445148789318, + "learning_rate": 1.9614721013749838e-10, + "loss": 0.825, + "step": 6194 + }, + { + "epoch": 1.0, + "grad_norm": 1.6273866673422035, + "learning_rate": 1.6481822791125646e-10, + "loss": 0.8281, + "step": 6195 + }, + { + "epoch": 1.0, + "grad_norm": 1.3892537432422891, + "learning_rate": 1.3621347643710814e-10, + "loss": 0.8789, + "step": 6196 + }, + { + "epoch": 1.0, + "grad_norm": 1.5215961491144232, + "learning_rate": 1.1033296350548839e-10, + "loss": 0.7913, + "step": 6197 + }, + { + "epoch": 1.0, + "grad_norm": 1.6205482070520882, + "learning_rate": 8.717669616742364e-11, + "loss": 0.8448, + "step": 6198 + }, + { + "epoch": 1.0, + "grad_norm": 1.3315640375465965, + "learning_rate": 6.674468073231133e-11, + "loss": 0.7669, + "step": 6199 + }, + { + "epoch": 1.0, + "grad_norm": 1.5160521220035885, + "learning_rate": 4.903692276569949e-11, + "loss": 0.842, + "step": 6200 + }, + { + "epoch": 1.0, + "grad_norm": 1.376235957430811, + "learning_rate": 3.405342709150716e-11, + "loss": 0.8409, + "step": 6201 + }, + { + "epoch": 1.0, + "grad_norm": 1.4748114213775856, + "learning_rate": 2.1794197790914183e-11, + "loss": 0.8638, + "step": 6202 + }, + { + "epoch": 1.0, + "grad_norm": 1.2362651379526777, + "learning_rate": 1.2259238205691858e-11, + "loss": 0.8505, + "step": 6203 + }, + { + "epoch": 1.0, + "grad_norm": 1.4117071250211006, + "learning_rate": 5.4485509326518415e-12, + "loss": 0.8824, + "step": 6204 + }, + { + "epoch": 1.0, + "grad_norm": 1.2812310641186235, + "learning_rate": 1.362137825866583e-12, + "loss": 0.7853, + "step": 6205 + }, + { + "epoch": 1.0, + "grad_norm": 1.8623132640479105, + "learning_rate": 0.0, + "loss": 0.4962, + "step": 6206 + }, + { + "epoch": 1.0, + "step": 6206, + "total_flos": 2.654469138789748e+19, + "train_loss": 0.8537314998750106, + "train_runtime": 50545.8721, + "train_samples_per_second": 15.717, + "train_steps_per_second": 0.123 + } + ], + "logging_steps": 1.0, + "max_steps": 6206, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 20000, + "total_flos": 2.654469138789748e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}