|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.9423076923076925, |
|
"eval_steps": 500, |
|
"global_step": 468, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 26.249136076838056, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 1.1102, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 26.110264576346324, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.0997, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 25.94374396489923, |
|
"learning_rate": 1.5e-06, |
|
"loss": 1.1239, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 22.37475129701464, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.1059, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 16.25191401266457, |
|
"learning_rate": 2.5e-06, |
|
"loss": 1.0405, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 22.679130338310745, |
|
"learning_rate": 3e-06, |
|
"loss": 1.0213, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 15.916959043580418, |
|
"learning_rate": 3.5e-06, |
|
"loss": 1.0104, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 9.482985220910942, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.9937, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6.965472899289267, |
|
"learning_rate": 4.5e-06, |
|
"loss": 0.9693, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 12.298531856803667, |
|
"learning_rate": 5e-06, |
|
"loss": 0.962, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 6.569727331935135, |
|
"learning_rate": 4.999941186489917e-06, |
|
"loss": 0.9374, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 5.377638735854951, |
|
"learning_rate": 4.999764748726891e-06, |
|
"loss": 0.9315, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.601213320961148, |
|
"learning_rate": 4.999470695012462e-06, |
|
"loss": 0.9317, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 6.018739227425751, |
|
"learning_rate": 4.999059039182093e-06, |
|
"loss": 0.9246, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 5.719715824372754, |
|
"learning_rate": 4.998529800604525e-06, |
|
"loss": 0.9136, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.9341753543248505, |
|
"learning_rate": 4.99788300418086e-06, |
|
"loss": 0.9221, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.060460637107745, |
|
"learning_rate": 4.997118680343392e-06, |
|
"loss": 0.9021, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.576152525840241, |
|
"learning_rate": 4.996236865054177e-06, |
|
"loss": 0.8804, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.2384463037464157, |
|
"learning_rate": 4.995237599803336e-06, |
|
"loss": 0.8864, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.961735655562835, |
|
"learning_rate": 4.994120931607106e-06, |
|
"loss": 0.884, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 3.715236432103334, |
|
"learning_rate": 4.992886913005628e-06, |
|
"loss": 0.8803, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.526746110251263, |
|
"learning_rate": 4.991535602060475e-06, |
|
"loss": 0.8856, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.663980149991846, |
|
"learning_rate": 4.9900670623519185e-06, |
|
"loss": 0.8613, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.4521755603079787, |
|
"learning_rate": 4.988481362975939e-06, |
|
"loss": 0.8692, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.414349491574281, |
|
"learning_rate": 4.986778578540973e-06, |
|
"loss": 0.8602, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.8508832900360477, |
|
"learning_rate": 4.984958789164404e-06, |
|
"loss": 0.8526, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.196516977267481, |
|
"learning_rate": 4.983022080468794e-06, |
|
"loss": 0.8416, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.6095238921057553, |
|
"learning_rate": 4.980968543577849e-06, |
|
"loss": 0.8477, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.2205063108737506, |
|
"learning_rate": 4.978798275112142e-06, |
|
"loss": 0.8502, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 4.453254154090838, |
|
"learning_rate": 4.976511377184557e-06, |
|
"loss": 0.8468, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.9249988861975766, |
|
"learning_rate": 4.97410795739549e-06, |
|
"loss": 0.8391, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 4.462696000038617, |
|
"learning_rate": 4.971588128827783e-06, |
|
"loss": 0.8436, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 3.587837288732032, |
|
"learning_rate": 4.968952010041408e-06, |
|
"loss": 0.8564, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 4.2433648029783635, |
|
"learning_rate": 4.966199725067883e-06, |
|
"loss": 0.8501, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.55480062205993, |
|
"learning_rate": 4.96333140340444e-06, |
|
"loss": 0.8474, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 3.430958497804354, |
|
"learning_rate": 4.960347180007932e-06, |
|
"loss": 0.8343, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 3.0559248054083428, |
|
"learning_rate": 4.957247195288479e-06, |
|
"loss": 0.8358, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 3.2032433802521147, |
|
"learning_rate": 4.9540315951028695e-06, |
|
"loss": 0.8538, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.757473867230571, |
|
"learning_rate": 4.9507005307476894e-06, |
|
"loss": 0.8488, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.045305839090202, |
|
"learning_rate": 4.947254158952209e-06, |
|
"loss": 0.8463, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.9629285351308554, |
|
"learning_rate": 4.943692641871005e-06, |
|
"loss": 0.828, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.7499225905634037, |
|
"learning_rate": 4.940016147076337e-06, |
|
"loss": 0.835, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.606092501631258, |
|
"learning_rate": 4.9362248475502515e-06, |
|
"loss": 0.8269, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.674180949875197, |
|
"learning_rate": 4.932318921676458e-06, |
|
"loss": 0.8417, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.7285602808097336, |
|
"learning_rate": 4.928298553231924e-06, |
|
"loss": 0.8142, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 2.7015909165553387, |
|
"learning_rate": 4.924163931378233e-06, |
|
"loss": 0.8323, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.5415943230444498, |
|
"learning_rate": 4.919915250652686e-06, |
|
"loss": 0.8244, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.776100967798618, |
|
"learning_rate": 4.9155527109591435e-06, |
|
"loss": 0.8516, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.711552634387251, |
|
"learning_rate": 4.911076517558623e-06, |
|
"loss": 0.8313, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.905341213972799, |
|
"learning_rate": 4.906486881059641e-06, |
|
"loss": 0.827, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.6533492618702206, |
|
"learning_rate": 4.901784017408303e-06, |
|
"loss": 0.8298, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.3477688431170414, |
|
"learning_rate": 4.896968147878146e-06, |
|
"loss": 0.8014, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.8384104093830587, |
|
"learning_rate": 4.892039499059721e-06, |
|
"loss": 0.8116, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.432850021289229, |
|
"learning_rate": 4.886998302849938e-06, |
|
"loss": 0.8156, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.804790940572451, |
|
"learning_rate": 4.881844796441153e-06, |
|
"loss": 0.8159, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.3348703819353926, |
|
"learning_rate": 4.876579222310007e-06, |
|
"loss": 0.8096, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 3.1901608706880134, |
|
"learning_rate": 4.8712018282060165e-06, |
|
"loss": 0.811, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.660868627279477, |
|
"learning_rate": 4.86571286713992e-06, |
|
"loss": 0.818, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.7869506219362514, |
|
"learning_rate": 4.860112597371772e-06, |
|
"loss": 0.8267, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.3781426665180727, |
|
"learning_rate": 4.85440128239879e-06, |
|
"loss": 0.8112, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 3.015345568347538, |
|
"learning_rate": 4.8485791909429575e-06, |
|
"loss": 0.8151, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 2.4423049063633546, |
|
"learning_rate": 4.842646596938383e-06, |
|
"loss": 0.8282, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.6311610579830345, |
|
"learning_rate": 4.8366037795184086e-06, |
|
"loss": 0.8224, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.2899705695712282, |
|
"learning_rate": 4.830451023002477e-06, |
|
"loss": 0.8249, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 2.734019031576332, |
|
"learning_rate": 4.824188616882754e-06, |
|
"loss": 0.8136, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 2.0962707496285153, |
|
"learning_rate": 4.817816855810507e-06, |
|
"loss": 0.8184, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.9665475220491966, |
|
"learning_rate": 4.811336039582244e-06, |
|
"loss": 0.817, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.3700279096809824, |
|
"learning_rate": 4.804746473125605e-06, |
|
"loss": 0.81, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 3.027450051648286, |
|
"learning_rate": 4.798048466485018e-06, |
|
"loss": 0.8219, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.752180087601986, |
|
"learning_rate": 4.791242334807106e-06, |
|
"loss": 0.8101, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.573031098084336, |
|
"learning_rate": 4.784328398325866e-06, |
|
"loss": 0.8099, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.7123062400387576, |
|
"learning_rate": 4.7773069823475945e-06, |
|
"loss": 0.8088, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.474407154924412, |
|
"learning_rate": 4.770178417235589e-06, |
|
"loss": 0.8232, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 2.4442790557938965, |
|
"learning_rate": 4.762943038394597e-06, |
|
"loss": 0.8051, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.1553005222889583, |
|
"learning_rate": 4.755601186255041e-06, |
|
"loss": 0.825, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.312089317706644, |
|
"learning_rate": 4.7481532062569945e-06, |
|
"loss": 0.8168, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.2731409969922693, |
|
"learning_rate": 4.7405994488339375e-06, |
|
"loss": 0.8095, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 2.444830999943097, |
|
"learning_rate": 4.732940269396259e-06, |
|
"loss": 0.8108, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.315315922207351, |
|
"learning_rate": 4.725176028314541e-06, |
|
"loss": 0.8072, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.231139324893734, |
|
"learning_rate": 4.7173070909026015e-06, |
|
"loss": 0.8093, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.3706185711850956, |
|
"learning_rate": 4.7093338274003035e-06, |
|
"loss": 0.8011, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.2002695782644905, |
|
"learning_rate": 4.701256612956137e-06, |
|
"loss": 0.8136, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.704162588410486, |
|
"learning_rate": 4.693075827609569e-06, |
|
"loss": 0.8161, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.9269420606373808, |
|
"learning_rate": 4.684791856273161e-06, |
|
"loss": 0.8023, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.4468707403860037, |
|
"learning_rate": 4.676405088714458e-06, |
|
"loss": 0.8019, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 2.07255184003701, |
|
"learning_rate": 4.667915919537651e-06, |
|
"loss": 0.8155, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 2.2036583211359746, |
|
"learning_rate": 4.6593247481650105e-06, |
|
"loss": 0.8175, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.8140377607105893, |
|
"learning_rate": 4.65063197881809e-06, |
|
"loss": 0.8047, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.1515853560068243, |
|
"learning_rate": 4.641838020498713e-06, |
|
"loss": 0.8185, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 2.422760543528869, |
|
"learning_rate": 4.632943286969724e-06, |
|
"loss": 0.8053, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.0829660459092776, |
|
"learning_rate": 4.6239481967355226e-06, |
|
"loss": 0.8016, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 2.1797861749114857, |
|
"learning_rate": 4.614853173022374e-06, |
|
"loss": 0.8068, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 2.6076944835409135, |
|
"learning_rate": 4.605658643758492e-06, |
|
"loss": 0.8121, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.8970468662695665, |
|
"learning_rate": 4.59636504155391e-06, |
|
"loss": 0.8146, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.195985971319255, |
|
"learning_rate": 4.586972803680119e-06, |
|
"loss": 0.7956, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.7905344729119324, |
|
"learning_rate": 4.577482372049503e-06, |
|
"loss": 0.7953, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.940825819357636, |
|
"learning_rate": 4.567894193194538e-06, |
|
"loss": 0.8047, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.7851389603760988, |
|
"learning_rate": 4.558208718246787e-06, |
|
"loss": 0.8105, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.4544053049998884, |
|
"learning_rate": 4.548426402915674e-06, |
|
"loss": 0.8007, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.1082597379545165, |
|
"learning_rate": 4.538547707467038e-06, |
|
"loss": 0.805, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.9122304736142002, |
|
"learning_rate": 4.528573096701484e-06, |
|
"loss": 0.8067, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.990095091929317, |
|
"learning_rate": 4.5185030399325085e-06, |
|
"loss": 0.8025, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.029120797101969, |
|
"learning_rate": 4.508338010964419e-06, |
|
"loss": 0.8051, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.9232157433946324, |
|
"learning_rate": 4.498078488070044e-06, |
|
"loss": 0.8078, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.026639315850326, |
|
"learning_rate": 4.4877249539682235e-06, |
|
"loss": 0.7957, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.321397841878117, |
|
"learning_rate": 4.477277895801105e-06, |
|
"loss": 0.8039, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.1058946605532656, |
|
"learning_rate": 4.466737805111218e-06, |
|
"loss": 0.7921, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.9367272096486814, |
|
"learning_rate": 4.456105177818345e-06, |
|
"loss": 0.7922, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.5699051489574605, |
|
"learning_rate": 4.445380514196192e-06, |
|
"loss": 0.7984, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 2.0519065594178003, |
|
"learning_rate": 4.434564318848851e-06, |
|
"loss": 0.784, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.809445653695483, |
|
"learning_rate": 4.423657100687051e-06, |
|
"loss": 0.7835, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.170793808104346, |
|
"learning_rate": 4.41265937290422e-06, |
|
"loss": 0.8039, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 2.612499453872831, |
|
"learning_rate": 4.401571652952338e-06, |
|
"loss": 0.8099, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 2.1820432279807718, |
|
"learning_rate": 4.390394462517589e-06, |
|
"loss": 0.7929, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.335653384387635, |
|
"learning_rate": 4.379128327495813e-06, |
|
"loss": 0.7999, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.9923005143051236, |
|
"learning_rate": 4.367773777967769e-06, |
|
"loss": 0.8123, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.1550829328561787, |
|
"learning_rate": 4.3563313481741855e-06, |
|
"loss": 0.7905, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.8608335678809116, |
|
"learning_rate": 4.344801576490631e-06, |
|
"loss": 0.8051, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 2.1245561127414923, |
|
"learning_rate": 4.3331850054021806e-06, |
|
"loss": 0.7979, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 3.0733225632670127, |
|
"learning_rate": 4.321482181477891e-06, |
|
"loss": 0.7712, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 3.2998040554493366, |
|
"learning_rate": 4.309693655345084e-06, |
|
"loss": 0.7523, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 3.0325218749275624, |
|
"learning_rate": 4.29781998166344e-06, |
|
"loss": 0.7591, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 3.696913587445061, |
|
"learning_rate": 4.2858617190989e-06, |
|
"loss": 0.7447, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.852432566856283, |
|
"learning_rate": 4.273819430297382e-06, |
|
"loss": 0.7557, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 4.684715882065128, |
|
"learning_rate": 4.261693681858306e-06, |
|
"loss": 0.7596, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 3.5092399399988343, |
|
"learning_rate": 4.2494850443079305e-06, |
|
"loss": 0.7467, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 3.851985488385657, |
|
"learning_rate": 4.237194092072521e-06, |
|
"loss": 0.7475, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 3.328297635562109, |
|
"learning_rate": 4.2248214034513114e-06, |
|
"loss": 0.7442, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 3.9102386032965373, |
|
"learning_rate": 4.212367560589299e-06, |
|
"loss": 0.7651, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 3.06457511968988, |
|
"learning_rate": 4.199833149449853e-06, |
|
"loss": 0.7418, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 3.2168853165359175, |
|
"learning_rate": 4.187218759787148e-06, |
|
"loss": 0.743, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 2.787806998778304, |
|
"learning_rate": 4.174524985118411e-06, |
|
"loss": 0.7583, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 3.9052093067187443, |
|
"learning_rate": 4.161752422695995e-06, |
|
"loss": 0.7657, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 3.0182785590623173, |
|
"learning_rate": 4.148901673479285e-06, |
|
"loss": 0.7362, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 3.344873463157497, |
|
"learning_rate": 4.135973342106416e-06, |
|
"loss": 0.7558, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.859789074330451, |
|
"learning_rate": 4.122968036865827e-06, |
|
"loss": 0.7486, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 3.228835212527119, |
|
"learning_rate": 4.109886369667636e-06, |
|
"loss": 0.7655, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 2.731179147211052, |
|
"learning_rate": 4.096728956014857e-06, |
|
"loss": 0.7528, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 2.652548225590943, |
|
"learning_rate": 4.083496414974434e-06, |
|
"loss": 0.7448, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 2.318475213880663, |
|
"learning_rate": 4.070189369148117e-06, |
|
"loss": 0.7577, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 2.63738412652813, |
|
"learning_rate": 4.056808444643163e-06, |
|
"loss": 0.7475, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 2.557947896210419, |
|
"learning_rate": 4.043354271042884e-06, |
|
"loss": 0.7368, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.555902551567185, |
|
"learning_rate": 4.02982748137702e-06, |
|
"loss": 0.7434, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 2.331216526916747, |
|
"learning_rate": 4.0162287120919545e-06, |
|
"loss": 0.7592, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 2.298359286001758, |
|
"learning_rate": 4.002558603020772e-06, |
|
"loss": 0.7407, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 2.2941501459555806, |
|
"learning_rate": 3.988817797353149e-06, |
|
"loss": 0.7534, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 2.195437955425272, |
|
"learning_rate": 3.975006941605099e-06, |
|
"loss": 0.7501, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.2354482392933583, |
|
"learning_rate": 3.961126685588541e-06, |
|
"loss": 0.7628, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.271851901257072, |
|
"learning_rate": 3.947177682380738e-06, |
|
"loss": 0.7559, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 2.06190688358819, |
|
"learning_rate": 3.933160588293564e-06, |
|
"loss": 0.7381, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 2.1082422406253145, |
|
"learning_rate": 3.9190760628426225e-06, |
|
"loss": 0.761, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.257445076263947, |
|
"learning_rate": 3.904924768716216e-06, |
|
"loss": 0.7339, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 2.0575089260504065, |
|
"learning_rate": 3.890707371744169e-06, |
|
"loss": 0.7481, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 2.2929887801738205, |
|
"learning_rate": 3.8764245408664964e-06, |
|
"loss": 0.7378, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 2.0516588845869244, |
|
"learning_rate": 3.862076948101934e-06, |
|
"loss": 0.7565, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 2.265288203536537, |
|
"learning_rate": 3.847665268516314e-06, |
|
"loss": 0.7489, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.3008837342067174, |
|
"learning_rate": 3.833190180190808e-06, |
|
"loss": 0.7387, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 2.303244798174658, |
|
"learning_rate": 3.818652364190018e-06, |
|
"loss": 0.7456, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 2.266318666547602, |
|
"learning_rate": 3.8040525045299337e-06, |
|
"loss": 0.7574, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 2.12834726687523, |
|
"learning_rate": 3.7893912881457505e-06, |
|
"loss": 0.7444, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 2.3349048004783968, |
|
"learning_rate": 3.7746694048595458e-06, |
|
"loss": 0.7576, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 2.3189890726535025, |
|
"learning_rate": 3.759887547347825e-06, |
|
"loss": 0.7661, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 2.4796894123137703, |
|
"learning_rate": 3.745046411108928e-06, |
|
"loss": 0.75, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 2.3870417794754446, |
|
"learning_rate": 3.730146694430308e-06, |
|
"loss": 0.7553, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 2.0568742731615752, |
|
"learning_rate": 3.7151890983556747e-06, |
|
"loss": 0.7493, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 2.4078734682098344, |
|
"learning_rate": 3.700174326652011e-06, |
|
"loss": 0.7413, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 2.1293229041387853, |
|
"learning_rate": 3.685103085776457e-06, |
|
"loss": 0.7467, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 2.117068360526989, |
|
"learning_rate": 3.6699760848430753e-06, |
|
"loss": 0.7396, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 1.9883016666638709, |
|
"learning_rate": 3.654794035589484e-06, |
|
"loss": 0.7491, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 2.2563683164117534, |
|
"learning_rate": 3.6395576523433672e-06, |
|
"loss": 0.7518, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.3064779080171744, |
|
"learning_rate": 3.6242676519888693e-06, |
|
"loss": 0.7565, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 1.9222922356745977, |
|
"learning_rate": 3.608924753932862e-06, |
|
"loss": 0.7353, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 2.2598801622305547, |
|
"learning_rate": 3.593529680071097e-06, |
|
"loss": 0.7466, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 2.0105134715596082, |
|
"learning_rate": 3.578083154754241e-06, |
|
"loss": 0.7427, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 2.482181765049527, |
|
"learning_rate": 3.5625859047537904e-06, |
|
"loss": 0.7531, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 1.989275137459208, |
|
"learning_rate": 3.547038659227881e-06, |
|
"loss": 0.7458, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 2.375903859138452, |
|
"learning_rate": 3.5314421496869777e-06, |
|
"loss": 0.7497, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.818443971455023, |
|
"learning_rate": 3.515797109959458e-06, |
|
"loss": 0.7416, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 2.1798526485495127, |
|
"learning_rate": 3.500104276157083e-06, |
|
"loss": 0.7477, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.8954837732640064, |
|
"learning_rate": 3.484364386640365e-06, |
|
"loss": 0.7511, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 2.1614293408819023, |
|
"learning_rate": 3.4685781819838233e-06, |
|
"loss": 0.7517, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 2.1506646824589497, |
|
"learning_rate": 3.452746404941143e-06, |
|
"loss": 0.7355, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 2.169292733116705, |
|
"learning_rate": 3.4368698004102284e-06, |
|
"loss": 0.74, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 2.418141938844683, |
|
"learning_rate": 3.420949115398151e-06, |
|
"loss": 0.7503, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.0599479007715886, |
|
"learning_rate": 3.404985098986007e-06, |
|
"loss": 0.7569, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 1.9525184362879087, |
|
"learning_rate": 3.388978502293666e-06, |
|
"loss": 0.7354, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 2.639288273379809, |
|
"learning_rate": 3.372930078444439e-06, |
|
"loss": 0.7567, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 2.0510480212606232, |
|
"learning_rate": 3.3568405825296355e-06, |
|
"loss": 0.7433, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.1578817124425735, |
|
"learning_rate": 3.34071077157304e-06, |
|
"loss": 0.7417, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.5268212959295893, |
|
"learning_rate": 3.3245414044952927e-06, |
|
"loss": 0.7485, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 2.159749201380735, |
|
"learning_rate": 3.308333242078179e-06, |
|
"loss": 0.7507, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 2.308409227797682, |
|
"learning_rate": 3.292087046928838e-06, |
|
"loss": 0.752, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 2.6229465595278616, |
|
"learning_rate": 3.2758035834438804e-06, |
|
"loss": 0.7348, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 2.055730524359484, |
|
"learning_rate": 3.2594836177734208e-06, |
|
"loss": 0.7421, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 2.0176570132314837, |
|
"learning_rate": 3.2431279177850317e-06, |
|
"loss": 0.7536, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 2.403883281288115, |
|
"learning_rate": 3.226737253027614e-06, |
|
"loss": 0.7279, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 2.000280638267734, |
|
"learning_rate": 3.210312394695189e-06, |
|
"loss": 0.7437, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 2.220127089726233, |
|
"learning_rate": 3.1938541155906146e-06, |
|
"loss": 0.7477, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 2.1887426336139875, |
|
"learning_rate": 3.177363190089221e-06, |
|
"loss": 0.7474, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 1.981302078482481, |
|
"learning_rate": 3.1608403941023793e-06, |
|
"loss": 0.7396, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 2.264086468679179, |
|
"learning_rate": 3.144286505040992e-06, |
|
"loss": 0.7391, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 2.125610702261824, |
|
"learning_rate": 3.1277023017789166e-06, |
|
"loss": 0.7449, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.167346271778115, |
|
"learning_rate": 3.111088564616317e-06, |
|
"loss": 0.741, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.375363723765046, |
|
"learning_rate": 3.094446075242952e-06, |
|
"loss": 0.7469, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 1.9942074170938597, |
|
"learning_rate": 3.0777756167013946e-06, |
|
"loss": 0.7431, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 2.146743351563113, |
|
"learning_rate": 3.0610779733501904e-06, |
|
"loss": 0.752, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 2.1525241003649174, |
|
"learning_rate": 3.044353930826952e-06, |
|
"loss": 0.7345, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 1.8638029080202694, |
|
"learning_rate": 3.0276042760113937e-06, |
|
"loss": 0.7343, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 1.8356317644401954, |
|
"learning_rate": 3.0108297969883105e-06, |
|
"loss": 0.7336, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 2.0787011804537583, |
|
"learning_rate": 2.9940312830104936e-06, |
|
"loss": 0.7505, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 1.8672332467108457, |
|
"learning_rate": 2.977209524461601e-06, |
|
"loss": 0.7479, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 2.0236026898201063, |
|
"learning_rate": 2.960365312818967e-06, |
|
"loss": 0.7563, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 1.9813232623431425, |
|
"learning_rate": 2.9434994406163574e-06, |
|
"loss": 0.7405, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 2.009874379662123, |
|
"learning_rate": 2.9266127014066905e-06, |
|
"loss": 0.7576, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 1.9182181733436605, |
|
"learning_rate": 2.9097058897246904e-06, |
|
"loss": 0.732, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 2.2713302836846836, |
|
"learning_rate": 2.8927798010495095e-06, |
|
"loss": 0.7557, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 1.8499354567551494, |
|
"learning_rate": 2.875835231767297e-06, |
|
"loss": 0.746, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 2.056857824508339, |
|
"learning_rate": 2.8588729791337298e-06, |
|
"loss": 0.7327, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 2.0366392068105768, |
|
"learning_rate": 2.8418938412365016e-06, |
|
"loss": 0.7649, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 2.075283194301643, |
|
"learning_rate": 2.8248986169577697e-06, |
|
"loss": 0.7498, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 1.8137465567608382, |
|
"learning_rate": 2.807888105936571e-06, |
|
"loss": 0.7426, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 1.9713775183042563, |
|
"learning_rate": 2.7908631085311933e-06, |
|
"loss": 0.7573, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 1.9347350716569554, |
|
"learning_rate": 2.7738244257815234e-06, |
|
"loss": 0.739, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 2.04529529738096, |
|
"learning_rate": 2.756772859371351e-06, |
|
"loss": 0.743, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 2.1971388123428754, |
|
"learning_rate": 2.7397092115906554e-06, |
|
"loss": 0.7471, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 1.7934500775315345, |
|
"learning_rate": 2.7226342852978542e-06, |
|
"loss": 0.7443, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 2.268705081511345, |
|
"learning_rate": 2.7055488838820266e-06, |
|
"loss": 0.7414, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 1.9139342117735205, |
|
"learning_rate": 2.6884538112251147e-06, |
|
"loss": 0.7406, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 2.3588112901077714, |
|
"learning_rate": 2.6713498716641017e-06, |
|
"loss": 0.7575, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 1.7941800288489695, |
|
"learning_rate": 2.6542378699531645e-06, |
|
"loss": 0.7459, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 2.5891645782274955, |
|
"learning_rate": 2.6371186112258118e-06, |
|
"loss": 0.7472, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 1.8268045992021926, |
|
"learning_rate": 2.6199929009570003e-06, |
|
"loss": 0.7489, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 2.703784770062622, |
|
"learning_rate": 2.602861544925236e-06, |
|
"loss": 0.7272, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 2.00621810320303, |
|
"learning_rate": 2.5857253491746646e-06, |
|
"loss": 0.7434, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 3.0770514249566507, |
|
"learning_rate": 2.568585119977142e-06, |
|
"loss": 0.7547, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.199087929149206, |
|
"learning_rate": 2.551441663794304e-06, |
|
"loss": 0.7362, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 2.609728798751417, |
|
"learning_rate": 2.5342957872396156e-06, |
|
"loss": 0.7387, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 2.4167509521439583, |
|
"learning_rate": 2.5171482970404244e-06, |
|
"loss": 0.7291, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.272061225763508, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.7054, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 2.824272633709913, |
|
"learning_rate": 2.482851702959577e-06, |
|
"loss": 0.6944, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 3.5666822575384316, |
|
"learning_rate": 2.4657042127603853e-06, |
|
"loss": 0.6993, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 2.3931138116624724, |
|
"learning_rate": 2.4485583362056975e-06, |
|
"loss": 0.7047, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 2.9320600754571795, |
|
"learning_rate": 2.4314148800228584e-06, |
|
"loss": 0.6925, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 2.8973919990299133, |
|
"learning_rate": 2.4142746508253367e-06, |
|
"loss": 0.6965, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 2.625815226251557, |
|
"learning_rate": 2.3971384550747644e-06, |
|
"loss": 0.683, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 2.4155664260482714, |
|
"learning_rate": 2.3800070990430006e-06, |
|
"loss": 0.6994, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 2.411022689524766, |
|
"learning_rate": 2.3628813887741882e-06, |
|
"loss": 0.6894, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 2.28077143295534, |
|
"learning_rate": 2.345762130046836e-06, |
|
"loss": 0.7023, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 2.401968274277045, |
|
"learning_rate": 2.3286501283358987e-06, |
|
"loss": 0.6759, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 2.378798265143013, |
|
"learning_rate": 2.311546188774886e-06, |
|
"loss": 0.6958, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 2.3673961258869904, |
|
"learning_rate": 2.2944511161179743e-06, |
|
"loss": 0.6838, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 2.516008730553284, |
|
"learning_rate": 2.2773657147021466e-06, |
|
"loss": 0.6909, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 2.2281402280211466, |
|
"learning_rate": 2.2602907884093454e-06, |
|
"loss": 0.6811, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 2.1439630603489315, |
|
"learning_rate": 2.24322714062865e-06, |
|
"loss": 0.6952, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 2.1634818483654077, |
|
"learning_rate": 2.2261755742184783e-06, |
|
"loss": 0.698, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 2.196698584094087, |
|
"learning_rate": 2.2091368914688067e-06, |
|
"loss": 0.6791, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 2.146209315784236, |
|
"learning_rate": 2.19211189406343e-06, |
|
"loss": 0.7023, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 2.1405757699513766, |
|
"learning_rate": 2.1751013830422303e-06, |
|
"loss": 0.6977, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 1.9616924770190525, |
|
"learning_rate": 2.1581061587634992e-06, |
|
"loss": 0.6812, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 2.23295032851048, |
|
"learning_rate": 2.14112702086627e-06, |
|
"loss": 0.685, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 2.279103072547858, |
|
"learning_rate": 2.1241647682327037e-06, |
|
"loss": 0.6954, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 2.1829812242717317, |
|
"learning_rate": 2.1072201989504914e-06, |
|
"loss": 0.6934, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 2.2223700306607537, |
|
"learning_rate": 2.09029411027531e-06, |
|
"loss": 0.6886, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 2.0785442261456337, |
|
"learning_rate": 2.073387298593311e-06, |
|
"loss": 0.6994, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 2.266610094629974, |
|
"learning_rate": 2.0565005593836434e-06, |
|
"loss": 0.6969, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 2.132447543935174, |
|
"learning_rate": 2.0396346871810347e-06, |
|
"loss": 0.6773, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 2.0884173463715228, |
|
"learning_rate": 2.0227904755383985e-06, |
|
"loss": 0.6945, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 2.2085913230241396, |
|
"learning_rate": 2.005968716989507e-06, |
|
"loss": 0.6949, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 2.0268180328155507, |
|
"learning_rate": 1.98917020301169e-06, |
|
"loss": 0.7027, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 2.4747960228822037, |
|
"learning_rate": 1.9723957239886067e-06, |
|
"loss": 0.6794, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 2.095701576576396, |
|
"learning_rate": 1.955646069173048e-06, |
|
"loss": 0.6961, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 2.0823665858025313, |
|
"learning_rate": 1.93892202664981e-06, |
|
"loss": 0.6786, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 2.245218571211113, |
|
"learning_rate": 1.922224383298606e-06, |
|
"loss": 0.6974, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 2.158681117909884, |
|
"learning_rate": 1.905553924757049e-06, |
|
"loss": 0.7002, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 2.146126951984283, |
|
"learning_rate": 1.888911435383684e-06, |
|
"loss": 0.6843, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 2.1238895111610048, |
|
"learning_rate": 1.8722976982210845e-06, |
|
"loss": 0.684, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 2.1081813807297984, |
|
"learning_rate": 1.8557134949590087e-06, |
|
"loss": 0.6868, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 2.0759325520644096, |
|
"learning_rate": 1.8391596058976214e-06, |
|
"loss": 0.69, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 2.087216621474724, |
|
"learning_rate": 1.8226368099107793e-06, |
|
"loss": 0.6923, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 2.4018313648831113, |
|
"learning_rate": 1.806145884409386e-06, |
|
"loss": 0.6931, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 2.0013309281300216, |
|
"learning_rate": 1.7896876053048112e-06, |
|
"loss": 0.6893, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 2.059546352986111, |
|
"learning_rate": 1.7732627469723868e-06, |
|
"loss": 0.6867, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 2.000325940508461, |
|
"learning_rate": 1.756872082214969e-06, |
|
"loss": 0.6914, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 2.379361502001129, |
|
"learning_rate": 1.7405163822265803e-06, |
|
"loss": 0.6906, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.8960887672873148, |
|
"learning_rate": 1.7241964165561204e-06, |
|
"loss": 0.6673, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 2.172181466809878, |
|
"learning_rate": 1.707912953071163e-06, |
|
"loss": 0.6781, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 2.015019267745016, |
|
"learning_rate": 1.6916667579218216e-06, |
|
"loss": 0.6963, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 2.0355415165674846, |
|
"learning_rate": 1.6754585955047081e-06, |
|
"loss": 0.6779, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 2.1086130667226977, |
|
"learning_rate": 1.6592892284269597e-06, |
|
"loss": 0.6998, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 2.059886916939569, |
|
"learning_rate": 1.6431594174703647e-06, |
|
"loss": 0.6802, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 2.0641741782118332, |
|
"learning_rate": 1.6270699215555608e-06, |
|
"loss": 0.6854, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 2.1806834552623444, |
|
"learning_rate": 1.6110214977063345e-06, |
|
"loss": 0.6987, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 2.158353312239409, |
|
"learning_rate": 1.5950149010139938e-06, |
|
"loss": 0.6823, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 1.921092711707764, |
|
"learning_rate": 1.5790508846018493e-06, |
|
"loss": 0.6941, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 2.3977437347525594, |
|
"learning_rate": 1.563130199589773e-06, |
|
"loss": 0.6915, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 2.0303949213498167, |
|
"learning_rate": 1.5472535950588575e-06, |
|
"loss": 0.6971, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 2.176759402835286, |
|
"learning_rate": 1.5314218180161783e-06, |
|
"loss": 0.6809, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 1.9729876407553733, |
|
"learning_rate": 1.5156356133596356e-06, |
|
"loss": 0.6933, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 2.168577615246608, |
|
"learning_rate": 1.4998957238429173e-06, |
|
"loss": 0.6873, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 2.0839917041722704, |
|
"learning_rate": 1.4842028900405422e-06, |
|
"loss": 0.6984, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 1.893739609530612, |
|
"learning_rate": 1.4685578503130227e-06, |
|
"loss": 0.6922, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 2.0857317964888193, |
|
"learning_rate": 1.4529613407721193e-06, |
|
"loss": 0.6908, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 2.0026826285563564, |
|
"learning_rate": 1.4374140952462109e-06, |
|
"loss": 0.6752, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.914005315845231, |
|
"learning_rate": 1.4219168452457593e-06, |
|
"loss": 0.6988, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 2.0223609197099552, |
|
"learning_rate": 1.4064703199289038e-06, |
|
"loss": 0.6842, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 2.1096738426378407, |
|
"learning_rate": 1.391075246067139e-06, |
|
"loss": 0.6823, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 2.0768301164703438, |
|
"learning_rate": 1.375732348011132e-06, |
|
"loss": 0.6898, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 2.123519591919823, |
|
"learning_rate": 1.3604423476566342e-06, |
|
"loss": 0.6732, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 2.1761224742449934, |
|
"learning_rate": 1.3452059644105174e-06, |
|
"loss": 0.6915, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 2.1481136545780246, |
|
"learning_rate": 1.3300239151569251e-06, |
|
"loss": 0.6942, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 2.194600040469437, |
|
"learning_rate": 1.3148969142235436e-06, |
|
"loss": 0.6788, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 2.1089291541456223, |
|
"learning_rate": 1.2998256733479896e-06, |
|
"loss": 0.7013, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 2.19121425222603, |
|
"learning_rate": 1.2848109016443255e-06, |
|
"loss": 0.6897, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 1.9319788219662473, |
|
"learning_rate": 1.2698533055696926e-06, |
|
"loss": 0.6976, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 2.275808654157748, |
|
"learning_rate": 1.254953588891073e-06, |
|
"loss": 0.6839, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 2.198386215375118, |
|
"learning_rate": 1.2401124526521763e-06, |
|
"loss": 0.6946, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 1.8790492694109773, |
|
"learning_rate": 1.225330595140455e-06, |
|
"loss": 0.676, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 1.9926892667726497, |
|
"learning_rate": 1.2106087118542504e-06, |
|
"loss": 0.6824, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 2.0144848585141206, |
|
"learning_rate": 1.1959474954700667e-06, |
|
"loss": 0.6906, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 2.0949703627379446, |
|
"learning_rate": 1.1813476358099824e-06, |
|
"loss": 0.6952, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 1.878706080567921, |
|
"learning_rate": 1.166809819809192e-06, |
|
"loss": 0.6846, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 2.106975261880749, |
|
"learning_rate": 1.1523347314836857e-06, |
|
"loss": 0.6916, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 2.0193667358417486, |
|
"learning_rate": 1.1379230518980663e-06, |
|
"loss": 0.695, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 1.8829454822076184, |
|
"learning_rate": 1.123575459133504e-06, |
|
"loss": 0.6856, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 1.9885667241669744, |
|
"learning_rate": 1.109292628255832e-06, |
|
"loss": 0.6849, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 1.9038960696049037, |
|
"learning_rate": 1.0950752312837846e-06, |
|
"loss": 0.6901, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 2.0104842271889467, |
|
"learning_rate": 1.0809239371573779e-06, |
|
"loss": 0.7014, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 1.9066116631636623, |
|
"learning_rate": 1.0668394117064365e-06, |
|
"loss": 0.6798, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 1.9713999361623535, |
|
"learning_rate": 1.0528223176192618e-06, |
|
"loss": 0.6979, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 1.8465153504391632, |
|
"learning_rate": 1.0388733144114605e-06, |
|
"loss": 0.6892, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 1.9408714006937027, |
|
"learning_rate": 1.024993058394902e-06, |
|
"loss": 0.6985, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 2.030993015395332, |
|
"learning_rate": 1.0111822026468515e-06, |
|
"loss": 0.6925, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 1.811976430858568, |
|
"learning_rate": 9.974413969792285e-07, |
|
"loss": 0.6805, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 2.025426310321446, |
|
"learning_rate": 9.837712879080464e-07, |
|
"loss": 0.6884, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 1.8699504401283087, |
|
"learning_rate": 9.701725186229801e-07, |
|
"loss": 0.6766, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 1.9813729971640541, |
|
"learning_rate": 9.56645728957117e-07, |
|
"loss": 0.6816, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 1.857568380571694, |
|
"learning_rate": 9.431915553568374e-07, |
|
"loss": 0.6941, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 1.8075501016131494, |
|
"learning_rate": 9.298106308518847e-07, |
|
"loss": 0.6915, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 1.896748082277053, |
|
"learning_rate": 9.165035850255672e-07, |
|
"loss": 0.6965, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 1.903236161607879, |
|
"learning_rate": 9.032710439851444e-07, |
|
"loss": 0.6942, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 2.0473184895907344, |
|
"learning_rate": 8.901136303323654e-07, |
|
"loss": 0.6868, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 1.9225358657320613, |
|
"learning_rate": 8.770319631341745e-07, |
|
"loss": 0.6833, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 1.8842364675717973, |
|
"learning_rate": 8.640266578935841e-07, |
|
"loss": 0.7059, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 1.9601955634309354, |
|
"learning_rate": 8.510983265207152e-07, |
|
"loss": 0.6996, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 1.8136997160021915, |
|
"learning_rate": 8.382475773040055e-07, |
|
"loss": 0.6836, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 1.9111490776903417, |
|
"learning_rate": 8.254750148815893e-07, |
|
"loss": 0.6996, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 1.8878734449529964, |
|
"learning_rate": 8.127812402128521e-07, |
|
"loss": 0.6932, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 1.7623368894317115, |
|
"learning_rate": 8.001668505501464e-07, |
|
"loss": 0.696, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 1.9509975903694705, |
|
"learning_rate": 7.876324394107018e-07, |
|
"loss": 0.6886, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 1.8836926534531768, |
|
"learning_rate": 7.751785965486894e-07, |
|
"loss": 0.6898, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.9384474477733897, |
|
"learning_rate": 7.628059079274793e-07, |
|
"loss": 0.6829, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 1.8215374593231801, |
|
"learning_rate": 7.505149556920698e-07, |
|
"loss": 0.6908, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 1.8093873518769943, |
|
"learning_rate": 7.383063181416955e-07, |
|
"loss": 0.6983, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 1.9915437999230632, |
|
"learning_rate": 7.261805697026178e-07, |
|
"loss": 0.7005, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.89611825729105, |
|
"learning_rate": 7.141382809010999e-07, |
|
"loss": 0.6931, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.8365953198306064, |
|
"learning_rate": 7.021800183365607e-07, |
|
"loss": 0.6817, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 1.8887825422099398, |
|
"learning_rate": 6.903063446549166e-07, |
|
"loss": 0.6796, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 2.0505162217401396, |
|
"learning_rate": 6.785178185221095e-07, |
|
"loss": 0.6823, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 2.8780046222752, |
|
"learning_rate": 6.668149945978203e-07, |
|
"loss": 0.6598, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 2.320474085762604, |
|
"learning_rate": 6.551984235093692e-07, |
|
"loss": 0.6646, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 3.148494101628221, |
|
"learning_rate": 6.436686518258156e-07, |
|
"loss": 0.6521, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 2.9894322407930707, |
|
"learning_rate": 6.322262220322314e-07, |
|
"loss": 0.6497, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 2.0905514911758116, |
|
"learning_rate": 6.208716725041869e-07, |
|
"loss": 0.6729, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 2.5235937968654882, |
|
"learning_rate": 6.096055374824117e-07, |
|
"loss": 0.6536, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 2.7164252624114953, |
|
"learning_rate": 5.984283470476621e-07, |
|
"loss": 0.6557, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 2.1414297977553134, |
|
"learning_rate": 5.873406270957804e-07, |
|
"loss": 0.6517, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 2.087954653292254, |
|
"learning_rate": 5.763428993129499e-07, |
|
"loss": 0.6535, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 2.5430489969699166, |
|
"learning_rate": 5.654356811511494e-07, |
|
"loss": 0.6594, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 2.4893194798160425, |
|
"learning_rate": 5.546194858038073e-07, |
|
"loss": 0.6702, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 1.9260382585512938, |
|
"learning_rate": 5.438948221816559e-07, |
|
"loss": 0.6629, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 2.057039216215999, |
|
"learning_rate": 5.332621948887823e-07, |
|
"loss": 0.6583, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 2.182074257751017, |
|
"learning_rate": 5.227221041988955e-07, |
|
"loss": 0.6602, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 1.9356067875549532, |
|
"learning_rate": 5.122750460317768e-07, |
|
"loss": 0.6621, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 1.9075744893117703, |
|
"learning_rate": 5.019215119299578e-07, |
|
"loss": 0.6673, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 2.0600626341053028, |
|
"learning_rate": 4.916619890355812e-07, |
|
"loss": 0.6577, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 1.847664227547946, |
|
"learning_rate": 4.814969600674926e-07, |
|
"loss": 0.6566, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 1.9200825550285445, |
|
"learning_rate": 4.714269032985161e-07, |
|
"loss": 0.6531, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 1.945604786921752, |
|
"learning_rate": 4.614522925329626e-07, |
|
"loss": 0.6577, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 1.9471196049311694, |
|
"learning_rate": 4.515735970843263e-07, |
|
"loss": 0.6659, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 1.8278961360694248, |
|
"learning_rate": 4.417912817532133e-07, |
|
"loss": 0.6554, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 1.88830260098924, |
|
"learning_rate": 4.321058068054626e-07, |
|
"loss": 0.6563, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 1.9149749844100774, |
|
"learning_rate": 4.225176279504975e-07, |
|
"loss": 0.6571, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 1.8814221934773716, |
|
"learning_rate": 4.130271963198815e-07, |
|
"loss": 0.6572, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 1.8849419819366298, |
|
"learning_rate": 4.0363495844609134e-07, |
|
"loss": 0.6604, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 1.9046094115295815, |
|
"learning_rate": 3.9434135624150854e-07, |
|
"loss": 0.6652, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 1.944275827853693, |
|
"learning_rate": 3.8514682697762706e-07, |
|
"loss": 0.6572, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 1.8699721288071858, |
|
"learning_rate": 3.7605180326447806e-07, |
|
"loss": 0.6401, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 1.775035768873695, |
|
"learning_rate": 3.6705671303027687e-07, |
|
"loss": 0.6523, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 1.7843895394177849, |
|
"learning_rate": 3.581619795012875e-07, |
|
"loss": 0.6516, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 1.919359950542867, |
|
"learning_rate": 3.493680211819103e-07, |
|
"loss": 0.6607, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 1.8576252034229292, |
|
"learning_rate": 3.4067525183499013e-07, |
|
"loss": 0.6663, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 1.7764574523914607, |
|
"learning_rate": 3.3208408046234904e-07, |
|
"loss": 0.6576, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 1.8446907169053142, |
|
"learning_rate": 3.2359491128554214e-07, |
|
"loss": 0.6582, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 1.7493689583147616, |
|
"learning_rate": 3.152081437268398e-07, |
|
"loss": 0.6548, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 1.816698197495291, |
|
"learning_rate": 3.069241723904318e-07, |
|
"loss": 0.6636, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 1.790271464078186, |
|
"learning_rate": 2.987433870438641e-07, |
|
"loss": 0.657, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 1.743131857961643, |
|
"learning_rate": 2.906661725996976e-07, |
|
"loss": 0.6652, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 1.7977795864445705, |
|
"learning_rate": 2.82692909097399e-07, |
|
"loss": 0.6455, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 1.777376679638967, |
|
"learning_rate": 2.7482397168545895e-07, |
|
"loss": 0.6592, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 1.806389217351911, |
|
"learning_rate": 2.670597306037412e-07, |
|
"loss": 0.6606, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 1.773333434653589, |
|
"learning_rate": 2.59400551166063e-07, |
|
"loss": 0.6576, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 1.7728777287155046, |
|
"learning_rate": 2.5184679374300553e-07, |
|
"loss": 0.6606, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 1.83343142007096, |
|
"learning_rate": 2.4439881374496016e-07, |
|
"loss": 0.6713, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 1.8119712073997163, |
|
"learning_rate": 2.3705696160540303e-07, |
|
"loss": 0.6596, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 1.7575305127120062, |
|
"learning_rate": 2.298215827644118e-07, |
|
"loss": 0.6582, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 1.80965570055429, |
|
"learning_rate": 2.2269301765240558e-07, |
|
"loss": 0.6508, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 1.8329895956407685, |
|
"learning_rate": 2.1567160167413503e-07, |
|
"loss": 0.6657, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 1.8295154972235375, |
|
"learning_rate": 2.0875766519289436e-07, |
|
"loss": 0.6602, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 1.778360539334375, |
|
"learning_rate": 2.0195153351498325e-07, |
|
"loss": 0.6672, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 1.8281360399477038, |
|
"learning_rate": 1.9525352687439548e-07, |
|
"loss": 0.6713, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 1.798281276385492, |
|
"learning_rate": 1.886639604177573e-07, |
|
"loss": 0.6589, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 1.8101646090365584, |
|
"learning_rate": 1.821831441894939e-07, |
|
"loss": 0.6576, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 1.8163930084993238, |
|
"learning_rate": 1.7581138311724754e-07, |
|
"loss": 0.6509, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 1.7889849857989786, |
|
"learning_rate": 1.6954897699752394e-07, |
|
"loss": 0.6654, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 1.7753176697331132, |
|
"learning_rate": 1.6339622048159198e-07, |
|
"loss": 0.6555, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 1.758833276503715, |
|
"learning_rate": 1.5735340306161752e-07, |
|
"loss": 0.665, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 1.7863343585516815, |
|
"learning_rate": 1.514208090570432e-07, |
|
"loss": 0.6484, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 1.7763079205782726, |
|
"learning_rate": 1.4559871760121108e-07, |
|
"loss": 0.6562, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 1.8490593873759873, |
|
"learning_rate": 1.3988740262822847e-07, |
|
"loss": 0.6497, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 1.7753823119901868, |
|
"learning_rate": 1.3428713286008005e-07, |
|
"loss": 0.6534, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 1.7671712087628604, |
|
"learning_rate": 1.2879817179398375e-07, |
|
"loss": 0.6519, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 1.7594428378082356, |
|
"learning_rate": 1.2342077768999372e-07, |
|
"loss": 0.6519, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 1.767897963166057, |
|
"learning_rate": 1.1815520355884679e-07, |
|
"loss": 0.6528, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 1.7463739318936164, |
|
"learning_rate": 1.130016971500622e-07, |
|
"loss": 0.6582, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 1.747840277010472, |
|
"learning_rate": 1.0796050094027954e-07, |
|
"loss": 0.6661, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 1.8160480622182698, |
|
"learning_rate": 1.0303185212185485e-07, |
|
"loss": 0.646, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 1.7568873705777095, |
|
"learning_rate": 9.821598259169729e-08, |
|
"loss": 0.6554, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 1.733832059747267, |
|
"learning_rate": 9.351311894036014e-08, |
|
"loss": 0.6632, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 1.804637277135235, |
|
"learning_rate": 8.892348244137788e-08, |
|
"loss": 0.66, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 1.767868039735343, |
|
"learning_rate": 8.444728904085737e-08, |
|
"loss": 0.659, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 1.770931658466082, |
|
"learning_rate": 8.008474934731447e-08, |
|
"loss": 0.668, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 1.7732670135950312, |
|
"learning_rate": 7.583606862176713e-08, |
|
"loss": 0.6548, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 1.7259860505689657, |
|
"learning_rate": 7.170144676807683e-08, |
|
"loss": 0.6318, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 1.7392331188224266, |
|
"learning_rate": 6.768107832354292e-08, |
|
"loss": 0.6636, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 1.7732212376542704, |
|
"learning_rate": 6.377515244974903e-08, |
|
"loss": 0.6626, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 1.7335582830409095, |
|
"learning_rate": 5.99838529236646e-08, |
|
"loss": 0.668, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 1.7716856700895114, |
|
"learning_rate": 5.6307358128994685e-08, |
|
"loss": 0.667, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 1.7617896255786891, |
|
"learning_rate": 5.274584104779157e-08, |
|
"loss": 0.6538, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 1.7528019015815823, |
|
"learning_rate": 4.929946925231077e-08, |
|
"loss": 0.6534, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 1.8117296265464948, |
|
"learning_rate": 4.5968404897130944e-08, |
|
"loss": 0.6674, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 1.749044793771054, |
|
"learning_rate": 4.27528047115211e-08, |
|
"loss": 0.6682, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 1.7454523412078409, |
|
"learning_rate": 3.965281999206899e-08, |
|
"loss": 0.6601, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 1.7598878691389603, |
|
"learning_rate": 3.666859659556016e-08, |
|
"loss": 0.6603, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 1.7046387508749583, |
|
"learning_rate": 3.3800274932117294e-08, |
|
"loss": 0.6518, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 1.7163795248428233, |
|
"learning_rate": 3.1047989958592203e-08, |
|
"loss": 0.6651, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 1.777257207147479, |
|
"learning_rate": 2.841187117221672e-08, |
|
"loss": 0.6558, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 1.8219202465976836, |
|
"learning_rate": 2.5892042604510614e-08, |
|
"loss": 0.6508, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 1.7767451714812037, |
|
"learning_rate": 2.348862281544323e-08, |
|
"loss": 0.6509, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 1.7465806936718902, |
|
"learning_rate": 2.1201724887858488e-08, |
|
"loss": 0.6523, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 1.7329527459099043, |
|
"learning_rate": 1.9031456422151374e-08, |
|
"loss": 0.6404, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 1.7965434015907633, |
|
"learning_rate": 1.6977919531207533e-08, |
|
"loss": 0.6603, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 1.7618638033364344, |
|
"learning_rate": 1.5041210835596288e-08, |
|
"loss": 0.6421, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 1.7717100092665263, |
|
"learning_rate": 1.3221421459027329e-08, |
|
"loss": 0.6656, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 1.7605568107436471, |
|
"learning_rate": 1.1518637024061086e-08, |
|
"loss": 0.6668, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 1.751907548134551, |
|
"learning_rate": 9.932937648081397e-09, |
|
"loss": 0.6579, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 1.7386986707922565, |
|
"learning_rate": 8.464397939524915e-09, |
|
"loss": 0.6703, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 1.7643987709822369, |
|
"learning_rate": 7.113086994372242e-09, |
|
"loss": 0.666, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 1.7296344516569304, |
|
"learning_rate": 5.879068392894427e-09, |
|
"loss": 0.6522, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 1.7593863922129787, |
|
"learning_rate": 4.762400196664518e-09, |
|
"loss": 0.6586, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 1.7608500271319567, |
|
"learning_rate": 3.763134945823088e-09, |
|
"loss": 0.6689, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 1.7248718265179743, |
|
"learning_rate": 2.8813196566079836e-09, |
|
"loss": 0.6476, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 1.7430786741620756, |
|
"learning_rate": 2.116995819140821e-09, |
|
"loss": 0.6636, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 1.7541283862977322, |
|
"learning_rate": 1.4701993954760462e-09, |
|
"loss": 0.6639, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 1.7203096912350941, |
|
"learning_rate": 9.409608179078433e-10, |
|
"loss": 0.6475, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 1.7295283175572225, |
|
"learning_rate": 5.293049875393363e-10, |
|
"loss": 0.6589, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 1.7037779795629253, |
|
"learning_rate": 2.3525127310936035e-10, |
|
"loss": 0.6521, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 1.7513315633630457, |
|
"learning_rate": 5.88135100831888e-11, |
|
"loss": 0.6556, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 1.7159722803516417, |
|
"learning_rate": 0.0, |
|
"loss": 0.6477, |
|
"step": 468 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 468, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 117, |
|
"total_flos": 783498671554560.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|