|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.98439937597504, |
|
"eval_steps": 500, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0998439937597504, |
|
"grad_norm": 0.33735999850516357, |
|
"learning_rate": 0.0, |
|
"loss": 2.0239, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.1996879875195008, |
|
"grad_norm": 3.996268319859662, |
|
"learning_rate": 0.0001, |
|
"loss": 2.1481, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.2995319812792512, |
|
"grad_norm": 3.972078144391957, |
|
"learning_rate": 0.0001, |
|
"loss": 2.1474, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.3993759750390016, |
|
"grad_norm": 2.7286088582127235, |
|
"learning_rate": 9.897959183673469e-05, |
|
"loss": 2.1429, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.49921996879875197, |
|
"grad_norm": 4.297284692826526, |
|
"learning_rate": 9.79591836734694e-05, |
|
"loss": 2.2106, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.5990639625585024, |
|
"grad_norm": 4.430982504708323, |
|
"learning_rate": 9.693877551020408e-05, |
|
"loss": 2.1834, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.6989079563182528, |
|
"grad_norm": 2.6687235846541064, |
|
"learning_rate": 9.591836734693878e-05, |
|
"loss": 2.1127, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.7987519500780031, |
|
"grad_norm": 1.2769742192031128, |
|
"learning_rate": 9.489795918367348e-05, |
|
"loss": 2.0969, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.8985959438377535, |
|
"grad_norm": 1.4823469261150688, |
|
"learning_rate": 9.387755102040817e-05, |
|
"loss": 2.0815, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.9984399375975039, |
|
"grad_norm": 2.036951250941504, |
|
"learning_rate": 9.285714285714286e-05, |
|
"loss": 2.0954, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 1.0982839313572543, |
|
"grad_norm": 1.9201925092786452, |
|
"learning_rate": 9.183673469387756e-05, |
|
"loss": 2.0938, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 1.1981279251170047, |
|
"grad_norm": 1.0750267735185661, |
|
"learning_rate": 9.081632653061225e-05, |
|
"loss": 2.0609, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 1.2979719188767551, |
|
"grad_norm": 0.7680234903130688, |
|
"learning_rate": 8.979591836734695e-05, |
|
"loss": 2.0664, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 1.3978159126365055, |
|
"grad_norm": 1.385815006248848, |
|
"learning_rate": 8.877551020408164e-05, |
|
"loss": 2.0705, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 1.497659906396256, |
|
"grad_norm": 1.7546578816659688, |
|
"learning_rate": 8.775510204081632e-05, |
|
"loss": 2.0676, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 1.5975039001560063, |
|
"grad_norm": 1.5801645828188804, |
|
"learning_rate": 8.673469387755102e-05, |
|
"loss": 2.0717, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 1.6973478939157567, |
|
"grad_norm": 0.6350567898174008, |
|
"learning_rate": 8.571428571428571e-05, |
|
"loss": 2.0445, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 1.797191887675507, |
|
"grad_norm": 0.769793586745769, |
|
"learning_rate": 8.469387755102041e-05, |
|
"loss": 2.0641, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 1.8970358814352575, |
|
"grad_norm": 0.9717953713720756, |
|
"learning_rate": 8.367346938775511e-05, |
|
"loss": 2.045, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 1.9968798751950079, |
|
"grad_norm": 0.8984304676609969, |
|
"learning_rate": 8.26530612244898e-05, |
|
"loss": 2.0571, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 2.0967238689547583, |
|
"grad_norm": 0.5175444394072088, |
|
"learning_rate": 8.163265306122449e-05, |
|
"loss": 2.041, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 2.1965678627145087, |
|
"grad_norm": 0.4541835423784387, |
|
"learning_rate": 8.061224489795919e-05, |
|
"loss": 2.0336, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 2.296411856474259, |
|
"grad_norm": 0.667841023508509, |
|
"learning_rate": 7.959183673469388e-05, |
|
"loss": 2.0495, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 2.3962558502340094, |
|
"grad_norm": 0.5788661227302379, |
|
"learning_rate": 7.857142857142858e-05, |
|
"loss": 2.0535, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 2.49609984399376, |
|
"grad_norm": 0.44773175799420883, |
|
"learning_rate": 7.755102040816327e-05, |
|
"loss": 2.0565, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 2.5959438377535102, |
|
"grad_norm": 0.36927161339954073, |
|
"learning_rate": 7.653061224489796e-05, |
|
"loss": 2.043, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 2.6957878315132606, |
|
"grad_norm": 0.41656375647295985, |
|
"learning_rate": 7.551020408163266e-05, |
|
"loss": 2.0508, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 2.795631825273011, |
|
"grad_norm": 0.5181702303959671, |
|
"learning_rate": 7.448979591836736e-05, |
|
"loss": 2.0421, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 2.8954758190327614, |
|
"grad_norm": 0.3514850743124221, |
|
"learning_rate": 7.346938775510205e-05, |
|
"loss": 2.0613, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 2.995319812792512, |
|
"grad_norm": 0.4107791530284269, |
|
"learning_rate": 7.244897959183675e-05, |
|
"loss": 2.0373, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 3.095163806552262, |
|
"grad_norm": 0.37199633406042804, |
|
"learning_rate": 7.142857142857143e-05, |
|
"loss": 2.0383, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 3.1950078003120126, |
|
"grad_norm": 0.4001809292131257, |
|
"learning_rate": 7.040816326530612e-05, |
|
"loss": 2.046, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 3.294851794071763, |
|
"grad_norm": 0.288426524415327, |
|
"learning_rate": 6.938775510204082e-05, |
|
"loss": 2.0397, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 3.3946957878315134, |
|
"grad_norm": 0.2430022517102903, |
|
"learning_rate": 6.836734693877551e-05, |
|
"loss": 2.0209, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 3.4945397815912638, |
|
"grad_norm": 0.21451368987973404, |
|
"learning_rate": 6.73469387755102e-05, |
|
"loss": 2.0321, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 3.594383775351014, |
|
"grad_norm": 0.216393318547338, |
|
"learning_rate": 6.63265306122449e-05, |
|
"loss": 2.034, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 3.6942277691107646, |
|
"grad_norm": 0.27185371030563144, |
|
"learning_rate": 6.530612244897959e-05, |
|
"loss": 2.0387, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 3.794071762870515, |
|
"grad_norm": 0.2697186164320835, |
|
"learning_rate": 6.428571428571429e-05, |
|
"loss": 2.0291, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 3.8939157566302653, |
|
"grad_norm": 0.24154490158801892, |
|
"learning_rate": 6.326530612244899e-05, |
|
"loss": 2.032, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 3.9937597503900157, |
|
"grad_norm": 0.19835514928005893, |
|
"learning_rate": 6.224489795918368e-05, |
|
"loss": 2.0378, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 4.093603744149766, |
|
"grad_norm": 0.202257495523738, |
|
"learning_rate": 6.122448979591838e-05, |
|
"loss": 2.0282, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 4.1934477379095165, |
|
"grad_norm": 0.29489887223631195, |
|
"learning_rate": 6.0204081632653065e-05, |
|
"loss": 2.0312, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 4.2932917316692665, |
|
"grad_norm": 0.24397101805150526, |
|
"learning_rate": 5.918367346938776e-05, |
|
"loss": 2.0279, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 4.393135725429017, |
|
"grad_norm": 0.18678468114794597, |
|
"learning_rate": 5.816326530612245e-05, |
|
"loss": 2.0322, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 4.492979719188767, |
|
"grad_norm": 0.20762719446936778, |
|
"learning_rate": 5.714285714285714e-05, |
|
"loss": 2.0319, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 4.592823712948518, |
|
"grad_norm": 0.22021149423424455, |
|
"learning_rate": 5.6122448979591836e-05, |
|
"loss": 2.0226, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 4.692667706708268, |
|
"grad_norm": 0.19683580500376263, |
|
"learning_rate": 5.510204081632653e-05, |
|
"loss": 2.0132, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 4.792511700468019, |
|
"grad_norm": 0.18067087727658598, |
|
"learning_rate": 5.408163265306123e-05, |
|
"loss": 2.0212, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 4.892355694227769, |
|
"grad_norm": 0.20548068001057151, |
|
"learning_rate": 5.3061224489795926e-05, |
|
"loss": 2.0297, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 4.99219968798752, |
|
"grad_norm": 0.21068796657056815, |
|
"learning_rate": 5.2040816326530614e-05, |
|
"loss": 2.024, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 5.09204368174727, |
|
"grad_norm": 0.18294872441473956, |
|
"learning_rate": 5.102040816326531e-05, |
|
"loss": 2.0165, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 5.1918876755070205, |
|
"grad_norm": 0.19829054181667702, |
|
"learning_rate": 5e-05, |
|
"loss": 2.0192, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 5.29173166926677, |
|
"grad_norm": 0.23253630377762147, |
|
"learning_rate": 4.89795918367347e-05, |
|
"loss": 2.0168, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 5.391575663026521, |
|
"grad_norm": 0.19977491013330723, |
|
"learning_rate": 4.795918367346939e-05, |
|
"loss": 2.0371, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 5.491419656786271, |
|
"grad_norm": 0.19839589028132484, |
|
"learning_rate": 4.6938775510204086e-05, |
|
"loss": 2.0241, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 5.591263650546022, |
|
"grad_norm": 0.19221135549659152, |
|
"learning_rate": 4.591836734693878e-05, |
|
"loss": 2.0155, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 5.691107644305772, |
|
"grad_norm": 0.18530533779964112, |
|
"learning_rate": 4.4897959183673474e-05, |
|
"loss": 2.0188, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 5.790951638065523, |
|
"grad_norm": 0.19542701225272013, |
|
"learning_rate": 4.387755102040816e-05, |
|
"loss": 2.0197, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 5.890795631825273, |
|
"grad_norm": 0.19627507477830436, |
|
"learning_rate": 4.2857142857142856e-05, |
|
"loss": 2.0273, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 5.990639625585024, |
|
"grad_norm": 0.19035619638095738, |
|
"learning_rate": 4.183673469387756e-05, |
|
"loss": 2.0219, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 6.090483619344774, |
|
"grad_norm": 0.2202319627016047, |
|
"learning_rate": 4.0816326530612245e-05, |
|
"loss": 2.0243, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 6.190327613104524, |
|
"grad_norm": 0.21161656218616026, |
|
"learning_rate": 3.979591836734694e-05, |
|
"loss": 2.0107, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 6.290171606864274, |
|
"grad_norm": 0.18165401696499975, |
|
"learning_rate": 3.8775510204081634e-05, |
|
"loss": 2.0139, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 6.390015600624025, |
|
"grad_norm": 0.2000226648720509, |
|
"learning_rate": 3.775510204081633e-05, |
|
"loss": 2.0213, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 6.489859594383775, |
|
"grad_norm": 0.21044890897157179, |
|
"learning_rate": 3.673469387755102e-05, |
|
"loss": 2.0191, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 6.589703588143526, |
|
"grad_norm": 0.19088424814099902, |
|
"learning_rate": 3.571428571428572e-05, |
|
"loss": 2.0242, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 6.689547581903276, |
|
"grad_norm": 0.21090169002189085, |
|
"learning_rate": 3.469387755102041e-05, |
|
"loss": 2.0129, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 6.789391575663027, |
|
"grad_norm": 0.17374408490589077, |
|
"learning_rate": 3.36734693877551e-05, |
|
"loss": 2.0137, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 6.889235569422777, |
|
"grad_norm": 0.19923758885188783, |
|
"learning_rate": 3.265306122448979e-05, |
|
"loss": 2.0246, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 6.9890795631825275, |
|
"grad_norm": 0.23509786711199504, |
|
"learning_rate": 3.1632653061224494e-05, |
|
"loss": 2.0217, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 7.0889235569422775, |
|
"grad_norm": 0.22682630866339715, |
|
"learning_rate": 3.061224489795919e-05, |
|
"loss": 2.0227, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 7.188767550702028, |
|
"grad_norm": 0.19506001365101278, |
|
"learning_rate": 2.959183673469388e-05, |
|
"loss": 2.0055, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 7.288611544461778, |
|
"grad_norm": 0.18843675103995558, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 2.009, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 7.388455538221529, |
|
"grad_norm": 0.3013340836030522, |
|
"learning_rate": 2.7551020408163265e-05, |
|
"loss": 2.0059, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 7.488299531981279, |
|
"grad_norm": 0.2196066974488878, |
|
"learning_rate": 2.6530612244897963e-05, |
|
"loss": 2.0146, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 7.58814352574103, |
|
"grad_norm": 0.29942455484876807, |
|
"learning_rate": 2.5510204081632654e-05, |
|
"loss": 2.028, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 7.68798751950078, |
|
"grad_norm": 0.2635978736170486, |
|
"learning_rate": 2.448979591836735e-05, |
|
"loss": 2.007, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 7.787831513260531, |
|
"grad_norm": 0.17048063706160105, |
|
"learning_rate": 2.3469387755102043e-05, |
|
"loss": 2.0178, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 7.887675507020281, |
|
"grad_norm": 0.2462988180698444, |
|
"learning_rate": 2.2448979591836737e-05, |
|
"loss": 2.0184, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 7.9875195007800315, |
|
"grad_norm": 0.18757769842389277, |
|
"learning_rate": 2.1428571428571428e-05, |
|
"loss": 2.0066, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 8.087363494539781, |
|
"grad_norm": 0.1838199763796138, |
|
"learning_rate": 2.0408163265306123e-05, |
|
"loss": 2.0086, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 8.187207488299531, |
|
"grad_norm": 0.18445305909507442, |
|
"learning_rate": 1.9387755102040817e-05, |
|
"loss": 2.0059, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 8.287051482059283, |
|
"grad_norm": 0.15914247815116395, |
|
"learning_rate": 1.836734693877551e-05, |
|
"loss": 2.0014, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 8.386895475819033, |
|
"grad_norm": 0.16618424435461218, |
|
"learning_rate": 1.7346938775510206e-05, |
|
"loss": 1.9989, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 8.486739469578783, |
|
"grad_norm": 0.18987830126280159, |
|
"learning_rate": 1.6326530612244897e-05, |
|
"loss": 2.014, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 8.586583463338533, |
|
"grad_norm": 0.17181919557458755, |
|
"learning_rate": 1.5306122448979594e-05, |
|
"loss": 2.0126, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 8.686427457098285, |
|
"grad_norm": 0.1803239869304924, |
|
"learning_rate": 1.4285714285714285e-05, |
|
"loss": 2.0118, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 8.786271450858035, |
|
"grad_norm": 0.16809838859014387, |
|
"learning_rate": 1.3265306122448982e-05, |
|
"loss": 2.0241, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 8.886115444617785, |
|
"grad_norm": 0.20717433267642188, |
|
"learning_rate": 1.2244897959183674e-05, |
|
"loss": 2.0134, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 8.985959438377535, |
|
"grad_norm": 0.16293892544531494, |
|
"learning_rate": 1.1224489795918369e-05, |
|
"loss": 2.0117, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 9.085803432137286, |
|
"grad_norm": 0.16729720184195912, |
|
"learning_rate": 1.0204081632653061e-05, |
|
"loss": 2.0126, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 9.185647425897036, |
|
"grad_norm": 0.22526874784083245, |
|
"learning_rate": 9.183673469387756e-06, |
|
"loss": 2.0024, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 9.285491419656786, |
|
"grad_norm": 0.2184919299582014, |
|
"learning_rate": 8.163265306122448e-06, |
|
"loss": 1.9992, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 9.385335413416536, |
|
"grad_norm": 0.17270901831612614, |
|
"learning_rate": 7.142857142857143e-06, |
|
"loss": 1.9999, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 9.485179407176288, |
|
"grad_norm": 0.16098303914434564, |
|
"learning_rate": 6.122448979591837e-06, |
|
"loss": 2.0125, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 9.585023400936038, |
|
"grad_norm": 0.17807241926954062, |
|
"learning_rate": 5.102040816326531e-06, |
|
"loss": 2.012, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 9.684867394695788, |
|
"grad_norm": 0.21950474032603587, |
|
"learning_rate": 4.081632653061224e-06, |
|
"loss": 2.0172, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 9.784711388455538, |
|
"grad_norm": 0.18080305990015563, |
|
"learning_rate": 3.0612244897959185e-06, |
|
"loss": 1.9969, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 9.88455538221529, |
|
"grad_norm": 0.1715649658731182, |
|
"learning_rate": 2.040816326530612e-06, |
|
"loss": 2.0153, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 9.98439937597504, |
|
"grad_norm": 0.16882262984886745, |
|
"learning_rate": 1.020408163265306e-06, |
|
"loss": 2.0115, |
|
"step": 100 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|