|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.6, |
|
"eval_steps": 500, |
|
"global_step": 10000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 1.2292424440383911, |
|
"learning_rate": 9.949748743718594e-05, |
|
"loss": 2.6316, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 1.361342191696167, |
|
"learning_rate": 9.84924623115578e-05, |
|
"loss": 1.9567, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 1.3794206380844116, |
|
"learning_rate": 9.748743718592965e-05, |
|
"loss": 1.8415, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 1.2263352870941162, |
|
"learning_rate": 9.64824120603015e-05, |
|
"loss": 1.8072, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.1767858266830444, |
|
"learning_rate": 9.547738693467337e-05, |
|
"loss": 1.7898, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 1.173898696899414, |
|
"learning_rate": 9.447236180904523e-05, |
|
"loss": 1.7314, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 1.2088936567306519, |
|
"learning_rate": 9.34673366834171e-05, |
|
"loss": 1.7011, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.9866878986358643, |
|
"learning_rate": 9.246231155778895e-05, |
|
"loss": 1.6663, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 1.2384347915649414, |
|
"learning_rate": 9.14572864321608e-05, |
|
"loss": 1.6507, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.065297245979309, |
|
"learning_rate": 9.045226130653267e-05, |
|
"loss": 1.6705, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 1.1226449012756348, |
|
"learning_rate": 8.944723618090453e-05, |
|
"loss": 1.6577, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.9142518639564514, |
|
"learning_rate": 8.84422110552764e-05, |
|
"loss": 1.612, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 1.1804460287094116, |
|
"learning_rate": 8.743718592964825e-05, |
|
"loss": 1.6083, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 1.1100006103515625, |
|
"learning_rate": 8.64321608040201e-05, |
|
"loss": 1.6242, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.1566694974899292, |
|
"learning_rate": 8.542713567839196e-05, |
|
"loss": 1.6111, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 1.094859004020691, |
|
"learning_rate": 8.442211055276383e-05, |
|
"loss": 1.5816, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 1.2286021709442139, |
|
"learning_rate": 8.341708542713568e-05, |
|
"loss": 1.578, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 1.0682488679885864, |
|
"learning_rate": 8.241206030150754e-05, |
|
"loss": 1.5795, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 1.1403608322143555, |
|
"learning_rate": 8.14070351758794e-05, |
|
"loss": 1.5676, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.0942330360412598, |
|
"learning_rate": 8.040201005025126e-05, |
|
"loss": 1.5749, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 1.060088872909546, |
|
"learning_rate": 7.939698492462313e-05, |
|
"loss": 1.5184, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 1.085312008857727, |
|
"learning_rate": 7.839195979899498e-05, |
|
"loss": 1.5339, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 1.303536295890808, |
|
"learning_rate": 7.738693467336684e-05, |
|
"loss": 1.515, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.9337490797042847, |
|
"learning_rate": 7.638190954773869e-05, |
|
"loss": 1.5243, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.2959569692611694, |
|
"learning_rate": 7.537688442211056e-05, |
|
"loss": 1.4846, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 1.1419408321380615, |
|
"learning_rate": 7.437185929648241e-05, |
|
"loss": 1.5158, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 0.9983295202255249, |
|
"learning_rate": 7.336683417085427e-05, |
|
"loss": 1.4873, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 1.1773889064788818, |
|
"learning_rate": 7.236180904522614e-05, |
|
"loss": 1.4894, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 1.2258810997009277, |
|
"learning_rate": 7.135678391959799e-05, |
|
"loss": 1.5015, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.1287764310836792, |
|
"learning_rate": 7.035175879396985e-05, |
|
"loss": 1.5166, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 1.1293085813522339, |
|
"learning_rate": 6.93467336683417e-05, |
|
"loss": 1.5117, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 1.0602566003799438, |
|
"learning_rate": 6.834170854271357e-05, |
|
"loss": 1.455, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 1.2482367753982544, |
|
"learning_rate": 6.733668341708544e-05, |
|
"loss": 1.4356, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 1.35064697265625, |
|
"learning_rate": 6.633165829145729e-05, |
|
"loss": 1.4528, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.065523386001587, |
|
"learning_rate": 6.532663316582915e-05, |
|
"loss": 1.4706, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 1.4030505418777466, |
|
"learning_rate": 6.4321608040201e-05, |
|
"loss": 1.4325, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 1.1023573875427246, |
|
"learning_rate": 6.331658291457287e-05, |
|
"loss": 1.455, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 1.179084062576294, |
|
"learning_rate": 6.231155778894473e-05, |
|
"loss": 1.4552, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 1.0885223150253296, |
|
"learning_rate": 6.130653266331658e-05, |
|
"loss": 1.4178, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.3725833892822266, |
|
"learning_rate": 6.030150753768844e-05, |
|
"loss": 1.456, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 1.1671427488327026, |
|
"learning_rate": 5.929648241206031e-05, |
|
"loss": 1.4552, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 1.0521718263626099, |
|
"learning_rate": 5.829145728643216e-05, |
|
"loss": 1.4236, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 1.1262151002883911, |
|
"learning_rate": 5.728643216080403e-05, |
|
"loss": 1.456, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 1.090331792831421, |
|
"learning_rate": 5.628140703517588e-05, |
|
"loss": 1.4021, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.1581507921218872, |
|
"learning_rate": 5.527638190954774e-05, |
|
"loss": 1.4708, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 1.1916351318359375, |
|
"learning_rate": 5.4271356783919604e-05, |
|
"loss": 1.4283, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 1.2623261213302612, |
|
"learning_rate": 5.3266331658291455e-05, |
|
"loss": 1.4593, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 1.2002214193344116, |
|
"learning_rate": 5.226130653266332e-05, |
|
"loss": 1.4387, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 1.0627392530441284, |
|
"learning_rate": 5.125628140703518e-05, |
|
"loss": 1.4313, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.2739390134811401, |
|
"learning_rate": 5.0251256281407036e-05, |
|
"loss": 1.4024, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 1.3108317852020264, |
|
"learning_rate": 4.92462311557789e-05, |
|
"loss": 1.4385, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 1.4682525396347046, |
|
"learning_rate": 4.824120603015075e-05, |
|
"loss": 1.4015, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 1.301832675933838, |
|
"learning_rate": 4.723618090452262e-05, |
|
"loss": 1.3995, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 1.3100578784942627, |
|
"learning_rate": 4.6231155778894475e-05, |
|
"loss": 1.4203, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.2472883462905884, |
|
"learning_rate": 4.522613065326633e-05, |
|
"loss": 1.3984, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 1.1501699686050415, |
|
"learning_rate": 4.42211055276382e-05, |
|
"loss": 1.4177, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 1.306634783744812, |
|
"learning_rate": 4.321608040201005e-05, |
|
"loss": 1.4013, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 1.199546217918396, |
|
"learning_rate": 4.2211055276381914e-05, |
|
"loss": 1.3998, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 1.4669443368911743, |
|
"learning_rate": 4.120603015075377e-05, |
|
"loss": 1.3858, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.1618568897247314, |
|
"learning_rate": 4.020100502512563e-05, |
|
"loss": 1.3952, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 1.3658894300460815, |
|
"learning_rate": 3.919597989949749e-05, |
|
"loss": 1.34, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 1.1548917293548584, |
|
"learning_rate": 3.8190954773869346e-05, |
|
"loss": 1.3753, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.008, |
|
"grad_norm": 1.250981092453003, |
|
"learning_rate": 3.7185929648241204e-05, |
|
"loss": 1.363, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 1.1988142728805542, |
|
"learning_rate": 3.618090452261307e-05, |
|
"loss": 1.2739, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.3094350099563599, |
|
"learning_rate": 3.517587939698493e-05, |
|
"loss": 1.3268, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"grad_norm": 1.4513778686523438, |
|
"learning_rate": 3.4170854271356785e-05, |
|
"loss": 1.3114, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.072, |
|
"grad_norm": 1.2981783151626587, |
|
"learning_rate": 3.3165829145728643e-05, |
|
"loss": 1.2866, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 1.350372314453125, |
|
"learning_rate": 3.21608040201005e-05, |
|
"loss": 1.2909, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.104, |
|
"grad_norm": 1.1077184677124023, |
|
"learning_rate": 3.1155778894472366e-05, |
|
"loss": 1.2278, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.3056607246398926, |
|
"learning_rate": 3.015075376884422e-05, |
|
"loss": 1.2573, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.1360000000000001, |
|
"grad_norm": 1.38368558883667, |
|
"learning_rate": 2.914572864321608e-05, |
|
"loss": 1.3041, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 1.7526077032089233, |
|
"learning_rate": 2.814070351758794e-05, |
|
"loss": 1.3056, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.168, |
|
"grad_norm": 1.1916877031326294, |
|
"learning_rate": 2.7135678391959802e-05, |
|
"loss": 1.2359, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.184, |
|
"grad_norm": 1.326968789100647, |
|
"learning_rate": 2.613065326633166e-05, |
|
"loss": 1.2529, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.502866506576538, |
|
"learning_rate": 2.5125628140703518e-05, |
|
"loss": 1.3043, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.216, |
|
"grad_norm": 1.7037489414215088, |
|
"learning_rate": 2.4120603015075376e-05, |
|
"loss": 1.3254, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.232, |
|
"grad_norm": 1.3369475603103638, |
|
"learning_rate": 2.3115577889447238e-05, |
|
"loss": 1.3274, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.248, |
|
"grad_norm": 1.3407210111618042, |
|
"learning_rate": 2.21105527638191e-05, |
|
"loss": 1.2879, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.264, |
|
"grad_norm": 1.5996978282928467, |
|
"learning_rate": 2.1105527638190957e-05, |
|
"loss": 1.2853, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.3061344623565674, |
|
"learning_rate": 2.0100502512562815e-05, |
|
"loss": 1.274, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.296, |
|
"grad_norm": 1.335577130317688, |
|
"learning_rate": 1.9105527638190956e-05, |
|
"loss": 1.2482, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 1.312, |
|
"grad_norm": 1.632110834121704, |
|
"learning_rate": 1.8100502512562814e-05, |
|
"loss": 1.2849, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 1.328, |
|
"grad_norm": 1.457372784614563, |
|
"learning_rate": 1.7095477386934675e-05, |
|
"loss": 1.27, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 1.3104965686798096, |
|
"learning_rate": 1.6090452261306533e-05, |
|
"loss": 1.2698, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 1.350401520729065, |
|
"learning_rate": 1.5085427135678393e-05, |
|
"loss": 1.2337, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.376, |
|
"grad_norm": 1.3079415559768677, |
|
"learning_rate": 1.4080402010050253e-05, |
|
"loss": 1.2904, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 1.392, |
|
"grad_norm": 1.3506203889846802, |
|
"learning_rate": 1.3075376884422111e-05, |
|
"loss": 1.2847, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 1.408, |
|
"grad_norm": 1.4178451299667358, |
|
"learning_rate": 1.2070351758793969e-05, |
|
"loss": 1.2713, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 1.424, |
|
"grad_norm": 1.2672168016433716, |
|
"learning_rate": 1.106532663316583e-05, |
|
"loss": 1.2634, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.4467307329177856, |
|
"learning_rate": 1.0070351758793971e-05, |
|
"loss": 1.2868, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.456, |
|
"grad_norm": 1.5032036304473877, |
|
"learning_rate": 9.06532663316583e-06, |
|
"loss": 1.2323, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 1.472, |
|
"grad_norm": 1.1872940063476562, |
|
"learning_rate": 8.060301507537689e-06, |
|
"loss": 1.2868, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 1.488, |
|
"grad_norm": 1.6626771688461304, |
|
"learning_rate": 7.055276381909548e-06, |
|
"loss": 1.27, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 1.504, |
|
"grad_norm": 1.3130452632904053, |
|
"learning_rate": 6.050251256281407e-06, |
|
"loss": 1.2542, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.4746296405792236, |
|
"learning_rate": 5.045226130653267e-06, |
|
"loss": 1.257, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.536, |
|
"grad_norm": 1.3648103475570679, |
|
"learning_rate": 4.0402010050251256e-06, |
|
"loss": 1.2487, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 1.552, |
|
"grad_norm": 1.3191380500793457, |
|
"learning_rate": 3.035175879396985e-06, |
|
"loss": 1.2557, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 1.568, |
|
"grad_norm": 1.806413173675537, |
|
"learning_rate": 2.0301507537688442e-06, |
|
"loss": 1.2323, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 1.584, |
|
"grad_norm": 1.6092606782913208, |
|
"learning_rate": 1.0251256281407035e-06, |
|
"loss": 1.2321, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.3367202281951904, |
|
"learning_rate": 2.0100502512562817e-08, |
|
"loss": 1.2361, |
|
"step": 10000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.995709021001564e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|