tyzhu's picture
End of training
4cbb649 verified
raw
history blame contribute delete
No virus
22.2 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 20.0,
"eval_steps": 500,
"global_step": 10580,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.1890359168241966,
"grad_norm": 0.37213513255119324,
"learning_rate": 0.0003,
"loss": 1.9201,
"step": 100
},
{
"epoch": 0.3780718336483932,
"grad_norm": 0.5696528553962708,
"learning_rate": 0.0003,
"loss": 1.8659,
"step": 200
},
{
"epoch": 0.5671077504725898,
"grad_norm": 0.44589683413505554,
"learning_rate": 0.0003,
"loss": 1.8382,
"step": 300
},
{
"epoch": 0.7561436672967864,
"grad_norm": 0.4036942720413208,
"learning_rate": 0.0003,
"loss": 1.8431,
"step": 400
},
{
"epoch": 0.945179584120983,
"grad_norm": 0.45892009139060974,
"learning_rate": 0.0003,
"loss": 1.8369,
"step": 500
},
{
"epoch": 1.0,
"eval_accuracy": 0.575076923076923,
"eval_loss": 1.6031557321548462,
"eval_runtime": 5.71,
"eval_samples_per_second": 87.566,
"eval_steps_per_second": 11.033,
"step": 529
},
{
"epoch": 1.1342155009451795,
"grad_norm": 0.4805261194705963,
"learning_rate": 0.0003,
"loss": 1.7006,
"step": 600
},
{
"epoch": 1.3232514177693762,
"grad_norm": 0.4189806878566742,
"learning_rate": 0.0003,
"loss": 1.6276,
"step": 700
},
{
"epoch": 1.5122873345935728,
"grad_norm": 0.4777927100658417,
"learning_rate": 0.0003,
"loss": 1.6294,
"step": 800
},
{
"epoch": 1.7013232514177694,
"grad_norm": 0.4425320327281952,
"learning_rate": 0.0003,
"loss": 1.6196,
"step": 900
},
{
"epoch": 1.8903591682419658,
"grad_norm": 0.5979380011558533,
"learning_rate": 0.0003,
"loss": 1.6451,
"step": 1000
},
{
"epoch": 2.0,
"eval_accuracy": 0.5745641025641026,
"eval_loss": 1.635698676109314,
"eval_runtime": 5.9951,
"eval_samples_per_second": 83.401,
"eval_steps_per_second": 10.509,
"step": 1058
},
{
"epoch": 2.0793950850661624,
"grad_norm": 0.5178828239440918,
"learning_rate": 0.0003,
"loss": 1.514,
"step": 1100
},
{
"epoch": 2.268431001890359,
"grad_norm": 0.7963550090789795,
"learning_rate": 0.0003,
"loss": 1.3597,
"step": 1200
},
{
"epoch": 2.4574669187145557,
"grad_norm": 0.7210434675216675,
"learning_rate": 0.0003,
"loss": 1.3774,
"step": 1300
},
{
"epoch": 2.6465028355387523,
"grad_norm": 0.6484540700912476,
"learning_rate": 0.0003,
"loss": 1.374,
"step": 1400
},
{
"epoch": 2.835538752362949,
"grad_norm": 0.6798349618911743,
"learning_rate": 0.0003,
"loss": 1.3703,
"step": 1500
},
{
"epoch": 3.0,
"eval_accuracy": 0.5716410256410256,
"eval_loss": 1.7677161693572998,
"eval_runtime": 5.9596,
"eval_samples_per_second": 83.899,
"eval_steps_per_second": 10.571,
"step": 1587
},
{
"epoch": 3.0245746691871456,
"grad_norm": 0.6527106761932373,
"learning_rate": 0.0003,
"loss": 1.3419,
"step": 1600
},
{
"epoch": 3.213610586011342,
"grad_norm": 0.6613947749137878,
"learning_rate": 0.0003,
"loss": 1.0932,
"step": 1700
},
{
"epoch": 3.402646502835539,
"grad_norm": 0.7362242341041565,
"learning_rate": 0.0003,
"loss": 1.1188,
"step": 1800
},
{
"epoch": 3.5916824196597354,
"grad_norm": 0.6629425287246704,
"learning_rate": 0.0003,
"loss": 1.142,
"step": 1900
},
{
"epoch": 3.780718336483932,
"grad_norm": 0.789070725440979,
"learning_rate": 0.0003,
"loss": 1.1661,
"step": 2000
},
{
"epoch": 3.9697542533081287,
"grad_norm": 0.7567113637924194,
"learning_rate": 0.0003,
"loss": 1.1817,
"step": 2100
},
{
"epoch": 4.0,
"eval_accuracy": 0.5718461538461539,
"eval_loss": 1.8587489128112793,
"eval_runtime": 5.7528,
"eval_samples_per_second": 86.914,
"eval_steps_per_second": 10.951,
"step": 2116
},
{
"epoch": 4.158790170132325,
"grad_norm": 0.7041072249412537,
"learning_rate": 0.0003,
"loss": 0.9335,
"step": 2200
},
{
"epoch": 4.3478260869565215,
"grad_norm": 0.7739288210868835,
"learning_rate": 0.0003,
"loss": 0.9257,
"step": 2300
},
{
"epoch": 4.536862003780718,
"grad_norm": 0.9699936509132385,
"learning_rate": 0.0003,
"loss": 0.9234,
"step": 2400
},
{
"epoch": 4.725897920604915,
"grad_norm": 0.8464515209197998,
"learning_rate": 0.0003,
"loss": 0.9547,
"step": 2500
},
{
"epoch": 4.914933837429111,
"grad_norm": 0.7238239049911499,
"learning_rate": 0.0003,
"loss": 0.9674,
"step": 2600
},
{
"epoch": 5.0,
"eval_accuracy": 0.5712820512820512,
"eval_loss": 1.931915283203125,
"eval_runtime": 5.7956,
"eval_samples_per_second": 86.272,
"eval_steps_per_second": 10.87,
"step": 2645
},
{
"epoch": 5.103969754253308,
"grad_norm": 0.8031799793243408,
"learning_rate": 0.0003,
"loss": 0.8353,
"step": 2700
},
{
"epoch": 5.293005671077505,
"grad_norm": 0.862354576587677,
"learning_rate": 0.0003,
"loss": 0.731,
"step": 2800
},
{
"epoch": 5.482041587901701,
"grad_norm": 0.9067243337631226,
"learning_rate": 0.0003,
"loss": 0.7515,
"step": 2900
},
{
"epoch": 5.671077504725898,
"grad_norm": 0.9791676998138428,
"learning_rate": 0.0003,
"loss": 0.7769,
"step": 3000
},
{
"epoch": 5.8601134215500945,
"grad_norm": 0.9806828498840332,
"learning_rate": 0.0003,
"loss": 0.7936,
"step": 3100
},
{
"epoch": 6.0,
"eval_accuracy": 0.5704102564102564,
"eval_loss": 1.993375301361084,
"eval_runtime": 5.8166,
"eval_samples_per_second": 85.96,
"eval_steps_per_second": 10.831,
"step": 3174
},
{
"epoch": 6.049149338374291,
"grad_norm": 0.8496273159980774,
"learning_rate": 0.0003,
"loss": 0.749,
"step": 3200
},
{
"epoch": 6.238185255198488,
"grad_norm": 0.8800780177116394,
"learning_rate": 0.0003,
"loss": 0.5822,
"step": 3300
},
{
"epoch": 6.427221172022684,
"grad_norm": 0.8136658668518066,
"learning_rate": 0.0003,
"loss": 0.6114,
"step": 3400
},
{
"epoch": 6.616257088846881,
"grad_norm": 1.0112674236297607,
"learning_rate": 0.0003,
"loss": 0.6329,
"step": 3500
},
{
"epoch": 6.805293005671078,
"grad_norm": 0.8546850681304932,
"learning_rate": 0.0003,
"loss": 0.6499,
"step": 3600
},
{
"epoch": 6.994328922495274,
"grad_norm": 0.947127640247345,
"learning_rate": 0.0003,
"loss": 0.67,
"step": 3700
},
{
"epoch": 7.0,
"eval_accuracy": 0.5683589743589743,
"eval_loss": 2.046682357788086,
"eval_runtime": 5.9678,
"eval_samples_per_second": 83.784,
"eval_steps_per_second": 10.557,
"step": 3703
},
{
"epoch": 7.183364839319471,
"grad_norm": 1.0099776983261108,
"learning_rate": 0.0003,
"loss": 0.477,
"step": 3800
},
{
"epoch": 7.3724007561436675,
"grad_norm": 0.8788864016532898,
"learning_rate": 0.0003,
"loss": 0.4951,
"step": 3900
},
{
"epoch": 7.561436672967864,
"grad_norm": 0.9243162274360657,
"learning_rate": 0.0003,
"loss": 0.5141,
"step": 4000
},
{
"epoch": 7.750472589792061,
"grad_norm": 1.0089187622070312,
"learning_rate": 0.0003,
"loss": 0.5317,
"step": 4100
},
{
"epoch": 7.939508506616257,
"grad_norm": 0.9674586057662964,
"learning_rate": 0.0003,
"loss": 0.5604,
"step": 4200
},
{
"epoch": 8.0,
"eval_accuracy": 0.5692820512820512,
"eval_loss": 2.121832847595215,
"eval_runtime": 5.6926,
"eval_samples_per_second": 87.833,
"eval_steps_per_second": 11.067,
"step": 4232
},
{
"epoch": 8.128544423440454,
"grad_norm": 0.8500985503196716,
"learning_rate": 0.0003,
"loss": 0.4532,
"step": 4300
},
{
"epoch": 8.31758034026465,
"grad_norm": 0.8404316902160645,
"learning_rate": 0.0003,
"loss": 0.4173,
"step": 4400
},
{
"epoch": 8.506616257088847,
"grad_norm": 0.9724924564361572,
"learning_rate": 0.0003,
"loss": 0.4386,
"step": 4500
},
{
"epoch": 8.695652173913043,
"grad_norm": 0.9974843859672546,
"learning_rate": 0.0003,
"loss": 0.453,
"step": 4600
},
{
"epoch": 8.88468809073724,
"grad_norm": 0.9835983514785767,
"learning_rate": 0.0003,
"loss": 0.4747,
"step": 4700
},
{
"epoch": 9.0,
"eval_accuracy": 0.5682051282051283,
"eval_loss": 2.1342432498931885,
"eval_runtime": 5.6644,
"eval_samples_per_second": 88.271,
"eval_steps_per_second": 11.122,
"step": 4761
},
{
"epoch": 9.073724007561436,
"grad_norm": 0.7832907438278198,
"learning_rate": 0.0003,
"loss": 0.4297,
"step": 4800
},
{
"epoch": 9.262759924385634,
"grad_norm": 0.9495198130607605,
"learning_rate": 0.0003,
"loss": 0.3607,
"step": 4900
},
{
"epoch": 9.45179584120983,
"grad_norm": 0.8801999688148499,
"learning_rate": 0.0003,
"loss": 0.3836,
"step": 5000
},
{
"epoch": 9.640831758034027,
"grad_norm": 0.9312089681625366,
"learning_rate": 0.0003,
"loss": 0.4013,
"step": 5100
},
{
"epoch": 9.829867674858223,
"grad_norm": 1.0668907165527344,
"learning_rate": 0.0003,
"loss": 0.4191,
"step": 5200
},
{
"epoch": 10.0,
"eval_accuracy": 0.5673846153846154,
"eval_loss": 2.1679139137268066,
"eval_runtime": 5.6062,
"eval_samples_per_second": 89.186,
"eval_steps_per_second": 11.237,
"step": 5290
},
{
"epoch": 10.01890359168242,
"grad_norm": 0.8078038096427917,
"learning_rate": 0.0003,
"loss": 0.42,
"step": 5300
},
{
"epoch": 10.207939508506616,
"grad_norm": 0.9549182653427124,
"learning_rate": 0.0003,
"loss": 0.329,
"step": 5400
},
{
"epoch": 10.396975425330814,
"grad_norm": 1.0073105096817017,
"learning_rate": 0.0003,
"loss": 0.3521,
"step": 5500
},
{
"epoch": 10.58601134215501,
"grad_norm": 1.0606422424316406,
"learning_rate": 0.0003,
"loss": 0.3671,
"step": 5600
},
{
"epoch": 10.775047258979207,
"grad_norm": 0.8719451427459717,
"learning_rate": 0.0003,
"loss": 0.3818,
"step": 5700
},
{
"epoch": 10.964083175803403,
"grad_norm": 1.0333566665649414,
"learning_rate": 0.0003,
"loss": 0.3971,
"step": 5800
},
{
"epoch": 11.0,
"eval_accuracy": 0.5657948717948718,
"eval_loss": 2.2080512046813965,
"eval_runtime": 5.6645,
"eval_samples_per_second": 88.27,
"eval_steps_per_second": 11.122,
"step": 5819
},
{
"epoch": 11.1531190926276,
"grad_norm": 0.84638911485672,
"learning_rate": 0.0003,
"loss": 0.3323,
"step": 5900
},
{
"epoch": 11.342155009451796,
"grad_norm": 0.8573931455612183,
"learning_rate": 0.0003,
"loss": 0.3304,
"step": 6000
},
{
"epoch": 11.531190926275993,
"grad_norm": 1.0611106157302856,
"learning_rate": 0.0003,
"loss": 0.3432,
"step": 6100
},
{
"epoch": 11.720226843100189,
"grad_norm": 0.8618783950805664,
"learning_rate": 0.0003,
"loss": 0.3601,
"step": 6200
},
{
"epoch": 11.909262759924385,
"grad_norm": 1.0319652557373047,
"learning_rate": 0.0003,
"loss": 0.3753,
"step": 6300
},
{
"epoch": 12.0,
"eval_accuracy": 0.5663589743589743,
"eval_loss": 2.184002637863159,
"eval_runtime": 5.7032,
"eval_samples_per_second": 87.67,
"eval_steps_per_second": 11.046,
"step": 6348
},
{
"epoch": 12.098298676748582,
"grad_norm": 0.9860585331916809,
"learning_rate": 0.0003,
"loss": 0.3454,
"step": 6400
},
{
"epoch": 12.287334593572778,
"grad_norm": 0.8168879747390747,
"learning_rate": 0.0003,
"loss": 0.317,
"step": 6500
},
{
"epoch": 12.476370510396976,
"grad_norm": 0.8994132876396179,
"learning_rate": 0.0003,
"loss": 0.3286,
"step": 6600
},
{
"epoch": 12.665406427221171,
"grad_norm": 0.9378811120986938,
"learning_rate": 0.0003,
"loss": 0.34,
"step": 6700
},
{
"epoch": 12.854442344045369,
"grad_norm": 0.8742515444755554,
"learning_rate": 0.0003,
"loss": 0.3571,
"step": 6800
},
{
"epoch": 13.0,
"eval_accuracy": 0.5633846153846154,
"eval_loss": 2.232431411743164,
"eval_runtime": 5.6334,
"eval_samples_per_second": 88.756,
"eval_steps_per_second": 11.183,
"step": 6877
},
{
"epoch": 13.043478260869565,
"grad_norm": 0.8000278472900391,
"learning_rate": 0.0003,
"loss": 0.3518,
"step": 6900
},
{
"epoch": 13.232514177693762,
"grad_norm": 0.7633320689201355,
"learning_rate": 0.0003,
"loss": 0.3046,
"step": 7000
},
{
"epoch": 13.421550094517958,
"grad_norm": 0.9149838089942932,
"learning_rate": 0.0003,
"loss": 0.3217,
"step": 7100
},
{
"epoch": 13.610586011342155,
"grad_norm": 0.9108119010925293,
"learning_rate": 0.0003,
"loss": 0.3315,
"step": 7200
},
{
"epoch": 13.799621928166351,
"grad_norm": 0.8181828260421753,
"learning_rate": 0.0003,
"loss": 0.3454,
"step": 7300
},
{
"epoch": 13.988657844990549,
"grad_norm": 0.9064919352531433,
"learning_rate": 0.0003,
"loss": 0.3526,
"step": 7400
},
{
"epoch": 14.0,
"eval_accuracy": 0.5631794871794872,
"eval_loss": 2.2189676761627197,
"eval_runtime": 5.8178,
"eval_samples_per_second": 85.943,
"eval_steps_per_second": 10.829,
"step": 7406
},
{
"epoch": 14.177693761814744,
"grad_norm": 0.8295121788978577,
"learning_rate": 0.0003,
"loss": 0.3011,
"step": 7500
},
{
"epoch": 14.366729678638942,
"grad_norm": 0.78739994764328,
"learning_rate": 0.0003,
"loss": 0.3091,
"step": 7600
},
{
"epoch": 14.555765595463138,
"grad_norm": 0.9230947494506836,
"learning_rate": 0.0003,
"loss": 0.3178,
"step": 7700
},
{
"epoch": 14.744801512287335,
"grad_norm": 0.8687440752983093,
"learning_rate": 0.0003,
"loss": 0.3298,
"step": 7800
},
{
"epoch": 14.93383742911153,
"grad_norm": 0.7883646488189697,
"learning_rate": 0.0003,
"loss": 0.35,
"step": 7900
},
{
"epoch": 15.0,
"eval_accuracy": 0.5638974358974359,
"eval_loss": 2.208575487136841,
"eval_runtime": 5.8338,
"eval_samples_per_second": 85.707,
"eval_steps_per_second": 10.799,
"step": 7935
},
{
"epoch": 15.122873345935728,
"grad_norm": 0.8580202460289001,
"learning_rate": 0.0003,
"loss": 0.3089,
"step": 8000
},
{
"epoch": 15.311909262759924,
"grad_norm": 1.0185339450836182,
"learning_rate": 0.0003,
"loss": 0.2966,
"step": 8100
},
{
"epoch": 15.500945179584122,
"grad_norm": 1.0962390899658203,
"learning_rate": 0.0003,
"loss": 0.3135,
"step": 8200
},
{
"epoch": 15.689981096408317,
"grad_norm": 0.9543150663375854,
"learning_rate": 0.0003,
"loss": 0.3258,
"step": 8300
},
{
"epoch": 15.879017013232515,
"grad_norm": 0.9073179364204407,
"learning_rate": 0.0003,
"loss": 0.3323,
"step": 8400
},
{
"epoch": 16.0,
"eval_accuracy": 0.5653846153846154,
"eval_loss": 2.2654531002044678,
"eval_runtime": 5.7427,
"eval_samples_per_second": 87.067,
"eval_steps_per_second": 10.97,
"step": 8464
},
{
"epoch": 16.068052930056712,
"grad_norm": 0.9761775135993958,
"learning_rate": 0.0003,
"loss": 0.3298,
"step": 8500
},
{
"epoch": 16.257088846880908,
"grad_norm": 0.8946503400802612,
"learning_rate": 0.0003,
"loss": 0.2945,
"step": 8600
},
{
"epoch": 16.446124763705104,
"grad_norm": 0.8116426467895508,
"learning_rate": 0.0003,
"loss": 0.3051,
"step": 8700
},
{
"epoch": 16.6351606805293,
"grad_norm": 0.8611814379692078,
"learning_rate": 0.0003,
"loss": 0.3145,
"step": 8800
},
{
"epoch": 16.8241965973535,
"grad_norm": 0.9202536940574646,
"learning_rate": 0.0003,
"loss": 0.3281,
"step": 8900
},
{
"epoch": 17.0,
"eval_accuracy": 0.5667179487179487,
"eval_loss": 2.244356870651245,
"eval_runtime": 5.9955,
"eval_samples_per_second": 83.396,
"eval_steps_per_second": 10.508,
"step": 8993
},
{
"epoch": 17.013232514177695,
"grad_norm": 0.7778891324996948,
"learning_rate": 0.0003,
"loss": 0.3344,
"step": 9000
},
{
"epoch": 17.20226843100189,
"grad_norm": 0.7352868914604187,
"learning_rate": 0.0003,
"loss": 0.2843,
"step": 9100
},
{
"epoch": 17.391304347826086,
"grad_norm": 0.7922298908233643,
"learning_rate": 0.0003,
"loss": 0.2941,
"step": 9200
},
{
"epoch": 17.58034026465028,
"grad_norm": 1.0191751718521118,
"learning_rate": 0.0003,
"loss": 0.3164,
"step": 9300
},
{
"epoch": 17.76937618147448,
"grad_norm": 1.0436335802078247,
"learning_rate": 0.0003,
"loss": 0.321,
"step": 9400
},
{
"epoch": 17.958412098298677,
"grad_norm": 0.8792089223861694,
"learning_rate": 0.0003,
"loss": 0.3328,
"step": 9500
},
{
"epoch": 18.0,
"eval_accuracy": 0.5626153846153846,
"eval_loss": 2.2596943378448486,
"eval_runtime": 6.0856,
"eval_samples_per_second": 82.161,
"eval_steps_per_second": 10.352,
"step": 9522
},
{
"epoch": 18.147448015122873,
"grad_norm": 0.6146367788314819,
"learning_rate": 0.0003,
"loss": 0.2967,
"step": 9600
},
{
"epoch": 18.33648393194707,
"grad_norm": 0.8163793683052063,
"learning_rate": 0.0003,
"loss": 0.2958,
"step": 9700
},
{
"epoch": 18.525519848771268,
"grad_norm": 0.8955701589584351,
"learning_rate": 0.0003,
"loss": 0.3065,
"step": 9800
},
{
"epoch": 18.714555765595463,
"grad_norm": 0.7934162020683289,
"learning_rate": 0.0003,
"loss": 0.3121,
"step": 9900
},
{
"epoch": 18.90359168241966,
"grad_norm": 1.07945716381073,
"learning_rate": 0.0003,
"loss": 0.3305,
"step": 10000
},
{
"epoch": 19.0,
"eval_accuracy": 0.5633333333333334,
"eval_loss": 2.2682220935821533,
"eval_runtime": 5.8794,
"eval_samples_per_second": 85.042,
"eval_steps_per_second": 10.715,
"step": 10051
},
{
"epoch": 19.092627599243855,
"grad_norm": 0.694956362247467,
"learning_rate": 0.0003,
"loss": 0.3069,
"step": 10100
},
{
"epoch": 19.281663516068054,
"grad_norm": 0.8144003748893738,
"learning_rate": 0.0003,
"loss": 0.2879,
"step": 10200
},
{
"epoch": 19.47069943289225,
"grad_norm": 0.7943779826164246,
"learning_rate": 0.0003,
"loss": 0.3002,
"step": 10300
},
{
"epoch": 19.659735349716446,
"grad_norm": 0.7781754732131958,
"learning_rate": 0.0003,
"loss": 0.3107,
"step": 10400
},
{
"epoch": 19.84877126654064,
"grad_norm": 0.7696000933647156,
"learning_rate": 0.0003,
"loss": 0.3228,
"step": 10500
},
{
"epoch": 20.0,
"eval_accuracy": 0.5640512820512821,
"eval_loss": 2.253227472305298,
"eval_runtime": 5.7574,
"eval_samples_per_second": 86.845,
"eval_steps_per_second": 10.943,
"step": 10580
},
{
"epoch": 20.0,
"step": 10580,
"total_flos": 6.517631969856061e+17,
"train_loss": 0.6476908660340625,
"train_runtime": 24281.4349,
"train_samples_per_second": 13.941,
"train_steps_per_second": 0.436
}
],
"logging_steps": 100,
"max_steps": 10580,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"total_flos": 6.517631969856061e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}