GPT2-From-Scratch / trainer_state.json
SimsConsulting's picture
Upload 7 files
336a0d7
{
"best_metric": 0.28431499004364014,
"best_model_checkpoint": "./new_models/gpt2/checkpoint-25000",
"epoch": 168.83116883116884,
"global_step": 39000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 3.46,
"learning_rate": 4.000000000000001e-06,
"loss": 9.4041,
"step": 100
},
{
"epoch": 6.93,
"learning_rate": 8.000000000000001e-06,
"loss": 7.6702,
"step": 200
},
{
"epoch": 10.39,
"learning_rate": 1.2e-05,
"loss": 6.7042,
"step": 300
},
{
"epoch": 13.85,
"learning_rate": 1.6000000000000003e-05,
"loss": 5.8391,
"step": 400
},
{
"epoch": 17.32,
"learning_rate": 2e-05,
"loss": 5.1775,
"step": 500
},
{
"epoch": 20.78,
"learning_rate": 1.9963963963963965e-05,
"loss": 4.7103,
"step": 600
},
{
"epoch": 24.24,
"learning_rate": 1.992792792792793e-05,
"loss": 4.353,
"step": 700
},
{
"epoch": 27.71,
"learning_rate": 1.9891891891891894e-05,
"loss": 4.04,
"step": 800
},
{
"epoch": 31.17,
"learning_rate": 1.9855855855855857e-05,
"loss": 3.7865,
"step": 900
},
{
"epoch": 34.63,
"learning_rate": 1.981981981981982e-05,
"loss": 3.5376,
"step": 1000
},
{
"epoch": 34.63,
"eval_loss": 3.2091352939605713,
"eval_runtime": 3.6439,
"eval_samples_per_second": 14.27,
"eval_steps_per_second": 1.921,
"step": 1000
},
{
"epoch": 38.1,
"learning_rate": 1.9783783783783786e-05,
"loss": 3.3258,
"step": 1100
},
{
"epoch": 41.56,
"learning_rate": 1.974774774774775e-05,
"loss": 3.1155,
"step": 1200
},
{
"epoch": 45.02,
"learning_rate": 1.9711711711711716e-05,
"loss": 2.9341,
"step": 1300
},
{
"epoch": 48.48,
"learning_rate": 1.967567567567568e-05,
"loss": 2.7419,
"step": 1400
},
{
"epoch": 51.95,
"learning_rate": 1.963963963963964e-05,
"loss": 2.5793,
"step": 1500
},
{
"epoch": 55.41,
"learning_rate": 1.9603603603603604e-05,
"loss": 2.4091,
"step": 1600
},
{
"epoch": 58.87,
"learning_rate": 1.956756756756757e-05,
"loss": 2.2517,
"step": 1700
},
{
"epoch": 62.34,
"learning_rate": 1.9531531531531534e-05,
"loss": 2.0899,
"step": 1800
},
{
"epoch": 65.8,
"learning_rate": 1.9495495495495497e-05,
"loss": 1.9464,
"step": 1900
},
{
"epoch": 69.26,
"learning_rate": 1.9459459459459463e-05,
"loss": 1.803,
"step": 2000
},
{
"epoch": 69.26,
"eval_loss": 1.7681734561920166,
"eval_runtime": 3.5208,
"eval_samples_per_second": 14.769,
"eval_steps_per_second": 1.988,
"step": 2000
},
{
"epoch": 72.73,
"learning_rate": 1.9423423423423423e-05,
"loss": 1.6706,
"step": 2100
},
{
"epoch": 76.19,
"learning_rate": 1.938738738738739e-05,
"loss": 1.5401,
"step": 2200
},
{
"epoch": 79.65,
"learning_rate": 1.9351351351351352e-05,
"loss": 1.4045,
"step": 2300
},
{
"epoch": 83.12,
"learning_rate": 1.931531531531532e-05,
"loss": 1.2934,
"step": 2400
},
{
"epoch": 86.58,
"learning_rate": 1.927927927927928e-05,
"loss": 1.1735,
"step": 2500
},
{
"epoch": 90.04,
"learning_rate": 1.9243243243243244e-05,
"loss": 1.0624,
"step": 2600
},
{
"epoch": 93.51,
"learning_rate": 1.9207207207207207e-05,
"loss": 0.9525,
"step": 2700
},
{
"epoch": 96.97,
"learning_rate": 1.9171171171171174e-05,
"loss": 0.8541,
"step": 2800
},
{
"epoch": 100.43,
"learning_rate": 1.9135135135135137e-05,
"loss": 0.7571,
"step": 2900
},
{
"epoch": 103.9,
"learning_rate": 1.90990990990991e-05,
"loss": 0.6733,
"step": 3000
},
{
"epoch": 103.9,
"eval_loss": 0.9859427213668823,
"eval_runtime": 3.5218,
"eval_samples_per_second": 14.765,
"eval_steps_per_second": 1.988,
"step": 3000
},
{
"epoch": 107.36,
"learning_rate": 1.9063063063063066e-05,
"loss": 0.5883,
"step": 3100
},
{
"epoch": 110.82,
"learning_rate": 1.902702702702703e-05,
"loss": 0.5167,
"step": 3200
},
{
"epoch": 114.29,
"learning_rate": 1.8990990990990992e-05,
"loss": 0.4459,
"step": 3300
},
{
"epoch": 117.75,
"learning_rate": 1.8954954954954955e-05,
"loss": 0.385,
"step": 3400
},
{
"epoch": 121.21,
"learning_rate": 1.891891891891892e-05,
"loss": 0.3311,
"step": 3500
},
{
"epoch": 124.68,
"learning_rate": 1.8882882882882884e-05,
"loss": 0.2853,
"step": 3600
},
{
"epoch": 128.14,
"learning_rate": 1.884684684684685e-05,
"loss": 0.2442,
"step": 3700
},
{
"epoch": 131.6,
"learning_rate": 1.8810810810810813e-05,
"loss": 0.2097,
"step": 3800
},
{
"epoch": 135.06,
"learning_rate": 1.8774774774774776e-05,
"loss": 0.1802,
"step": 3900
},
{
"epoch": 138.53,
"learning_rate": 1.873873873873874e-05,
"loss": 0.1561,
"step": 4000
},
{
"epoch": 138.53,
"eval_loss": 0.8047342300415039,
"eval_runtime": 3.5244,
"eval_samples_per_second": 14.754,
"eval_steps_per_second": 1.986,
"step": 4000
},
{
"epoch": 141.99,
"learning_rate": 1.8702702702702706e-05,
"loss": 0.1359,
"step": 4100
},
{
"epoch": 145.45,
"learning_rate": 1.866666666666667e-05,
"loss": 0.12,
"step": 4200
},
{
"epoch": 148.92,
"learning_rate": 1.863063063063063e-05,
"loss": 0.1066,
"step": 4300
},
{
"epoch": 152.38,
"learning_rate": 1.8594594594594598e-05,
"loss": 0.0952,
"step": 4400
},
{
"epoch": 155.84,
"learning_rate": 1.855855855855856e-05,
"loss": 0.0866,
"step": 4500
},
{
"epoch": 159.31,
"learning_rate": 1.8522522522522524e-05,
"loss": 0.0791,
"step": 4600
},
{
"epoch": 162.77,
"learning_rate": 1.8486486486486487e-05,
"loss": 0.072,
"step": 4700
},
{
"epoch": 166.23,
"learning_rate": 1.8450450450450453e-05,
"loss": 0.0658,
"step": 4800
},
{
"epoch": 169.7,
"learning_rate": 1.8414414414414416e-05,
"loss": 0.0622,
"step": 4900
},
{
"epoch": 173.16,
"learning_rate": 1.8378378378378383e-05,
"loss": 0.058,
"step": 5000
},
{
"epoch": 173.16,
"eval_loss": 0.8171238303184509,
"eval_runtime": 3.5228,
"eval_samples_per_second": 14.761,
"eval_steps_per_second": 1.987,
"step": 5000
},
{
"epoch": 176.62,
"learning_rate": 1.8342342342342342e-05,
"loss": 0.0531,
"step": 5100
},
{
"epoch": 180.09,
"learning_rate": 1.830630630630631e-05,
"loss": 0.0504,
"step": 5200
},
{
"epoch": 183.55,
"learning_rate": 1.827027027027027e-05,
"loss": 0.046,
"step": 5300
},
{
"epoch": 187.01,
"learning_rate": 1.8234234234234234e-05,
"loss": 0.0447,
"step": 5400
},
{
"epoch": 190.48,
"learning_rate": 1.81981981981982e-05,
"loss": 0.0543,
"step": 5500
},
{
"epoch": 193.94,
"learning_rate": 1.8162162162162164e-05,
"loss": 0.0492,
"step": 5600
},
{
"epoch": 197.4,
"learning_rate": 1.8126126126126127e-05,
"loss": 0.0438,
"step": 5700
},
{
"epoch": 200.87,
"learning_rate": 1.809009009009009e-05,
"loss": 0.0547,
"step": 5800
},
{
"epoch": 204.33,
"learning_rate": 1.8054054054054056e-05,
"loss": 0.0615,
"step": 5900
},
{
"epoch": 207.79,
"learning_rate": 1.801801801801802e-05,
"loss": 0.072,
"step": 6000
},
{
"epoch": 207.79,
"eval_loss": 0.8289902210235596,
"eval_runtime": 3.5216,
"eval_samples_per_second": 14.766,
"eval_steps_per_second": 1.988,
"step": 6000
},
{
"epoch": 211.26,
"learning_rate": 1.7981981981981985e-05,
"loss": 0.1157,
"step": 6100
},
{
"epoch": 214.72,
"learning_rate": 1.7945945945945948e-05,
"loss": 0.0869,
"step": 6200
},
{
"epoch": 218.18,
"learning_rate": 1.790990990990991e-05,
"loss": 1.0166,
"step": 6300
},
{
"epoch": 221.65,
"learning_rate": 1.7873873873873874e-05,
"loss": 0.0771,
"step": 6400
},
{
"epoch": 225.11,
"learning_rate": 1.783783783783784e-05,
"loss": 0.0953,
"step": 6500
},
{
"epoch": 228.57,
"learning_rate": 1.7801801801801804e-05,
"loss": 0.6189,
"step": 6600
},
{
"epoch": 232.03,
"learning_rate": 1.7765765765765767e-05,
"loss": 0.5593,
"step": 6700
},
{
"epoch": 235.5,
"learning_rate": 1.7729729729729733e-05,
"loss": 0.376,
"step": 6800
},
{
"epoch": 238.96,
"learning_rate": 1.7693693693693696e-05,
"loss": 0.4129,
"step": 6900
},
{
"epoch": 242.42,
"learning_rate": 1.765765765765766e-05,
"loss": 2.2984,
"step": 7000
},
{
"epoch": 242.42,
"eval_loss": 4.4349541664123535,
"eval_runtime": 3.5205,
"eval_samples_per_second": 14.77,
"eval_steps_per_second": 1.988,
"step": 7000
},
{
"epoch": 245.89,
"learning_rate": 1.7621621621621622e-05,
"loss": 3.4028,
"step": 7100
},
{
"epoch": 249.35,
"learning_rate": 1.7585585585585588e-05,
"loss": 0.7196,
"step": 7200
},
{
"epoch": 252.81,
"learning_rate": 1.754954954954955e-05,
"loss": 1.162,
"step": 7300
},
{
"epoch": 256.28,
"learning_rate": 1.7513513513513517e-05,
"loss": 0.7413,
"step": 7400
},
{
"epoch": 259.74,
"learning_rate": 1.7477477477477477e-05,
"loss": 1.1918,
"step": 7500
},
{
"epoch": 263.2,
"learning_rate": 1.7441441441441443e-05,
"loss": 0.8564,
"step": 7600
},
{
"epoch": 266.67,
"learning_rate": 1.7405405405405406e-05,
"loss": 0.2815,
"step": 7700
},
{
"epoch": 270.13,
"learning_rate": 1.7369369369369373e-05,
"loss": 0.5848,
"step": 7800
},
{
"epoch": 273.59,
"learning_rate": 1.7333333333333336e-05,
"loss": 0.6489,
"step": 7900
},
{
"epoch": 277.06,
"learning_rate": 1.72972972972973e-05,
"loss": 1.0025,
"step": 8000
},
{
"epoch": 277.06,
"eval_loss": 1.2763237953186035,
"eval_runtime": 3.5102,
"eval_samples_per_second": 14.814,
"eval_steps_per_second": 1.994,
"step": 8000
},
{
"epoch": 280.52,
"learning_rate": 1.726126126126126e-05,
"loss": 0.7947,
"step": 8100
},
{
"epoch": 283.98,
"learning_rate": 1.7225225225225225e-05,
"loss": 0.558,
"step": 8200
},
{
"epoch": 287.45,
"learning_rate": 1.718918918918919e-05,
"loss": 0.6356,
"step": 8300
},
{
"epoch": 290.91,
"learning_rate": 1.7153153153153154e-05,
"loss": 0.5268,
"step": 8400
},
{
"epoch": 294.37,
"learning_rate": 1.711711711711712e-05,
"loss": 0.2633,
"step": 8500
},
{
"epoch": 297.84,
"learning_rate": 1.7081081081081083e-05,
"loss": 0.2457,
"step": 8600
},
{
"epoch": 301.3,
"learning_rate": 1.7045045045045046e-05,
"loss": 0.5308,
"step": 8700
},
{
"epoch": 304.76,
"learning_rate": 1.700900900900901e-05,
"loss": 0.369,
"step": 8800
},
{
"epoch": 308.23,
"learning_rate": 1.6972972972972975e-05,
"loss": 0.3203,
"step": 8900
},
{
"epoch": 311.69,
"learning_rate": 1.693693693693694e-05,
"loss": 2.5307,
"step": 9000
},
{
"epoch": 311.69,
"eval_loss": 1.3849806785583496,
"eval_runtime": 3.5124,
"eval_samples_per_second": 14.805,
"eval_steps_per_second": 1.993,
"step": 9000
},
{
"epoch": 39.39,
"learning_rate": 1.96273022751896e-05,
"loss": 3.0696,
"step": 9100
},
{
"epoch": 39.83,
"learning_rate": 1.962296858071506e-05,
"loss": 3.0068,
"step": 9200
},
{
"epoch": 40.26,
"learning_rate": 1.9618634886240522e-05,
"loss": 2.7896,
"step": 9300
},
{
"epoch": 40.69,
"learning_rate": 1.9614301191765985e-05,
"loss": 2.5042,
"step": 9400
},
{
"epoch": 41.13,
"learning_rate": 1.960996749729144e-05,
"loss": 2.8704,
"step": 9500
},
{
"epoch": 41.56,
"learning_rate": 1.9605633802816904e-05,
"loss": 3.4878,
"step": 9600
},
{
"epoch": 41.99,
"learning_rate": 1.9601300108342363e-05,
"loss": 3.0682,
"step": 9700
},
{
"epoch": 42.42,
"learning_rate": 1.9596966413867822e-05,
"loss": 2.9751,
"step": 9800
},
{
"epoch": 42.86,
"learning_rate": 1.9592632719393285e-05,
"loss": 3.3576,
"step": 9900
},
{
"epoch": 43.29,
"learning_rate": 1.9588299024918744e-05,
"loss": 2.9478,
"step": 10000
},
{
"epoch": 43.29,
"eval_loss": 1.7224024534225464,
"eval_runtime": 3.6186,
"eval_samples_per_second": 14.37,
"eval_steps_per_second": 1.934,
"step": 10000
},
{
"epoch": 47.62,
"learning_rate": 1.954496208017335e-05,
"loss": 2.4401,
"step": 11000
},
{
"epoch": 47.62,
"eval_loss": 1.6094621419906616,
"eval_runtime": 3.6227,
"eval_samples_per_second": 14.354,
"eval_steps_per_second": 1.932,
"step": 11000
},
{
"epoch": 51.95,
"learning_rate": 1.9501625135427952e-05,
"loss": 2.3021,
"step": 12000
},
{
"epoch": 51.95,
"eval_loss": 1.9848077297210693,
"eval_runtime": 3.511,
"eval_samples_per_second": 14.81,
"eval_steps_per_second": 1.994,
"step": 12000
},
{
"epoch": 56.28,
"learning_rate": 1.945828819068256e-05,
"loss": 1.8831,
"step": 13000
},
{
"epoch": 56.28,
"eval_loss": 0.5190821290016174,
"eval_runtime": 3.5109,
"eval_samples_per_second": 14.811,
"eval_steps_per_second": 1.994,
"step": 13000
},
{
"epoch": 60.61,
"learning_rate": 1.9414951245937164e-05,
"loss": 1.1329,
"step": 14000
},
{
"epoch": 60.61,
"eval_loss": 0.9506992101669312,
"eval_runtime": 3.511,
"eval_samples_per_second": 14.81,
"eval_steps_per_second": 1.994,
"step": 14000
},
{
"epoch": 64.94,
"learning_rate": 1.9371614301191768e-05,
"loss": 1.8788,
"step": 15000
},
{
"epoch": 64.94,
"eval_loss": 1.937408685684204,
"eval_runtime": 3.5081,
"eval_samples_per_second": 14.823,
"eval_steps_per_second": 1.995,
"step": 15000
},
{
"epoch": 69.26,
"learning_rate": 1.932827735644637e-05,
"loss": 1.6736,
"step": 16000
},
{
"epoch": 69.26,
"eval_loss": 0.5699201226234436,
"eval_runtime": 3.5113,
"eval_samples_per_second": 14.809,
"eval_steps_per_second": 1.994,
"step": 16000
},
{
"epoch": 73.59,
"learning_rate": 1.9284940411700976e-05,
"loss": 0.5165,
"step": 17000
},
{
"epoch": 73.59,
"eval_loss": 0.4182128310203552,
"eval_runtime": 3.5129,
"eval_samples_per_second": 14.803,
"eval_steps_per_second": 1.993,
"step": 17000
},
{
"epoch": 77.92,
"learning_rate": 1.924160346695558e-05,
"loss": 0.4656,
"step": 18000
},
{
"epoch": 77.92,
"eval_loss": 0.4120073914527893,
"eval_runtime": 3.5127,
"eval_samples_per_second": 14.803,
"eval_steps_per_second": 1.993,
"step": 18000
},
{
"epoch": 82.25,
"learning_rate": 1.9198266522210184e-05,
"loss": 0.6133,
"step": 19000
},
{
"epoch": 82.25,
"eval_loss": 0.4980267286300659,
"eval_runtime": 3.5108,
"eval_samples_per_second": 14.811,
"eval_steps_per_second": 1.994,
"step": 19000
},
{
"epoch": 86.58,
"learning_rate": 1.9154929577464788e-05,
"loss": 0.8087,
"step": 20000
},
{
"epoch": 86.58,
"eval_loss": 0.5801683068275452,
"eval_runtime": 3.5099,
"eval_samples_per_second": 14.815,
"eval_steps_per_second": 1.994,
"step": 20000
},
{
"epoch": 90.91,
"learning_rate": 1.9111592632719395e-05,
"loss": 2.2068,
"step": 21000
},
{
"epoch": 90.91,
"eval_loss": 0.7701263427734375,
"eval_runtime": 3.5112,
"eval_samples_per_second": 14.81,
"eval_steps_per_second": 1.994,
"step": 21000
},
{
"epoch": 95.24,
"learning_rate": 1.9068255687974e-05,
"loss": 1.0182,
"step": 22000
},
{
"epoch": 95.24,
"eval_loss": 0.42168232798576355,
"eval_runtime": 3.5098,
"eval_samples_per_second": 14.816,
"eval_steps_per_second": 1.994,
"step": 22000
},
{
"epoch": 99.57,
"learning_rate": 1.9024918743228603e-05,
"loss": 0.3515,
"step": 23000
},
{
"epoch": 99.57,
"eval_loss": 0.2897047996520996,
"eval_runtime": 3.5082,
"eval_samples_per_second": 14.822,
"eval_steps_per_second": 1.995,
"step": 23000
},
{
"epoch": 103.9,
"learning_rate": 1.8981581798483207e-05,
"loss": 1.007,
"step": 24000
},
{
"epoch": 103.9,
"eval_loss": 0.28924015164375305,
"eval_runtime": 3.5076,
"eval_samples_per_second": 14.825,
"eval_steps_per_second": 1.996,
"step": 24000
},
{
"epoch": 108.23,
"learning_rate": 1.8938244853737814e-05,
"loss": 0.1892,
"step": 25000
},
{
"epoch": 108.23,
"eval_loss": 0.28431499004364014,
"eval_runtime": 3.5124,
"eval_samples_per_second": 14.805,
"eval_steps_per_second": 1.993,
"step": 25000
},
{
"epoch": 112.55,
"learning_rate": 1.8894907908992418e-05,
"loss": 0.2349,
"step": 26000
},
{
"epoch": 112.55,
"eval_loss": 0.2943420112133026,
"eval_runtime": 3.5082,
"eval_samples_per_second": 14.822,
"eval_steps_per_second": 1.995,
"step": 26000
},
{
"epoch": 116.88,
"learning_rate": 1.8851570964247022e-05,
"loss": 0.1959,
"step": 27000
},
{
"epoch": 116.88,
"eval_loss": 0.2937524616718292,
"eval_runtime": 3.5084,
"eval_samples_per_second": 14.822,
"eval_steps_per_second": 1.995,
"step": 27000
},
{
"epoch": 121.21,
"learning_rate": 1.8808234019501626e-05,
"loss": 0.5489,
"step": 28000
},
{
"epoch": 121.21,
"eval_loss": 0.3693106770515442,
"eval_runtime": 3.5038,
"eval_samples_per_second": 14.841,
"eval_steps_per_second": 1.998,
"step": 28000
},
{
"epoch": 125.54,
"learning_rate": 1.8764897074756233e-05,
"loss": 0.1798,
"step": 29000
},
{
"epoch": 125.54,
"eval_loss": 0.2986227571964264,
"eval_runtime": 3.5089,
"eval_samples_per_second": 14.819,
"eval_steps_per_second": 1.995,
"step": 29000
},
{
"epoch": 129.87,
"learning_rate": 1.8721560130010837e-05,
"loss": 0.1638,
"step": 30000
},
{
"epoch": 129.87,
"eval_loss": 0.3518519103527069,
"eval_runtime": 3.5068,
"eval_samples_per_second": 14.828,
"eval_steps_per_second": 1.996,
"step": 30000
},
{
"epoch": 134.2,
"learning_rate": 1.867822318526544e-05,
"loss": 0.3161,
"step": 31000
},
{
"epoch": 134.2,
"eval_loss": 0.37139639258384705,
"eval_runtime": 3.5102,
"eval_samples_per_second": 14.814,
"eval_steps_per_second": 1.994,
"step": 31000
},
{
"epoch": 138.53,
"learning_rate": 1.8634886240520045e-05,
"loss": 0.4443,
"step": 32000
},
{
"epoch": 138.53,
"eval_loss": 0.4150441288948059,
"eval_runtime": 3.5081,
"eval_samples_per_second": 14.823,
"eval_steps_per_second": 1.995,
"step": 32000
},
{
"epoch": 142.86,
"learning_rate": 1.859154929577465e-05,
"loss": 0.6043,
"step": 33000
},
{
"epoch": 142.86,
"eval_loss": 0.6062866449356079,
"eval_runtime": 3.5067,
"eval_samples_per_second": 14.829,
"eval_steps_per_second": 1.996,
"step": 33000
},
{
"epoch": 147.19,
"learning_rate": 1.8548212351029253e-05,
"loss": 1.0402,
"step": 34000
},
{
"epoch": 147.19,
"eval_loss": 0.5321042537689209,
"eval_runtime": 3.6131,
"eval_samples_per_second": 14.392,
"eval_steps_per_second": 1.937,
"step": 34000
},
{
"epoch": 151.52,
"learning_rate": 1.8504875406283857e-05,
"loss": 0.8064,
"step": 35000
},
{
"epoch": 151.52,
"eval_loss": 0.5623323917388916,
"eval_runtime": 3.5113,
"eval_samples_per_second": 14.809,
"eval_steps_per_second": 1.994,
"step": 35000
},
{
"epoch": 155.84,
"learning_rate": 1.8461538461538465e-05,
"loss": 1.0081,
"step": 36000
},
{
"epoch": 155.84,
"eval_loss": 0.8560149669647217,
"eval_runtime": 3.5137,
"eval_samples_per_second": 14.799,
"eval_steps_per_second": 1.992,
"step": 36000
},
{
"epoch": 160.17,
"learning_rate": 1.841820151679307e-05,
"loss": 1.4319,
"step": 37000
},
{
"epoch": 160.17,
"eval_loss": 0.7755089998245239,
"eval_runtime": 3.5088,
"eval_samples_per_second": 14.82,
"eval_steps_per_second": 1.995,
"step": 37000
},
{
"epoch": 164.5,
"learning_rate": 1.8374864572047673e-05,
"loss": 1.5845,
"step": 38000
},
{
"epoch": 164.5,
"eval_loss": 0.8413295745849609,
"eval_runtime": 3.5072,
"eval_samples_per_second": 14.827,
"eval_steps_per_second": 1.996,
"step": 38000
},
{
"epoch": 168.83,
"learning_rate": 1.8331527627302277e-05,
"loss": 1.1751,
"step": 39000
},
{
"epoch": 168.83,
"eval_loss": 1.2155665159225464,
"eval_runtime": 3.5106,
"eval_samples_per_second": 14.812,
"eval_steps_per_second": 1.994,
"step": 39000
}
],
"max_steps": 462000,
"num_train_epochs": 2000,
"total_flos": 1.06376689483776e+17,
"trial_name": null,
"trial_params": null
}