japanese-mistral-300m-instruction / trainer_state.json
ce-lery's picture
feat: fine-tuning japanese-mistral-300M-base with databricks-dolly-15k-ja
025d0db
{
"best_metric": 2.513946056365967,
"best_model_checkpoint": "checkpoints-finetuning/checkpoint-1080",
"epoch": 193.14128943758573,
"eval_steps": 40,
"global_step": 2200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 3.51,
"learning_rate": 2.0000000000000002e-07,
"loss": 3.595,
"step": 40
},
{
"epoch": 3.51,
"eval_loss": 3.5299072265625,
"eval_runtime": 5.0148,
"eval_samples_per_second": 62.416,
"eval_steps_per_second": 15.753,
"step": 40
},
{
"epoch": 7.02,
"learning_rate": 4.0000000000000003e-07,
"loss": 3.4769,
"step": 80
},
{
"epoch": 7.02,
"eval_loss": 3.3721721172332764,
"eval_runtime": 4.4435,
"eval_samples_per_second": 70.441,
"eval_steps_per_second": 17.779,
"step": 80
},
{
"epoch": 10.53,
"learning_rate": 6.000000000000001e-07,
"loss": 3.3037,
"step": 120
},
{
"epoch": 10.53,
"eval_loss": 3.1870808601379395,
"eval_runtime": 4.6407,
"eval_samples_per_second": 67.446,
"eval_steps_per_second": 17.023,
"step": 120
},
{
"epoch": 14.05,
"learning_rate": 8.000000000000001e-07,
"loss": 3.1255,
"step": 160
},
{
"epoch": 14.05,
"eval_loss": 3.0087945461273193,
"eval_runtime": 4.7026,
"eval_samples_per_second": 66.559,
"eval_steps_per_second": 16.799,
"step": 160
},
{
"epoch": 17.56,
"learning_rate": 1.0000000000000002e-06,
"loss": 2.9615,
"step": 200
},
{
"epoch": 17.56,
"eval_loss": 2.8684051036834717,
"eval_runtime": 4.6401,
"eval_samples_per_second": 67.455,
"eval_steps_per_second": 17.026,
"step": 200
},
{
"epoch": 21.07,
"learning_rate": 1.2000000000000002e-06,
"loss": 2.8468,
"step": 240
},
{
"epoch": 21.07,
"eval_loss": 2.780834436416626,
"eval_runtime": 4.4221,
"eval_samples_per_second": 70.78,
"eval_steps_per_second": 17.865,
"step": 240
},
{
"epoch": 24.58,
"learning_rate": 1.4000000000000001e-06,
"loss": 2.7699,
"step": 280
},
{
"epoch": 24.58,
"eval_loss": 2.720453977584839,
"eval_runtime": 4.5663,
"eval_samples_per_second": 68.546,
"eval_steps_per_second": 17.301,
"step": 280
},
{
"epoch": 28.09,
"learning_rate": 1.6000000000000001e-06,
"loss": 2.7139,
"step": 320
},
{
"epoch": 28.09,
"eval_loss": 2.679349422454834,
"eval_runtime": 4.7784,
"eval_samples_per_second": 65.504,
"eval_steps_per_second": 16.533,
"step": 320
},
{
"epoch": 31.6,
"learning_rate": 1.8000000000000001e-06,
"loss": 2.6712,
"step": 360
},
{
"epoch": 31.6,
"eval_loss": 2.650853395462036,
"eval_runtime": 4.6713,
"eval_samples_per_second": 67.005,
"eval_steps_per_second": 16.912,
"step": 360
},
{
"epoch": 35.12,
"learning_rate": 2.0000000000000003e-06,
"loss": 2.6356,
"step": 400
},
{
"epoch": 35.12,
"eval_loss": 2.6293749809265137,
"eval_runtime": 4.6364,
"eval_samples_per_second": 67.51,
"eval_steps_per_second": 17.039,
"step": 400
},
{
"epoch": 38.63,
"learning_rate": 2.2e-06,
"loss": 2.6048,
"step": 440
},
{
"epoch": 38.63,
"eval_loss": 2.611950635910034,
"eval_runtime": 4.5116,
"eval_samples_per_second": 69.377,
"eval_steps_per_second": 17.511,
"step": 440
},
{
"epoch": 42.14,
"learning_rate": 2.4000000000000003e-06,
"loss": 2.5823,
"step": 480
},
{
"epoch": 42.14,
"eval_loss": 2.597449541091919,
"eval_runtime": 4.5213,
"eval_samples_per_second": 69.228,
"eval_steps_per_second": 17.473,
"step": 480
},
{
"epoch": 45.65,
"learning_rate": 2.6e-06,
"loss": 2.5536,
"step": 520
},
{
"epoch": 45.65,
"eval_loss": 2.5848779678344727,
"eval_runtime": 4.4398,
"eval_samples_per_second": 70.498,
"eval_steps_per_second": 17.793,
"step": 520
},
{
"epoch": 49.16,
"learning_rate": 2.8000000000000003e-06,
"loss": 2.5293,
"step": 560
},
{
"epoch": 49.16,
"eval_loss": 2.574049472808838,
"eval_runtime": 4.6572,
"eval_samples_per_second": 67.208,
"eval_steps_per_second": 16.963,
"step": 560
},
{
"epoch": 52.67,
"learning_rate": 3e-06,
"loss": 2.5058,
"step": 600
},
{
"epoch": 52.67,
"eval_loss": 2.5643808841705322,
"eval_runtime": 4.6849,
"eval_samples_per_second": 66.81,
"eval_steps_per_second": 16.863,
"step": 600
},
{
"epoch": 56.19,
"learning_rate": 3.2000000000000003e-06,
"loss": 2.482,
"step": 640
},
{
"epoch": 56.19,
"eval_loss": 2.555607557296753,
"eval_runtime": 4.491,
"eval_samples_per_second": 69.695,
"eval_steps_per_second": 17.591,
"step": 640
},
{
"epoch": 59.7,
"learning_rate": 3.4000000000000005e-06,
"loss": 2.4575,
"step": 680
},
{
"epoch": 59.7,
"eval_loss": 2.547734260559082,
"eval_runtime": 4.6182,
"eval_samples_per_second": 67.776,
"eval_steps_per_second": 17.106,
"step": 680
},
{
"epoch": 63.21,
"learning_rate": 3.6000000000000003e-06,
"loss": 2.4339,
"step": 720
},
{
"epoch": 63.21,
"eval_loss": 2.5405359268188477,
"eval_runtime": 4.5137,
"eval_samples_per_second": 69.345,
"eval_steps_per_second": 17.502,
"step": 720
},
{
"epoch": 66.72,
"learning_rate": 3.8000000000000005e-06,
"loss": 2.4073,
"step": 760
},
{
"epoch": 66.72,
"eval_loss": 2.5350451469421387,
"eval_runtime": 4.6034,
"eval_samples_per_second": 67.993,
"eval_steps_per_second": 17.161,
"step": 760
},
{
"epoch": 70.23,
"learning_rate": 4.000000000000001e-06,
"loss": 2.3845,
"step": 800
},
{
"epoch": 70.23,
"eval_loss": 2.530299186706543,
"eval_runtime": 4.6325,
"eval_samples_per_second": 67.566,
"eval_steps_per_second": 17.053,
"step": 800
},
{
"epoch": 73.74,
"learning_rate": 4.2000000000000004e-06,
"loss": 2.3606,
"step": 840
},
{
"epoch": 73.74,
"eval_loss": 2.525312662124634,
"eval_runtime": 4.4668,
"eval_samples_per_second": 70.072,
"eval_steps_per_second": 17.686,
"step": 840
},
{
"epoch": 77.26,
"learning_rate": 4.4e-06,
"loss": 2.329,
"step": 880
},
{
"epoch": 77.26,
"eval_loss": 2.5215225219726562,
"eval_runtime": 4.4699,
"eval_samples_per_second": 70.023,
"eval_steps_per_second": 17.674,
"step": 880
},
{
"epoch": 80.77,
"learning_rate": 4.600000000000001e-06,
"loss": 2.3071,
"step": 920
},
{
"epoch": 80.77,
"eval_loss": 2.5184576511383057,
"eval_runtime": 4.3807,
"eval_samples_per_second": 71.45,
"eval_steps_per_second": 18.034,
"step": 920
},
{
"epoch": 84.28,
"learning_rate": 4.800000000000001e-06,
"loss": 2.2768,
"step": 960
},
{
"epoch": 84.28,
"eval_loss": 2.515460729598999,
"eval_runtime": 4.6634,
"eval_samples_per_second": 67.119,
"eval_steps_per_second": 16.941,
"step": 960
},
{
"epoch": 87.79,
"learning_rate": 5e-06,
"loss": 2.2479,
"step": 1000
},
{
"epoch": 87.79,
"eval_loss": 2.514392852783203,
"eval_runtime": 4.5583,
"eval_samples_per_second": 68.665,
"eval_steps_per_second": 17.331,
"step": 1000
},
{
"epoch": 91.3,
"learning_rate": 4.986304738420684e-06,
"loss": 2.2181,
"step": 1040
},
{
"epoch": 91.3,
"eval_loss": 2.515076160430908,
"eval_runtime": 4.6324,
"eval_samples_per_second": 67.568,
"eval_steps_per_second": 17.054,
"step": 1040
},
{
"epoch": 94.81,
"learning_rate": 4.9453690018345144e-06,
"loss": 2.1901,
"step": 1080
},
{
"epoch": 94.81,
"eval_loss": 2.513946056365967,
"eval_runtime": 4.635,
"eval_samples_per_second": 67.53,
"eval_steps_per_second": 17.044,
"step": 1080
},
{
"epoch": 98.33,
"learning_rate": 4.8776412907378845e-06,
"loss": 2.1571,
"step": 1120
},
{
"epoch": 98.33,
"eval_loss": 2.514775037765503,
"eval_runtime": 4.7132,
"eval_samples_per_second": 66.41,
"eval_steps_per_second": 16.762,
"step": 1120
},
{
"epoch": 101.84,
"learning_rate": 4.783863644106502e-06,
"loss": 2.1308,
"step": 1160
},
{
"epoch": 101.84,
"eval_loss": 2.5165762901306152,
"eval_runtime": 4.6347,
"eval_samples_per_second": 67.535,
"eval_steps_per_second": 17.046,
"step": 1160
},
{
"epoch": 105.35,
"learning_rate": 4.665063509461098e-06,
"loss": 2.1032,
"step": 1200
},
{
"epoch": 105.35,
"eval_loss": 2.5192971229553223,
"eval_runtime": 4.6292,
"eval_samples_per_second": 67.614,
"eval_steps_per_second": 17.066,
"step": 1200
},
{
"epoch": 108.86,
"learning_rate": 4.522542485937369e-06,
"loss": 2.0761,
"step": 1240
},
{
"epoch": 108.86,
"eval_loss": 2.5203866958618164,
"eval_runtime": 4.6638,
"eval_samples_per_second": 67.113,
"eval_steps_per_second": 16.939,
"step": 1240
},
{
"epoch": 112.37,
"learning_rate": 4.357862063693486e-06,
"loss": 2.0495,
"step": 1280
},
{
"epoch": 112.37,
"eval_loss": 2.5268709659576416,
"eval_runtime": 4.6504,
"eval_samples_per_second": 67.306,
"eval_steps_per_second": 16.988,
"step": 1280
},
{
"epoch": 115.88,
"learning_rate": 4.172826515897146e-06,
"loss": 2.0231,
"step": 1320
},
{
"epoch": 115.88,
"eval_loss": 2.5284526348114014,
"eval_runtime": 4.6029,
"eval_samples_per_second": 68.0,
"eval_steps_per_second": 17.163,
"step": 1320
},
{
"epoch": 119.4,
"learning_rate": 3.969463130731183e-06,
"loss": 2.0021,
"step": 1360
},
{
"epoch": 119.4,
"eval_loss": 2.5327632427215576,
"eval_runtime": 4.7118,
"eval_samples_per_second": 66.429,
"eval_steps_per_second": 16.767,
"step": 1360
},
{
"epoch": 122.91,
"learning_rate": 3.7500000000000005e-06,
"loss": 1.9793,
"step": 1400
},
{
"epoch": 122.91,
"eval_loss": 2.5382816791534424,
"eval_runtime": 4.6299,
"eval_samples_per_second": 67.603,
"eval_steps_per_second": 17.063,
"step": 1400
},
{
"epoch": 126.42,
"learning_rate": 3.516841607689501e-06,
"loss": 1.9575,
"step": 1440
},
{
"epoch": 126.42,
"eval_loss": 2.5441536903381348,
"eval_runtime": 4.6442,
"eval_samples_per_second": 67.396,
"eval_steps_per_second": 17.01,
"step": 1440
},
{
"epoch": 129.93,
"learning_rate": 3.272542485937369e-06,
"loss": 1.9368,
"step": 1480
},
{
"epoch": 129.93,
"eval_loss": 2.5487852096557617,
"eval_runtime": 4.6396,
"eval_samples_per_second": 67.462,
"eval_steps_per_second": 17.027,
"step": 1480
},
{
"epoch": 133.44,
"learning_rate": 3.019779227044398e-06,
"loss": 1.9216,
"step": 1520
},
{
"epoch": 133.44,
"eval_loss": 2.5533745288848877,
"eval_runtime": 4.6038,
"eval_samples_per_second": 67.987,
"eval_steps_per_second": 17.16,
"step": 1520
},
{
"epoch": 136.95,
"learning_rate": 2.761321158169134e-06,
"loss": 1.902,
"step": 1560
},
{
"epoch": 136.95,
"eval_loss": 2.558429479598999,
"eval_runtime": 4.605,
"eval_samples_per_second": 67.969,
"eval_steps_per_second": 17.155,
"step": 1560
},
{
"epoch": 140.47,
"learning_rate": 2.5e-06,
"loss": 1.8885,
"step": 1600
},
{
"epoch": 140.47,
"eval_loss": 2.560931444168091,
"eval_runtime": 4.6137,
"eval_samples_per_second": 67.842,
"eval_steps_per_second": 17.123,
"step": 1600
},
{
"epoch": 143.98,
"learning_rate": 2.238678841830867e-06,
"loss": 1.8728,
"step": 1640
},
{
"epoch": 143.98,
"eval_loss": 2.565746307373047,
"eval_runtime": 4.6085,
"eval_samples_per_second": 67.918,
"eval_steps_per_second": 17.142,
"step": 1640
},
{
"epoch": 147.49,
"learning_rate": 1.9802207729556023e-06,
"loss": 1.8605,
"step": 1680
},
{
"epoch": 147.49,
"eval_loss": 2.569748640060425,
"eval_runtime": 4.6652,
"eval_samples_per_second": 67.092,
"eval_steps_per_second": 16.934,
"step": 1680
},
{
"epoch": 151.0,
"learning_rate": 1.7274575140626318e-06,
"loss": 1.8476,
"step": 1720
},
{
"epoch": 151.0,
"eval_loss": 2.5741446018218994,
"eval_runtime": 4.7429,
"eval_samples_per_second": 65.994,
"eval_steps_per_second": 16.657,
"step": 1720
},
{
"epoch": 154.51,
"learning_rate": 1.4831583923105e-06,
"loss": 1.8402,
"step": 1760
},
{
"epoch": 154.51,
"eval_loss": 2.5770394802093506,
"eval_runtime": 4.6184,
"eval_samples_per_second": 67.772,
"eval_steps_per_second": 17.105,
"step": 1760
},
{
"epoch": 158.02,
"learning_rate": 1.2500000000000007e-06,
"loss": 1.8274,
"step": 1800
},
{
"epoch": 158.02,
"eval_loss": 2.580260992050171,
"eval_runtime": 4.5687,
"eval_samples_per_second": 68.509,
"eval_steps_per_second": 17.291,
"step": 1800
},
{
"epoch": 161.54,
"learning_rate": 1.0305368692688175e-06,
"loss": 1.8218,
"step": 1840
},
{
"epoch": 161.54,
"eval_loss": 2.582859992980957,
"eval_runtime": 4.6266,
"eval_samples_per_second": 67.653,
"eval_steps_per_second": 17.075,
"step": 1840
},
{
"epoch": 165.05,
"learning_rate": 8.271734841028553e-07,
"loss": 1.8144,
"step": 1880
},
{
"epoch": 165.05,
"eval_loss": 2.5846669673919678,
"eval_runtime": 4.601,
"eval_samples_per_second": 68.029,
"eval_steps_per_second": 17.17,
"step": 1880
},
{
"epoch": 168.56,
"learning_rate": 6.421379363065142e-07,
"loss": 1.8097,
"step": 1920
},
{
"epoch": 168.56,
"eval_loss": 2.5867464542388916,
"eval_runtime": 4.593,
"eval_samples_per_second": 68.148,
"eval_steps_per_second": 17.2,
"step": 1920
},
{
"epoch": 172.07,
"learning_rate": 4.774575140626317e-07,
"loss": 1.8076,
"step": 1960
},
{
"epoch": 172.07,
"eval_loss": 2.5882575511932373,
"eval_runtime": 4.601,
"eval_samples_per_second": 68.028,
"eval_steps_per_second": 17.17,
"step": 1960
},
{
"epoch": 175.58,
"learning_rate": 3.3493649053890325e-07,
"loss": 1.8014,
"step": 2000
},
{
"epoch": 175.58,
"eval_loss": 2.589245080947876,
"eval_runtime": 4.5976,
"eval_samples_per_second": 68.079,
"eval_steps_per_second": 17.183,
"step": 2000
},
{
"epoch": 179.09,
"learning_rate": 2.1613635589349756e-07,
"loss": 1.8001,
"step": 2040
},
{
"epoch": 179.09,
"eval_loss": 2.589866876602173,
"eval_runtime": 4.5824,
"eval_samples_per_second": 68.305,
"eval_steps_per_second": 17.24,
"step": 2040
},
{
"epoch": 182.61,
"learning_rate": 1.223587092621162e-07,
"loss": 1.7987,
"step": 2080
},
{
"epoch": 182.61,
"eval_loss": 2.5903093814849854,
"eval_runtime": 4.6146,
"eval_samples_per_second": 67.829,
"eval_steps_per_second": 17.12,
"step": 2080
},
{
"epoch": 186.12,
"learning_rate": 5.463099816548578e-08,
"loss": 1.7971,
"step": 2120
},
{
"epoch": 186.12,
"eval_loss": 2.590583562850952,
"eval_runtime": 4.609,
"eval_samples_per_second": 67.911,
"eval_steps_per_second": 17.141,
"step": 2120
},
{
"epoch": 189.63,
"learning_rate": 1.3695261579316776e-08,
"loss": 1.7979,
"step": 2160
},
{
"epoch": 189.63,
"eval_loss": 2.5907208919525146,
"eval_runtime": 4.6125,
"eval_samples_per_second": 67.859,
"eval_steps_per_second": 17.127,
"step": 2160
},
{
"epoch": 193.14,
"learning_rate": 0.0,
"loss": 1.7975,
"step": 2200
},
{
"epoch": 193.14,
"eval_loss": 2.590698719024658,
"eval_runtime": 4.6213,
"eval_samples_per_second": 67.729,
"eval_steps_per_second": 17.095,
"step": 2200
},
{
"epoch": 193.14,
"step": 2200,
"total_flos": 1.0517861659312128e+18,
"train_loss": 2.2616969472711737,
"train_runtime": 20093.6832,
"train_samples_per_second": 29.024,
"train_steps_per_second": 0.109
}
],
"logging_steps": 40,
"max_steps": 2200,
"num_train_epochs": 200,
"save_steps": 40,
"total_flos": 1.0517861659312128e+18,
"trial_name": null,
"trial_params": null
}