whisper-large-v2-lora-cantonese / trainer_state.json
Oblivion208's picture
Upload folder using huggingface_hub
6683fe2
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.1165,
"eval_steps": 200,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 5e-05,
"loss": 2.492,
"step": 25
},
{
"epoch": 0.01,
"learning_rate": 0.0001,
"loss": 1.5737,
"step": 50
},
{
"epoch": 0.01,
"learning_rate": 0.00015,
"loss": 0.8096,
"step": 75
},
{
"epoch": 0.01,
"learning_rate": 0.0002,
"loss": 0.1818,
"step": 100
},
{
"epoch": 0.01,
"learning_rate": 0.00025,
"loss": 0.1383,
"step": 125
},
{
"epoch": 0.01,
"learning_rate": 0.0003,
"loss": 0.1267,
"step": 150
},
{
"epoch": 0.02,
"learning_rate": 0.00035,
"loss": 0.1119,
"step": 175
},
{
"epoch": 0.02,
"learning_rate": 0.0004,
"loss": 0.1193,
"step": 200
},
{
"epoch": 0.02,
"eval_loss": 0.5211273431777954,
"eval_runtime": 222.3408,
"eval_samples_per_second": 4.498,
"eval_steps_per_second": 0.283,
"step": 200
},
{
"epoch": 0.02,
"learning_rate": 0.00045000000000000004,
"loss": 0.1103,
"step": 225
},
{
"epoch": 0.03,
"learning_rate": 0.0005,
"loss": 0.1132,
"step": 250
},
{
"epoch": 0.03,
"learning_rate": 0.00055,
"loss": 0.1192,
"step": 275
},
{
"epoch": 0.03,
"learning_rate": 0.0006,
"loss": 0.0996,
"step": 300
},
{
"epoch": 0.03,
"learning_rate": 0.0006500000000000001,
"loss": 0.1094,
"step": 325
},
{
"epoch": 0.04,
"learning_rate": 0.0007,
"loss": 0.1099,
"step": 350
},
{
"epoch": 0.04,
"learning_rate": 0.00075,
"loss": 0.1044,
"step": 375
},
{
"epoch": 0.04,
"learning_rate": 0.0008,
"loss": 0.1153,
"step": 400
},
{
"epoch": 0.04,
"eval_loss": 0.5528811812400818,
"eval_runtime": 215.4309,
"eval_samples_per_second": 4.642,
"eval_steps_per_second": 0.292,
"step": 400
},
{
"epoch": 0.04,
"learning_rate": 0.00085,
"loss": 0.1142,
"step": 425
},
{
"epoch": 0.04,
"learning_rate": 0.0009000000000000001,
"loss": 0.1265,
"step": 450
},
{
"epoch": 0.05,
"learning_rate": 0.00095,
"loss": 0.1238,
"step": 475
},
{
"epoch": 0.05,
"learning_rate": 0.001,
"loss": 0.1515,
"step": 500
},
{
"epoch": 0.05,
"learning_rate": 0.0009973684210526316,
"loss": 0.1554,
"step": 525
},
{
"epoch": 0.06,
"learning_rate": 0.000994842105263158,
"loss": 0.1472,
"step": 550
},
{
"epoch": 0.06,
"learning_rate": 0.0009922105263157894,
"loss": 0.1346,
"step": 575
},
{
"epoch": 0.06,
"learning_rate": 0.000989578947368421,
"loss": 0.1307,
"step": 600
},
{
"epoch": 0.06,
"eval_loss": 0.6507117748260498,
"eval_runtime": 230.1411,
"eval_samples_per_second": 4.345,
"eval_steps_per_second": 0.274,
"step": 600
},
{
"epoch": 0.06,
"learning_rate": 0.0009869473684210527,
"loss": 0.1318,
"step": 625
},
{
"epoch": 0.07,
"learning_rate": 0.0009843157894736843,
"loss": 0.1296,
"step": 650
},
{
"epoch": 0.07,
"learning_rate": 0.0009816842105263158,
"loss": 0.1215,
"step": 675
},
{
"epoch": 0.07,
"learning_rate": 0.0009790526315789473,
"loss": 0.1209,
"step": 700
},
{
"epoch": 0.07,
"learning_rate": 0.000976421052631579,
"loss": 0.1117,
"step": 725
},
{
"epoch": 0.07,
"learning_rate": 0.0009737894736842105,
"loss": 0.1156,
"step": 750
},
{
"epoch": 0.08,
"learning_rate": 0.0009711578947368422,
"loss": 0.1223,
"step": 775
},
{
"epoch": 0.08,
"learning_rate": 0.0009685263157894737,
"loss": 0.1195,
"step": 800
},
{
"epoch": 0.08,
"eval_loss": 0.6224600672721863,
"eval_runtime": 224.4686,
"eval_samples_per_second": 4.455,
"eval_steps_per_second": 0.281,
"step": 800
},
{
"epoch": 0.08,
"learning_rate": 0.0009658947368421053,
"loss": 0.1256,
"step": 825
},
{
"epoch": 0.09,
"learning_rate": 0.0009632631578947368,
"loss": 0.127,
"step": 850
},
{
"epoch": 0.09,
"learning_rate": 0.0009606315789473684,
"loss": 0.1131,
"step": 875
},
{
"epoch": 0.09,
"learning_rate": 0.000958,
"loss": 0.1266,
"step": 900
},
{
"epoch": 0.09,
"learning_rate": 0.0009553684210526315,
"loss": 0.1223,
"step": 925
},
{
"epoch": 0.1,
"learning_rate": 0.0009527368421052632,
"loss": 0.1247,
"step": 950
},
{
"epoch": 0.1,
"learning_rate": 0.0009501052631578948,
"loss": 0.126,
"step": 975
},
{
"epoch": 0.1,
"learning_rate": 0.0009474736842105263,
"loss": 0.1271,
"step": 1000
},
{
"epoch": 0.1,
"eval_loss": 0.6543993353843689,
"eval_runtime": 224.6223,
"eval_samples_per_second": 4.452,
"eval_steps_per_second": 0.28,
"step": 1000
},
{
"epoch": 0.1,
"learning_rate": 0.000944842105263158,
"loss": 0.1111,
"step": 1025
},
{
"epoch": 0.1,
"learning_rate": 0.0009422105263157895,
"loss": 0.1174,
"step": 1050
},
{
"epoch": 0.11,
"learning_rate": 0.0009395789473684211,
"loss": 0.1234,
"step": 1075
},
{
"epoch": 0.11,
"learning_rate": 0.0009369473684210527,
"loss": 0.1176,
"step": 1100
},
{
"epoch": 0.11,
"learning_rate": 0.0009343157894736842,
"loss": 0.1154,
"step": 1125
},
{
"epoch": 0.12,
"learning_rate": 0.0009316842105263158,
"loss": 0.1096,
"step": 1150
},
{
"epoch": 0.12,
"learning_rate": 0.0009290526315789473,
"loss": 0.1202,
"step": 1175
},
{
"epoch": 0.12,
"learning_rate": 0.000926421052631579,
"loss": 0.1153,
"step": 1200
},
{
"epoch": 0.12,
"eval_loss": 0.6334092020988464,
"eval_runtime": 231.0686,
"eval_samples_per_second": 4.328,
"eval_steps_per_second": 0.273,
"step": 1200
},
{
"epoch": 0.12,
"learning_rate": 0.0009237894736842105,
"loss": 0.1045,
"step": 1225
},
{
"epoch": 0.12,
"learning_rate": 0.000921157894736842,
"loss": 0.1181,
"step": 1250
},
{
"epoch": 0.13,
"learning_rate": 0.0009185263157894737,
"loss": 0.1149,
"step": 1275
},
{
"epoch": 0.13,
"learning_rate": 0.0009158947368421053,
"loss": 0.1108,
"step": 1300
},
{
"epoch": 0.13,
"learning_rate": 0.0009132631578947369,
"loss": 0.1154,
"step": 1325
},
{
"epoch": 0.14,
"learning_rate": 0.0009106315789473685,
"loss": 0.1137,
"step": 1350
},
{
"epoch": 0.14,
"learning_rate": 0.0009080000000000001,
"loss": 0.1165,
"step": 1375
},
{
"epoch": 0.14,
"learning_rate": 0.0009053684210526316,
"loss": 0.1157,
"step": 1400
},
{
"epoch": 0.14,
"eval_loss": 0.6377915740013123,
"eval_runtime": 234.7043,
"eval_samples_per_second": 4.261,
"eval_steps_per_second": 0.268,
"step": 1400
},
{
"epoch": 0.14,
"learning_rate": 0.0009027368421052631,
"loss": 0.1047,
"step": 1425
},
{
"epoch": 0.14,
"learning_rate": 0.0009001052631578948,
"loss": 0.1164,
"step": 1450
},
{
"epoch": 0.15,
"learning_rate": 0.0008974736842105263,
"loss": 0.1643,
"step": 1475
},
{
"epoch": 0.15,
"learning_rate": 0.0008948421052631579,
"loss": 0.3201,
"step": 1500
},
{
"epoch": 0.15,
"learning_rate": 0.0008922105263157895,
"loss": 0.3339,
"step": 1525
},
{
"epoch": 0.15,
"learning_rate": 0.000889578947368421,
"loss": 0.3632,
"step": 1550
},
{
"epoch": 0.16,
"learning_rate": 0.0008869473684210526,
"loss": 0.3388,
"step": 1575
},
{
"epoch": 0.16,
"learning_rate": 0.0008843157894736842,
"loss": 0.3302,
"step": 1600
},
{
"epoch": 0.16,
"eval_loss": 0.3262811303138733,
"eval_runtime": 238.2184,
"eval_samples_per_second": 4.198,
"eval_steps_per_second": 0.264,
"step": 1600
},
{
"epoch": 0.16,
"learning_rate": 0.0008816842105263158,
"loss": 0.3697,
"step": 1625
},
{
"epoch": 0.17,
"learning_rate": 0.0008790526315789474,
"loss": 0.3009,
"step": 1650
},
{
"epoch": 0.17,
"learning_rate": 0.0008764210526315791,
"loss": 0.2743,
"step": 1675
},
{
"epoch": 0.17,
"learning_rate": 0.000874,
"loss": 0.2846,
"step": 1700
},
{
"epoch": 0.17,
"learning_rate": 0.0008714736842105263,
"loss": 0.2532,
"step": 1725
},
{
"epoch": 0.17,
"learning_rate": 0.0008688421052631579,
"loss": 0.2461,
"step": 1750
},
{
"epoch": 1.0,
"learning_rate": 0.0008662105263157894,
"loss": 0.2367,
"step": 1775
},
{
"epoch": 1.0,
"learning_rate": 0.0008635789473684211,
"loss": 0.1269,
"step": 1800
},
{
"epoch": 1.0,
"eval_loss": 0.3383277654647827,
"eval_runtime": 259.2781,
"eval_samples_per_second": 3.857,
"eval_steps_per_second": 0.243,
"step": 1800
},
{
"epoch": 1.01,
"learning_rate": 0.0008609473684210527,
"loss": 0.1188,
"step": 1825
},
{
"epoch": 1.01,
"learning_rate": 0.0008583157894736842,
"loss": 0.1173,
"step": 1850
},
{
"epoch": 1.01,
"learning_rate": 0.0008556842105263159,
"loss": 0.1257,
"step": 1875
},
{
"epoch": 1.01,
"learning_rate": 0.0008530526315789474,
"loss": 0.1197,
"step": 1900
},
{
"epoch": 1.02,
"learning_rate": 0.000850421052631579,
"loss": 0.1109,
"step": 1925
},
{
"epoch": 1.02,
"learning_rate": 0.0008477894736842105,
"loss": 0.1041,
"step": 1950
},
{
"epoch": 1.02,
"learning_rate": 0.0008451578947368422,
"loss": 0.101,
"step": 1975
},
{
"epoch": 1.02,
"learning_rate": 0.0008425263157894737,
"loss": 0.1006,
"step": 2000
},
{
"epoch": 1.02,
"eval_loss": 0.3684940040111542,
"eval_runtime": 255.8865,
"eval_samples_per_second": 3.908,
"eval_steps_per_second": 0.246,
"step": 2000
},
{
"epoch": 1.03,
"learning_rate": 0.0008398947368421052,
"loss": 0.1051,
"step": 2025
},
{
"epoch": 1.03,
"learning_rate": 0.0008372631578947369,
"loss": 0.1029,
"step": 2050
},
{
"epoch": 1.03,
"learning_rate": 0.0008346315789473684,
"loss": 0.0923,
"step": 2075
},
{
"epoch": 1.03,
"learning_rate": 0.000832,
"loss": 0.0983,
"step": 2100
},
{
"epoch": 1.04,
"learning_rate": 0.0008293684210526316,
"loss": 0.09,
"step": 2125
},
{
"epoch": 1.04,
"learning_rate": 0.0008267368421052631,
"loss": 0.0906,
"step": 2150
},
{
"epoch": 1.04,
"learning_rate": 0.0008241052631578948,
"loss": 0.0899,
"step": 2175
},
{
"epoch": 1.04,
"learning_rate": 0.0008214736842105264,
"loss": 0.0876,
"step": 2200
},
{
"epoch": 1.04,
"eval_loss": 0.3948498070240021,
"eval_runtime": 239.7929,
"eval_samples_per_second": 4.17,
"eval_steps_per_second": 0.263,
"step": 2200
},
{
"epoch": 1.05,
"learning_rate": 0.000818842105263158,
"loss": 0.0845,
"step": 2225
},
{
"epoch": 1.05,
"learning_rate": 0.0008162105263157895,
"loss": 0.0799,
"step": 2250
},
{
"epoch": 1.05,
"learning_rate": 0.000813578947368421,
"loss": 0.0898,
"step": 2275
},
{
"epoch": 1.05,
"learning_rate": 0.0008109473684210527,
"loss": 0.0818,
"step": 2300
},
{
"epoch": 1.06,
"learning_rate": 0.0008083157894736842,
"loss": 0.0914,
"step": 2325
},
{
"epoch": 1.06,
"learning_rate": 0.0008056842105263158,
"loss": 0.0844,
"step": 2350
},
{
"epoch": 1.06,
"learning_rate": 0.0008030526315789474,
"loss": 0.0845,
"step": 2375
},
{
"epoch": 1.06,
"learning_rate": 0.0008004210526315789,
"loss": 0.0815,
"step": 2400
},
{
"epoch": 1.06,
"eval_loss": 0.40161755681037903,
"eval_runtime": 239.5377,
"eval_samples_per_second": 4.175,
"eval_steps_per_second": 0.263,
"step": 2400
},
{
"epoch": 1.07,
"learning_rate": 0.0007977894736842105,
"loss": 0.0806,
"step": 2425
},
{
"epoch": 1.07,
"learning_rate": 0.0007951578947368421,
"loss": 0.0783,
"step": 2450
},
{
"epoch": 1.07,
"learning_rate": 0.0007925263157894737,
"loss": 0.0746,
"step": 2475
},
{
"epoch": 1.07,
"learning_rate": 0.0007898947368421053,
"loss": 0.0748,
"step": 2500
},
{
"epoch": 1.08,
"learning_rate": 0.0007872631578947369,
"loss": 0.079,
"step": 2525
},
{
"epoch": 1.08,
"learning_rate": 0.0007846315789473685,
"loss": 0.0729,
"step": 2550
},
{
"epoch": 1.08,
"learning_rate": 0.000782,
"loss": 0.0745,
"step": 2575
},
{
"epoch": 1.08,
"learning_rate": 0.0007793684210526316,
"loss": 0.0821,
"step": 2600
},
{
"epoch": 1.08,
"eval_loss": 0.4262455105781555,
"eval_runtime": 240.4602,
"eval_samples_per_second": 4.159,
"eval_steps_per_second": 0.262,
"step": 2600
},
{
"epoch": 1.09,
"learning_rate": 0.0007767368421052632,
"loss": 0.074,
"step": 2625
},
{
"epoch": 1.09,
"learning_rate": 0.0007741052631578948,
"loss": 0.0692,
"step": 2650
},
{
"epoch": 1.09,
"learning_rate": 0.0007714736842105263,
"loss": 0.0779,
"step": 2675
},
{
"epoch": 1.09,
"learning_rate": 0.0007688421052631579,
"loss": 0.0678,
"step": 2700
},
{
"epoch": 1.1,
"learning_rate": 0.0007662105263157895,
"loss": 0.079,
"step": 2725
},
{
"epoch": 1.1,
"learning_rate": 0.000763578947368421,
"loss": 0.0689,
"step": 2750
},
{
"epoch": 1.1,
"learning_rate": 0.0007609473684210527,
"loss": 0.0761,
"step": 2775
},
{
"epoch": 1.1,
"learning_rate": 0.0007583157894736842,
"loss": 0.0758,
"step": 2800
},
{
"epoch": 1.1,
"eval_loss": 0.4292161166667938,
"eval_runtime": 229.938,
"eval_samples_per_second": 4.349,
"eval_steps_per_second": 0.274,
"step": 2800
},
{
"epoch": 1.11,
"learning_rate": 0.0007556842105263157,
"loss": 0.0652,
"step": 2825
},
{
"epoch": 1.11,
"learning_rate": 0.0007530526315789474,
"loss": 0.0722,
"step": 2850
},
{
"epoch": 1.11,
"learning_rate": 0.000750421052631579,
"loss": 0.0703,
"step": 2875
},
{
"epoch": 1.11,
"learning_rate": 0.0007477894736842106,
"loss": 0.0698,
"step": 2900
},
{
"epoch": 1.12,
"learning_rate": 0.0007451578947368421,
"loss": 0.0675,
"step": 2925
},
{
"epoch": 1.12,
"learning_rate": 0.0007425263157894738,
"loss": 0.0717,
"step": 2950
},
{
"epoch": 1.12,
"learning_rate": 0.0007398947368421053,
"loss": 0.0711,
"step": 2975
},
{
"epoch": 1.12,
"learning_rate": 0.0007372631578947368,
"loss": 0.0689,
"step": 3000
},
{
"epoch": 1.12,
"eval_loss": 0.4249730110168457,
"eval_runtime": 233.7088,
"eval_samples_per_second": 4.279,
"eval_steps_per_second": 0.27,
"step": 3000
},
{
"epoch": 1.13,
"learning_rate": 0.0007346315789473685,
"loss": 0.0697,
"step": 3025
},
{
"epoch": 1.13,
"learning_rate": 0.000732,
"loss": 0.0719,
"step": 3050
},
{
"epoch": 1.13,
"learning_rate": 0.0007293684210526315,
"loss": 0.0717,
"step": 3075
},
{
"epoch": 1.13,
"learning_rate": 0.0007267368421052631,
"loss": 0.0739,
"step": 3100
},
{
"epoch": 1.14,
"learning_rate": 0.0007241052631578947,
"loss": 0.0653,
"step": 3125
},
{
"epoch": 1.14,
"learning_rate": 0.0007214736842105263,
"loss": 0.0679,
"step": 3150
},
{
"epoch": 1.14,
"learning_rate": 0.0007188421052631579,
"loss": 0.0697,
"step": 3175
},
{
"epoch": 1.14,
"learning_rate": 0.0007162105263157896,
"loss": 0.0612,
"step": 3200
},
{
"epoch": 1.14,
"eval_loss": 0.4456619620323181,
"eval_runtime": 283.5688,
"eval_samples_per_second": 3.526,
"eval_steps_per_second": 0.222,
"step": 3200
},
{
"epoch": 1.15,
"learning_rate": 0.0007135789473684211,
"loss": 0.071,
"step": 3225
},
{
"epoch": 1.15,
"learning_rate": 0.0007109473684210526,
"loss": 0.1414,
"step": 3250
},
{
"epoch": 1.15,
"learning_rate": 0.0007083157894736843,
"loss": 0.2082,
"step": 3275
},
{
"epoch": 1.15,
"learning_rate": 0.0007056842105263158,
"loss": 0.2392,
"step": 3300
},
{
"epoch": 1.16,
"learning_rate": 0.0007030526315789474,
"loss": 0.2313,
"step": 3325
},
{
"epoch": 1.16,
"learning_rate": 0.000700421052631579,
"loss": 0.2382,
"step": 3350
},
{
"epoch": 1.16,
"learning_rate": 0.0006977894736842105,
"loss": 0.2509,
"step": 3375
},
{
"epoch": 1.16,
"learning_rate": 0.0006951578947368421,
"loss": 0.2347,
"step": 3400
},
{
"epoch": 1.16,
"eval_loss": 0.2735106348991394,
"eval_runtime": 256.6563,
"eval_samples_per_second": 3.896,
"eval_steps_per_second": 0.245,
"step": 3400
},
{
"epoch": 1.17,
"learning_rate": 0.0006925263157894736,
"loss": 0.215,
"step": 3425
},
{
"epoch": 1.17,
"learning_rate": 0.0006898947368421053,
"loss": 0.1968,
"step": 3450
},
{
"epoch": 1.17,
"learning_rate": 0.0006872631578947368,
"loss": 0.1947,
"step": 3475
},
{
"epoch": 1.17,
"learning_rate": 0.0006846315789473683,
"loss": 0.1786,
"step": 3500
},
{
"epoch": 1.18,
"learning_rate": 0.0006820000000000001,
"loss": 0.1816,
"step": 3525
},
{
"epoch": 2.0,
"learning_rate": 0.0006793684210526316,
"loss": 0.1258,
"step": 3550
},
{
"epoch": 2.0,
"learning_rate": 0.0006767368421052632,
"loss": 0.076,
"step": 3575
},
{
"epoch": 2.01,
"learning_rate": 0.0006741052631578948,
"loss": 0.0723,
"step": 3600
},
{
"epoch": 2.01,
"eval_loss": 0.3416486084461212,
"eval_runtime": 264.4251,
"eval_samples_per_second": 3.782,
"eval_steps_per_second": 0.238,
"step": 3600
},
{
"epoch": 2.01,
"learning_rate": 0.0006714736842105264,
"loss": 0.0754,
"step": 3625
},
{
"epoch": 2.01,
"learning_rate": 0.0006688421052631579,
"loss": 0.0775,
"step": 3650
},
{
"epoch": 2.01,
"learning_rate": 0.0006662105263157895,
"loss": 0.0781,
"step": 3675
},
{
"epoch": 2.02,
"learning_rate": 0.0006635789473684211,
"loss": 0.0684,
"step": 3700
},
{
"epoch": 2.02,
"learning_rate": 0.0006609473684210526,
"loss": 0.0682,
"step": 3725
},
{
"epoch": 2.02,
"learning_rate": 0.0006583157894736842,
"loss": 0.0624,
"step": 3750
},
{
"epoch": 2.02,
"learning_rate": 0.0006556842105263158,
"loss": 0.07,
"step": 3775
},
{
"epoch": 2.03,
"learning_rate": 0.0006530526315789473,
"loss": 0.0728,
"step": 3800
},
{
"epoch": 2.03,
"eval_loss": 0.3479284644126892,
"eval_runtime": 292.143,
"eval_samples_per_second": 3.423,
"eval_steps_per_second": 0.216,
"step": 3800
},
{
"epoch": 2.03,
"learning_rate": 0.0006504210526315789,
"loss": 0.0762,
"step": 3825
},
{
"epoch": 2.03,
"learning_rate": 0.0006477894736842106,
"loss": 0.0714,
"step": 3850
},
{
"epoch": 2.03,
"learning_rate": 0.0006451578947368422,
"loss": 0.0753,
"step": 3875
},
{
"epoch": 2.04,
"learning_rate": 0.0006425263157894737,
"loss": 0.0653,
"step": 3900
},
{
"epoch": 2.04,
"learning_rate": 0.0006398947368421054,
"loss": 0.0711,
"step": 3925
},
{
"epoch": 2.04,
"learning_rate": 0.0006372631578947369,
"loss": 0.0664,
"step": 3950
},
{
"epoch": 2.04,
"learning_rate": 0.0006346315789473684,
"loss": 0.0608,
"step": 3975
},
{
"epoch": 2.05,
"learning_rate": 0.000632,
"loss": 0.0592,
"step": 4000
},
{
"epoch": 2.05,
"eval_loss": 0.37529394030570984,
"eval_runtime": 235.8642,
"eval_samples_per_second": 4.24,
"eval_steps_per_second": 0.267,
"step": 4000
},
{
"epoch": 2.05,
"learning_rate": 0.0006293684210526316,
"loss": 0.0588,
"step": 4025
},
{
"epoch": 2.05,
"learning_rate": 0.0006267368421052632,
"loss": 0.0612,
"step": 4050
},
{
"epoch": 2.05,
"learning_rate": 0.0006241052631578947,
"loss": 0.0603,
"step": 4075
},
{
"epoch": 2.06,
"learning_rate": 0.0006214736842105263,
"loss": 0.0727,
"step": 4100
},
{
"epoch": 2.06,
"learning_rate": 0.0006188421052631579,
"loss": 0.0625,
"step": 4125
},
{
"epoch": 2.06,
"learning_rate": 0.0006162105263157894,
"loss": 0.0617,
"step": 4150
},
{
"epoch": 2.06,
"learning_rate": 0.0006135789473684211,
"loss": 0.0642,
"step": 4175
},
{
"epoch": 2.07,
"learning_rate": 0.0006109473684210527,
"loss": 0.0601,
"step": 4200
},
{
"epoch": 2.07,
"eval_loss": 0.39785417914390564,
"eval_runtime": 249.2887,
"eval_samples_per_second": 4.011,
"eval_steps_per_second": 0.253,
"step": 4200
},
{
"epoch": 2.07,
"learning_rate": 0.0006083157894736842,
"loss": 0.0603,
"step": 4225
},
{
"epoch": 2.07,
"learning_rate": 0.0006056842105263159,
"loss": 0.0555,
"step": 4250
},
{
"epoch": 2.07,
"learning_rate": 0.0006030526315789474,
"loss": 0.0486,
"step": 4275
},
{
"epoch": 2.08,
"learning_rate": 0.000600421052631579,
"loss": 0.0605,
"step": 4300
},
{
"epoch": 2.08,
"learning_rate": 0.0005977894736842105,
"loss": 0.0507,
"step": 4325
},
{
"epoch": 2.08,
"learning_rate": 0.0005951578947368421,
"loss": 0.0524,
"step": 4350
},
{
"epoch": 2.08,
"learning_rate": 0.0005925263157894737,
"loss": 0.0572,
"step": 4375
},
{
"epoch": 2.09,
"learning_rate": 0.0005898947368421052,
"loss": 0.0484,
"step": 4400
},
{
"epoch": 2.09,
"eval_loss": 0.3840053975582123,
"eval_runtime": 233.7153,
"eval_samples_per_second": 4.279,
"eval_steps_per_second": 0.27,
"step": 4400
},
{
"epoch": 2.09,
"learning_rate": 0.0005872631578947369,
"loss": 0.0532,
"step": 4425
},
{
"epoch": 2.09,
"learning_rate": 0.0005846315789473684,
"loss": 0.0532,
"step": 4450
},
{
"epoch": 2.09,
"learning_rate": 0.0005819999999999999,
"loss": 0.0455,
"step": 4475
},
{
"epoch": 2.1,
"learning_rate": 0.0005793684210526316,
"loss": 0.0506,
"step": 4500
},
{
"epoch": 2.1,
"learning_rate": 0.0005767368421052631,
"loss": 0.046,
"step": 4525
},
{
"epoch": 2.1,
"learning_rate": 0.0005741052631578948,
"loss": 0.0523,
"step": 4550
},
{
"epoch": 2.1,
"learning_rate": 0.0005714736842105264,
"loss": 0.0456,
"step": 4575
},
{
"epoch": 2.11,
"learning_rate": 0.000568842105263158,
"loss": 0.0457,
"step": 4600
},
{
"epoch": 2.11,
"eval_loss": 0.4161207675933838,
"eval_runtime": 226.1145,
"eval_samples_per_second": 4.423,
"eval_steps_per_second": 0.279,
"step": 4600
},
{
"epoch": 2.11,
"learning_rate": 0.0005662105263157895,
"loss": 0.0454,
"step": 4625
},
{
"epoch": 2.11,
"learning_rate": 0.000563578947368421,
"loss": 0.0461,
"step": 4650
},
{
"epoch": 2.11,
"learning_rate": 0.0005609473684210527,
"loss": 0.0438,
"step": 4675
},
{
"epoch": 2.12,
"learning_rate": 0.0005583157894736842,
"loss": 0.0444,
"step": 4700
},
{
"epoch": 2.12,
"learning_rate": 0.0005556842105263158,
"loss": 0.0457,
"step": 4725
},
{
"epoch": 2.12,
"learning_rate": 0.0005530526315789474,
"loss": 0.0432,
"step": 4750
},
{
"epoch": 2.12,
"learning_rate": 0.0005504210526315789,
"loss": 0.0424,
"step": 4775
},
{
"epoch": 2.13,
"learning_rate": 0.0005477894736842105,
"loss": 0.0472,
"step": 4800
},
{
"epoch": 2.13,
"eval_loss": 0.3975374102592468,
"eval_runtime": 233.7801,
"eval_samples_per_second": 4.278,
"eval_steps_per_second": 0.269,
"step": 4800
},
{
"epoch": 2.13,
"learning_rate": 0.0005451578947368421,
"loss": 0.0418,
"step": 4825
},
{
"epoch": 2.13,
"learning_rate": 0.0005425263157894737,
"loss": 0.0435,
"step": 4850
},
{
"epoch": 2.13,
"learning_rate": 0.0005398947368421053,
"loss": 0.0459,
"step": 4875
},
{
"epoch": 2.14,
"learning_rate": 0.0005372631578947368,
"loss": 0.0457,
"step": 4900
},
{
"epoch": 2.14,
"learning_rate": 0.0005346315789473685,
"loss": 0.0429,
"step": 4925
},
{
"epoch": 2.14,
"learning_rate": 0.000532,
"loss": 0.0426,
"step": 4950
},
{
"epoch": 2.14,
"learning_rate": 0.0005293684210526316,
"loss": 0.0387,
"step": 4975
},
{
"epoch": 2.15,
"learning_rate": 0.0005267368421052632,
"loss": 0.0371,
"step": 5000
},
{
"epoch": 2.15,
"eval_loss": 0.38193443417549133,
"eval_runtime": 235.0403,
"eval_samples_per_second": 4.255,
"eval_steps_per_second": 0.268,
"step": 5000
},
{
"epoch": 2.15,
"learning_rate": 0.0005241052631578948,
"loss": 0.1203,
"step": 5025
},
{
"epoch": 2.15,
"learning_rate": 0.0005214736842105263,
"loss": 0.1324,
"step": 5050
},
{
"epoch": 2.15,
"learning_rate": 0.0005188421052631579,
"loss": 0.1544,
"step": 5075
},
{
"epoch": 2.16,
"learning_rate": 0.0005162105263157895,
"loss": 0.1441,
"step": 5100
},
{
"epoch": 2.16,
"learning_rate": 0.000513578947368421,
"loss": 0.1528,
"step": 5125
},
{
"epoch": 2.16,
"learning_rate": 0.0005109473684210527,
"loss": 0.176,
"step": 5150
},
{
"epoch": 2.16,
"learning_rate": 0.0005083157894736842,
"loss": 0.1425,
"step": 5175
},
{
"epoch": 2.17,
"learning_rate": 0.0005056842105263157,
"loss": 0.1266,
"step": 5200
},
{
"epoch": 2.17,
"eval_loss": 0.2390679568052292,
"eval_runtime": 228.9049,
"eval_samples_per_second": 4.369,
"eval_steps_per_second": 0.275,
"step": 5200
},
{
"epoch": 2.17,
"learning_rate": 0.0005030526315789474,
"loss": 0.1253,
"step": 5225
},
{
"epoch": 2.17,
"learning_rate": 0.000500421052631579,
"loss": 0.1176,
"step": 5250
},
{
"epoch": 2.17,
"learning_rate": 0.0004977894736842106,
"loss": 0.1122,
"step": 5275
},
{
"epoch": 2.18,
"learning_rate": 0.0004951578947368421,
"loss": 0.1195,
"step": 5300
},
{
"epoch": 3.0,
"learning_rate": 0.0004925263157894737,
"loss": 0.0519,
"step": 5325
},
{
"epoch": 3.0,
"learning_rate": 0.0004898947368421053,
"loss": 0.0397,
"step": 5350
},
{
"epoch": 3.01,
"learning_rate": 0.0004872631578947369,
"loss": 0.0439,
"step": 5375
},
{
"epoch": 3.01,
"learning_rate": 0.0004846315789473684,
"loss": 0.0467,
"step": 5400
},
{
"epoch": 3.01,
"eval_loss": 0.3231227993965149,
"eval_runtime": 236.2617,
"eval_samples_per_second": 4.233,
"eval_steps_per_second": 0.267,
"step": 5400
},
{
"epoch": 3.01,
"learning_rate": 0.000482,
"loss": 0.047,
"step": 5425
},
{
"epoch": 3.01,
"learning_rate": 0.00047936842105263154,
"loss": 0.0416,
"step": 5450
},
{
"epoch": 3.02,
"learning_rate": 0.0004767368421052632,
"loss": 0.0344,
"step": 5475
},
{
"epoch": 3.02,
"learning_rate": 0.0004741052631578948,
"loss": 0.0369,
"step": 5500
},
{
"epoch": 3.02,
"learning_rate": 0.0004714736842105263,
"loss": 0.0344,
"step": 5525
},
{
"epoch": 3.02,
"learning_rate": 0.0004688421052631579,
"loss": 0.0358,
"step": 5550
},
{
"epoch": 3.03,
"learning_rate": 0.00046621052631578945,
"loss": 0.0385,
"step": 5575
},
{
"epoch": 3.03,
"learning_rate": 0.00046357894736842104,
"loss": 0.0342,
"step": 5600
},
{
"epoch": 3.03,
"eval_loss": 0.31728434562683105,
"eval_runtime": 231.7415,
"eval_samples_per_second": 4.315,
"eval_steps_per_second": 0.272,
"step": 5600
},
{
"epoch": 3.03,
"learning_rate": 0.00046094736842105263,
"loss": 0.0345,
"step": 5625
},
{
"epoch": 3.03,
"learning_rate": 0.0004583157894736842,
"loss": 0.0377,
"step": 5650
},
{
"epoch": 3.04,
"learning_rate": 0.0004556842105263158,
"loss": 0.0313,
"step": 5675
},
{
"epoch": 3.04,
"learning_rate": 0.00045305263157894736,
"loss": 0.0337,
"step": 5700
},
{
"epoch": 3.04,
"learning_rate": 0.00045042105263157895,
"loss": 0.0312,
"step": 5725
},
{
"epoch": 3.04,
"learning_rate": 0.00044778947368421054,
"loss": 0.0349,
"step": 5750
},
{
"epoch": 3.05,
"learning_rate": 0.0004451578947368421,
"loss": 0.03,
"step": 5775
},
{
"epoch": 3.05,
"learning_rate": 0.00044252631578947367,
"loss": 0.0302,
"step": 5800
},
{
"epoch": 3.05,
"eval_loss": 0.3289109170436859,
"eval_runtime": 231.1941,
"eval_samples_per_second": 4.325,
"eval_steps_per_second": 0.272,
"step": 5800
},
{
"epoch": 3.05,
"learning_rate": 0.0004398947368421053,
"loss": 0.0314,
"step": 5825
},
{
"epoch": 3.05,
"learning_rate": 0.00043726315789473685,
"loss": 0.0298,
"step": 5850
},
{
"epoch": 3.06,
"learning_rate": 0.00043463157894736845,
"loss": 0.0343,
"step": 5875
},
{
"epoch": 3.06,
"learning_rate": 0.000432,
"loss": 0.0279,
"step": 5900
},
{
"epoch": 3.06,
"learning_rate": 0.0004293684210526316,
"loss": 0.029,
"step": 5925
},
{
"epoch": 3.06,
"learning_rate": 0.00042673684210526317,
"loss": 0.0325,
"step": 5950
},
{
"epoch": 3.07,
"learning_rate": 0.00042410526315789476,
"loss": 0.0286,
"step": 5975
},
{
"epoch": 3.07,
"learning_rate": 0.00042147368421052635,
"loss": 0.0288,
"step": 6000
},
{
"epoch": 3.07,
"eval_loss": 0.33179327845573425,
"eval_runtime": 229.5634,
"eval_samples_per_second": 4.356,
"eval_steps_per_second": 0.274,
"step": 6000
},
{
"epoch": 3.07,
"learning_rate": 0.0004188421052631579,
"loss": 0.0239,
"step": 6025
},
{
"epoch": 3.07,
"learning_rate": 0.0004162105263157895,
"loss": 0.03,
"step": 6050
},
{
"epoch": 3.08,
"learning_rate": 0.0004135789473684211,
"loss": 0.0267,
"step": 6075
},
{
"epoch": 3.08,
"learning_rate": 0.0004109473684210526,
"loss": 0.0243,
"step": 6100
},
{
"epoch": 3.08,
"learning_rate": 0.0004083157894736842,
"loss": 0.0265,
"step": 6125
},
{
"epoch": 3.08,
"learning_rate": 0.0004056842105263158,
"loss": 0.0286,
"step": 6150
},
{
"epoch": 3.09,
"learning_rate": 0.0004030526315789474,
"loss": 0.0229,
"step": 6175
},
{
"epoch": 3.09,
"learning_rate": 0.000400421052631579,
"loss": 0.0236,
"step": 6200
},
{
"epoch": 3.09,
"eval_loss": 0.3390714228153229,
"eval_runtime": 232.9846,
"eval_samples_per_second": 4.292,
"eval_steps_per_second": 0.27,
"step": 6200
},
{
"epoch": 3.09,
"learning_rate": 0.0003977894736842105,
"loss": 0.0225,
"step": 6225
},
{
"epoch": 3.09,
"learning_rate": 0.0003951578947368421,
"loss": 0.0241,
"step": 6250
},
{
"epoch": 3.1,
"learning_rate": 0.0003925263157894737,
"loss": 0.0253,
"step": 6275
},
{
"epoch": 3.1,
"learning_rate": 0.00038989473684210524,
"loss": 0.0241,
"step": 6300
},
{
"epoch": 3.1,
"learning_rate": 0.0003872631578947369,
"loss": 0.0231,
"step": 6325
},
{
"epoch": 3.1,
"learning_rate": 0.0003846315789473684,
"loss": 0.0212,
"step": 6350
},
{
"epoch": 3.11,
"learning_rate": 0.000382,
"loss": 0.0209,
"step": 6375
},
{
"epoch": 3.11,
"learning_rate": 0.0003793684210526316,
"loss": 0.0223,
"step": 6400
},
{
"epoch": 3.11,
"eval_loss": 0.35378792881965637,
"eval_runtime": 234.1666,
"eval_samples_per_second": 4.27,
"eval_steps_per_second": 0.269,
"step": 6400
},
{
"epoch": 3.11,
"learning_rate": 0.00037673684210526315,
"loss": 0.0241,
"step": 6425
},
{
"epoch": 3.11,
"learning_rate": 0.00037410526315789474,
"loss": 0.0206,
"step": 6450
},
{
"epoch": 3.12,
"learning_rate": 0.0003714736842105263,
"loss": 0.0206,
"step": 6475
},
{
"epoch": 3.12,
"learning_rate": 0.0003688421052631579,
"loss": 0.0234,
"step": 6500
},
{
"epoch": 3.12,
"learning_rate": 0.0003662105263157895,
"loss": 0.0185,
"step": 6525
},
{
"epoch": 3.12,
"learning_rate": 0.00036357894736842105,
"loss": 0.0205,
"step": 6550
},
{
"epoch": 3.13,
"learning_rate": 0.00036094736842105264,
"loss": 0.0217,
"step": 6575
},
{
"epoch": 3.13,
"learning_rate": 0.0003583157894736842,
"loss": 0.023,
"step": 6600
},
{
"epoch": 3.13,
"eval_loss": 0.3395112156867981,
"eval_runtime": 232.7288,
"eval_samples_per_second": 4.297,
"eval_steps_per_second": 0.271,
"step": 6600
},
{
"epoch": 3.13,
"learning_rate": 0.00035568421052631577,
"loss": 0.0209,
"step": 6625
},
{
"epoch": 3.13,
"learning_rate": 0.0003530526315789474,
"loss": 0.0195,
"step": 6650
},
{
"epoch": 3.14,
"learning_rate": 0.00035042105263157896,
"loss": 0.0223,
"step": 6675
},
{
"epoch": 3.14,
"learning_rate": 0.00034778947368421055,
"loss": 0.0231,
"step": 6700
},
{
"epoch": 3.14,
"learning_rate": 0.00034515789473684214,
"loss": 0.021,
"step": 6725
},
{
"epoch": 3.14,
"learning_rate": 0.0003425263157894737,
"loss": 0.0234,
"step": 6750
},
{
"epoch": 3.15,
"learning_rate": 0.00033989473684210527,
"loss": 0.0281,
"step": 6775
},
{
"epoch": 3.15,
"learning_rate": 0.0003372631578947368,
"loss": 0.0652,
"step": 6800
},
{
"epoch": 3.15,
"eval_loss": 0.2487945556640625,
"eval_runtime": 234.5765,
"eval_samples_per_second": 4.263,
"eval_steps_per_second": 0.269,
"step": 6800
},
{
"epoch": 3.15,
"learning_rate": 0.00033463157894736845,
"loss": 0.069,
"step": 6825
},
{
"epoch": 3.15,
"learning_rate": 0.00033200000000000005,
"loss": 0.0776,
"step": 6850
},
{
"epoch": 3.16,
"learning_rate": 0.0003293684210526316,
"loss": 0.0736,
"step": 6875
},
{
"epoch": 3.16,
"learning_rate": 0.0003267368421052632,
"loss": 0.0746,
"step": 6900
},
{
"epoch": 3.16,
"learning_rate": 0.0003241052631578947,
"loss": 0.0903,
"step": 6925
},
{
"epoch": 3.16,
"learning_rate": 0.0003214736842105263,
"loss": 0.073,
"step": 6950
},
{
"epoch": 3.17,
"learning_rate": 0.0003188421052631579,
"loss": 0.0624,
"step": 6975
},
{
"epoch": 3.17,
"learning_rate": 0.0003162105263157895,
"loss": 0.0585,
"step": 7000
},
{
"epoch": 3.17,
"eval_loss": 0.23527191579341888,
"eval_runtime": 232.506,
"eval_samples_per_second": 4.301,
"eval_steps_per_second": 0.271,
"step": 7000
},
{
"epoch": 3.17,
"learning_rate": 0.0003135789473684211,
"loss": 0.055,
"step": 7025
},
{
"epoch": 3.17,
"learning_rate": 0.0003109473684210526,
"loss": 0.0531,
"step": 7050
},
{
"epoch": 4.0,
"learning_rate": 0.0003083157894736842,
"loss": 0.05,
"step": 7075
},
{
"epoch": 4.0,
"learning_rate": 0.0003056842105263158,
"loss": 0.0237,
"step": 7100
},
{
"epoch": 4.01,
"learning_rate": 0.00030305263157894734,
"loss": 0.0206,
"step": 7125
},
{
"epoch": 4.01,
"learning_rate": 0.00030042105263157893,
"loss": 0.0197,
"step": 7150
},
{
"epoch": 4.01,
"learning_rate": 0.0002977894736842106,
"loss": 0.0209,
"step": 7175
},
{
"epoch": 4.01,
"learning_rate": 0.0002951578947368421,
"loss": 0.0219,
"step": 7200
},
{
"epoch": 4.01,
"eval_loss": 0.29737773537635803,
"eval_runtime": 231.2741,
"eval_samples_per_second": 4.324,
"eval_steps_per_second": 0.272,
"step": 7200
},
{
"epoch": 4.02,
"learning_rate": 0.0002925263157894737,
"loss": 0.02,
"step": 7225
},
{
"epoch": 4.02,
"learning_rate": 0.00028989473684210525,
"loss": 0.0158,
"step": 7250
},
{
"epoch": 4.02,
"learning_rate": 0.00028726315789473684,
"loss": 0.0175,
"step": 7275
},
{
"epoch": 4.02,
"learning_rate": 0.00028463157894736843,
"loss": 0.0152,
"step": 7300
},
{
"epoch": 4.03,
"learning_rate": 0.00028199999999999997,
"loss": 0.0176,
"step": 7325
},
{
"epoch": 4.03,
"learning_rate": 0.0002793684210526316,
"loss": 0.0146,
"step": 7350
},
{
"epoch": 4.03,
"learning_rate": 0.00027673684210526315,
"loss": 0.0141,
"step": 7375
},
{
"epoch": 4.03,
"learning_rate": 0.00027410526315789475,
"loss": 0.0171,
"step": 7400
},
{
"epoch": 4.03,
"eval_loss": 0.3058754801750183,
"eval_runtime": 236.0616,
"eval_samples_per_second": 4.236,
"eval_steps_per_second": 0.267,
"step": 7400
},
{
"epoch": 4.04,
"learning_rate": 0.00027147368421052634,
"loss": 0.0159,
"step": 7425
},
{
"epoch": 4.04,
"learning_rate": 0.0002688421052631579,
"loss": 0.0155,
"step": 7450
},
{
"epoch": 4.04,
"learning_rate": 0.00026621052631578947,
"loss": 0.014,
"step": 7475
},
{
"epoch": 4.04,
"learning_rate": 0.00026357894736842106,
"loss": 0.0141,
"step": 7500
},
{
"epoch": 4.05,
"learning_rate": 0.00026094736842105265,
"loss": 0.0157,
"step": 7525
},
{
"epoch": 4.05,
"learning_rate": 0.00025831578947368424,
"loss": 0.0128,
"step": 7550
},
{
"epoch": 4.05,
"learning_rate": 0.0002556842105263158,
"loss": 0.0152,
"step": 7575
},
{
"epoch": 4.05,
"learning_rate": 0.0002530526315789474,
"loss": 0.0146,
"step": 7600
},
{
"epoch": 4.05,
"eval_loss": 0.30549487471580505,
"eval_runtime": 230.3772,
"eval_samples_per_second": 4.341,
"eval_steps_per_second": 0.273,
"step": 7600
},
{
"epoch": 4.06,
"learning_rate": 0.0002504210526315789,
"loss": 0.0168,
"step": 7625
},
{
"epoch": 4.06,
"learning_rate": 0.00024778947368421056,
"loss": 0.0149,
"step": 7650
},
{
"epoch": 4.06,
"learning_rate": 0.0002451578947368421,
"loss": 0.0136,
"step": 7675
},
{
"epoch": 4.06,
"learning_rate": 0.0002425263157894737,
"loss": 0.0131,
"step": 7700
},
{
"epoch": 4.07,
"learning_rate": 0.00023989473684210528,
"loss": 0.0138,
"step": 7725
},
{
"epoch": 4.07,
"learning_rate": 0.00023726315789473684,
"loss": 0.0128,
"step": 7750
},
{
"epoch": 4.07,
"learning_rate": 0.0002346315789473684,
"loss": 0.0129,
"step": 7775
},
{
"epoch": 4.07,
"learning_rate": 0.00023200000000000003,
"loss": 0.0123,
"step": 7800
},
{
"epoch": 4.07,
"eval_loss": 0.3095347583293915,
"eval_runtime": 230.2124,
"eval_samples_per_second": 4.344,
"eval_steps_per_second": 0.274,
"step": 7800
},
{
"epoch": 4.08,
"learning_rate": 0.0002293684210526316,
"loss": 0.0123,
"step": 7825
},
{
"epoch": 4.08,
"learning_rate": 0.00022673684210526316,
"loss": 0.0109,
"step": 7850
},
{
"epoch": 4.08,
"learning_rate": 0.00022410526315789472,
"loss": 0.0124,
"step": 7875
},
{
"epoch": 4.08,
"learning_rate": 0.00022147368421052632,
"loss": 0.0123,
"step": 7900
},
{
"epoch": 4.09,
"learning_rate": 0.0002188421052631579,
"loss": 0.011,
"step": 7925
},
{
"epoch": 4.09,
"learning_rate": 0.00021621052631578947,
"loss": 0.0109,
"step": 7950
},
{
"epoch": 4.09,
"learning_rate": 0.00021357894736842106,
"loss": 0.0113,
"step": 7975
},
{
"epoch": 4.09,
"learning_rate": 0.00021094736842105263,
"loss": 0.0102,
"step": 8000
},
{
"epoch": 4.09,
"eval_loss": 0.3172641098499298,
"eval_runtime": 232.1001,
"eval_samples_per_second": 4.308,
"eval_steps_per_second": 0.271,
"step": 8000
},
{
"epoch": 4.1,
"learning_rate": 0.00020831578947368422,
"loss": 0.0126,
"step": 8025
},
{
"epoch": 4.1,
"learning_rate": 0.0002056842105263158,
"loss": 0.0105,
"step": 8050
},
{
"epoch": 4.1,
"learning_rate": 0.00020305263157894738,
"loss": 0.0113,
"step": 8075
},
{
"epoch": 4.1,
"learning_rate": 0.00020042105263157894,
"loss": 0.0108,
"step": 8100
},
{
"epoch": 4.11,
"learning_rate": 0.0001977894736842105,
"loss": 0.0097,
"step": 8125
},
{
"epoch": 4.11,
"learning_rate": 0.00019515789473684213,
"loss": 0.0112,
"step": 8150
},
{
"epoch": 4.11,
"learning_rate": 0.0001925263157894737,
"loss": 0.0103,
"step": 8175
},
{
"epoch": 4.11,
"learning_rate": 0.00018989473684210526,
"loss": 0.0106,
"step": 8200
},
{
"epoch": 4.11,
"eval_loss": 0.31945887207984924,
"eval_runtime": 268.0042,
"eval_samples_per_second": 3.731,
"eval_steps_per_second": 0.235,
"step": 8200
},
{
"epoch": 4.12,
"learning_rate": 0.00018726315789473685,
"loss": 0.0094,
"step": 8225
},
{
"epoch": 4.12,
"learning_rate": 0.00018463157894736844,
"loss": 0.0099,
"step": 8250
},
{
"epoch": 4.12,
"learning_rate": 0.000182,
"loss": 0.0103,
"step": 8275
},
{
"epoch": 4.12,
"learning_rate": 0.00017936842105263157,
"loss": 0.0091,
"step": 8300
},
{
"epoch": 4.13,
"learning_rate": 0.00017673684210526316,
"loss": 0.0083,
"step": 8325
},
{
"epoch": 4.13,
"learning_rate": 0.00017410526315789473,
"loss": 0.0091,
"step": 8350
},
{
"epoch": 4.13,
"learning_rate": 0.00017147368421052632,
"loss": 0.0093,
"step": 8375
},
{
"epoch": 4.13,
"learning_rate": 0.0001688421052631579,
"loss": 0.0093,
"step": 8400
},
{
"epoch": 4.13,
"eval_loss": 0.3215804398059845,
"eval_runtime": 237.5836,
"eval_samples_per_second": 4.209,
"eval_steps_per_second": 0.265,
"step": 8400
},
{
"epoch": 4.14,
"learning_rate": 0.00016621052631578948,
"loss": 0.0095,
"step": 8425
},
{
"epoch": 4.14,
"learning_rate": 0.00016357894736842104,
"loss": 0.0104,
"step": 8450
},
{
"epoch": 4.14,
"learning_rate": 0.00016094736842105266,
"loss": 0.0091,
"step": 8475
},
{
"epoch": 4.14,
"learning_rate": 0.00015831578947368423,
"loss": 0.0083,
"step": 8500
},
{
"epoch": 4.15,
"learning_rate": 0.0001556842105263158,
"loss": 0.0087,
"step": 8525
},
{
"epoch": 4.15,
"learning_rate": 0.00015305263157894736,
"loss": 0.0158,
"step": 8550
},
{
"epoch": 4.15,
"learning_rate": 0.00015042105263157895,
"loss": 0.0306,
"step": 8575
},
{
"epoch": 4.15,
"learning_rate": 0.00014778947368421054,
"loss": 0.0323,
"step": 8600
},
{
"epoch": 4.15,
"eval_loss": 0.244441956281662,
"eval_runtime": 237.4283,
"eval_samples_per_second": 4.212,
"eval_steps_per_second": 0.265,
"step": 8600
},
{
"epoch": 4.16,
"learning_rate": 0.0001451578947368421,
"loss": 0.0321,
"step": 8625
},
{
"epoch": 4.16,
"learning_rate": 0.0001425263157894737,
"loss": 0.0313,
"step": 8650
},
{
"epoch": 4.16,
"learning_rate": 0.00013989473684210526,
"loss": 0.0363,
"step": 8675
},
{
"epoch": 4.16,
"learning_rate": 0.00013726315789473685,
"loss": 0.0329,
"step": 8700
},
{
"epoch": 4.17,
"learning_rate": 0.00013463157894736842,
"loss": 0.0294,
"step": 8725
},
{
"epoch": 4.17,
"learning_rate": 0.000132,
"loss": 0.0255,
"step": 8750
},
{
"epoch": 4.17,
"learning_rate": 0.00012936842105263158,
"loss": 0.0213,
"step": 8775
},
{
"epoch": 4.17,
"learning_rate": 0.00012673684210526314,
"loss": 0.023,
"step": 8800
},
{
"epoch": 4.17,
"eval_loss": 0.2333984673023224,
"eval_runtime": 235.211,
"eval_samples_per_second": 4.252,
"eval_steps_per_second": 0.268,
"step": 8800
},
{
"epoch": 4.18,
"learning_rate": 0.00012410526315789473,
"loss": 0.0227,
"step": 8825
},
{
"epoch": 5.0,
"learning_rate": 0.00012147368421052632,
"loss": 0.0157,
"step": 8850
},
{
"epoch": 5.0,
"learning_rate": 0.00011884210526315789,
"loss": 0.0108,
"step": 8875
},
{
"epoch": 5.01,
"learning_rate": 0.00011621052631578948,
"loss": 0.01,
"step": 8900
},
{
"epoch": 5.01,
"learning_rate": 0.00011357894736842106,
"loss": 0.0092,
"step": 8925
},
{
"epoch": 5.01,
"learning_rate": 0.00011094736842105262,
"loss": 0.0093,
"step": 8950
},
{
"epoch": 5.01,
"learning_rate": 0.00010831578947368422,
"loss": 0.0112,
"step": 8975
},
{
"epoch": 5.02,
"learning_rate": 0.00010568421052631578,
"loss": 0.0082,
"step": 9000
},
{
"epoch": 5.02,
"eval_loss": 0.2606710195541382,
"eval_runtime": 238.1871,
"eval_samples_per_second": 4.198,
"eval_steps_per_second": 0.264,
"step": 9000
},
{
"epoch": 5.02,
"learning_rate": 0.00010305263157894737,
"loss": 0.0078,
"step": 9025
},
{
"epoch": 5.02,
"learning_rate": 0.00010042105263157895,
"loss": 0.0084,
"step": 9050
},
{
"epoch": 5.02,
"learning_rate": 9.778947368421053e-05,
"loss": 0.0069,
"step": 9075
},
{
"epoch": 5.03,
"learning_rate": 9.515789473684211e-05,
"loss": 0.0071,
"step": 9100
},
{
"epoch": 5.03,
"learning_rate": 9.252631578947369e-05,
"loss": 0.0068,
"step": 9125
},
{
"epoch": 5.03,
"learning_rate": 8.989473684210527e-05,
"loss": 0.0069,
"step": 9150
},
{
"epoch": 5.03,
"learning_rate": 8.726315789473684e-05,
"loss": 0.0081,
"step": 9175
},
{
"epoch": 5.04,
"learning_rate": 8.463157894736842e-05,
"loss": 0.0069,
"step": 9200
},
{
"epoch": 5.04,
"eval_loss": 0.26959776878356934,
"eval_runtime": 236.0274,
"eval_samples_per_second": 4.237,
"eval_steps_per_second": 0.267,
"step": 9200
},
{
"epoch": 5.04,
"learning_rate": 8.2e-05,
"loss": 0.0073,
"step": 9225
},
{
"epoch": 5.04,
"learning_rate": 7.936842105263158e-05,
"loss": 0.0065,
"step": 9250
},
{
"epoch": 5.04,
"learning_rate": 7.673684210526316e-05,
"loss": 0.0067,
"step": 9275
},
{
"epoch": 5.05,
"learning_rate": 7.410526315789475e-05,
"loss": 0.0067,
"step": 9300
},
{
"epoch": 5.05,
"learning_rate": 7.147368421052631e-05,
"loss": 0.0058,
"step": 9325
},
{
"epoch": 5.05,
"learning_rate": 6.884210526315791e-05,
"loss": 0.0078,
"step": 9350
},
{
"epoch": 5.05,
"learning_rate": 6.621052631578947e-05,
"loss": 0.0068,
"step": 9375
},
{
"epoch": 5.06,
"learning_rate": 6.357894736842105e-05,
"loss": 0.008,
"step": 9400
},
{
"epoch": 5.06,
"eval_loss": 0.2715131342411041,
"eval_runtime": 235.3933,
"eval_samples_per_second": 4.248,
"eval_steps_per_second": 0.268,
"step": 9400
},
{
"epoch": 5.06,
"learning_rate": 6.0947368421052635e-05,
"loss": 0.0065,
"step": 9425
},
{
"epoch": 5.06,
"learning_rate": 5.8315789473684214e-05,
"loss": 0.0065,
"step": 9450
},
{
"epoch": 5.06,
"learning_rate": 5.5684210526315786e-05,
"loss": 0.0072,
"step": 9475
},
{
"epoch": 5.07,
"learning_rate": 5.3052631578947364e-05,
"loss": 0.0055,
"step": 9500
},
{
"epoch": 5.07,
"learning_rate": 5.042105263157895e-05,
"loss": 0.0069,
"step": 9525
},
{
"epoch": 5.07,
"learning_rate": 4.778947368421053e-05,
"loss": 0.0074,
"step": 9550
},
{
"epoch": 5.07,
"learning_rate": 4.5157894736842106e-05,
"loss": 0.006,
"step": 9575
},
{
"epoch": 5.08,
"learning_rate": 4.2526315789473685e-05,
"loss": 0.0068,
"step": 9600
},
{
"epoch": 5.08,
"eval_loss": 0.27589207887649536,
"eval_runtime": 232.1651,
"eval_samples_per_second": 4.307,
"eval_steps_per_second": 0.271,
"step": 9600
},
{
"epoch": 5.08,
"learning_rate": 3.989473684210526e-05,
"loss": 0.0057,
"step": 9625
},
{
"epoch": 5.08,
"learning_rate": 3.726315789473685e-05,
"loss": 0.0061,
"step": 9650
},
{
"epoch": 5.08,
"learning_rate": 3.463157894736843e-05,
"loss": 0.0056,
"step": 9675
},
{
"epoch": 5.09,
"learning_rate": 3.2e-05,
"loss": 0.0056,
"step": 9700
},
{
"epoch": 5.09,
"learning_rate": 2.936842105263158e-05,
"loss": 0.0056,
"step": 9725
},
{
"epoch": 5.09,
"learning_rate": 2.673684210526316e-05,
"loss": 0.0064,
"step": 9750
},
{
"epoch": 5.09,
"learning_rate": 2.4105263157894737e-05,
"loss": 0.0057,
"step": 9775
},
{
"epoch": 5.1,
"learning_rate": 2.1473684210526316e-05,
"loss": 0.0061,
"step": 9800
},
{
"epoch": 5.1,
"eval_loss": 0.27701282501220703,
"eval_runtime": 227.6759,
"eval_samples_per_second": 4.392,
"eval_steps_per_second": 0.277,
"step": 9800
},
{
"epoch": 5.1,
"learning_rate": 1.8842105263157898e-05,
"loss": 0.0055,
"step": 9825
},
{
"epoch": 5.1,
"learning_rate": 1.6210526315789473e-05,
"loss": 0.0059,
"step": 9850
},
{
"epoch": 5.1,
"learning_rate": 1.3578947368421053e-05,
"loss": 0.0054,
"step": 9875
},
{
"epoch": 5.11,
"learning_rate": 1.0947368421052631e-05,
"loss": 0.005,
"step": 9900
},
{
"epoch": 5.11,
"learning_rate": 8.31578947368421e-06,
"loss": 0.0054,
"step": 9925
},
{
"epoch": 5.11,
"learning_rate": 5.68421052631579e-06,
"loss": 0.0055,
"step": 9950
},
{
"epoch": 5.11,
"learning_rate": 3.0526315789473684e-06,
"loss": 0.0048,
"step": 9975
},
{
"epoch": 5.12,
"learning_rate": 4.210526315789474e-07,
"loss": 0.0046,
"step": 10000
},
{
"epoch": 5.12,
"eval_loss": 0.2767942547798157,
"eval_runtime": 226.6824,
"eval_samples_per_second": 4.411,
"eval_steps_per_second": 0.278,
"step": 10000
}
],
"logging_steps": 25,
"max_steps": 10000,
"num_train_epochs": 9223372036854775807,
"save_steps": 400,
"total_flos": 1.029789873027072e+21,
"trial_name": null,
"trial_params": null
}