akkky02's picture
Upload folder using huggingface_hub
94eb8cd verified
raw
history blame
No virus
27.2 kB
{
"best_metric": 0.6120218634605408,
"best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/microsoft/phi_2_ledgar/checkpoint-2800",
"epoch": 3.0,
"eval_steps": 100,
"global_step": 2814,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03,
"grad_norm": 278.5504455566406,
"learning_rate": 4.9555792466240235e-06,
"loss": 4.8556,
"step": 25
},
{
"epoch": 0.05,
"grad_norm": 332.1685485839844,
"learning_rate": 4.911158493248046e-06,
"loss": 4.2381,
"step": 50
},
{
"epoch": 0.08,
"grad_norm": 238.07369995117188,
"learning_rate": 4.866737739872069e-06,
"loss": 4.0038,
"step": 75
},
{
"epoch": 0.11,
"grad_norm": 152.84237670898438,
"learning_rate": 4.822316986496091e-06,
"loss": 3.6034,
"step": 100
},
{
"epoch": 0.11,
"eval_accuracy": 0.337,
"eval_f1_macro": 0.12358690595802334,
"eval_f1_micro": 0.337,
"eval_loss": 3.211400032043457,
"eval_runtime": 47.3513,
"eval_samples_per_second": 211.188,
"eval_steps_per_second": 3.316,
"step": 100
},
{
"epoch": 0.13,
"grad_norm": 143.5316619873047,
"learning_rate": 4.777896233120114e-06,
"loss": 3.1831,
"step": 125
},
{
"epoch": 0.16,
"grad_norm": 152.6884765625,
"learning_rate": 4.733475479744136e-06,
"loss": 2.9609,
"step": 150
},
{
"epoch": 0.19,
"grad_norm": 116.26598358154297,
"learning_rate": 4.6890547263681595e-06,
"loss": 2.4773,
"step": 175
},
{
"epoch": 0.21,
"grad_norm": 102.64595031738281,
"learning_rate": 4.644633972992183e-06,
"loss": 2.2678,
"step": 200
},
{
"epoch": 0.21,
"eval_accuracy": 0.5623,
"eval_f1_macro": 0.3331251034728065,
"eval_f1_micro": 0.5623,
"eval_loss": 1.9837249517440796,
"eval_runtime": 47.4491,
"eval_samples_per_second": 210.752,
"eval_steps_per_second": 3.309,
"step": 200
},
{
"epoch": 0.24,
"grad_norm": 107.59564208984375,
"learning_rate": 4.600213219616206e-06,
"loss": 2.0387,
"step": 225
},
{
"epoch": 0.27,
"grad_norm": 129.82203674316406,
"learning_rate": 4.555792466240228e-06,
"loss": 1.7772,
"step": 250
},
{
"epoch": 0.29,
"grad_norm": 106.58737182617188,
"learning_rate": 4.51137171286425e-06,
"loss": 1.5959,
"step": 275
},
{
"epoch": 0.32,
"grad_norm": 219.3987274169922,
"learning_rate": 4.466950959488273e-06,
"loss": 1.4927,
"step": 300
},
{
"epoch": 0.32,
"eval_accuracy": 0.6712,
"eval_f1_macro": 0.48839865479998884,
"eval_f1_micro": 0.6712,
"eval_loss": 1.3369125127792358,
"eval_runtime": 47.4603,
"eval_samples_per_second": 210.702,
"eval_steps_per_second": 3.308,
"step": 300
},
{
"epoch": 0.35,
"grad_norm": 62.788822174072266,
"learning_rate": 4.422530206112296e-06,
"loss": 1.4203,
"step": 325
},
{
"epoch": 0.37,
"grad_norm": 63.6900634765625,
"learning_rate": 4.378109452736319e-06,
"loss": 1.2841,
"step": 350
},
{
"epoch": 0.4,
"grad_norm": 75.33383178710938,
"learning_rate": 4.333688699360342e-06,
"loss": 1.1642,
"step": 375
},
{
"epoch": 0.43,
"grad_norm": 89.42117309570312,
"learning_rate": 4.289267945984365e-06,
"loss": 1.1518,
"step": 400
},
{
"epoch": 0.43,
"eval_accuracy": 0.7243,
"eval_f1_macro": 0.5613056174383468,
"eval_f1_micro": 0.7243,
"eval_loss": 1.052606225013733,
"eval_runtime": 47.3838,
"eval_samples_per_second": 211.043,
"eval_steps_per_second": 3.313,
"step": 400
},
{
"epoch": 0.45,
"grad_norm": 74.13993072509766,
"learning_rate": 4.244847192608387e-06,
"loss": 1.135,
"step": 425
},
{
"epoch": 0.48,
"grad_norm": 80.19258117675781,
"learning_rate": 4.200426439232409e-06,
"loss": 1.176,
"step": 450
},
{
"epoch": 0.51,
"grad_norm": 70.83063507080078,
"learning_rate": 4.156005685856432e-06,
"loss": 1.1386,
"step": 475
},
{
"epoch": 0.53,
"grad_norm": 109.83588409423828,
"learning_rate": 4.1115849324804554e-06,
"loss": 1.1041,
"step": 500
},
{
"epoch": 0.53,
"eval_accuracy": 0.7521,
"eval_f1_macro": 0.6206030644902993,
"eval_f1_micro": 0.7521,
"eval_loss": 0.9304812550544739,
"eval_runtime": 47.4284,
"eval_samples_per_second": 210.844,
"eval_steps_per_second": 3.31,
"step": 500
},
{
"epoch": 0.56,
"grad_norm": 51.40995788574219,
"learning_rate": 4.067164179104478e-06,
"loss": 0.9872,
"step": 525
},
{
"epoch": 0.59,
"grad_norm": 75.6723861694336,
"learning_rate": 4.022743425728501e-06,
"loss": 0.9832,
"step": 550
},
{
"epoch": 0.61,
"grad_norm": 86.0229721069336,
"learning_rate": 3.978322672352524e-06,
"loss": 0.9485,
"step": 575
},
{
"epoch": 0.64,
"grad_norm": 113.96233367919922,
"learning_rate": 3.933901918976546e-06,
"loss": 1.0144,
"step": 600
},
{
"epoch": 0.64,
"eval_accuracy": 0.7574,
"eval_f1_macro": 0.6294250374822845,
"eval_f1_micro": 0.7574,
"eval_loss": 0.9067593812942505,
"eval_runtime": 47.4873,
"eval_samples_per_second": 210.583,
"eval_steps_per_second": 3.306,
"step": 600
},
{
"epoch": 0.67,
"grad_norm": 77.80159759521484,
"learning_rate": 3.889481165600569e-06,
"loss": 1.0435,
"step": 625
},
{
"epoch": 0.69,
"grad_norm": 77.97108459472656,
"learning_rate": 3.8450604122245914e-06,
"loss": 0.9593,
"step": 650
},
{
"epoch": 0.72,
"grad_norm": 68.20707702636719,
"learning_rate": 3.8006396588486145e-06,
"loss": 0.9818,
"step": 675
},
{
"epoch": 0.75,
"grad_norm": 103.11054229736328,
"learning_rate": 3.756218905472637e-06,
"loss": 0.9892,
"step": 700
},
{
"epoch": 0.75,
"eval_accuracy": 0.7669,
"eval_f1_macro": 0.642987522646621,
"eval_f1_micro": 0.7669,
"eval_loss": 0.8712124824523926,
"eval_runtime": 47.4988,
"eval_samples_per_second": 210.531,
"eval_steps_per_second": 3.305,
"step": 700
},
{
"epoch": 0.77,
"grad_norm": 88.3270263671875,
"learning_rate": 3.71179815209666e-06,
"loss": 0.8937,
"step": 725
},
{
"epoch": 0.8,
"grad_norm": 67.907958984375,
"learning_rate": 3.667377398720683e-06,
"loss": 0.9788,
"step": 750
},
{
"epoch": 0.83,
"grad_norm": 68.60499572753906,
"learning_rate": 3.622956645344705e-06,
"loss": 0.9459,
"step": 775
},
{
"epoch": 0.85,
"grad_norm": 71.7964859008789,
"learning_rate": 3.578535891968728e-06,
"loss": 0.9972,
"step": 800
},
{
"epoch": 0.85,
"eval_accuracy": 0.7675,
"eval_f1_macro": 0.6369038817239384,
"eval_f1_micro": 0.7675,
"eval_loss": 0.8591374754905701,
"eval_runtime": 47.4288,
"eval_samples_per_second": 210.842,
"eval_steps_per_second": 3.31,
"step": 800
},
{
"epoch": 0.88,
"grad_norm": 99.17341613769531,
"learning_rate": 3.534115138592751e-06,
"loss": 0.9663,
"step": 825
},
{
"epoch": 0.91,
"grad_norm": 198.6553955078125,
"learning_rate": 3.4896943852167736e-06,
"loss": 0.8617,
"step": 850
},
{
"epoch": 0.93,
"grad_norm": 82.58099365234375,
"learning_rate": 3.4452736318407963e-06,
"loss": 0.8735,
"step": 875
},
{
"epoch": 0.96,
"grad_norm": 63.692623138427734,
"learning_rate": 3.4008528784648194e-06,
"loss": 0.8439,
"step": 900
},
{
"epoch": 0.96,
"eval_accuracy": 0.7848,
"eval_f1_macro": 0.6834829058415216,
"eval_f1_micro": 0.7848,
"eval_loss": 0.7894874811172485,
"eval_runtime": 47.4489,
"eval_samples_per_second": 210.753,
"eval_steps_per_second": 3.309,
"step": 900
},
{
"epoch": 0.99,
"grad_norm": 78.88030242919922,
"learning_rate": 3.3564321250888416e-06,
"loss": 0.8582,
"step": 925
},
{
"epoch": 1.01,
"grad_norm": 45.49142837524414,
"learning_rate": 3.3120113717128643e-06,
"loss": 0.8621,
"step": 950
},
{
"epoch": 1.04,
"grad_norm": 37.98048400878906,
"learning_rate": 3.2675906183368874e-06,
"loss": 0.8396,
"step": 975
},
{
"epoch": 1.07,
"grad_norm": 37.72966384887695,
"learning_rate": 3.22316986496091e-06,
"loss": 0.7409,
"step": 1000
},
{
"epoch": 1.07,
"eval_accuracy": 0.7944,
"eval_f1_macro": 0.6808979493082343,
"eval_f1_micro": 0.7944,
"eval_loss": 0.7614499926567078,
"eval_runtime": 47.4823,
"eval_samples_per_second": 210.605,
"eval_steps_per_second": 3.306,
"step": 1000
},
{
"epoch": 1.09,
"grad_norm": 57.926578521728516,
"learning_rate": 3.1787491115849327e-06,
"loss": 0.7475,
"step": 1025
},
{
"epoch": 1.12,
"grad_norm": 86.9485855102539,
"learning_rate": 3.1343283582089558e-06,
"loss": 0.7197,
"step": 1050
},
{
"epoch": 1.15,
"grad_norm": 54.83045959472656,
"learning_rate": 3.0899076048329785e-06,
"loss": 0.8314,
"step": 1075
},
{
"epoch": 1.17,
"grad_norm": 68.1777114868164,
"learning_rate": 3.0454868514570007e-06,
"loss": 0.7627,
"step": 1100
},
{
"epoch": 1.17,
"eval_accuracy": 0.7946,
"eval_f1_macro": 0.6809735037021439,
"eval_f1_micro": 0.7946,
"eval_loss": 0.7538655996322632,
"eval_runtime": 47.4181,
"eval_samples_per_second": 210.89,
"eval_steps_per_second": 3.311,
"step": 1100
},
{
"epoch": 1.2,
"grad_norm": 31.217445373535156,
"learning_rate": 3.0010660980810234e-06,
"loss": 0.7501,
"step": 1125
},
{
"epoch": 1.23,
"grad_norm": 38.76744842529297,
"learning_rate": 2.9566453447050464e-06,
"loss": 0.6884,
"step": 1150
},
{
"epoch": 1.25,
"grad_norm": 50.481388092041016,
"learning_rate": 2.912224591329069e-06,
"loss": 0.6911,
"step": 1175
},
{
"epoch": 1.28,
"grad_norm": 54.91836929321289,
"learning_rate": 2.867803837953092e-06,
"loss": 0.8065,
"step": 1200
},
{
"epoch": 1.28,
"eval_accuracy": 0.8008,
"eval_f1_macro": 0.6945223141031294,
"eval_f1_micro": 0.8008,
"eval_loss": 0.7289281487464905,
"eval_runtime": 47.4458,
"eval_samples_per_second": 210.767,
"eval_steps_per_second": 3.309,
"step": 1200
},
{
"epoch": 1.31,
"grad_norm": 72.01395416259766,
"learning_rate": 2.823383084577115e-06,
"loss": 0.8075,
"step": 1225
},
{
"epoch": 1.33,
"grad_norm": 83.60379791259766,
"learning_rate": 2.7789623312011375e-06,
"loss": 0.8068,
"step": 1250
},
{
"epoch": 1.36,
"grad_norm": 51.42044448852539,
"learning_rate": 2.7345415778251598e-06,
"loss": 0.7549,
"step": 1275
},
{
"epoch": 1.39,
"grad_norm": 44.68650436401367,
"learning_rate": 2.690120824449183e-06,
"loss": 0.7359,
"step": 1300
},
{
"epoch": 1.39,
"eval_accuracy": 0.8034,
"eval_f1_macro": 0.6975565105528009,
"eval_f1_micro": 0.8034,
"eval_loss": 0.7253593802452087,
"eval_runtime": 47.5217,
"eval_samples_per_second": 210.43,
"eval_steps_per_second": 3.304,
"step": 1300
},
{
"epoch": 1.41,
"grad_norm": 56.39244842529297,
"learning_rate": 2.6457000710732055e-06,
"loss": 0.7201,
"step": 1325
},
{
"epoch": 1.44,
"grad_norm": 70.84113311767578,
"learning_rate": 2.601279317697228e-06,
"loss": 0.7346,
"step": 1350
},
{
"epoch": 1.47,
"grad_norm": 69.2415542602539,
"learning_rate": 2.5568585643212513e-06,
"loss": 0.6693,
"step": 1375
},
{
"epoch": 1.49,
"grad_norm": 50.06897735595703,
"learning_rate": 2.512437810945274e-06,
"loss": 0.6525,
"step": 1400
},
{
"epoch": 1.49,
"eval_accuracy": 0.8065,
"eval_f1_macro": 0.7050317065636842,
"eval_f1_micro": 0.8065,
"eval_loss": 0.7072968482971191,
"eval_runtime": 47.4893,
"eval_samples_per_second": 210.574,
"eval_steps_per_second": 3.306,
"step": 1400
},
{
"epoch": 1.52,
"grad_norm": 45.797054290771484,
"learning_rate": 2.4680170575692966e-06,
"loss": 0.7271,
"step": 1425
},
{
"epoch": 1.55,
"grad_norm": 46.304473876953125,
"learning_rate": 2.4235963041933193e-06,
"loss": 0.8033,
"step": 1450
},
{
"epoch": 1.57,
"grad_norm": 80.502685546875,
"learning_rate": 2.379175550817342e-06,
"loss": 0.7464,
"step": 1475
},
{
"epoch": 1.6,
"grad_norm": 89.19364166259766,
"learning_rate": 2.3347547974413646e-06,
"loss": 0.7359,
"step": 1500
},
{
"epoch": 1.6,
"eval_accuracy": 0.8033,
"eval_f1_macro": 0.6948589139841357,
"eval_f1_micro": 0.8033,
"eval_loss": 0.720634400844574,
"eval_runtime": 47.454,
"eval_samples_per_second": 210.73,
"eval_steps_per_second": 3.308,
"step": 1500
},
{
"epoch": 1.63,
"grad_norm": 36.966548919677734,
"learning_rate": 2.2903340440653877e-06,
"loss": 0.697,
"step": 1525
},
{
"epoch": 1.65,
"grad_norm": 37.08382797241211,
"learning_rate": 2.24591329068941e-06,
"loss": 0.6736,
"step": 1550
},
{
"epoch": 1.68,
"grad_norm": 37.009403228759766,
"learning_rate": 2.201492537313433e-06,
"loss": 0.6441,
"step": 1575
},
{
"epoch": 1.71,
"grad_norm": 29.34271240234375,
"learning_rate": 2.1570717839374557e-06,
"loss": 0.7291,
"step": 1600
},
{
"epoch": 1.71,
"eval_accuracy": 0.8089,
"eval_f1_macro": 0.7065968196696667,
"eval_f1_micro": 0.8089,
"eval_loss": 0.6923750042915344,
"eval_runtime": 47.4715,
"eval_samples_per_second": 210.653,
"eval_steps_per_second": 3.307,
"step": 1600
},
{
"epoch": 1.73,
"grad_norm": 43.28564453125,
"learning_rate": 2.112651030561479e-06,
"loss": 0.7153,
"step": 1625
},
{
"epoch": 1.76,
"grad_norm": 27.64020347595215,
"learning_rate": 2.068230277185501e-06,
"loss": 0.6721,
"step": 1650
},
{
"epoch": 1.79,
"grad_norm": 46.75600051879883,
"learning_rate": 2.023809523809524e-06,
"loss": 0.6566,
"step": 1675
},
{
"epoch": 1.81,
"grad_norm": 34.07355499267578,
"learning_rate": 1.979388770433547e-06,
"loss": 0.7072,
"step": 1700
},
{
"epoch": 1.81,
"eval_accuracy": 0.8102,
"eval_f1_macro": 0.7070176051219198,
"eval_f1_micro": 0.8102,
"eval_loss": 0.6763781309127808,
"eval_runtime": 47.526,
"eval_samples_per_second": 210.411,
"eval_steps_per_second": 3.303,
"step": 1700
},
{
"epoch": 1.84,
"grad_norm": 43.36367416381836,
"learning_rate": 1.9349680170575695e-06,
"loss": 0.7289,
"step": 1725
},
{
"epoch": 1.87,
"grad_norm": 34.61235046386719,
"learning_rate": 1.8905472636815921e-06,
"loss": 0.6787,
"step": 1750
},
{
"epoch": 1.89,
"grad_norm": 44.31460952758789,
"learning_rate": 1.846126510305615e-06,
"loss": 0.7188,
"step": 1775
},
{
"epoch": 1.92,
"grad_norm": 53.3677864074707,
"learning_rate": 1.8017057569296375e-06,
"loss": 0.6688,
"step": 1800
},
{
"epoch": 1.92,
"eval_accuracy": 0.814,
"eval_f1_macro": 0.7128286594201773,
"eval_f1_micro": 0.814,
"eval_loss": 0.6546062231063843,
"eval_runtime": 47.5163,
"eval_samples_per_second": 210.454,
"eval_steps_per_second": 3.304,
"step": 1800
},
{
"epoch": 1.95,
"grad_norm": 37.4492073059082,
"learning_rate": 1.7572850035536603e-06,
"loss": 0.7817,
"step": 1825
},
{
"epoch": 1.97,
"grad_norm": 41.068538665771484,
"learning_rate": 1.7128642501776832e-06,
"loss": 0.7044,
"step": 1850
},
{
"epoch": 2.0,
"grad_norm": 67.96875762939453,
"learning_rate": 1.668443496801706e-06,
"loss": 0.5948,
"step": 1875
},
{
"epoch": 2.03,
"grad_norm": 51.58526611328125,
"learning_rate": 1.6240227434257286e-06,
"loss": 0.6253,
"step": 1900
},
{
"epoch": 2.03,
"eval_accuracy": 0.8158,
"eval_f1_macro": 0.7058775462036815,
"eval_f1_micro": 0.8158,
"eval_loss": 0.6505562663078308,
"eval_runtime": 47.3282,
"eval_samples_per_second": 211.29,
"eval_steps_per_second": 3.317,
"step": 1900
},
{
"epoch": 2.05,
"grad_norm": 45.662109375,
"learning_rate": 1.5796019900497514e-06,
"loss": 0.645,
"step": 1925
},
{
"epoch": 2.08,
"grad_norm": 66.76183319091797,
"learning_rate": 1.5351812366737743e-06,
"loss": 0.6412,
"step": 1950
},
{
"epoch": 2.11,
"grad_norm": 56.84629440307617,
"learning_rate": 1.4907604832977968e-06,
"loss": 0.6511,
"step": 1975
},
{
"epoch": 2.13,
"grad_norm": 53.2021484375,
"learning_rate": 1.4463397299218196e-06,
"loss": 0.6044,
"step": 2000
},
{
"epoch": 2.13,
"eval_accuracy": 0.8155,
"eval_f1_macro": 0.716530877903978,
"eval_f1_micro": 0.8155,
"eval_loss": 0.6602968573570251,
"eval_runtime": 47.473,
"eval_samples_per_second": 210.646,
"eval_steps_per_second": 3.307,
"step": 2000
},
{
"epoch": 2.16,
"grad_norm": 71.7922134399414,
"learning_rate": 1.4019189765458423e-06,
"loss": 0.6381,
"step": 2025
},
{
"epoch": 2.19,
"grad_norm": 32.50026321411133,
"learning_rate": 1.357498223169865e-06,
"loss": 0.6622,
"step": 2050
},
{
"epoch": 2.21,
"grad_norm": 50.483219146728516,
"learning_rate": 1.3130774697938879e-06,
"loss": 0.6165,
"step": 2075
},
{
"epoch": 2.24,
"grad_norm": 40.9278450012207,
"learning_rate": 1.2686567164179105e-06,
"loss": 0.6414,
"step": 2100
},
{
"epoch": 2.24,
"eval_accuracy": 0.8138,
"eval_f1_macro": 0.7185019905510756,
"eval_f1_micro": 0.8138,
"eval_loss": 0.6434906125068665,
"eval_runtime": 47.5169,
"eval_samples_per_second": 210.452,
"eval_steps_per_second": 3.304,
"step": 2100
},
{
"epoch": 2.27,
"grad_norm": 40.769439697265625,
"learning_rate": 1.2242359630419332e-06,
"loss": 0.619,
"step": 2125
},
{
"epoch": 2.29,
"grad_norm": 23.210596084594727,
"learning_rate": 1.179815209665956e-06,
"loss": 0.6168,
"step": 2150
},
{
"epoch": 2.32,
"grad_norm": 43.848052978515625,
"learning_rate": 1.1353944562899787e-06,
"loss": 0.5244,
"step": 2175
},
{
"epoch": 2.35,
"grad_norm": 40.81973648071289,
"learning_rate": 1.0909737029140014e-06,
"loss": 0.6115,
"step": 2200
},
{
"epoch": 2.35,
"eval_accuracy": 0.8216,
"eval_f1_macro": 0.727953382739938,
"eval_f1_micro": 0.8216,
"eval_loss": 0.63681560754776,
"eval_runtime": 47.4723,
"eval_samples_per_second": 210.649,
"eval_steps_per_second": 3.307,
"step": 2200
},
{
"epoch": 2.37,
"grad_norm": 28.034944534301758,
"learning_rate": 1.0465529495380243e-06,
"loss": 0.5613,
"step": 2225
},
{
"epoch": 2.4,
"grad_norm": 68.5925521850586,
"learning_rate": 1.002132196162047e-06,
"loss": 0.5987,
"step": 2250
},
{
"epoch": 2.43,
"grad_norm": 30.3986759185791,
"learning_rate": 9.577114427860696e-07,
"loss": 0.6119,
"step": 2275
},
{
"epoch": 2.45,
"grad_norm": 38.481117248535156,
"learning_rate": 9.132906894100925e-07,
"loss": 0.6331,
"step": 2300
},
{
"epoch": 2.45,
"eval_accuracy": 0.8208,
"eval_f1_macro": 0.7251441527998863,
"eval_f1_micro": 0.8208,
"eval_loss": 0.6273249983787537,
"eval_runtime": 47.475,
"eval_samples_per_second": 210.637,
"eval_steps_per_second": 3.307,
"step": 2300
},
{
"epoch": 2.48,
"grad_norm": 52.91291427612305,
"learning_rate": 8.688699360341152e-07,
"loss": 0.6261,
"step": 2325
},
{
"epoch": 2.51,
"grad_norm": 43.07246398925781,
"learning_rate": 8.24449182658138e-07,
"loss": 0.6191,
"step": 2350
},
{
"epoch": 2.53,
"grad_norm": 40.49489974975586,
"learning_rate": 7.800284292821607e-07,
"loss": 0.5856,
"step": 2375
},
{
"epoch": 2.56,
"grad_norm": 30.555904388427734,
"learning_rate": 7.356076759061834e-07,
"loss": 0.608,
"step": 2400
},
{
"epoch": 2.56,
"eval_accuracy": 0.8232,
"eval_f1_macro": 0.7285501243647127,
"eval_f1_micro": 0.8232,
"eval_loss": 0.6251906156539917,
"eval_runtime": 47.5564,
"eval_samples_per_second": 210.277,
"eval_steps_per_second": 3.301,
"step": 2400
},
{
"epoch": 2.59,
"grad_norm": 175.31057739257812,
"learning_rate": 6.911869225302062e-07,
"loss": 0.6802,
"step": 2425
},
{
"epoch": 2.61,
"grad_norm": 30.15215301513672,
"learning_rate": 6.467661691542289e-07,
"loss": 0.6307,
"step": 2450
},
{
"epoch": 2.64,
"grad_norm": 34.21811294555664,
"learning_rate": 6.023454157782517e-07,
"loss": 0.6029,
"step": 2475
},
{
"epoch": 2.67,
"grad_norm": 56.559364318847656,
"learning_rate": 5.579246624022743e-07,
"loss": 0.5879,
"step": 2500
},
{
"epoch": 2.67,
"eval_accuracy": 0.8241,
"eval_f1_macro": 0.7307624039691639,
"eval_f1_micro": 0.8241,
"eval_loss": 0.6171656250953674,
"eval_runtime": 47.4737,
"eval_samples_per_second": 210.643,
"eval_steps_per_second": 3.307,
"step": 2500
},
{
"epoch": 2.69,
"grad_norm": 43.20689392089844,
"learning_rate": 5.135039090262971e-07,
"loss": 0.5858,
"step": 2525
},
{
"epoch": 2.72,
"grad_norm": 46.331817626953125,
"learning_rate": 4.690831556503199e-07,
"loss": 0.5959,
"step": 2550
},
{
"epoch": 2.75,
"grad_norm": 36.399810791015625,
"learning_rate": 4.2466240227434256e-07,
"loss": 0.6069,
"step": 2575
},
{
"epoch": 2.77,
"grad_norm": 79.88939666748047,
"learning_rate": 3.8024164889836533e-07,
"loss": 0.6056,
"step": 2600
},
{
"epoch": 2.77,
"eval_accuracy": 0.8257,
"eval_f1_macro": 0.7346364945364393,
"eval_f1_micro": 0.8257,
"eval_loss": 0.6157156229019165,
"eval_runtime": 47.4846,
"eval_samples_per_second": 210.595,
"eval_steps_per_second": 3.306,
"step": 2600
},
{
"epoch": 2.8,
"grad_norm": 49.164161682128906,
"learning_rate": 3.358208955223881e-07,
"loss": 0.5675,
"step": 2625
},
{
"epoch": 2.83,
"grad_norm": 46.98076248168945,
"learning_rate": 2.914001421464108e-07,
"loss": 0.6607,
"step": 2650
},
{
"epoch": 2.85,
"grad_norm": 23.488380432128906,
"learning_rate": 2.4697938877043354e-07,
"loss": 0.5866,
"step": 2675
},
{
"epoch": 2.88,
"grad_norm": 31.710861206054688,
"learning_rate": 2.0255863539445632e-07,
"loss": 0.5711,
"step": 2700
},
{
"epoch": 2.88,
"eval_accuracy": 0.8253,
"eval_f1_macro": 0.7341228101010588,
"eval_f1_micro": 0.8253,
"eval_loss": 0.6129437685012817,
"eval_runtime": 47.5391,
"eval_samples_per_second": 210.353,
"eval_steps_per_second": 3.303,
"step": 2700
},
{
"epoch": 2.91,
"grad_norm": 33.91932678222656,
"learning_rate": 1.5813788201847903e-07,
"loss": 0.5565,
"step": 2725
},
{
"epoch": 2.93,
"grad_norm": 35.29643630981445,
"learning_rate": 1.1371712864250178e-07,
"loss": 0.5793,
"step": 2750
},
{
"epoch": 2.96,
"grad_norm": 33.85700988769531,
"learning_rate": 6.929637526652453e-08,
"loss": 0.5425,
"step": 2775
},
{
"epoch": 2.99,
"grad_norm": 40.65436935424805,
"learning_rate": 2.4875621890547265e-08,
"loss": 0.5802,
"step": 2800
},
{
"epoch": 2.99,
"eval_accuracy": 0.826,
"eval_f1_macro": 0.7355084015446216,
"eval_f1_micro": 0.826,
"eval_loss": 0.6120218634605408,
"eval_runtime": 47.528,
"eval_samples_per_second": 210.402,
"eval_steps_per_second": 3.303,
"step": 2800
},
{
"epoch": 3.0,
"step": 2814,
"total_flos": 3.482386379487314e+17,
"train_loss": 0.9835220694796108,
"train_runtime": 6489.4436,
"train_samples_per_second": 27.737,
"train_steps_per_second": 0.434
}
],
"logging_steps": 25,
"max_steps": 2814,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"total_flos": 3.482386379487314e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}