VanAnh's picture
End of training
d6ed931 verified
raw
history blame
33.6 kB
{
"best_metric": 0.5454545454545454,
"best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-isic217/checkpoint-245",
"epoch": 48.97959183673469,
"eval_steps": 500,
"global_step": 1200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.40816326530612246,
"grad_norm": 20.10276222229004,
"learning_rate": 4.166666666666667e-06,
"loss": 2.1681,
"step": 10
},
{
"epoch": 0.8163265306122449,
"grad_norm": 18.530778884887695,
"learning_rate": 8.333333333333334e-06,
"loss": 2.2404,
"step": 20
},
{
"epoch": 0.9795918367346939,
"eval_accuracy": 0.13636363636363635,
"eval_loss": 2.1526713371276855,
"eval_runtime": 3.7709,
"eval_samples_per_second": 5.834,
"eval_steps_per_second": 2.917,
"step": 24
},
{
"epoch": 1.2244897959183674,
"grad_norm": 19.15616798400879,
"learning_rate": 1.25e-05,
"loss": 2.1402,
"step": 30
},
{
"epoch": 1.6326530612244898,
"grad_norm": 18.088478088378906,
"learning_rate": 1.6666666666666667e-05,
"loss": 2.0749,
"step": 40
},
{
"epoch": 2.0,
"eval_accuracy": 0.13636363636363635,
"eval_loss": 2.115939140319824,
"eval_runtime": 4.2644,
"eval_samples_per_second": 5.159,
"eval_steps_per_second": 2.58,
"step": 49
},
{
"epoch": 2.0408163265306123,
"grad_norm": 28.553932189941406,
"learning_rate": 2.0833333333333336e-05,
"loss": 2.0201,
"step": 50
},
{
"epoch": 2.4489795918367347,
"grad_norm": 21.90468406677246,
"learning_rate": 2.5e-05,
"loss": 1.8697,
"step": 60
},
{
"epoch": 2.857142857142857,
"grad_norm": 29.221223831176758,
"learning_rate": 2.916666666666667e-05,
"loss": 1.947,
"step": 70
},
{
"epoch": 2.979591836734694,
"eval_accuracy": 0.13636363636363635,
"eval_loss": 2.172260284423828,
"eval_runtime": 3.7285,
"eval_samples_per_second": 5.9,
"eval_steps_per_second": 2.95,
"step": 73
},
{
"epoch": 3.2653061224489797,
"grad_norm": 26.88852882385254,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.8989,
"step": 80
},
{
"epoch": 3.673469387755102,
"grad_norm": 29.26166343688965,
"learning_rate": 3.7500000000000003e-05,
"loss": 1.732,
"step": 90
},
{
"epoch": 4.0,
"eval_accuracy": 0.09090909090909091,
"eval_loss": 2.154468536376953,
"eval_runtime": 4.2209,
"eval_samples_per_second": 5.212,
"eval_steps_per_second": 2.606,
"step": 98
},
{
"epoch": 4.081632653061225,
"grad_norm": 39.108612060546875,
"learning_rate": 4.166666666666667e-05,
"loss": 1.6138,
"step": 100
},
{
"epoch": 4.489795918367347,
"grad_norm": 23.49874496459961,
"learning_rate": 4.5833333333333334e-05,
"loss": 1.4534,
"step": 110
},
{
"epoch": 4.8979591836734695,
"grad_norm": 26.28005599975586,
"learning_rate": 5e-05,
"loss": 1.446,
"step": 120
},
{
"epoch": 4.979591836734694,
"eval_accuracy": 0.18181818181818182,
"eval_loss": 2.2917678356170654,
"eval_runtime": 3.8303,
"eval_samples_per_second": 5.744,
"eval_steps_per_second": 2.872,
"step": 122
},
{
"epoch": 5.3061224489795915,
"grad_norm": 38.8511962890625,
"learning_rate": 4.9537037037037035e-05,
"loss": 1.2513,
"step": 130
},
{
"epoch": 5.714285714285714,
"grad_norm": 29.820043563842773,
"learning_rate": 4.9074074074074075e-05,
"loss": 1.1175,
"step": 140
},
{
"epoch": 6.0,
"eval_accuracy": 0.3181818181818182,
"eval_loss": 1.9290556907653809,
"eval_runtime": 4.301,
"eval_samples_per_second": 5.115,
"eval_steps_per_second": 2.558,
"step": 147
},
{
"epoch": 6.122448979591836,
"grad_norm": 36.40614700317383,
"learning_rate": 4.8611111111111115e-05,
"loss": 1.3955,
"step": 150
},
{
"epoch": 6.530612244897959,
"grad_norm": 31.91864776611328,
"learning_rate": 4.814814814814815e-05,
"loss": 1.0427,
"step": 160
},
{
"epoch": 6.938775510204081,
"grad_norm": 35.82953643798828,
"learning_rate": 4.768518518518519e-05,
"loss": 1.1069,
"step": 170
},
{
"epoch": 6.979591836734694,
"eval_accuracy": 0.36363636363636365,
"eval_loss": 1.9551143646240234,
"eval_runtime": 4.268,
"eval_samples_per_second": 5.155,
"eval_steps_per_second": 2.577,
"step": 171
},
{
"epoch": 7.346938775510204,
"grad_norm": 30.75468635559082,
"learning_rate": 4.722222222222222e-05,
"loss": 0.9718,
"step": 180
},
{
"epoch": 7.755102040816326,
"grad_norm": 33.49174880981445,
"learning_rate": 4.675925925925926e-05,
"loss": 0.7932,
"step": 190
},
{
"epoch": 8.0,
"eval_accuracy": 0.4090909090909091,
"eval_loss": 2.0534465312957764,
"eval_runtime": 4.3072,
"eval_samples_per_second": 5.108,
"eval_steps_per_second": 2.554,
"step": 196
},
{
"epoch": 8.16326530612245,
"grad_norm": 45.86684799194336,
"learning_rate": 4.62962962962963e-05,
"loss": 1.0077,
"step": 200
},
{
"epoch": 8.571428571428571,
"grad_norm": 34.891082763671875,
"learning_rate": 4.5833333333333334e-05,
"loss": 0.7286,
"step": 210
},
{
"epoch": 8.979591836734693,
"grad_norm": 30.408119201660156,
"learning_rate": 4.5370370370370374e-05,
"loss": 0.5994,
"step": 220
},
{
"epoch": 8.979591836734693,
"eval_accuracy": 0.36363636363636365,
"eval_loss": 1.8135310411453247,
"eval_runtime": 3.8949,
"eval_samples_per_second": 5.648,
"eval_steps_per_second": 2.824,
"step": 220
},
{
"epoch": 9.387755102040817,
"grad_norm": 20.37044906616211,
"learning_rate": 4.490740740740741e-05,
"loss": 0.7078,
"step": 230
},
{
"epoch": 9.795918367346939,
"grad_norm": 14.344355583190918,
"learning_rate": 4.4444444444444447e-05,
"loss": 0.4671,
"step": 240
},
{
"epoch": 10.0,
"eval_accuracy": 0.5454545454545454,
"eval_loss": 1.7822338342666626,
"eval_runtime": 4.1682,
"eval_samples_per_second": 5.278,
"eval_steps_per_second": 2.639,
"step": 245
},
{
"epoch": 10.204081632653061,
"grad_norm": 39.557579040527344,
"learning_rate": 4.3981481481481486e-05,
"loss": 0.5962,
"step": 250
},
{
"epoch": 10.612244897959183,
"grad_norm": 7.858915328979492,
"learning_rate": 4.351851851851852e-05,
"loss": 0.4612,
"step": 260
},
{
"epoch": 10.979591836734693,
"eval_accuracy": 0.5,
"eval_loss": 2.2529866695404053,
"eval_runtime": 3.7854,
"eval_samples_per_second": 5.812,
"eval_steps_per_second": 2.906,
"step": 269
},
{
"epoch": 11.020408163265307,
"grad_norm": 28.87532615661621,
"learning_rate": 4.305555555555556e-05,
"loss": 0.4548,
"step": 270
},
{
"epoch": 11.428571428571429,
"grad_norm": 40.782554626464844,
"learning_rate": 4.259259259259259e-05,
"loss": 0.5538,
"step": 280
},
{
"epoch": 11.83673469387755,
"grad_norm": 17.471019744873047,
"learning_rate": 4.212962962962963e-05,
"loss": 0.4016,
"step": 290
},
{
"epoch": 12.0,
"eval_accuracy": 0.4090909090909091,
"eval_loss": 1.8437938690185547,
"eval_runtime": 4.198,
"eval_samples_per_second": 5.241,
"eval_steps_per_second": 2.62,
"step": 294
},
{
"epoch": 12.244897959183673,
"grad_norm": 38.50642395019531,
"learning_rate": 4.166666666666667e-05,
"loss": 0.4786,
"step": 300
},
{
"epoch": 12.653061224489797,
"grad_norm": 30.62510871887207,
"learning_rate": 4.1203703703703705e-05,
"loss": 0.3947,
"step": 310
},
{
"epoch": 12.979591836734693,
"eval_accuracy": 0.5,
"eval_loss": 1.8609530925750732,
"eval_runtime": 3.8332,
"eval_samples_per_second": 5.739,
"eval_steps_per_second": 2.87,
"step": 318
},
{
"epoch": 13.061224489795919,
"grad_norm": 1.0034515857696533,
"learning_rate": 4.074074074074074e-05,
"loss": 0.3527,
"step": 320
},
{
"epoch": 13.46938775510204,
"grad_norm": 11.846328735351562,
"learning_rate": 4.027777777777778e-05,
"loss": 0.3415,
"step": 330
},
{
"epoch": 13.877551020408163,
"grad_norm": 9.749055862426758,
"learning_rate": 3.981481481481482e-05,
"loss": 0.5033,
"step": 340
},
{
"epoch": 14.0,
"eval_accuracy": 0.45454545454545453,
"eval_loss": 1.8614706993103027,
"eval_runtime": 4.1423,
"eval_samples_per_second": 5.311,
"eval_steps_per_second": 2.656,
"step": 343
},
{
"epoch": 14.285714285714286,
"grad_norm": 74.4493637084961,
"learning_rate": 3.935185185185186e-05,
"loss": 0.2709,
"step": 350
},
{
"epoch": 14.693877551020408,
"grad_norm": 34.6863899230957,
"learning_rate": 3.888888888888889e-05,
"loss": 0.2846,
"step": 360
},
{
"epoch": 14.979591836734693,
"eval_accuracy": 0.5,
"eval_loss": 1.5478615760803223,
"eval_runtime": 3.7136,
"eval_samples_per_second": 5.924,
"eval_steps_per_second": 2.962,
"step": 367
},
{
"epoch": 15.10204081632653,
"grad_norm": 21.048105239868164,
"learning_rate": 3.8425925925925924e-05,
"loss": 0.361,
"step": 370
},
{
"epoch": 15.510204081632653,
"grad_norm": 27.45090103149414,
"learning_rate": 3.7962962962962964e-05,
"loss": 0.1265,
"step": 380
},
{
"epoch": 15.918367346938776,
"grad_norm": 2.5599782466888428,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.2828,
"step": 390
},
{
"epoch": 16.0,
"eval_accuracy": 0.5454545454545454,
"eval_loss": 1.6410338878631592,
"eval_runtime": 4.2352,
"eval_samples_per_second": 5.195,
"eval_steps_per_second": 2.597,
"step": 392
},
{
"epoch": 16.3265306122449,
"grad_norm": 44.01292037963867,
"learning_rate": 3.7037037037037037e-05,
"loss": 0.4138,
"step": 400
},
{
"epoch": 16.73469387755102,
"grad_norm": 49.22306823730469,
"learning_rate": 3.6574074074074076e-05,
"loss": 0.3426,
"step": 410
},
{
"epoch": 16.979591836734695,
"eval_accuracy": 0.36363636363636365,
"eval_loss": 1.9146943092346191,
"eval_runtime": 3.73,
"eval_samples_per_second": 5.898,
"eval_steps_per_second": 2.949,
"step": 416
},
{
"epoch": 17.142857142857142,
"grad_norm": 20.12368392944336,
"learning_rate": 3.611111111111111e-05,
"loss": 0.3658,
"step": 420
},
{
"epoch": 17.551020408163264,
"grad_norm": 5.72607946395874,
"learning_rate": 3.564814814814815e-05,
"loss": 0.0983,
"step": 430
},
{
"epoch": 17.959183673469386,
"grad_norm": 127.7773208618164,
"learning_rate": 3.518518518518519e-05,
"loss": 0.3108,
"step": 440
},
{
"epoch": 18.0,
"eval_accuracy": 0.5454545454545454,
"eval_loss": 1.4793992042541504,
"eval_runtime": 4.4219,
"eval_samples_per_second": 4.975,
"eval_steps_per_second": 2.488,
"step": 441
},
{
"epoch": 18.367346938775512,
"grad_norm": 2.2349815368652344,
"learning_rate": 3.472222222222222e-05,
"loss": 0.08,
"step": 450
},
{
"epoch": 18.775510204081634,
"grad_norm": 84.97185516357422,
"learning_rate": 3.425925925925926e-05,
"loss": 0.2129,
"step": 460
},
{
"epoch": 18.979591836734695,
"eval_accuracy": 0.4090909090909091,
"eval_loss": 1.776501178741455,
"eval_runtime": 3.7003,
"eval_samples_per_second": 5.946,
"eval_steps_per_second": 2.973,
"step": 465
},
{
"epoch": 19.183673469387756,
"grad_norm": 6.593123912811279,
"learning_rate": 3.3796296296296295e-05,
"loss": 0.1714,
"step": 470
},
{
"epoch": 19.591836734693878,
"grad_norm": 2.1411988735198975,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.1612,
"step": 480
},
{
"epoch": 20.0,
"grad_norm": 7.131197929382324,
"learning_rate": 3.2870370370370375e-05,
"loss": 0.1946,
"step": 490
},
{
"epoch": 20.0,
"eval_accuracy": 0.45454545454545453,
"eval_loss": 2.16646671295166,
"eval_runtime": 4.4203,
"eval_samples_per_second": 4.977,
"eval_steps_per_second": 2.489,
"step": 490
},
{
"epoch": 20.408163265306122,
"grad_norm": 4.898344039916992,
"learning_rate": 3.240740740740741e-05,
"loss": 0.2809,
"step": 500
},
{
"epoch": 20.816326530612244,
"grad_norm": 0.8991552591323853,
"learning_rate": 3.194444444444444e-05,
"loss": 0.1255,
"step": 510
},
{
"epoch": 20.979591836734695,
"eval_accuracy": 0.5454545454545454,
"eval_loss": 1.8521960973739624,
"eval_runtime": 4.0094,
"eval_samples_per_second": 5.487,
"eval_steps_per_second": 2.744,
"step": 514
},
{
"epoch": 21.224489795918366,
"grad_norm": 0.9338099956512451,
"learning_rate": 3.148148148148148e-05,
"loss": 0.0503,
"step": 520
},
{
"epoch": 21.632653061224488,
"grad_norm": 1.708648920059204,
"learning_rate": 3.101851851851852e-05,
"loss": 0.2301,
"step": 530
},
{
"epoch": 22.0,
"eval_accuracy": 0.45454545454545453,
"eval_loss": 2.028917074203491,
"eval_runtime": 4.3302,
"eval_samples_per_second": 5.081,
"eval_steps_per_second": 2.54,
"step": 539
},
{
"epoch": 22.040816326530614,
"grad_norm": 68.97888946533203,
"learning_rate": 3.055555555555556e-05,
"loss": 0.1664,
"step": 540
},
{
"epoch": 22.448979591836736,
"grad_norm": 8.795487403869629,
"learning_rate": 3.0092592592592593e-05,
"loss": 0.0495,
"step": 550
},
{
"epoch": 22.857142857142858,
"grad_norm": 1.032072901725769,
"learning_rate": 2.962962962962963e-05,
"loss": 0.0909,
"step": 560
},
{
"epoch": 22.979591836734695,
"eval_accuracy": 0.4090909090909091,
"eval_loss": 2.051244020462036,
"eval_runtime": 4.0425,
"eval_samples_per_second": 5.442,
"eval_steps_per_second": 2.721,
"step": 563
},
{
"epoch": 23.26530612244898,
"grad_norm": 28.04718589782715,
"learning_rate": 2.916666666666667e-05,
"loss": 0.1032,
"step": 570
},
{
"epoch": 23.6734693877551,
"grad_norm": 24.228317260742188,
"learning_rate": 2.8703703703703706e-05,
"loss": 0.1724,
"step": 580
},
{
"epoch": 24.0,
"eval_accuracy": 0.4090909090909091,
"eval_loss": 2.3410568237304688,
"eval_runtime": 4.2445,
"eval_samples_per_second": 5.183,
"eval_steps_per_second": 2.592,
"step": 588
},
{
"epoch": 24.081632653061224,
"grad_norm": 51.32829284667969,
"learning_rate": 2.824074074074074e-05,
"loss": 0.1376,
"step": 590
},
{
"epoch": 24.489795918367346,
"grad_norm": 8.50900936126709,
"learning_rate": 2.777777777777778e-05,
"loss": 0.0941,
"step": 600
},
{
"epoch": 24.897959183673468,
"grad_norm": 53.41023254394531,
"learning_rate": 2.7314814814814816e-05,
"loss": 0.2256,
"step": 610
},
{
"epoch": 24.979591836734695,
"eval_accuracy": 0.5,
"eval_loss": 2.162550687789917,
"eval_runtime": 3.7513,
"eval_samples_per_second": 5.865,
"eval_steps_per_second": 2.932,
"step": 612
},
{
"epoch": 25.306122448979593,
"grad_norm": 20.35555076599121,
"learning_rate": 2.6851851851851855e-05,
"loss": 0.2228,
"step": 620
},
{
"epoch": 25.714285714285715,
"grad_norm": 5.79302978515625,
"learning_rate": 2.6388888888888892e-05,
"loss": 0.2471,
"step": 630
},
{
"epoch": 26.0,
"eval_accuracy": 0.4090909090909091,
"eval_loss": 2.0551793575286865,
"eval_runtime": 4.2279,
"eval_samples_per_second": 5.204,
"eval_steps_per_second": 2.602,
"step": 637
},
{
"epoch": 26.122448979591837,
"grad_norm": 57.79982376098633,
"learning_rate": 2.5925925925925925e-05,
"loss": 0.1514,
"step": 640
},
{
"epoch": 26.53061224489796,
"grad_norm": 9.047138214111328,
"learning_rate": 2.5462962962962965e-05,
"loss": 0.056,
"step": 650
},
{
"epoch": 26.93877551020408,
"grad_norm": 8.557161331176758,
"learning_rate": 2.5e-05,
"loss": 0.0671,
"step": 660
},
{
"epoch": 26.979591836734695,
"eval_accuracy": 0.5454545454545454,
"eval_loss": 1.9339114427566528,
"eval_runtime": 3.8162,
"eval_samples_per_second": 5.765,
"eval_steps_per_second": 2.882,
"step": 661
},
{
"epoch": 27.346938775510203,
"grad_norm": 0.14792662858963013,
"learning_rate": 2.4537037037037038e-05,
"loss": 0.1409,
"step": 670
},
{
"epoch": 27.755102040816325,
"grad_norm": 112.82161712646484,
"learning_rate": 2.4074074074074074e-05,
"loss": 0.2563,
"step": 680
},
{
"epoch": 28.0,
"eval_accuracy": 0.45454545454545453,
"eval_loss": 2.2506721019744873,
"eval_runtime": 4.1968,
"eval_samples_per_second": 5.242,
"eval_steps_per_second": 2.621,
"step": 686
},
{
"epoch": 28.163265306122447,
"grad_norm": 0.717983067035675,
"learning_rate": 2.361111111111111e-05,
"loss": 0.0499,
"step": 690
},
{
"epoch": 28.571428571428573,
"grad_norm": 47.17780303955078,
"learning_rate": 2.314814814814815e-05,
"loss": 0.1607,
"step": 700
},
{
"epoch": 28.979591836734695,
"grad_norm": 102.77259063720703,
"learning_rate": 2.2685185185185187e-05,
"loss": 0.1865,
"step": 710
},
{
"epoch": 28.979591836734695,
"eval_accuracy": 0.5,
"eval_loss": 2.0703585147857666,
"eval_runtime": 3.7043,
"eval_samples_per_second": 5.939,
"eval_steps_per_second": 2.969,
"step": 710
},
{
"epoch": 29.387755102040817,
"grad_norm": 0.42554885149002075,
"learning_rate": 2.2222222222222223e-05,
"loss": 0.1905,
"step": 720
},
{
"epoch": 29.79591836734694,
"grad_norm": 92.93157196044922,
"learning_rate": 2.175925925925926e-05,
"loss": 0.0477,
"step": 730
},
{
"epoch": 30.0,
"eval_accuracy": 0.3181818181818182,
"eval_loss": 2.839505195617676,
"eval_runtime": 4.1643,
"eval_samples_per_second": 5.283,
"eval_steps_per_second": 2.642,
"step": 735
},
{
"epoch": 30.20408163265306,
"grad_norm": 10.255499839782715,
"learning_rate": 2.1296296296296296e-05,
"loss": 0.2298,
"step": 740
},
{
"epoch": 30.612244897959183,
"grad_norm": 32.9160041809082,
"learning_rate": 2.0833333333333336e-05,
"loss": 0.0931,
"step": 750
},
{
"epoch": 30.979591836734695,
"eval_accuracy": 0.36363636363636365,
"eval_loss": 2.9483964443206787,
"eval_runtime": 3.6962,
"eval_samples_per_second": 5.952,
"eval_steps_per_second": 2.976,
"step": 759
},
{
"epoch": 31.020408163265305,
"grad_norm": 3.2979772090911865,
"learning_rate": 2.037037037037037e-05,
"loss": 0.0643,
"step": 760
},
{
"epoch": 31.428571428571427,
"grad_norm": 1.9711153507232666,
"learning_rate": 1.990740740740741e-05,
"loss": 0.1815,
"step": 770
},
{
"epoch": 31.836734693877553,
"grad_norm": 39.575653076171875,
"learning_rate": 1.9444444444444445e-05,
"loss": 0.047,
"step": 780
},
{
"epoch": 32.0,
"eval_accuracy": 0.45454545454545453,
"eval_loss": 2.548579216003418,
"eval_runtime": 4.2179,
"eval_samples_per_second": 5.216,
"eval_steps_per_second": 2.608,
"step": 784
},
{
"epoch": 32.244897959183675,
"grad_norm": 0.1791549175977707,
"learning_rate": 1.8981481481481482e-05,
"loss": 0.1034,
"step": 790
},
{
"epoch": 32.6530612244898,
"grad_norm": 18.35671043395996,
"learning_rate": 1.8518518518518518e-05,
"loss": 0.165,
"step": 800
},
{
"epoch": 32.97959183673469,
"eval_accuracy": 0.45454545454545453,
"eval_loss": 2.601107120513916,
"eval_runtime": 3.894,
"eval_samples_per_second": 5.65,
"eval_steps_per_second": 2.825,
"step": 808
},
{
"epoch": 33.06122448979592,
"grad_norm": 88.95061492919922,
"learning_rate": 1.8055555555555555e-05,
"loss": 0.0639,
"step": 810
},
{
"epoch": 33.46938775510204,
"grad_norm": 0.11740878969430923,
"learning_rate": 1.7592592592592595e-05,
"loss": 0.032,
"step": 820
},
{
"epoch": 33.87755102040816,
"grad_norm": 8.803248405456543,
"learning_rate": 1.712962962962963e-05,
"loss": 0.0203,
"step": 830
},
{
"epoch": 34.0,
"eval_accuracy": 0.5,
"eval_loss": 2.3598248958587646,
"eval_runtime": 4.1023,
"eval_samples_per_second": 5.363,
"eval_steps_per_second": 2.681,
"step": 833
},
{
"epoch": 34.285714285714285,
"grad_norm": 3.050877571105957,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.0173,
"step": 840
},
{
"epoch": 34.69387755102041,
"grad_norm": 10.551043510437012,
"learning_rate": 1.6203703703703704e-05,
"loss": 0.0143,
"step": 850
},
{
"epoch": 34.97959183673469,
"eval_accuracy": 0.4090909090909091,
"eval_loss": 2.589207649230957,
"eval_runtime": 4.2169,
"eval_samples_per_second": 5.217,
"eval_steps_per_second": 2.609,
"step": 857
},
{
"epoch": 35.10204081632653,
"grad_norm": 2.492408514022827,
"learning_rate": 1.574074074074074e-05,
"loss": 0.1669,
"step": 860
},
{
"epoch": 35.51020408163265,
"grad_norm": 28.920665740966797,
"learning_rate": 1.527777777777778e-05,
"loss": 0.0546,
"step": 870
},
{
"epoch": 35.91836734693877,
"grad_norm": 0.1343473196029663,
"learning_rate": 1.4814814814814815e-05,
"loss": 0.0248,
"step": 880
},
{
"epoch": 36.0,
"eval_accuracy": 0.4090909090909091,
"eval_loss": 2.8362252712249756,
"eval_runtime": 4.252,
"eval_samples_per_second": 5.174,
"eval_steps_per_second": 2.587,
"step": 882
},
{
"epoch": 36.326530612244895,
"grad_norm": 0.00863693282008171,
"learning_rate": 1.4351851851851853e-05,
"loss": 0.0432,
"step": 890
},
{
"epoch": 36.734693877551024,
"grad_norm": 1.786009430885315,
"learning_rate": 1.388888888888889e-05,
"loss": 0.0812,
"step": 900
},
{
"epoch": 36.97959183673469,
"eval_accuracy": 0.4090909090909091,
"eval_loss": 2.4658091068267822,
"eval_runtime": 4.0409,
"eval_samples_per_second": 5.444,
"eval_steps_per_second": 2.722,
"step": 906
},
{
"epoch": 37.142857142857146,
"grad_norm": 5.971357822418213,
"learning_rate": 1.3425925925925928e-05,
"loss": 0.1312,
"step": 910
},
{
"epoch": 37.55102040816327,
"grad_norm": 9.33790111541748,
"learning_rate": 1.2962962962962962e-05,
"loss": 0.0453,
"step": 920
},
{
"epoch": 37.95918367346939,
"grad_norm": 13.613574028015137,
"learning_rate": 1.25e-05,
"loss": 0.0662,
"step": 930
},
{
"epoch": 38.0,
"eval_accuracy": 0.4090909090909091,
"eval_loss": 2.640266180038452,
"eval_runtime": 4.0647,
"eval_samples_per_second": 5.412,
"eval_steps_per_second": 2.706,
"step": 931
},
{
"epoch": 38.36734693877551,
"grad_norm": 17.910255432128906,
"learning_rate": 1.2037037037037037e-05,
"loss": 0.1061,
"step": 940
},
{
"epoch": 38.775510204081634,
"grad_norm": 5.941530227661133,
"learning_rate": 1.1574074074074075e-05,
"loss": 0.1855,
"step": 950
},
{
"epoch": 38.97959183673469,
"eval_accuracy": 0.45454545454545453,
"eval_loss": 2.604184150695801,
"eval_runtime": 3.8038,
"eval_samples_per_second": 5.784,
"eval_steps_per_second": 2.892,
"step": 955
},
{
"epoch": 39.183673469387756,
"grad_norm": 0.057647667825222015,
"learning_rate": 1.1111111111111112e-05,
"loss": 0.0598,
"step": 960
},
{
"epoch": 39.59183673469388,
"grad_norm": 1.981195092201233,
"learning_rate": 1.0648148148148148e-05,
"loss": 0.0107,
"step": 970
},
{
"epoch": 40.0,
"grad_norm": 74.54513549804688,
"learning_rate": 1.0185185185185185e-05,
"loss": 0.03,
"step": 980
},
{
"epoch": 40.0,
"eval_accuracy": 0.5,
"eval_loss": 2.5595407485961914,
"eval_runtime": 4.2131,
"eval_samples_per_second": 5.222,
"eval_steps_per_second": 2.611,
"step": 980
},
{
"epoch": 40.40816326530612,
"grad_norm": 3.7874374389648438,
"learning_rate": 9.722222222222223e-06,
"loss": 0.0882,
"step": 990
},
{
"epoch": 40.816326530612244,
"grad_norm": 0.011240988969802856,
"learning_rate": 9.259259259259259e-06,
"loss": 0.1117,
"step": 1000
},
{
"epoch": 40.97959183673469,
"eval_accuracy": 0.5,
"eval_loss": 2.4715521335601807,
"eval_runtime": 3.7805,
"eval_samples_per_second": 5.819,
"eval_steps_per_second": 2.91,
"step": 1004
},
{
"epoch": 41.224489795918366,
"grad_norm": 30.458051681518555,
"learning_rate": 8.796296296296297e-06,
"loss": 0.0819,
"step": 1010
},
{
"epoch": 41.63265306122449,
"grad_norm": 0.2596540153026581,
"learning_rate": 8.333333333333334e-06,
"loss": 0.0466,
"step": 1020
},
{
"epoch": 42.0,
"eval_accuracy": 0.5,
"eval_loss": 2.325749397277832,
"eval_runtime": 4.0793,
"eval_samples_per_second": 5.393,
"eval_steps_per_second": 2.697,
"step": 1029
},
{
"epoch": 42.04081632653061,
"grad_norm": 0.8128257393836975,
"learning_rate": 7.87037037037037e-06,
"loss": 0.2931,
"step": 1030
},
{
"epoch": 42.44897959183673,
"grad_norm": 1.8079004287719727,
"learning_rate": 7.4074074074074075e-06,
"loss": 0.0667,
"step": 1040
},
{
"epoch": 42.857142857142854,
"grad_norm": 13.098880767822266,
"learning_rate": 6.944444444444445e-06,
"loss": 0.1349,
"step": 1050
},
{
"epoch": 42.97959183673469,
"eval_accuracy": 0.45454545454545453,
"eval_loss": 2.5544846057891846,
"eval_runtime": 3.7821,
"eval_samples_per_second": 5.817,
"eval_steps_per_second": 2.908,
"step": 1053
},
{
"epoch": 43.265306122448976,
"grad_norm": 0.03145942464470863,
"learning_rate": 6.481481481481481e-06,
"loss": 0.0184,
"step": 1060
},
{
"epoch": 43.673469387755105,
"grad_norm": 0.008914113976061344,
"learning_rate": 6.0185185185185185e-06,
"loss": 0.0069,
"step": 1070
},
{
"epoch": 44.0,
"eval_accuracy": 0.5,
"eval_loss": 2.5814802646636963,
"eval_runtime": 4.2458,
"eval_samples_per_second": 5.182,
"eval_steps_per_second": 2.591,
"step": 1078
},
{
"epoch": 44.08163265306123,
"grad_norm": 0.0782785564661026,
"learning_rate": 5.555555555555556e-06,
"loss": 0.1017,
"step": 1080
},
{
"epoch": 44.48979591836735,
"grad_norm": 0.24598410725593567,
"learning_rate": 5.092592592592592e-06,
"loss": 0.0344,
"step": 1090
},
{
"epoch": 44.89795918367347,
"grad_norm": 0.017518645152449608,
"learning_rate": 4.6296296296296296e-06,
"loss": 0.0468,
"step": 1100
},
{
"epoch": 44.97959183673469,
"eval_accuracy": 0.5,
"eval_loss": 2.3666675090789795,
"eval_runtime": 3.9709,
"eval_samples_per_second": 5.54,
"eval_steps_per_second": 2.77,
"step": 1102
},
{
"epoch": 45.30612244897959,
"grad_norm": 3.6144909858703613,
"learning_rate": 4.166666666666667e-06,
"loss": 0.0372,
"step": 1110
},
{
"epoch": 45.714285714285715,
"grad_norm": 0.018579425290226936,
"learning_rate": 3.7037037037037037e-06,
"loss": 0.1807,
"step": 1120
},
{
"epoch": 46.0,
"eval_accuracy": 0.5,
"eval_loss": 2.472862482070923,
"eval_runtime": 4.5348,
"eval_samples_per_second": 4.851,
"eval_steps_per_second": 2.426,
"step": 1127
},
{
"epoch": 46.12244897959184,
"grad_norm": 6.9941864013671875,
"learning_rate": 3.2407407407407406e-06,
"loss": 0.0091,
"step": 1130
},
{
"epoch": 46.53061224489796,
"grad_norm": 1.61014986038208,
"learning_rate": 2.777777777777778e-06,
"loss": 0.0388,
"step": 1140
},
{
"epoch": 46.93877551020408,
"grad_norm": 0.021990543231368065,
"learning_rate": 2.3148148148148148e-06,
"loss": 0.0667,
"step": 1150
},
{
"epoch": 46.97959183673469,
"eval_accuracy": 0.5,
"eval_loss": 2.496938467025757,
"eval_runtime": 3.9012,
"eval_samples_per_second": 5.639,
"eval_steps_per_second": 2.82,
"step": 1151
},
{
"epoch": 47.3469387755102,
"grad_norm": 0.5512164235115051,
"learning_rate": 1.8518518518518519e-06,
"loss": 0.0191,
"step": 1160
},
{
"epoch": 47.755102040816325,
"grad_norm": 0.2976542115211487,
"learning_rate": 1.388888888888889e-06,
"loss": 0.0199,
"step": 1170
},
{
"epoch": 48.0,
"eval_accuracy": 0.5,
"eval_loss": 2.5520730018615723,
"eval_runtime": 4.4614,
"eval_samples_per_second": 4.931,
"eval_steps_per_second": 2.466,
"step": 1176
},
{
"epoch": 48.16326530612245,
"grad_norm": 0.37567588686943054,
"learning_rate": 9.259259259259259e-07,
"loss": 0.0806,
"step": 1180
},
{
"epoch": 48.57142857142857,
"grad_norm": 2.0820977687835693,
"learning_rate": 4.6296296296296297e-07,
"loss": 0.1011,
"step": 1190
},
{
"epoch": 48.97959183673469,
"grad_norm": 0.018370352685451508,
"learning_rate": 0.0,
"loss": 0.2716,
"step": 1200
},
{
"epoch": 48.97959183673469,
"eval_accuracy": 0.5,
"eval_loss": 2.5371408462524414,
"eval_runtime": 3.8373,
"eval_samples_per_second": 5.733,
"eval_steps_per_second": 2.867,
"step": 1200
},
{
"epoch": 48.97959183673469,
"step": 1200,
"total_flos": 2.374708462608384e+17,
"train_loss": 0.41171714147552846,
"train_runtime": 1608.8411,
"train_samples_per_second": 6.06,
"train_steps_per_second": 0.746
}
],
"logging_steps": 10,
"max_steps": 1200,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.374708462608384e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}