iplaw-llama3-8b-Instruct-20241002 / trainer_state.json
clinno's picture
Upload folder using huggingface_hub
b7b719d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 11.97948717948718,
"eval_steps": 1000,
"global_step": 1752,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.06837606837606838,
"grad_norm": 10.4375,
"learning_rate": 5.681818181818182e-07,
"loss": 2.6042,
"step": 10
},
{
"epoch": 0.13675213675213677,
"grad_norm": 12.75,
"learning_rate": 1.1363636363636364e-06,
"loss": 2.577,
"step": 20
},
{
"epoch": 0.20512820512820512,
"grad_norm": 10.6875,
"learning_rate": 1.7045454545454546e-06,
"loss": 2.4842,
"step": 30
},
{
"epoch": 0.27350427350427353,
"grad_norm": 15.4375,
"learning_rate": 2.2727272727272728e-06,
"loss": 2.4127,
"step": 40
},
{
"epoch": 0.3418803418803419,
"grad_norm": 21.75,
"learning_rate": 2.8409090909090916e-06,
"loss": 2.2065,
"step": 50
},
{
"epoch": 0.41025641025641024,
"grad_norm": 28.125,
"learning_rate": 3.409090909090909e-06,
"loss": 2.2638,
"step": 60
},
{
"epoch": 0.47863247863247865,
"grad_norm": 29.5,
"learning_rate": 3.9772727272727275e-06,
"loss": 2.0591,
"step": 70
},
{
"epoch": 0.5470085470085471,
"grad_norm": 20.5,
"learning_rate": 4.5454545454545455e-06,
"loss": 1.9693,
"step": 80
},
{
"epoch": 0.6153846153846154,
"grad_norm": 19.875,
"learning_rate": 5.113636363636364e-06,
"loss": 1.9559,
"step": 90
},
{
"epoch": 0.6837606837606838,
"grad_norm": 5.125,
"learning_rate": 5.681818181818183e-06,
"loss": 1.8579,
"step": 100
},
{
"epoch": 0.7521367521367521,
"grad_norm": 5.5625,
"learning_rate": 6.25e-06,
"loss": 1.8601,
"step": 110
},
{
"epoch": 0.8205128205128205,
"grad_norm": 4.34375,
"learning_rate": 6.818181818181818e-06,
"loss": 1.7222,
"step": 120
},
{
"epoch": 0.8888888888888888,
"grad_norm": 6.03125,
"learning_rate": 7.386363636363637e-06,
"loss": 1.7724,
"step": 130
},
{
"epoch": 0.9572649572649573,
"grad_norm": 5.6875,
"learning_rate": 7.954545454545455e-06,
"loss": 1.8647,
"step": 140
},
{
"epoch": 1.0256410256410255,
"grad_norm": 5.0625,
"learning_rate": 8.522727272727273e-06,
"loss": 1.7017,
"step": 150
},
{
"epoch": 1.0940170940170941,
"grad_norm": 4.5625,
"learning_rate": 9.090909090909091e-06,
"loss": 1.6883,
"step": 160
},
{
"epoch": 1.1623931623931625,
"grad_norm": 5.3125,
"learning_rate": 9.65909090909091e-06,
"loss": 1.5485,
"step": 170
},
{
"epoch": 1.2307692307692308,
"grad_norm": 4.96875,
"learning_rate": 9.999841055681184e-06,
"loss": 1.5548,
"step": 180
},
{
"epoch": 1.2991452991452992,
"grad_norm": 5.4375,
"learning_rate": 9.998053048145735e-06,
"loss": 1.602,
"step": 190
},
{
"epoch": 1.3675213675213675,
"grad_norm": 3.671875,
"learning_rate": 9.994279065509094e-06,
"loss": 1.5151,
"step": 200
},
{
"epoch": 1.435897435897436,
"grad_norm": 5.1875,
"learning_rate": 9.988520607362297e-06,
"loss": 1.4208,
"step": 210
},
{
"epoch": 1.5042735042735043,
"grad_norm": 5.90625,
"learning_rate": 9.98077996182722e-06,
"loss": 1.4954,
"step": 220
},
{
"epoch": 1.5726495726495726,
"grad_norm": 5.84375,
"learning_rate": 9.971060204647384e-06,
"loss": 1.3399,
"step": 230
},
{
"epoch": 1.641025641025641,
"grad_norm": 5.5,
"learning_rate": 9.959365197965824e-06,
"loss": 1.2554,
"step": 240
},
{
"epoch": 1.7094017094017095,
"grad_norm": 6.65625,
"learning_rate": 9.945699588790455e-06,
"loss": 1.3161,
"step": 250
},
{
"epoch": 1.7777777777777777,
"grad_norm": 7.5625,
"learning_rate": 9.930068807147585e-06,
"loss": 1.3603,
"step": 260
},
{
"epoch": 1.8461538461538463,
"grad_norm": 6.375,
"learning_rate": 9.912479063924309e-06,
"loss": 1.2576,
"step": 270
},
{
"epoch": 1.9145299145299144,
"grad_norm": 6.84375,
"learning_rate": 9.8929373484006e-06,
"loss": 1.207,
"step": 280
},
{
"epoch": 1.982905982905983,
"grad_norm": 4.90625,
"learning_rate": 9.871451425472128e-06,
"loss": 1.232,
"step": 290
},
{
"epoch": 2.051282051282051,
"grad_norm": 7.125,
"learning_rate": 9.848029832564875e-06,
"loss": 1.0106,
"step": 300
},
{
"epoch": 2.1196581196581197,
"grad_norm": 6.65625,
"learning_rate": 9.822681876242797e-06,
"loss": 0.9484,
"step": 310
},
{
"epoch": 2.1880341880341883,
"grad_norm": 5.1875,
"learning_rate": 9.795417628509857e-06,
"loss": 1.0094,
"step": 320
},
{
"epoch": 2.2564102564102564,
"grad_norm": 5.28125,
"learning_rate": 9.766247922807927e-06,
"loss": 0.9317,
"step": 330
},
{
"epoch": 2.324786324786325,
"grad_norm": 5.09375,
"learning_rate": 9.73518434971211e-06,
"loss": 0.8936,
"step": 340
},
{
"epoch": 2.393162393162393,
"grad_norm": 7.375,
"learning_rate": 9.702239252325237e-06,
"loss": 0.9004,
"step": 350
},
{
"epoch": 2.4615384615384617,
"grad_norm": 6.3125,
"learning_rate": 9.667425721373333e-06,
"loss": 0.9335,
"step": 360
},
{
"epoch": 2.52991452991453,
"grad_norm": 7.0625,
"learning_rate": 9.630757590004023e-06,
"loss": 0.8982,
"step": 370
},
{
"epoch": 2.5982905982905984,
"grad_norm": 4.40625,
"learning_rate": 9.592249428289935e-06,
"loss": 0.8541,
"step": 380
},
{
"epoch": 2.6666666666666665,
"grad_norm": 6.5625,
"learning_rate": 9.551916537439282e-06,
"loss": 0.8105,
"step": 390
},
{
"epoch": 2.735042735042735,
"grad_norm": 5.125,
"learning_rate": 9.50977494371594e-06,
"loss": 0.8308,
"step": 400
},
{
"epoch": 2.8034188034188032,
"grad_norm": 5.0625,
"learning_rate": 9.465841392071396e-06,
"loss": 0.8515,
"step": 410
},
{
"epoch": 2.871794871794872,
"grad_norm": 6.53125,
"learning_rate": 9.420133339491171e-06,
"loss": 0.6671,
"step": 420
},
{
"epoch": 2.9401709401709404,
"grad_norm": 6.21875,
"learning_rate": 9.372668948058276e-06,
"loss": 0.7728,
"step": 430
},
{
"epoch": 3.0085470085470085,
"grad_norm": 5.1875,
"learning_rate": 9.323467077736513e-06,
"loss": 0.6978,
"step": 440
},
{
"epoch": 3.076923076923077,
"grad_norm": 5.9375,
"learning_rate": 9.272547278876475e-06,
"loss": 0.561,
"step": 450
},
{
"epoch": 3.1452991452991452,
"grad_norm": 5.21875,
"learning_rate": 9.219929784447232e-06,
"loss": 0.5354,
"step": 460
},
{
"epoch": 3.213675213675214,
"grad_norm": 4.90625,
"learning_rate": 9.16563550199674e-06,
"loss": 0.5613,
"step": 470
},
{
"epoch": 3.282051282051282,
"grad_norm": 6.5,
"learning_rate": 9.109686005344258e-06,
"loss": 0.576,
"step": 480
},
{
"epoch": 3.3504273504273505,
"grad_norm": 5.71875,
"learning_rate": 9.052103526007976e-06,
"loss": 0.492,
"step": 490
},
{
"epoch": 3.4188034188034186,
"grad_norm": 7.5,
"learning_rate": 8.992910944371343e-06,
"loss": 0.5087,
"step": 500
},
{
"epoch": 3.4871794871794872,
"grad_norm": 6.59375,
"learning_rate": 8.932131780591542e-06,
"loss": 0.476,
"step": 510
},
{
"epoch": 3.5555555555555554,
"grad_norm": 4.4375,
"learning_rate": 8.869790185253766e-06,
"loss": 0.4111,
"step": 520
},
{
"epoch": 3.623931623931624,
"grad_norm": 5.09375,
"learning_rate": 8.805910929774989e-06,
"loss": 0.4426,
"step": 530
},
{
"epoch": 3.6923076923076925,
"grad_norm": 4.46875,
"learning_rate": 8.740519396561045e-06,
"loss": 0.4171,
"step": 540
},
{
"epoch": 3.7606837606837606,
"grad_norm": 5.28125,
"learning_rate": 8.673641568920944e-06,
"loss": 0.445,
"step": 550
},
{
"epoch": 3.8290598290598292,
"grad_norm": 5.75,
"learning_rate": 8.60530402074241e-06,
"loss": 0.4653,
"step": 560
},
{
"epoch": 3.8974358974358974,
"grad_norm": 2.984375,
"learning_rate": 8.535533905932739e-06,
"loss": 0.4276,
"step": 570
},
{
"epoch": 3.965811965811966,
"grad_norm": 3.9375,
"learning_rate": 8.46435894762922e-06,
"loss": 0.423,
"step": 580
},
{
"epoch": 4.034188034188034,
"grad_norm": 2.96875,
"learning_rate": 8.39180742718334e-06,
"loss": 0.3372,
"step": 590
},
{
"epoch": 4.102564102564102,
"grad_norm": 3.25,
"learning_rate": 8.317908172923207e-06,
"loss": 0.29,
"step": 600
},
{
"epoch": 4.170940170940171,
"grad_norm": 3.84375,
"learning_rate": 8.242690548698611e-06,
"loss": 0.2464,
"step": 610
},
{
"epoch": 4.239316239316239,
"grad_norm": 4.28125,
"learning_rate": 8.166184442213314e-06,
"loss": 0.2754,
"step": 620
},
{
"epoch": 4.3076923076923075,
"grad_norm": 2.796875,
"learning_rate": 8.088420253149173e-06,
"loss": 0.2699,
"step": 630
},
{
"epoch": 4.3760683760683765,
"grad_norm": 2.796875,
"learning_rate": 8.009428881086836e-06,
"loss": 0.2621,
"step": 640
},
{
"epoch": 4.444444444444445,
"grad_norm": 4.125,
"learning_rate": 7.9292417132278e-06,
"loss": 0.2688,
"step": 650
},
{
"epoch": 4.512820512820513,
"grad_norm": 2.9375,
"learning_rate": 7.847890611922721e-06,
"loss": 0.2871,
"step": 660
},
{
"epoch": 4.581196581196581,
"grad_norm": 2.78125,
"learning_rate": 7.76540790201091e-06,
"loss": 0.2439,
"step": 670
},
{
"epoch": 4.64957264957265,
"grad_norm": 5.5625,
"learning_rate": 7.68182635797606e-06,
"loss": 0.2478,
"step": 680
},
{
"epoch": 4.717948717948718,
"grad_norm": 5.09375,
"learning_rate": 7.597179190923343e-06,
"loss": 0.2385,
"step": 690
},
{
"epoch": 4.786324786324786,
"grad_norm": 2.53125,
"learning_rate": 7.511500035382943e-06,
"loss": 0.2525,
"step": 700
},
{
"epoch": 4.854700854700854,
"grad_norm": 3.734375,
"learning_rate": 7.424822935945416e-06,
"loss": 0.2448,
"step": 710
},
{
"epoch": 4.923076923076923,
"grad_norm": 2.453125,
"learning_rate": 7.33718233373407e-06,
"loss": 0.2173,
"step": 720
},
{
"epoch": 4.9914529914529915,
"grad_norm": 2.0625,
"learning_rate": 7.248613052719793e-06,
"loss": 0.1926,
"step": 730
},
{
"epoch": 5.05982905982906,
"grad_norm": 2.046875,
"learning_rate": 7.159150285883757e-06,
"loss": 0.1754,
"step": 740
},
{
"epoch": 5.128205128205128,
"grad_norm": 2.4375,
"learning_rate": 7.0688295812334995e-06,
"loss": 0.1334,
"step": 750
},
{
"epoch": 5.196581196581197,
"grad_norm": 1.84375,
"learning_rate": 6.977686827677926e-06,
"loss": 0.147,
"step": 760
},
{
"epoch": 5.264957264957265,
"grad_norm": 1.65625,
"learning_rate": 6.885758240766867e-06,
"loss": 0.1549,
"step": 770
},
{
"epoch": 5.333333333333333,
"grad_norm": 2.3125,
"learning_rate": 6.793080348300834e-06,
"loss": 0.1312,
"step": 780
},
{
"epoch": 5.401709401709402,
"grad_norm": 1.59375,
"learning_rate": 6.69968997581671e-06,
"loss": 0.1403,
"step": 790
},
{
"epoch": 5.47008547008547,
"grad_norm": 2.21875,
"learning_rate": 6.6056242319551315e-06,
"loss": 0.1237,
"step": 800
},
{
"epoch": 5.538461538461538,
"grad_norm": 3.046875,
"learning_rate": 6.510920493715381e-06,
"loss": 0.1233,
"step": 810
},
{
"epoch": 5.6068376068376065,
"grad_norm": 0.87109375,
"learning_rate": 6.415616391603639e-06,
"loss": 0.1667,
"step": 820
},
{
"epoch": 5.6752136752136755,
"grad_norm": 1.25,
"learning_rate": 6.3197497946805205e-06,
"loss": 0.1224,
"step": 830
},
{
"epoch": 5.743589743589744,
"grad_norm": 2.421875,
"learning_rate": 6.223358795513812e-06,
"loss": 0.118,
"step": 840
},
{
"epoch": 5.811965811965812,
"grad_norm": 1.3515625,
"learning_rate": 6.126481695042392e-06,
"loss": 0.0897,
"step": 850
},
{
"epoch": 5.880341880341881,
"grad_norm": 1.5390625,
"learning_rate": 6.029156987357373e-06,
"loss": 0.1142,
"step": 860
},
{
"epoch": 5.948717948717949,
"grad_norm": 1.78125,
"learning_rate": 5.931423344406478e-06,
"loss": 0.1542,
"step": 870
},
{
"epoch": 6.017094017094017,
"grad_norm": 1.2890625,
"learning_rate": 5.8333196006277536e-06,
"loss": 0.0937,
"step": 880
},
{
"epoch": 6.085470085470085,
"grad_norm": 1.75,
"learning_rate": 5.734884737518714e-06,
"loss": 0.0699,
"step": 890
},
{
"epoch": 6.153846153846154,
"grad_norm": 0.96484375,
"learning_rate": 5.636157868147054e-06,
"loss": 0.0757,
"step": 900
},
{
"epoch": 6.222222222222222,
"grad_norm": 0.84765625,
"learning_rate": 5.537178221609088e-06,
"loss": 0.0657,
"step": 910
},
{
"epoch": 6.2905982905982905,
"grad_norm": 2.328125,
"learning_rate": 5.437985127442065e-06,
"loss": 0.0838,
"step": 920
},
{
"epoch": 6.358974358974359,
"grad_norm": 1.015625,
"learning_rate": 5.338617999996603e-06,
"loss": 0.0876,
"step": 930
},
{
"epoch": 6.427350427350428,
"grad_norm": 1.0,
"learning_rate": 5.239116322775392e-06,
"loss": 0.0652,
"step": 940
},
{
"epoch": 6.495726495726496,
"grad_norm": 0.96484375,
"learning_rate": 5.139519632744443e-06,
"loss": 0.0843,
"step": 950
},
{
"epoch": 6.564102564102564,
"grad_norm": 1.3828125,
"learning_rate": 5.039867504623084e-06,
"loss": 0.0677,
"step": 960
},
{
"epoch": 6.632478632478632,
"grad_norm": 1.0390625,
"learning_rate": 4.940199535158954e-06,
"loss": 0.0764,
"step": 970
},
{
"epoch": 6.700854700854701,
"grad_norm": 0.828125,
"learning_rate": 4.8405553273942415e-06,
"loss": 0.0642,
"step": 980
},
{
"epoch": 6.769230769230769,
"grad_norm": 1.1796875,
"learning_rate": 4.740974474929438e-06,
"loss": 0.0444,
"step": 990
},
{
"epoch": 6.837606837606837,
"grad_norm": 0.83203125,
"learning_rate": 4.641496546190813e-06,
"loss": 0.0713,
"step": 1000
},
{
"epoch": 6.837606837606837,
"eval_loss": 0.19593119621276855,
"eval_runtime": 5.1008,
"eval_samples_per_second": 25.486,
"eval_steps_per_second": 25.486,
"step": 1000
},
{
"epoch": 6.905982905982906,
"grad_norm": 0.4765625,
"learning_rate": 4.542161068707927e-06,
"loss": 0.0494,
"step": 1010
},
{
"epoch": 6.9743589743589745,
"grad_norm": 0.78125,
"learning_rate": 4.443007513407368e-06,
"loss": 0.0492,
"step": 1020
},
{
"epoch": 7.042735042735043,
"grad_norm": 0.73046875,
"learning_rate": 4.344075278928989e-06,
"loss": 0.1084,
"step": 1030
},
{
"epoch": 7.111111111111111,
"grad_norm": 0.78515625,
"learning_rate": 4.245403675970877e-06,
"loss": 0.0605,
"step": 1040
},
{
"epoch": 7.17948717948718,
"grad_norm": 0.79296875,
"learning_rate": 4.147031911669243e-06,
"loss": 0.0422,
"step": 1050
},
{
"epoch": 7.247863247863248,
"grad_norm": 1.796875,
"learning_rate": 4.048999074019493e-06,
"loss": 0.0388,
"step": 1060
},
{
"epoch": 7.316239316239316,
"grad_norm": 0.88671875,
"learning_rate": 3.951344116344606e-06,
"loss": 0.0295,
"step": 1070
},
{
"epoch": 7.384615384615385,
"grad_norm": 0.44140625,
"learning_rate": 3.854105841817056e-06,
"loss": 0.0545,
"step": 1080
},
{
"epoch": 7.452991452991453,
"grad_norm": 0.64453125,
"learning_rate": 3.7573228880403734e-06,
"loss": 0.0337,
"step": 1090
},
{
"epoch": 7.521367521367521,
"grad_norm": 0.5703125,
"learning_rate": 3.661033711696501e-06,
"loss": 0.0507,
"step": 1100
},
{
"epoch": 7.589743589743589,
"grad_norm": 0.6328125,
"learning_rate": 3.5652765732650523e-06,
"loss": 0.0419,
"step": 1110
},
{
"epoch": 7.6581196581196584,
"grad_norm": 1.21875,
"learning_rate": 3.4700895218205026e-06,
"loss": 0.0423,
"step": 1120
},
{
"epoch": 7.726495726495727,
"grad_norm": 1.234375,
"learning_rate": 3.375510379913418e-06,
"loss": 0.0488,
"step": 1130
},
{
"epoch": 7.794871794871795,
"grad_norm": 0.4140625,
"learning_rate": 3.2815767285416576e-06,
"loss": 0.0388,
"step": 1140
},
{
"epoch": 7.863247863247864,
"grad_norm": 0.462890625,
"learning_rate": 3.188325892217587e-06,
"loss": 0.0197,
"step": 1150
},
{
"epoch": 7.931623931623932,
"grad_norm": 0.390625,
"learning_rate": 3.0957949241371845e-06,
"loss": 0.0326,
"step": 1160
},
{
"epoch": 8.0,
"grad_norm": 0.57421875,
"learning_rate": 3.0040205914569664e-06,
"loss": 0.0383,
"step": 1170
},
{
"epoch": 8.068376068376068,
"grad_norm": 0.40234375,
"learning_rate": 2.913039360684565e-06,
"loss": 0.0296,
"step": 1180
},
{
"epoch": 8.136752136752136,
"grad_norm": 0.482421875,
"learning_rate": 2.822887383188775e-06,
"loss": 0.0564,
"step": 1190
},
{
"epoch": 8.205128205128204,
"grad_norm": 0.435546875,
"learning_rate": 2.7336004808348094e-06,
"loss": 0.0295,
"step": 1200
},
{
"epoch": 8.273504273504274,
"grad_norm": 0.435546875,
"learning_rate": 2.645214131750498e-06,
"loss": 0.0194,
"step": 1210
},
{
"epoch": 8.341880341880342,
"grad_norm": 0.2275390625,
"learning_rate": 2.5577634562290567e-06,
"loss": 0.0261,
"step": 1220
},
{
"epoch": 8.41025641025641,
"grad_norm": 0.6171875,
"learning_rate": 2.4712832027740545e-06,
"loss": 0.0237,
"step": 1230
},
{
"epoch": 8.478632478632479,
"grad_norm": 0.15625,
"learning_rate": 2.385807734292097e-06,
"loss": 0.037,
"step": 1240
},
{
"epoch": 8.547008547008547,
"grad_norm": 0.63671875,
"learning_rate": 2.3013710144387374e-06,
"loss": 0.0241,
"step": 1250
},
{
"epoch": 8.615384615384615,
"grad_norm": 0.53515625,
"learning_rate": 2.218006594123028e-06,
"loss": 0.0258,
"step": 1260
},
{
"epoch": 8.683760683760683,
"grad_norm": 0.73828125,
"learning_rate": 2.1357475981760704e-06,
"loss": 0.0361,
"step": 1270
},
{
"epoch": 8.752136752136753,
"grad_norm": 0.333984375,
"learning_rate": 2.0546267121888863e-06,
"loss": 0.0243,
"step": 1280
},
{
"epoch": 8.820512820512821,
"grad_norm": 0.404296875,
"learning_rate": 1.9746761695247803e-06,
"loss": 0.0404,
"step": 1290
},
{
"epoch": 8.88888888888889,
"grad_norm": 0.50390625,
"learning_rate": 1.8959277385114516e-06,
"loss": 0.0284,
"step": 1300
},
{
"epoch": 8.957264957264957,
"grad_norm": 0.265625,
"learning_rate": 1.8184127098178288e-06,
"loss": 0.028,
"step": 1310
},
{
"epoch": 9.025641025641026,
"grad_norm": 0.298828125,
"learning_rate": 1.7421618840207576e-06,
"loss": 0.0174,
"step": 1320
},
{
"epoch": 9.094017094017094,
"grad_norm": 0.47265625,
"learning_rate": 1.667205559366372e-06,
"loss": 0.0293,
"step": 1330
},
{
"epoch": 9.162393162393162,
"grad_norm": 0.2275390625,
"learning_rate": 1.5935735197311204e-06,
"loss": 0.0459,
"step": 1340
},
{
"epoch": 9.23076923076923,
"grad_norm": 0.12890625,
"learning_rate": 1.5212950227871292e-06,
"loss": 0.0197,
"step": 1350
},
{
"epoch": 9.2991452991453,
"grad_norm": 0.2470703125,
"learning_rate": 1.4503987883766857e-06,
"loss": 0.0208,
"step": 1360
},
{
"epoch": 9.367521367521368,
"grad_norm": 0.400390625,
"learning_rate": 1.3809129871004113e-06,
"loss": 0.0332,
"step": 1370
},
{
"epoch": 9.435897435897436,
"grad_norm": 0.625,
"learning_rate": 1.312865229123681e-06,
"loss": 0.0186,
"step": 1380
},
{
"epoch": 9.504273504273504,
"grad_norm": 0.32421875,
"learning_rate": 1.2462825532057394e-06,
"loss": 0.0365,
"step": 1390
},
{
"epoch": 9.572649572649572,
"grad_norm": 0.330078125,
"learning_rate": 1.1811914159558374e-06,
"loss": 0.0172,
"step": 1400
},
{
"epoch": 9.64102564102564,
"grad_norm": 0.1943359375,
"learning_rate": 1.117617681320729e-06,
"loss": 0.0307,
"step": 1410
},
{
"epoch": 9.709401709401709,
"grad_norm": 0.177734375,
"learning_rate": 1.0555866103076212e-06,
"loss": 0.0138,
"step": 1420
},
{
"epoch": 9.777777777777779,
"grad_norm": 0.369140625,
"learning_rate": 9.951228509467248e-07,
"loss": 0.0204,
"step": 1430
},
{
"epoch": 9.846153846153847,
"grad_norm": 0.2421875,
"learning_rate": 9.362504284973683e-07,
"loss": 0.017,
"step": 1440
},
{
"epoch": 9.914529914529915,
"grad_norm": 0.2119140625,
"learning_rate": 8.789927359015643e-07,
"loss": 0.0236,
"step": 1450
},
{
"epoch": 9.982905982905983,
"grad_norm": 0.296875,
"learning_rate": 8.233725244888291e-07,
"loss": 0.0234,
"step": 1460
},
{
"epoch": 10.051282051282051,
"grad_norm": 0.3828125,
"learning_rate": 7.694118949359553e-07,
"loss": 0.015,
"step": 1470
},
{
"epoch": 10.11965811965812,
"grad_norm": 0.1845703125,
"learning_rate": 7.171322884852988e-07,
"loss": 0.0209,
"step": 1480
},
{
"epoch": 10.188034188034187,
"grad_norm": 0.20703125,
"learning_rate": 6.665544784251232e-07,
"loss": 0.0156,
"step": 1490
},
{
"epoch": 10.256410256410255,
"grad_norm": 0.380859375,
"learning_rate": 6.176985618353282e-07,
"loss": 0.0245,
"step": 1500
},
{
"epoch": 10.324786324786325,
"grad_norm": 0.5234375,
"learning_rate": 5.705839516018818e-07,
"loss": 0.0143,
"step": 1510
},
{
"epoch": 10.393162393162394,
"grad_norm": 0.9375,
"learning_rate": 5.252293687031196e-07,
"loss": 0.0279,
"step": 1520
},
{
"epoch": 10.461538461538462,
"grad_norm": 0.1884765625,
"learning_rate": 4.816528347709614e-07,
"loss": 0.0215,
"step": 1530
},
{
"epoch": 10.52991452991453,
"grad_norm": 0.84765625,
"learning_rate": 4.398716649300311e-07,
"loss": 0.0145,
"step": 1540
},
{
"epoch": 10.598290598290598,
"grad_norm": 0.6015625,
"learning_rate": 3.999024609174812e-07,
"loss": 0.0196,
"step": 1550
},
{
"epoch": 10.666666666666666,
"grad_norm": 1.3671875,
"learning_rate": 3.61761104486314e-07,
"loss": 0.0271,
"step": 1560
},
{
"epoch": 10.735042735042736,
"grad_norm": 0.70703125,
"learning_rate": 3.2546275109475554e-07,
"loss": 0.0179,
"step": 1570
},
{
"epoch": 10.803418803418804,
"grad_norm": 0.796875,
"learning_rate": 2.9102182388425106e-07,
"loss": 0.0228,
"step": 1580
},
{
"epoch": 10.871794871794872,
"grad_norm": 0.451171875,
"learning_rate": 2.5845200794842154e-07,
"loss": 0.0217,
"step": 1590
},
{
"epoch": 10.94017094017094,
"grad_norm": 0.95703125,
"learning_rate": 2.2776624489530664e-07,
"loss": 0.0212,
"step": 1600
},
{
"epoch": 11.008547008547009,
"grad_norm": 1.6796875,
"learning_rate": 1.9897672770501198e-07,
"loss": 0.0492,
"step": 1610
},
{
"epoch": 11.076923076923077,
"grad_norm": 2.171875,
"learning_rate": 1.7209489588483396e-07,
"loss": 0.0265,
"step": 1620
},
{
"epoch": 11.145299145299145,
"grad_norm": 2.171875,
"learning_rate": 1.4713143092377534e-07,
"loss": 0.0188,
"step": 1630
},
{
"epoch": 11.213675213675213,
"grad_norm": 2.625,
"learning_rate": 1.2409625204825802e-07,
"loss": 0.025,
"step": 1640
},
{
"epoch": 11.282051282051283,
"grad_norm": 3.328125,
"learning_rate": 1.0299851228072089e-07,
"loss": 0.0186,
"step": 1650
},
{
"epoch": 11.350427350427351,
"grad_norm": 5.40625,
"learning_rate": 8.384659480266733e-08,
"loss": 0.0257,
"step": 1660
},
{
"epoch": 11.418803418803419,
"grad_norm": 4.09375,
"learning_rate": 6.664810962361268e-08,
"loss": 0.0213,
"step": 1670
},
{
"epoch": 11.487179487179487,
"grad_norm": 3.125,
"learning_rate": 5.1409890557246876e-08,
"loss": 0.0137,
"step": 1680
},
{
"epoch": 11.555555555555555,
"grad_norm": 5.21875,
"learning_rate": 3.813799250602046e-08,
"loss": 0.0164,
"step": 1690
},
{
"epoch": 11.623931623931623,
"grad_norm": 0.328125,
"learning_rate": 2.683768905523243e-08,
"loss": 0.0197,
"step": 1700
},
{
"epoch": 11.692307692307692,
"grad_norm": 0.796875,
"learning_rate": 1.7513470377570896e-08,
"loss": 0.0246,
"step": 1710
},
{
"epoch": 11.760683760683762,
"grad_norm": 0.92578125,
"learning_rate": 1.016904144894304e-08,
"loss": 0.0295,
"step": 1720
},
{
"epoch": 11.82905982905983,
"grad_norm": 1.3671875,
"learning_rate": 4.807320576307728e-09,
"loss": 0.029,
"step": 1730
},
{
"epoch": 11.897435897435898,
"grad_norm": 2.203125,
"learning_rate": 1.4304382380819771e-09,
"loss": 0.0177,
"step": 1740
},
{
"epoch": 11.965811965811966,
"grad_norm": 1.0234375,
"learning_rate": 3.9736237600895846e-11,
"loss": 0.0261,
"step": 1750
},
{
"epoch": 11.97948717948718,
"step": 1752,
"total_flos": 9.576163802677248e+16,
"train_loss": 0.4554890414942311,
"train_runtime": 1838.2954,
"train_samples_per_second": 7.638,
"train_steps_per_second": 0.953
}
],
"logging_steps": 10,
"max_steps": 1752,
"num_input_tokens_seen": 0,
"num_train_epochs": 12,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.576163802677248e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}