llama-3-8B-lora-sft / trainer_state.json
ytcheng's picture
End of training
215ed2c verified
raw
history blame contribute delete
No virus
75.9 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9986300118732303,
"eval_steps": 200,
"global_step": 4104,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0073066033427710295,
"grad_norm": 1.7583225965499878,
"learning_rate": 2.1897810218978103e-06,
"loss": 3.655,
"step": 10
},
{
"epoch": 0.014613206685542059,
"grad_norm": NaN,
"learning_rate": 4.379562043795621e-06,
"loss": 4.0596,
"step": 20
},
{
"epoch": 0.02191981002831309,
"grad_norm": 2.3730218410491943,
"learning_rate": 6.812652068126521e-06,
"loss": 4.2837,
"step": 30
},
{
"epoch": 0.029226413371084118,
"grad_norm": 3.1703426837921143,
"learning_rate": 9.24574209245742e-06,
"loss": 3.8455,
"step": 40
},
{
"epoch": 0.03653301671385514,
"grad_norm": 2.987401247024536,
"learning_rate": 1.1678832116788322e-05,
"loss": 3.5357,
"step": 50
},
{
"epoch": 0.04383962005662618,
"grad_norm": 4.219663619995117,
"learning_rate": 1.411192214111922e-05,
"loss": 3.6343,
"step": 60
},
{
"epoch": 0.05114622339939721,
"grad_norm": 4.096388816833496,
"learning_rate": 1.654501216545012e-05,
"loss": 3.4301,
"step": 70
},
{
"epoch": 0.058452826742168236,
"grad_norm": 10.184355735778809,
"learning_rate": 1.8734793187347933e-05,
"loss": 3.2771,
"step": 80
},
{
"epoch": 0.06575943008493926,
"grad_norm": 8.512189865112305,
"learning_rate": 2.1167883211678834e-05,
"loss": 2.9386,
"step": 90
},
{
"epoch": 0.07306603342771029,
"grad_norm": 5.223749160766602,
"learning_rate": 2.360097323600973e-05,
"loss": 2.6416,
"step": 100
},
{
"epoch": 0.08037263677048133,
"grad_norm": 5.041982173919678,
"learning_rate": 2.6034063260340636e-05,
"loss": 2.78,
"step": 110
},
{
"epoch": 0.08767924011325236,
"grad_norm": 8.065332412719727,
"learning_rate": 2.8467153284671533e-05,
"loss": 2.7657,
"step": 120
},
{
"epoch": 0.09498584345602339,
"grad_norm": 2.3226640224456787,
"learning_rate": 3.0900243309002434e-05,
"loss": 2.7316,
"step": 130
},
{
"epoch": 0.10229244679879441,
"grad_norm": 3.4772486686706543,
"learning_rate": 3.3333333333333335e-05,
"loss": 2.6205,
"step": 140
},
{
"epoch": 0.10959905014156544,
"grad_norm": 2.8939208984375,
"learning_rate": 3.5766423357664236e-05,
"loss": 2.648,
"step": 150
},
{
"epoch": 0.11690565348433647,
"grad_norm": 2.951481342315674,
"learning_rate": 3.819951338199514e-05,
"loss": 2.6972,
"step": 160
},
{
"epoch": 0.1242122568271075,
"grad_norm": 2.277855157852173,
"learning_rate": 4.063260340632604e-05,
"loss": 2.4787,
"step": 170
},
{
"epoch": 0.13151886016987852,
"grad_norm": 9.363456726074219,
"learning_rate": 4.306569343065693e-05,
"loss": 2.6347,
"step": 180
},
{
"epoch": 0.13882546351264954,
"grad_norm": 3.6133177280426025,
"learning_rate": 4.549878345498784e-05,
"loss": 2.656,
"step": 190
},
{
"epoch": 0.14613206685542057,
"grad_norm": 4.037402153015137,
"learning_rate": 4.793187347931874e-05,
"loss": 2.5512,
"step": 200
},
{
"epoch": 0.14613206685542057,
"eval_loss": 2.5883829593658447,
"eval_runtime": 109.0986,
"eval_samples_per_second": 11.155,
"eval_steps_per_second": 11.155,
"step": 200
},
{
"epoch": 0.15343867019819163,
"grad_norm": 3.737877368927002,
"learning_rate": 5.036496350364964e-05,
"loss": 2.5724,
"step": 210
},
{
"epoch": 0.16074527354096266,
"grad_norm": 3.6852433681488037,
"learning_rate": 5.279805352798054e-05,
"loss": 2.6093,
"step": 220
},
{
"epoch": 0.1680518768837337,
"grad_norm": 4.3811936378479,
"learning_rate": 5.5231143552311436e-05,
"loss": 2.5968,
"step": 230
},
{
"epoch": 0.17535848022650471,
"grad_norm": 4.019372940063477,
"learning_rate": 5.766423357664234e-05,
"loss": 2.7364,
"step": 240
},
{
"epoch": 0.18266508356927574,
"grad_norm": 4.277368068695068,
"learning_rate": 6.0097323600973245e-05,
"loss": 2.7445,
"step": 250
},
{
"epoch": 0.18997168691204677,
"grad_norm": 11.767447471618652,
"learning_rate": 6.253041362530415e-05,
"loss": 2.6712,
"step": 260
},
{
"epoch": 0.1972782902548178,
"grad_norm": 3.2719593048095703,
"learning_rate": 6.496350364963504e-05,
"loss": 2.5184,
"step": 270
},
{
"epoch": 0.20458489359758883,
"grad_norm": 3.923417091369629,
"learning_rate": 6.739659367396593e-05,
"loss": 2.6089,
"step": 280
},
{
"epoch": 0.21189149694035986,
"grad_norm": 5.958123207092285,
"learning_rate": 6.982968369829684e-05,
"loss": 2.5422,
"step": 290
},
{
"epoch": 0.2191981002831309,
"grad_norm": 4.775500774383545,
"learning_rate": 7.226277372262774e-05,
"loss": 2.3889,
"step": 300
},
{
"epoch": 0.22650470362590192,
"grad_norm": 7.674183368682861,
"learning_rate": 7.469586374695864e-05,
"loss": 2.4629,
"step": 310
},
{
"epoch": 0.23381130696867294,
"grad_norm": 5.1322760581970215,
"learning_rate": 7.712895377128954e-05,
"loss": 2.5144,
"step": 320
},
{
"epoch": 0.24111791031144397,
"grad_norm": 5.469252586364746,
"learning_rate": 7.956204379562045e-05,
"loss": 2.5871,
"step": 330
},
{
"epoch": 0.248424513654215,
"grad_norm": 4.472785472869873,
"learning_rate": 8.199513381995134e-05,
"loss": 2.4764,
"step": 340
},
{
"epoch": 0.25573111699698603,
"grad_norm": 7.12663459777832,
"learning_rate": 8.442822384428223e-05,
"loss": 2.5562,
"step": 350
},
{
"epoch": 0.26303772033975703,
"grad_norm": 3.2579259872436523,
"learning_rate": 8.686131386861314e-05,
"loss": 2.4707,
"step": 360
},
{
"epoch": 0.2703443236825281,
"grad_norm": 4.707943916320801,
"learning_rate": 8.929440389294405e-05,
"loss": 2.5207,
"step": 370
},
{
"epoch": 0.2776509270252991,
"grad_norm": 8.757339477539062,
"learning_rate": 9.172749391727494e-05,
"loss": 2.5424,
"step": 380
},
{
"epoch": 0.28495753036807014,
"grad_norm": 6.260244369506836,
"learning_rate": 9.416058394160584e-05,
"loss": 2.5552,
"step": 390
},
{
"epoch": 0.29226413371084115,
"grad_norm": 3.4638991355895996,
"learning_rate": 9.659367396593674e-05,
"loss": 2.3861,
"step": 400
},
{
"epoch": 0.29226413371084115,
"eval_loss": 2.477276563644409,
"eval_runtime": 107.9591,
"eval_samples_per_second": 11.273,
"eval_steps_per_second": 11.273,
"step": 400
},
{
"epoch": 0.2995707370536122,
"grad_norm": 4.783712863922119,
"learning_rate": 9.902676399026765e-05,
"loss": 2.5961,
"step": 410
},
{
"epoch": 0.30687734039638326,
"grad_norm": 6.2626729011535645,
"learning_rate": 9.999934869757279e-05,
"loss": 2.5863,
"step": 420
},
{
"epoch": 0.31418394373915426,
"grad_norm": 4.87408971786499,
"learning_rate": 9.999536857752013e-05,
"loss": 2.3565,
"step": 430
},
{
"epoch": 0.3214905470819253,
"grad_norm": 3.7447562217712402,
"learning_rate": 9.998777045977545e-05,
"loss": 2.4354,
"step": 440
},
{
"epoch": 0.3287971504246963,
"grad_norm": 5.334987640380859,
"learning_rate": 9.997655489418913e-05,
"loss": 2.412,
"step": 450
},
{
"epoch": 0.3361037537674674,
"grad_norm": 5.347410678863525,
"learning_rate": 9.996172269239417e-05,
"loss": 2.4486,
"step": 460
},
{
"epoch": 0.3434103571102384,
"grad_norm": 5.353787899017334,
"learning_rate": 9.99432749277474e-05,
"loss": 2.3383,
"step": 470
},
{
"epoch": 0.35071696045300943,
"grad_norm": 5.850295543670654,
"learning_rate": 9.992121293525189e-05,
"loss": 2.3838,
"step": 480
},
{
"epoch": 0.35802356379578043,
"grad_norm": 4.7518792152404785,
"learning_rate": 9.98955383114603e-05,
"loss": 2.427,
"step": 490
},
{
"epoch": 0.3653301671385515,
"grad_norm": 7.027746677398682,
"learning_rate": 9.986625291435933e-05,
"loss": 2.608,
"step": 500
},
{
"epoch": 0.3726367704813225,
"grad_norm": 5.194112300872803,
"learning_rate": 9.983335886323523e-05,
"loss": 2.5837,
"step": 510
},
{
"epoch": 0.37994337382409354,
"grad_norm": 5.106868743896484,
"learning_rate": 9.979685853852057e-05,
"loss": 2.4636,
"step": 520
},
{
"epoch": 0.38724997716686455,
"grad_norm": 4.584632396697998,
"learning_rate": 9.975675458162177e-05,
"loss": 2.3892,
"step": 530
},
{
"epoch": 0.3945565805096356,
"grad_norm": 3.168287992477417,
"learning_rate": 9.971304989472819e-05,
"loss": 2.4344,
"step": 540
},
{
"epoch": 0.4018631838524066,
"grad_norm": 5.590039253234863,
"learning_rate": 9.966574764060186e-05,
"loss": 2.377,
"step": 550
},
{
"epoch": 0.40916978719517766,
"grad_norm": 3.9408605098724365,
"learning_rate": 9.961485124234881e-05,
"loss": 2.4833,
"step": 560
},
{
"epoch": 0.41647639053794866,
"grad_norm": 3.6905062198638916,
"learning_rate": 9.956036438317124e-05,
"loss": 2.5575,
"step": 570
},
{
"epoch": 0.4237829938807197,
"grad_norm": 5.282037258148193,
"learning_rate": 9.9502291006101e-05,
"loss": 2.438,
"step": 580
},
{
"epoch": 0.4310895972234907,
"grad_norm": 3.3918325901031494,
"learning_rate": 9.944063531371423e-05,
"loss": 2.5227,
"step": 590
},
{
"epoch": 0.4383962005662618,
"grad_norm": 3.809112787246704,
"learning_rate": 9.937540176782732e-05,
"loss": 2.5938,
"step": 600
},
{
"epoch": 0.4383962005662618,
"eval_loss": 2.399010181427002,
"eval_runtime": 107.2947,
"eval_samples_per_second": 11.343,
"eval_steps_per_second": 11.343,
"step": 600
},
{
"epoch": 0.4457028039090328,
"grad_norm": 6.199471473693848,
"learning_rate": 9.930659508917388e-05,
"loss": 2.4369,
"step": 610
},
{
"epoch": 0.45300940725180383,
"grad_norm": 4.105823993682861,
"learning_rate": 9.923422025706323e-05,
"loss": 2.5947,
"step": 620
},
{
"epoch": 0.46031601059457483,
"grad_norm": 4.27271842956543,
"learning_rate": 9.915828250902004e-05,
"loss": 2.3976,
"step": 630
},
{
"epoch": 0.4676226139373459,
"grad_norm": 3.6033222675323486,
"learning_rate": 9.907878734040525e-05,
"loss": 2.3318,
"step": 640
},
{
"epoch": 0.4749292172801169,
"grad_norm": 3.8373398780822754,
"learning_rate": 9.89957405040185e-05,
"loss": 2.4319,
"step": 650
},
{
"epoch": 0.48223582062288795,
"grad_norm": 3.295011520385742,
"learning_rate": 9.890914800968171e-05,
"loss": 2.2978,
"step": 660
},
{
"epoch": 0.48954242396565895,
"grad_norm": 3.935311794281006,
"learning_rate": 9.88190161238042e-05,
"loss": 2.4859,
"step": 670
},
{
"epoch": 0.49684902730843,
"grad_norm": 4.868900299072266,
"learning_rate": 9.872535136892926e-05,
"loss": 2.3433,
"step": 680
},
{
"epoch": 0.5041556306512011,
"grad_norm": 6.791601181030273,
"learning_rate": 9.862816052326209e-05,
"loss": 2.3252,
"step": 690
},
{
"epoch": 0.5114622339939721,
"grad_norm": 2.8761675357818604,
"learning_rate": 9.852745062017927e-05,
"loss": 2.4546,
"step": 700
},
{
"epoch": 0.5187688373367431,
"grad_norm": 7.200977802276611,
"learning_rate": 9.84338089316251e-05,
"loss": 2.462,
"step": 710
},
{
"epoch": 0.5260754406795141,
"grad_norm": 5.1391825675964355,
"learning_rate": 9.832643310805385e-05,
"loss": 2.601,
"step": 720
},
{
"epoch": 0.5333820440222852,
"grad_norm": 3.5819122791290283,
"learning_rate": 9.821556006207133e-05,
"loss": 2.3209,
"step": 730
},
{
"epoch": 0.5406886473650562,
"grad_norm": 3.3951661586761475,
"learning_rate": 9.810119781718924e-05,
"loss": 2.1652,
"step": 740
},
{
"epoch": 0.5479952507078272,
"grad_norm": 5.344549655914307,
"learning_rate": 9.798335464942094e-05,
"loss": 2.3954,
"step": 750
},
{
"epoch": 0.5553018540505982,
"grad_norm": 5.5213446617126465,
"learning_rate": 9.786203908668255e-05,
"loss": 2.1831,
"step": 760
},
{
"epoch": 0.5626084573933693,
"grad_norm": 4.242595195770264,
"learning_rate": 9.773725990817575e-05,
"loss": 2.4479,
"step": 770
},
{
"epoch": 0.5699150607361403,
"grad_norm": 5.442046165466309,
"learning_rate": 9.76090261437526e-05,
"loss": 2.2424,
"step": 780
},
{
"epoch": 0.5772216640789113,
"grad_norm": 5.669281482696533,
"learning_rate": 9.747734707326195e-05,
"loss": 2.1442,
"step": 790
},
{
"epoch": 0.5845282674216823,
"grad_norm": 6.425075531005859,
"learning_rate": 9.734223222587792e-05,
"loss": 2.3384,
"step": 800
},
{
"epoch": 0.5845282674216823,
"eval_loss": 2.348578691482544,
"eval_runtime": 108.5263,
"eval_samples_per_second": 11.214,
"eval_steps_per_second": 11.214,
"step": 800
},
{
"epoch": 0.5918348707644534,
"grad_norm": 5.118699073791504,
"learning_rate": 9.720369137941034e-05,
"loss": 2.4253,
"step": 810
},
{
"epoch": 0.5991414741072244,
"grad_norm": 6.51192569732666,
"learning_rate": 9.706173455959715e-05,
"loss": 2.5238,
"step": 820
},
{
"epoch": 0.6064480774499954,
"grad_norm": 4.507262706756592,
"learning_rate": 9.69163720393788e-05,
"loss": 2.4986,
"step": 830
},
{
"epoch": 0.6137546807927665,
"grad_norm": 5.540140151977539,
"learning_rate": 9.676761433815498e-05,
"loss": 2.2241,
"step": 840
},
{
"epoch": 0.6210612841355375,
"grad_norm": 3.475051164627075,
"learning_rate": 9.661547222102323e-05,
"loss": 2.3395,
"step": 850
},
{
"epoch": 0.6283678874783085,
"grad_norm": 5.291077136993408,
"learning_rate": 9.645995669799995e-05,
"loss": 2.2988,
"step": 860
},
{
"epoch": 0.6356744908210795,
"grad_norm": 3.973806858062744,
"learning_rate": 9.630107902322367e-05,
"loss": 2.3554,
"step": 870
},
{
"epoch": 0.6429810941638506,
"grad_norm": 4.875363349914551,
"learning_rate": 9.613885069414061e-05,
"loss": 2.3115,
"step": 880
},
{
"epoch": 0.6502876975066216,
"grad_norm": 4.581294059753418,
"learning_rate": 9.597328345067259e-05,
"loss": 2.1619,
"step": 890
},
{
"epoch": 0.6575943008493926,
"grad_norm": 5.785318374633789,
"learning_rate": 9.580438927436756e-05,
"loss": 2.3814,
"step": 900
},
{
"epoch": 0.6649009041921636,
"grad_norm": 8.992704391479492,
"learning_rate": 9.563218038753246e-05,
"loss": 2.5302,
"step": 910
},
{
"epoch": 0.6722075075349347,
"grad_norm": 8.763484954833984,
"learning_rate": 9.545666925234873e-05,
"loss": 2.5443,
"step": 920
},
{
"epoch": 0.6795141108777057,
"grad_norm": 4.8247456550598145,
"learning_rate": 9.52778685699705e-05,
"loss": 2.269,
"step": 930
},
{
"epoch": 0.6868207142204767,
"grad_norm": 4.4191131591796875,
"learning_rate": 9.509579127960543e-05,
"loss": 2.0768,
"step": 940
},
{
"epoch": 0.6941273175632477,
"grad_norm": 3.7724087238311768,
"learning_rate": 9.491045055757836e-05,
"loss": 2.2629,
"step": 950
},
{
"epoch": 0.7014339209060189,
"grad_norm": 4.011133193969727,
"learning_rate": 9.472185981637775e-05,
"loss": 2.3676,
"step": 960
},
{
"epoch": 0.7087405242487899,
"grad_norm": 3.6976683139801025,
"learning_rate": 9.45300327036851e-05,
"loss": 2.3744,
"step": 970
},
{
"epoch": 0.7160471275915609,
"grad_norm": 4.968256950378418,
"learning_rate": 9.433498310138728e-05,
"loss": 2.1978,
"step": 980
},
{
"epoch": 0.7233537309343319,
"grad_norm": 32.12665939331055,
"learning_rate": 9.413672512457197e-05,
"loss": 2.1205,
"step": 990
},
{
"epoch": 0.730660334277103,
"grad_norm": 3.214895248413086,
"learning_rate": 9.393527312050618e-05,
"loss": 2.3232,
"step": 1000
},
{
"epoch": 0.730660334277103,
"eval_loss": 2.3093771934509277,
"eval_runtime": 109.2454,
"eval_samples_per_second": 11.14,
"eval_steps_per_second": 11.14,
"step": 1000
},
{
"epoch": 0.737966937619874,
"grad_norm": 5.2641072273254395,
"learning_rate": 9.373064166759803e-05,
"loss": 2.2691,
"step": 1010
},
{
"epoch": 0.745273540962645,
"grad_norm": 4.43721866607666,
"learning_rate": 9.352284557434166e-05,
"loss": 2.3682,
"step": 1020
},
{
"epoch": 0.752580144305416,
"grad_norm": 3.769683599472046,
"learning_rate": 9.331189987824569e-05,
"loss": 2.2732,
"step": 1030
},
{
"epoch": 0.7598867476481871,
"grad_norm": 5.2646002769470215,
"learning_rate": 9.309781984474497e-05,
"loss": 2.1909,
"step": 1040
},
{
"epoch": 0.7671933509909581,
"grad_norm": 4.442707538604736,
"learning_rate": 9.288062096609588e-05,
"loss": 2.2229,
"step": 1050
},
{
"epoch": 0.7744999543337291,
"grad_norm": 7.33398962020874,
"learning_rate": 9.266031896025516e-05,
"loss": 2.3366,
"step": 1060
},
{
"epoch": 0.7818065576765001,
"grad_norm": 4.366443634033203,
"learning_rate": 9.243692976974254e-05,
"loss": 2.0555,
"step": 1070
},
{
"epoch": 0.7891131610192712,
"grad_norm": 6.721280097961426,
"learning_rate": 9.221046956048696e-05,
"loss": 2.2303,
"step": 1080
},
{
"epoch": 0.7964197643620422,
"grad_norm": 4.3517746925354,
"learning_rate": 9.198095472065668e-05,
"loss": 2.3249,
"step": 1090
},
{
"epoch": 0.8037263677048132,
"grad_norm": 5.425954818725586,
"learning_rate": 9.174840185947345e-05,
"loss": 2.2346,
"step": 1100
},
{
"epoch": 0.8110329710475842,
"grad_norm": 4.331856727600098,
"learning_rate": 9.151282780601039e-05,
"loss": 2.4686,
"step": 1110
},
{
"epoch": 0.8183395743903553,
"grad_norm": 7.6655707359313965,
"learning_rate": 9.127424960797424e-05,
"loss": 2.3503,
"step": 1120
},
{
"epoch": 0.8256461777331263,
"grad_norm": 3.6036481857299805,
"learning_rate": 9.103268453047165e-05,
"loss": 2.3279,
"step": 1130
},
{
"epoch": 0.8329527810758973,
"grad_norm": 7.087502479553223,
"learning_rate": 9.078815005475974e-05,
"loss": 2.4316,
"step": 1140
},
{
"epoch": 0.8402593844186683,
"grad_norm": 3.372032880783081,
"learning_rate": 9.054066387698104e-05,
"loss": 2.3761,
"step": 1150
},
{
"epoch": 0.8475659877614394,
"grad_norm": 5.309089660644531,
"learning_rate": 9.02902439068829e-05,
"loss": 2.4221,
"step": 1160
},
{
"epoch": 0.8548725911042104,
"grad_norm": 4.797391414642334,
"learning_rate": 9.003690826652143e-05,
"loss": 2.2968,
"step": 1170
},
{
"epoch": 0.8621791944469814,
"grad_norm": 3.347250461578369,
"learning_rate": 8.978067528895003e-05,
"loss": 2.2259,
"step": 1180
},
{
"epoch": 0.8694857977897524,
"grad_norm": 2.894286870956421,
"learning_rate": 8.95215635168927e-05,
"loss": 2.3369,
"step": 1190
},
{
"epoch": 0.8767924011325235,
"grad_norm": 5.936110019683838,
"learning_rate": 8.925959170140218e-05,
"loss": 2.2603,
"step": 1200
},
{
"epoch": 0.8767924011325235,
"eval_loss": 2.2486681938171387,
"eval_runtime": 109.2268,
"eval_samples_per_second": 11.142,
"eval_steps_per_second": 11.142,
"step": 1200
},
{
"epoch": 0.8840990044752945,
"grad_norm": 4.270968437194824,
"learning_rate": 8.899477880050306e-05,
"loss": 2.3473,
"step": 1210
},
{
"epoch": 0.8914056078180655,
"grad_norm": 5.154277324676514,
"learning_rate": 8.872714397781965e-05,
"loss": 2.3085,
"step": 1220
},
{
"epoch": 0.8987122111608367,
"grad_norm": 4.050198078155518,
"learning_rate": 8.84567066011894e-05,
"loss": 2.2946,
"step": 1230
},
{
"epoch": 0.9060188145036077,
"grad_norm": 11.016803741455078,
"learning_rate": 8.818348624126122e-05,
"loss": 2.1233,
"step": 1240
},
{
"epoch": 0.9133254178463787,
"grad_norm": 4.9638447761535645,
"learning_rate": 8.790750267007918e-05,
"loss": 2.1703,
"step": 1250
},
{
"epoch": 0.9206320211891497,
"grad_norm": 5.055572032928467,
"learning_rate": 8.762877585965172e-05,
"loss": 2.3752,
"step": 1260
},
{
"epoch": 0.9279386245319208,
"grad_norm": 5.373875617980957,
"learning_rate": 8.734732598050637e-05,
"loss": 2.2983,
"step": 1270
},
{
"epoch": 0.9352452278746918,
"grad_norm": 4.6786723136901855,
"learning_rate": 8.706317340022997e-05,
"loss": 2.2051,
"step": 1280
},
{
"epoch": 0.9425518312174628,
"grad_norm": 3.9016830921173096,
"learning_rate": 8.677633868199487e-05,
"loss": 2.1745,
"step": 1290
},
{
"epoch": 0.9498584345602338,
"grad_norm": 3.027470350265503,
"learning_rate": 8.648684258307076e-05,
"loss": 2.2724,
"step": 1300
},
{
"epoch": 0.9571650379030049,
"grad_norm": 4.489301681518555,
"learning_rate": 8.619470605332253e-05,
"loss": 2.2487,
"step": 1310
},
{
"epoch": 0.9644716412457759,
"grad_norm": 4.5948686599731445,
"learning_rate": 8.589995023369429e-05,
"loss": 2.2639,
"step": 1320
},
{
"epoch": 0.9717782445885469,
"grad_norm": 5.749096393585205,
"learning_rate": 8.560259645467928e-05,
"loss": 2.2842,
"step": 1330
},
{
"epoch": 0.9790848479313179,
"grad_norm": 3.869464635848999,
"learning_rate": 8.53026662347765e-05,
"loss": 2.3613,
"step": 1340
},
{
"epoch": 0.986391451274089,
"grad_norm": 6.761131763458252,
"learning_rate": 8.500018127893329e-05,
"loss": 2.1473,
"step": 1350
},
{
"epoch": 0.99369805461686,
"grad_norm": 3.230255365371704,
"learning_rate": 8.469516347697473e-05,
"loss": 2.2302,
"step": 1360
},
{
"epoch": 1.001004657959631,
"grad_norm": 4.553093433380127,
"learning_rate": 8.438763490201946e-05,
"loss": 2.1864,
"step": 1370
},
{
"epoch": 1.0083112613024021,
"grad_norm": 4.498405456542969,
"learning_rate": 8.407761780888244e-05,
"loss": 2.209,
"step": 1380
},
{
"epoch": 1.015617864645173,
"grad_norm": 5.025854110717773,
"learning_rate": 8.37651346324643e-05,
"loss": 2.2477,
"step": 1390
},
{
"epoch": 1.0229244679879441,
"grad_norm": 4.791236400604248,
"learning_rate": 8.345020798612791e-05,
"loss": 2.1007,
"step": 1400
},
{
"epoch": 1.0229244679879441,
"eval_loss": 2.2063653469085693,
"eval_runtime": 108.5908,
"eval_samples_per_second": 11.207,
"eval_steps_per_second": 11.207,
"step": 1400
},
{
"epoch": 1.0302310713307152,
"grad_norm": 4.35936164855957,
"learning_rate": 8.313286066006187e-05,
"loss": 2.1783,
"step": 1410
},
{
"epoch": 1.0375376746734861,
"grad_norm": 4.893729209899902,
"learning_rate": 8.28131156196313e-05,
"loss": 2.0583,
"step": 1420
},
{
"epoch": 1.0448442780162572,
"grad_norm": 9.001852989196777,
"learning_rate": 8.249099600371591e-05,
"loss": 2.0463,
"step": 1430
},
{
"epoch": 1.0521508813590281,
"grad_norm": 4.704784393310547,
"learning_rate": 8.216652512303543e-05,
"loss": 2.255,
"step": 1440
},
{
"epoch": 1.0594574847017992,
"grad_norm": 4.346590995788574,
"learning_rate": 8.183972645846283e-05,
"loss": 2.2898,
"step": 1450
},
{
"epoch": 1.0667640880445703,
"grad_norm": 6.388704299926758,
"learning_rate": 8.1510623659325e-05,
"loss": 2.0351,
"step": 1460
},
{
"epoch": 1.0740706913873412,
"grad_norm": 5.978973388671875,
"learning_rate": 8.117924054169133e-05,
"loss": 2.091,
"step": 1470
},
{
"epoch": 1.0813772947301123,
"grad_norm": 3.6497621536254883,
"learning_rate": 8.084560108665024e-05,
"loss": 2.0603,
"step": 1480
},
{
"epoch": 1.0886838980728835,
"grad_norm": 5.628828048706055,
"learning_rate": 8.050972943857375e-05,
"loss": 2.1513,
"step": 1490
},
{
"epoch": 1.0959905014156544,
"grad_norm": 15.277828216552734,
"learning_rate": 8.017164990337026e-05,
"loss": 2.3273,
"step": 1500
},
{
"epoch": 1.1032971047584255,
"grad_norm": 7.737220764160156,
"learning_rate": 7.983138694672552e-05,
"loss": 2.2664,
"step": 1510
},
{
"epoch": 1.1106037081011966,
"grad_norm": 6.418485164642334,
"learning_rate": 7.948896519233225e-05,
"loss": 2.1581,
"step": 1520
},
{
"epoch": 1.1179103114439675,
"grad_norm": 3.803318738937378,
"learning_rate": 7.914440942010807e-05,
"loss": 2.2859,
"step": 1530
},
{
"epoch": 1.1252169147867386,
"grad_norm": 3.083667278289795,
"learning_rate": 7.879774456440243e-05,
"loss": 2.1394,
"step": 1540
},
{
"epoch": 1.1325235181295095,
"grad_norm": 13.434645652770996,
"learning_rate": 7.844899571219202e-05,
"loss": 2.0644,
"step": 1550
},
{
"epoch": 1.1398301214722806,
"grad_norm": 5.204081058502197,
"learning_rate": 7.809818810126545e-05,
"loss": 2.055,
"step": 1560
},
{
"epoch": 1.1471367248150517,
"grad_norm": 4.114535808563232,
"learning_rate": 7.774534711839677e-05,
"loss": 2.0104,
"step": 1570
},
{
"epoch": 1.1544433281578226,
"grad_norm": 5.424691677093506,
"learning_rate": 7.73904982975084e-05,
"loss": 2.0453,
"step": 1580
},
{
"epoch": 1.1617499315005937,
"grad_norm": 4.465676784515381,
"learning_rate": 7.703366731782327e-05,
"loss": 2.2777,
"step": 1590
},
{
"epoch": 1.1690565348433646,
"grad_norm": 3.521860361099243,
"learning_rate": 7.667488000200649e-05,
"loss": 2.0635,
"step": 1600
},
{
"epoch": 1.1690565348433646,
"eval_loss": 2.1665337085723877,
"eval_runtime": 109.3334,
"eval_samples_per_second": 11.131,
"eval_steps_per_second": 11.131,
"step": 1600
},
{
"epoch": 1.1763631381861357,
"grad_norm": 5.549830913543701,
"learning_rate": 7.631416231429672e-05,
"loss": 2.0994,
"step": 1610
},
{
"epoch": 1.1836697415289068,
"grad_norm": 6.577401638031006,
"learning_rate": 7.595154035862715e-05,
"loss": 2.0379,
"step": 1620
},
{
"epoch": 1.1909763448716777,
"grad_norm": 6.858834743499756,
"learning_rate": 7.558704037673648e-05,
"loss": 2.1925,
"step": 1630
},
{
"epoch": 1.1982829482144488,
"grad_norm": 4.788407325744629,
"learning_rate": 7.522068874626988e-05,
"loss": 2.1162,
"step": 1640
},
{
"epoch": 1.20558955155722,
"grad_norm": 3.5293450355529785,
"learning_rate": 7.48525119788702e-05,
"loss": 2.1279,
"step": 1650
},
{
"epoch": 1.2128961548999908,
"grad_norm": 4.138194561004639,
"learning_rate": 7.448253671825927e-05,
"loss": 2.1242,
"step": 1660
},
{
"epoch": 1.220202758242762,
"grad_norm": 3.9674646854400635,
"learning_rate": 7.411078973830987e-05,
"loss": 2.1451,
"step": 1670
},
{
"epoch": 1.227509361585533,
"grad_norm": 10.673015594482422,
"learning_rate": 7.373729794110826e-05,
"loss": 2.0227,
"step": 1680
},
{
"epoch": 1.234815964928304,
"grad_norm": 5.9692511558532715,
"learning_rate": 7.33620883550072e-05,
"loss": 2.191,
"step": 1690
},
{
"epoch": 1.242122568271075,
"grad_norm": 8.75203800201416,
"learning_rate": 7.298518813267015e-05,
"loss": 1.9689,
"step": 1700
},
{
"epoch": 1.249429171613846,
"grad_norm": 6.571184158325195,
"learning_rate": 7.260662454910621e-05,
"loss": 2.0869,
"step": 1710
},
{
"epoch": 1.256735774956617,
"grad_norm": 6.252017498016357,
"learning_rate": 7.222642499969646e-05,
"loss": 2.0596,
"step": 1720
},
{
"epoch": 1.2640423782993881,
"grad_norm": 7.060279369354248,
"learning_rate": 7.184461699821126e-05,
"loss": 2.211,
"step": 1730
},
{
"epoch": 1.271348981642159,
"grad_norm": 5.095629692077637,
"learning_rate": 7.14612281748193e-05,
"loss": 2.0639,
"step": 1740
},
{
"epoch": 1.2786555849849301,
"grad_norm": 3.8237876892089844,
"learning_rate": 7.107628627408813e-05,
"loss": 2.0824,
"step": 1750
},
{
"epoch": 1.285962188327701,
"grad_norm": 4.18397331237793,
"learning_rate": 7.068981915297626e-05,
"loss": 2.0253,
"step": 1760
},
{
"epoch": 1.2932687916704722,
"grad_norm": 4.53864049911499,
"learning_rate": 7.030185477881726e-05,
"loss": 2.1168,
"step": 1770
},
{
"epoch": 1.3005753950132433,
"grad_norm": 6.684192657470703,
"learning_rate": 6.991242122729597e-05,
"loss": 1.9231,
"step": 1780
},
{
"epoch": 1.3078819983560144,
"grad_norm": 6.962904453277588,
"learning_rate": 6.952154668041666e-05,
"loss": 1.9549,
"step": 1790
},
{
"epoch": 1.3151886016987853,
"grad_norm": 8.408102035522461,
"learning_rate": 6.91292594244636e-05,
"loss": 2.1212,
"step": 1800
},
{
"epoch": 1.3151886016987853,
"eval_loss": 2.135313034057617,
"eval_runtime": 108.3855,
"eval_samples_per_second": 11.228,
"eval_steps_per_second": 11.228,
"step": 1800
},
{
"epoch": 1.3224952050415564,
"grad_norm": 4.266064167022705,
"learning_rate": 6.873558784795412e-05,
"loss": 1.9563,
"step": 1810
},
{
"epoch": 1.3298018083843273,
"grad_norm": 5.321630001068115,
"learning_rate": 6.834056043958419e-05,
"loss": 2.3073,
"step": 1820
},
{
"epoch": 1.3371084117270984,
"grad_norm": 5.667231559753418,
"learning_rate": 6.794420578616679e-05,
"loss": 2.0931,
"step": 1830
},
{
"epoch": 1.3444150150698695,
"grad_norm": 5.461520671844482,
"learning_rate": 6.754655257056322e-05,
"loss": 2.0288,
"step": 1840
},
{
"epoch": 1.3517216184126404,
"grad_norm": 3.88820743560791,
"learning_rate": 6.71476295696073e-05,
"loss": 2.0259,
"step": 1850
},
{
"epoch": 1.3590282217554115,
"grad_norm": 5.630788803100586,
"learning_rate": 6.674746565202309e-05,
"loss": 1.9281,
"step": 1860
},
{
"epoch": 1.3663348250981824,
"grad_norm": 3.890331745147705,
"learning_rate": 6.634608977633555e-05,
"loss": 2.1578,
"step": 1870
},
{
"epoch": 1.3736414284409535,
"grad_norm": 3.679614782333374,
"learning_rate": 6.594353098877503e-05,
"loss": 2.1456,
"step": 1880
},
{
"epoch": 1.3809480317837246,
"grad_norm": 5.798458099365234,
"learning_rate": 6.553981842117526e-05,
"loss": 2.0121,
"step": 1890
},
{
"epoch": 1.3882546351264955,
"grad_norm": 4.401270389556885,
"learning_rate": 6.513498128886515e-05,
"loss": 2.0316,
"step": 1900
},
{
"epoch": 1.3955612384692666,
"grad_norm": 6.3201823234558105,
"learning_rate": 6.472904888855463e-05,
"loss": 2.1175,
"step": 1910
},
{
"epoch": 1.4028678418120375,
"grad_norm": 5.530188083648682,
"learning_rate": 6.432205059621449e-05,
"loss": 2.0955,
"step": 1920
},
{
"epoch": 1.4101744451548086,
"grad_norm": 5.705748558044434,
"learning_rate": 6.391401586495059e-05,
"loss": 2.0269,
"step": 1930
},
{
"epoch": 1.4174810484975797,
"grad_norm": 5.124378204345703,
"learning_rate": 6.350497422287236e-05,
"loss": 2.2386,
"step": 1940
},
{
"epoch": 1.4247876518403508,
"grad_norm": 7.351918697357178,
"learning_rate": 6.309495527095606e-05,
"loss": 2.1977,
"step": 1950
},
{
"epoch": 1.4320942551831217,
"grad_norm": 5.076743125915527,
"learning_rate": 6.268398868090255e-05,
"loss": 2.117,
"step": 1960
},
{
"epoch": 1.4394008585258928,
"grad_norm": 3.8619604110717773,
"learning_rate": 6.227210419299014e-05,
"loss": 1.9848,
"step": 1970
},
{
"epoch": 1.4467074618686637,
"grad_norm": 3.990159034729004,
"learning_rate": 6.185933161392228e-05,
"loss": 2.0853,
"step": 1980
},
{
"epoch": 1.4540140652114348,
"grad_norm": 6.388422966003418,
"learning_rate": 6.144570081467066e-05,
"loss": 2.0883,
"step": 1990
},
{
"epoch": 1.461320668554206,
"grad_norm": 5.564253807067871,
"learning_rate": 6.103124172831346e-05,
"loss": 2.038,
"step": 2000
},
{
"epoch": 1.461320668554206,
"eval_loss": 2.0880391597747803,
"eval_runtime": 109.0271,
"eval_samples_per_second": 11.162,
"eval_steps_per_second": 11.162,
"step": 2000
},
{
"epoch": 1.4686272718969768,
"grad_norm": 6.205707550048828,
"learning_rate": 6.061598434786926e-05,
"loss": 1.9301,
"step": 2010
},
{
"epoch": 1.475933875239748,
"grad_norm": 7.6283111572265625,
"learning_rate": 6.019995872412649e-05,
"loss": 2.0155,
"step": 2020
},
{
"epoch": 1.4832404785825188,
"grad_norm": 6.749846458435059,
"learning_rate": 5.9783194963468784e-05,
"loss": 1.9461,
"step": 2030
},
{
"epoch": 1.49054708192529,
"grad_norm": 5.651824951171875,
"learning_rate": 5.936572322569629e-05,
"loss": 2.0335,
"step": 2040
},
{
"epoch": 1.497853685268061,
"grad_norm": 11.750208854675293,
"learning_rate": 5.894757372184309e-05,
"loss": 2.0556,
"step": 2050
},
{
"epoch": 1.5051602886108322,
"grad_norm": 5.419318675994873,
"learning_rate": 5.852877671199091e-05,
"loss": 1.9466,
"step": 2060
},
{
"epoch": 1.512466891953603,
"grad_norm": 4.80405330657959,
"learning_rate": 5.810936250307935e-05,
"loss": 2.1046,
"step": 2070
},
{
"epoch": 1.519773495296374,
"grad_norm": 11.513900756835938,
"learning_rate": 5.768936144671261e-05,
"loss": 1.8582,
"step": 2080
},
{
"epoch": 1.527080098639145,
"grad_norm": 5.375672817230225,
"learning_rate": 5.7268803936963124e-05,
"loss": 2.0872,
"step": 2090
},
{
"epoch": 1.5343867019819162,
"grad_norm": 4.7279863357543945,
"learning_rate": 5.6847720408171946e-05,
"loss": 2.0174,
"step": 2100
},
{
"epoch": 1.5416933053246873,
"grad_norm": 4.411227226257324,
"learning_rate": 5.642614133274641e-05,
"loss": 2.0424,
"step": 2110
},
{
"epoch": 1.5489999086674582,
"grad_norm": 7.084948539733887,
"learning_rate": 5.600409721895488e-05,
"loss": 2.2553,
"step": 2120
},
{
"epoch": 1.5563065120102293,
"grad_norm": 5.153254508972168,
"learning_rate": 5.558161860871899e-05,
"loss": 1.9618,
"step": 2130
},
{
"epoch": 1.5636131153530002,
"grad_norm": 8.986722946166992,
"learning_rate": 5.515873607540346e-05,
"loss": 2.0533,
"step": 2140
},
{
"epoch": 1.5709197186957713,
"grad_norm": 6.175150394439697,
"learning_rate": 5.473548022160354e-05,
"loss": 2.0438,
"step": 2150
},
{
"epoch": 1.5782263220385424,
"grad_norm": 8.26689624786377,
"learning_rate": 5.431188167693044e-05,
"loss": 2.0011,
"step": 2160
},
{
"epoch": 1.5855329253813135,
"grad_norm": 4.056985855102539,
"learning_rate": 5.388797109579479e-05,
"loss": 1.9823,
"step": 2170
},
{
"epoch": 1.5928395287240844,
"grad_norm": 5.051248073577881,
"learning_rate": 5.346377915518821e-05,
"loss": 1.9149,
"step": 2180
},
{
"epoch": 1.6001461320668553,
"grad_norm": 5.093779563903809,
"learning_rate": 5.3039336552463414e-05,
"loss": 2.0657,
"step": 2190
},
{
"epoch": 1.6074527354096264,
"grad_norm": 9.080710411071777,
"learning_rate": 5.261467400311266e-05,
"loss": 1.9656,
"step": 2200
},
{
"epoch": 1.6074527354096264,
"eval_loss": 2.061772108078003,
"eval_runtime": 108.1317,
"eval_samples_per_second": 11.255,
"eval_steps_per_second": 11.255,
"step": 2200
},
{
"epoch": 1.6147593387523975,
"grad_norm": 5.32938814163208,
"learning_rate": 5.2189822238545017e-05,
"loss": 1.9647,
"step": 2210
},
{
"epoch": 1.6220659420951686,
"grad_norm": 7.389804840087891,
"learning_rate": 5.176481200386245e-05,
"loss": 1.904,
"step": 2220
},
{
"epoch": 1.6293725454379395,
"grad_norm": 4.834039688110352,
"learning_rate": 5.1339674055634826e-05,
"loss": 1.9067,
"step": 2230
},
{
"epoch": 1.6366791487807104,
"grad_norm": 5.643000602722168,
"learning_rate": 5.0914439159674244e-05,
"loss": 2.2128,
"step": 2240
},
{
"epoch": 1.6439857521234815,
"grad_norm": 4.886800765991211,
"learning_rate": 5.048913808880861e-05,
"loss": 1.9497,
"step": 2250
},
{
"epoch": 1.6512923554662526,
"grad_norm": 12.440683364868164,
"learning_rate": 5.006380162065465e-05,
"loss": 1.9488,
"step": 2260
},
{
"epoch": 1.6585989588090237,
"grad_norm": 7.266822814941406,
"learning_rate": 4.963846053539071e-05,
"loss": 2.1738,
"step": 2270
},
{
"epoch": 1.6659055621517946,
"grad_norm": 9.061182975769043,
"learning_rate": 4.9213145613529194e-05,
"loss": 2.0644,
"step": 2280
},
{
"epoch": 1.6732121654945658,
"grad_norm": 4.660227298736572,
"learning_rate": 4.878788763368921e-05,
"loss": 1.8194,
"step": 2290
},
{
"epoch": 1.6805187688373366,
"grad_norm": 6.212920665740967,
"learning_rate": 4.836271737036916e-05,
"loss": 2.1629,
"step": 2300
},
{
"epoch": 1.6878253721801078,
"grad_norm": 5.760697841644287,
"learning_rate": 4.7937665591719664e-05,
"loss": 1.9379,
"step": 2310
},
{
"epoch": 1.6951319755228789,
"grad_norm": 6.021162033081055,
"learning_rate": 4.7512763057317014e-05,
"loss": 1.9756,
"step": 2320
},
{
"epoch": 1.70243857886565,
"grad_norm": 4.001003265380859,
"learning_rate": 4.70880405159372e-05,
"loss": 1.9218,
"step": 2330
},
{
"epoch": 1.7097451822084209,
"grad_norm": 6.9795002937316895,
"learning_rate": 4.666352870333072e-05,
"loss": 2.1045,
"step": 2340
},
{
"epoch": 1.7170517855511918,
"grad_norm": 5.518786907196045,
"learning_rate": 4.623925833999832e-05,
"loss": 2.0515,
"step": 2350
},
{
"epoch": 1.7243583888939629,
"grad_norm": 3.8339755535125732,
"learning_rate": 4.5815260128967894e-05,
"loss": 1.9462,
"step": 2360
},
{
"epoch": 1.731664992236734,
"grad_norm": 4.569764137268066,
"learning_rate": 4.539156475357257e-05,
"loss": 2.0918,
"step": 2370
},
{
"epoch": 1.738971595579505,
"grad_norm": 4.515227317810059,
"learning_rate": 4.496820287523027e-05,
"loss": 2.1785,
"step": 2380
},
{
"epoch": 1.746278198922276,
"grad_norm": 4.677797794342041,
"learning_rate": 4.454520513122484e-05,
"loss": 2.1057,
"step": 2390
},
{
"epoch": 1.7535848022650469,
"grad_norm": 8.1099214553833,
"learning_rate": 4.412260213248898e-05,
"loss": 2.0285,
"step": 2400
},
{
"epoch": 1.7535848022650469,
"eval_loss": 2.0392701625823975,
"eval_runtime": 108.4121,
"eval_samples_per_second": 11.226,
"eval_steps_per_second": 11.226,
"step": 2400
},
{
"epoch": 1.760891405607818,
"grad_norm": 5.997411727905273,
"learning_rate": 4.370042446138897e-05,
"loss": 2.0577,
"step": 2410
},
{
"epoch": 1.768198008950589,
"grad_norm": 4.918084144592285,
"learning_rate": 4.3278702669511506e-05,
"loss": 1.9363,
"step": 2420
},
{
"epoch": 1.7755046122933602,
"grad_norm": 5.5613932609558105,
"learning_rate": 4.285746727545291e-05,
"loss": 2.041,
"step": 2430
},
{
"epoch": 1.7828112156361313,
"grad_norm": 11.325989723205566,
"learning_rate": 4.2436748762610465e-05,
"loss": 1.9016,
"step": 2440
},
{
"epoch": 1.7901178189789022,
"grad_norm": 5.7209296226501465,
"learning_rate": 4.201657757697651e-05,
"loss": 2.0692,
"step": 2450
},
{
"epoch": 1.797424422321673,
"grad_norm": 7.241997718811035,
"learning_rate": 4.159698412493515e-05,
"loss": 1.9591,
"step": 2460
},
{
"epoch": 1.8047310256644442,
"grad_norm": 4.004190921783447,
"learning_rate": 4.117799877106181e-05,
"loss": 2.0189,
"step": 2470
},
{
"epoch": 1.8120376290072153,
"grad_norm": 7.879773139953613,
"learning_rate": 4.075965183592592e-05,
"loss": 1.9538,
"step": 2480
},
{
"epoch": 1.8193442323499864,
"grad_norm": 5.945032596588135,
"learning_rate": 4.034197359389666e-05,
"loss": 1.853,
"step": 2490
},
{
"epoch": 1.8266508356927573,
"grad_norm": 6.213232517242432,
"learning_rate": 3.992499427095213e-05,
"loss": 2.102,
"step": 2500
},
{
"epoch": 1.8339574390355282,
"grad_norm": 4.097675323486328,
"learning_rate": 3.950874404249199e-05,
"loss": 2.1149,
"step": 2510
},
{
"epoch": 1.8412640423782993,
"grad_norm": 3.8982291221618652,
"learning_rate": 3.9093253031153755e-05,
"loss": 1.961,
"step": 2520
},
{
"epoch": 1.8485706457210704,
"grad_norm": 4.43918514251709,
"learning_rate": 3.8678551304632965e-05,
"loss": 1.8826,
"step": 2530
},
{
"epoch": 1.8558772490638415,
"grad_norm": 6.397246360778809,
"learning_rate": 3.8264668873507245e-05,
"loss": 1.9244,
"step": 2540
},
{
"epoch": 1.8631838524066124,
"grad_norm": 4.929690361022949,
"learning_rate": 3.7851635689064546e-05,
"loss": 1.8184,
"step": 2550
},
{
"epoch": 1.8704904557493836,
"grad_norm": 4.7890801429748535,
"learning_rate": 3.743948164113567e-05,
"loss": 1.9089,
"step": 2560
},
{
"epoch": 1.8777970590921544,
"grad_norm": 5.918452262878418,
"learning_rate": 3.702823655593128e-05,
"loss": 2.0868,
"step": 2570
},
{
"epoch": 1.8851036624349256,
"grad_norm": 5.867265701293945,
"learning_rate": 3.6617930193883384e-05,
"loss": 2.0125,
"step": 2580
},
{
"epoch": 1.8924102657776967,
"grad_norm": 4.536275863647461,
"learning_rate": 3.62085922474918e-05,
"loss": 1.9122,
"step": 2590
},
{
"epoch": 1.8997168691204678,
"grad_norm": 5.288751602172852,
"learning_rate": 3.580025233917529e-05,
"loss": 1.9932,
"step": 2600
},
{
"epoch": 1.8997168691204678,
"eval_loss": 2.0270345211029053,
"eval_runtime": 111.0925,
"eval_samples_per_second": 10.955,
"eval_steps_per_second": 10.955,
"step": 2600
},
{
"epoch": 1.9070234724632387,
"grad_norm": 5.767890453338623,
"learning_rate": 3.5392940019127977e-05,
"loss": 1.9772,
"step": 2610
},
{
"epoch": 1.9143300758060096,
"grad_norm": 5.185894012451172,
"learning_rate": 3.498668476318083e-05,
"loss": 1.6921,
"step": 2620
},
{
"epoch": 1.9216366791487807,
"grad_norm": 6.62992525100708,
"learning_rate": 3.458151597066863e-05,
"loss": 1.9544,
"step": 2630
},
{
"epoch": 1.9289432824915518,
"grad_norm": 11.95433235168457,
"learning_rate": 3.417746296230244e-05,
"loss": 2.0189,
"step": 2640
},
{
"epoch": 1.936249885834323,
"grad_norm": 4.77971076965332,
"learning_rate": 3.3774554978047756e-05,
"loss": 1.9203,
"step": 2650
},
{
"epoch": 1.9435564891770938,
"grad_norm": 7.271909236907959,
"learning_rate": 3.337282117500847e-05,
"loss": 2.0945,
"step": 2660
},
{
"epoch": 1.9508630925198647,
"grad_norm": 8.73035717010498,
"learning_rate": 3.297229062531696e-05,
"loss": 2.0409,
"step": 2670
},
{
"epoch": 1.9581696958626358,
"grad_norm": 9.147926330566406,
"learning_rate": 3.257299231403014e-05,
"loss": 1.9463,
"step": 2680
},
{
"epoch": 1.965476299205407,
"grad_norm": 29.067060470581055,
"learning_rate": 3.217495513703198e-05,
"loss": 2.0646,
"step": 2690
},
{
"epoch": 1.972782902548178,
"grad_norm": 17.448034286499023,
"learning_rate": 3.177820789894234e-05,
"loss": 1.8981,
"step": 2700
},
{
"epoch": 1.980089505890949,
"grad_norm": 5.893951892852783,
"learning_rate": 3.138277931103254e-05,
"loss": 1.7993,
"step": 2710
},
{
"epoch": 1.98739610923372,
"grad_norm": 8.20900821685791,
"learning_rate": 3.09886979891476e-05,
"loss": 2.1449,
"step": 2720
},
{
"epoch": 1.994702712576491,
"grad_norm": 5.447132587432861,
"learning_rate": 3.059599245163538e-05,
"loss": 1.8473,
"step": 2730
},
{
"epoch": 2.002009315919262,
"grad_norm": 6.67828893661499,
"learning_rate": 3.0204691117282856e-05,
"loss": 1.9341,
"step": 2740
},
{
"epoch": 2.009315919262033,
"grad_norm": 7.406982898712158,
"learning_rate": 2.981482230325946e-05,
"loss": 1.9778,
"step": 2750
},
{
"epoch": 2.0166225226048042,
"grad_norm": 4.272141456604004,
"learning_rate": 2.9426414223067978e-05,
"loss": 1.8293,
"step": 2760
},
{
"epoch": 2.023929125947575,
"grad_norm": 5.6061320304870605,
"learning_rate": 2.9039494984502734e-05,
"loss": 1.7844,
"step": 2770
},
{
"epoch": 2.031235729290346,
"grad_norm": 6.46597957611084,
"learning_rate": 2.865409258761557e-05,
"loss": 1.8147,
"step": 2780
},
{
"epoch": 2.038542332633117,
"grad_norm": 6.094344139099121,
"learning_rate": 2.8270234922689597e-05,
"loss": 1.7082,
"step": 2790
},
{
"epoch": 2.0458489359758882,
"grad_norm": 5.04010534286499,
"learning_rate": 2.788794976822077e-05,
"loss": 1.6477,
"step": 2800
},
{
"epoch": 2.0458489359758882,
"eval_loss": 2.0164308547973633,
"eval_runtime": 111.1374,
"eval_samples_per_second": 10.95,
"eval_steps_per_second": 10.95,
"step": 2800
},
{
"epoch": 2.0531555393186594,
"grad_norm": 13.458788871765137,
"learning_rate": 2.7507264788907783e-05,
"loss": 1.7778,
"step": 2810
},
{
"epoch": 2.0604621426614305,
"grad_norm": 6.946293830871582,
"learning_rate": 2.712820753364998e-05,
"loss": 1.6629,
"step": 2820
},
{
"epoch": 2.067768746004201,
"grad_norm": 8.358539581298828,
"learning_rate": 2.6750805433553728e-05,
"loss": 2.0157,
"step": 2830
},
{
"epoch": 2.0750753493469722,
"grad_norm": 6.011590480804443,
"learning_rate": 2.637508579994741e-05,
"loss": 1.9037,
"step": 2840
},
{
"epoch": 2.0823819526897434,
"grad_norm": 5.1315226554870605,
"learning_rate": 2.6001075822404864e-05,
"loss": 1.8682,
"step": 2850
},
{
"epoch": 2.0896885560325145,
"grad_norm": 5.693300247192383,
"learning_rate": 2.5628802566777904e-05,
"loss": 1.8367,
"step": 2860
},
{
"epoch": 2.0969951593752856,
"grad_norm": 5.308597087860107,
"learning_rate": 2.5258292973237536e-05,
"loss": 1.9468,
"step": 2870
},
{
"epoch": 2.1043017627180562,
"grad_norm": 6.057744979858398,
"learning_rate": 2.4889573854324443e-05,
"loss": 1.7683,
"step": 2880
},
{
"epoch": 2.1116083660608274,
"grad_norm": 5.609562397003174,
"learning_rate": 2.452267189300864e-05,
"loss": 1.8463,
"step": 2890
},
{
"epoch": 2.1189149694035985,
"grad_norm": 4.729313850402832,
"learning_rate": 2.415761364075857e-05,
"loss": 1.9935,
"step": 2900
},
{
"epoch": 2.1262215727463696,
"grad_norm": 4.285727500915527,
"learning_rate": 2.3794425515619535e-05,
"loss": 1.8125,
"step": 2910
},
{
"epoch": 2.1335281760891407,
"grad_norm": 5.974366664886475,
"learning_rate": 2.343313380030207e-05,
"loss": 1.8855,
"step": 2920
},
{
"epoch": 2.140834779431912,
"grad_norm": 5.998205661773682,
"learning_rate": 2.30737646402798e-05,
"loss": 2.0122,
"step": 2930
},
{
"epoch": 2.1481413827746825,
"grad_norm": 7.216914176940918,
"learning_rate": 2.271634404189752e-05,
"loss": 1.8831,
"step": 2940
},
{
"epoch": 2.1554479861174536,
"grad_norm": 6.571489334106445,
"learning_rate": 2.2360897870489055e-05,
"loss": 1.6656,
"step": 2950
},
{
"epoch": 2.1627545894602247,
"grad_norm": 5.754743576049805,
"learning_rate": 2.2007451848505627e-05,
"loss": 1.7651,
"step": 2960
},
{
"epoch": 2.170061192802996,
"grad_norm": 4.261959552764893,
"learning_rate": 2.1656031553654272e-05,
"loss": 2.0409,
"step": 2970
},
{
"epoch": 2.177367796145767,
"grad_norm": 4.679163932800293,
"learning_rate": 2.1306662417046968e-05,
"loss": 1.7555,
"step": 2980
},
{
"epoch": 2.1846743994885376,
"grad_norm": 5.427217483520508,
"learning_rate": 2.0959369721360183e-05,
"loss": 2.0074,
"step": 2990
},
{
"epoch": 2.1919810028313087,
"grad_norm": 8.53897476196289,
"learning_rate": 2.0614178599005356e-05,
"loss": 1.5985,
"step": 3000
},
{
"epoch": 2.1919810028313087,
"eval_loss": 2.0024373531341553,
"eval_runtime": 110.7503,
"eval_samples_per_second": 10.989,
"eval_steps_per_second": 10.989,
"step": 3000
},
{
"epoch": 2.19928760617408,
"grad_norm": 8.221793174743652,
"learning_rate": 2.0271114030310035e-05,
"loss": 1.8105,
"step": 3010
},
{
"epoch": 2.206594209516851,
"grad_norm": 5.500391006469727,
"learning_rate": 1.9930200841710193e-05,
"loss": 1.844,
"step": 3020
},
{
"epoch": 2.213900812859622,
"grad_norm": 8.051352500915527,
"learning_rate": 1.9591463703953672e-05,
"loss": 1.6881,
"step": 3030
},
{
"epoch": 2.221207416202393,
"grad_norm": 4.4952898025512695,
"learning_rate": 1.9254927130314726e-05,
"loss": 1.8373,
"step": 3040
},
{
"epoch": 2.228514019545164,
"grad_norm": 7.687475204467773,
"learning_rate": 1.8920615474820152e-05,
"loss": 1.9104,
"step": 3050
},
{
"epoch": 2.235820622887935,
"grad_norm": 5.942209243774414,
"learning_rate": 1.8588552930486915e-05,
"loss": 1.9455,
"step": 3060
},
{
"epoch": 2.243127226230706,
"grad_norm": 4.695751667022705,
"learning_rate": 1.8258763527571243e-05,
"loss": 1.7007,
"step": 3070
},
{
"epoch": 2.250433829573477,
"grad_norm": 5.661278247833252,
"learning_rate": 1.7931271131829758e-05,
"loss": 1.7467,
"step": 3080
},
{
"epoch": 2.257740432916248,
"grad_norm": 7.149540901184082,
"learning_rate": 1.7606099442792373e-05,
"loss": 1.6398,
"step": 3090
},
{
"epoch": 2.265047036259019,
"grad_norm": 7.075028419494629,
"learning_rate": 1.728327199204716e-05,
"loss": 1.7911,
"step": 3100
},
{
"epoch": 2.27235363960179,
"grad_norm": 5.169941425323486,
"learning_rate": 1.696281214153757e-05,
"loss": 1.9487,
"step": 3110
},
{
"epoch": 2.279660242944561,
"grad_norm": 8.544054985046387,
"learning_rate": 1.664474308187167e-05,
"loss": 1.9077,
"step": 3120
},
{
"epoch": 2.2869668462873323,
"grad_norm": 5.160293102264404,
"learning_rate": 1.6329087830644053e-05,
"loss": 1.7878,
"step": 3130
},
{
"epoch": 2.2942734496301034,
"grad_norm": 7.8444342613220215,
"learning_rate": 1.6015869230769992e-05,
"loss": 1.6692,
"step": 3140
},
{
"epoch": 2.301580052972874,
"grad_norm": 5.798057556152344,
"learning_rate": 1.5705109948832526e-05,
"loss": 1.8317,
"step": 3150
},
{
"epoch": 2.308886656315645,
"grad_norm": 7.343387126922607,
"learning_rate": 1.5396832473442e-05,
"loss": 1.9332,
"step": 3160
},
{
"epoch": 2.3161932596584163,
"grad_norm": 5.731598377227783,
"learning_rate": 1.5091059113608785e-05,
"loss": 1.6953,
"step": 3170
},
{
"epoch": 2.3234998630011874,
"grad_norm": 7.075622081756592,
"learning_rate": 1.4787811997128737e-05,
"loss": 1.8049,
"step": 3180
},
{
"epoch": 2.3308064663439585,
"grad_norm": 9.426651000976562,
"learning_rate": 1.4487113068981934e-05,
"loss": 2.002,
"step": 3190
},
{
"epoch": 2.338113069686729,
"grad_norm": 6.5062360763549805,
"learning_rate": 1.418898408974456e-05,
"loss": 1.8708,
"step": 3200
},
{
"epoch": 2.338113069686729,
"eval_loss": 1.9918181896209717,
"eval_runtime": 110.5621,
"eval_samples_per_second": 11.007,
"eval_steps_per_second": 11.007,
"step": 3200
},
{
"epoch": 2.3454196730295003,
"grad_norm": 5.592414379119873,
"learning_rate": 1.3893446634014257e-05,
"loss": 1.8714,
"step": 3210
},
{
"epoch": 2.3527262763722714,
"grad_norm": 5.582223892211914,
"learning_rate": 1.3600522088848689e-05,
"loss": 1.7765,
"step": 3220
},
{
"epoch": 2.3600328797150425,
"grad_norm": 7.002323150634766,
"learning_rate": 1.3310231652217997e-05,
"loss": 1.9461,
"step": 3230
},
{
"epoch": 2.3673394830578136,
"grad_norm": 12.038192749023438,
"learning_rate": 1.3022596331470632e-05,
"loss": 1.8854,
"step": 3240
},
{
"epoch": 2.3746460864005847,
"grad_norm": 7.399435520172119,
"learning_rate": 1.2737636941813196e-05,
"loss": 1.9169,
"step": 3250
},
{
"epoch": 2.3819526897433554,
"grad_norm": 8.563164710998535,
"learning_rate": 1.245537410480414e-05,
"loss": 1.9562,
"step": 3260
},
{
"epoch": 2.3892592930861265,
"grad_norm": 5.194084167480469,
"learning_rate": 1.2175828246861359e-05,
"loss": 1.7997,
"step": 3270
},
{
"epoch": 2.3965658964288976,
"grad_norm": 6.553465366363525,
"learning_rate": 1.1899019597784117e-05,
"loss": 1.8751,
"step": 3280
},
{
"epoch": 2.4038724997716687,
"grad_norm": 5.508111476898193,
"learning_rate": 1.1624968189288965e-05,
"loss": 1.7808,
"step": 3290
},
{
"epoch": 2.41117910311444,
"grad_norm": 6.692938327789307,
"learning_rate": 1.1353693853560216e-05,
"loss": 1.6511,
"step": 3300
},
{
"epoch": 2.4184857064572105,
"grad_norm": 6.976585388183594,
"learning_rate": 1.1085216221814665e-05,
"loss": 1.8939,
"step": 3310
},
{
"epoch": 2.4257923097999816,
"grad_norm": 7.673839569091797,
"learning_rate": 1.0819554722881048e-05,
"loss": 2.0006,
"step": 3320
},
{
"epoch": 2.4330989131427527,
"grad_norm": 6.984090805053711,
"learning_rate": 1.055672858179393e-05,
"loss": 1.7107,
"step": 3330
},
{
"epoch": 2.440405516485524,
"grad_norm": 6.236724853515625,
"learning_rate": 1.0296756818402531e-05,
"loss": 1.6792,
"step": 3340
},
{
"epoch": 2.447712119828295,
"grad_norm": 7.661273002624512,
"learning_rate": 1.0039658245994277e-05,
"loss": 1.818,
"step": 3350
},
{
"epoch": 2.455018723171066,
"grad_norm": 8.403165817260742,
"learning_rate": 9.78545146993342e-06,
"loss": 1.9927,
"step": 3360
},
{
"epoch": 2.4623253265138367,
"grad_norm": 5.08246374130249,
"learning_rate": 9.534154886314517e-06,
"loss": 1.7398,
"step": 3370
},
{
"epoch": 2.469631929856608,
"grad_norm": 7.482173919677734,
"learning_rate": 9.28578668063127e-06,
"loss": 1.9105,
"step": 3380
},
{
"epoch": 2.476938533199379,
"grad_norm": 4.506008148193359,
"learning_rate": 9.040364826460423e-06,
"loss": 1.9258,
"step": 3390
},
{
"epoch": 2.48424513654215,
"grad_norm": 6.290830612182617,
"learning_rate": 8.797907084161155e-06,
"loss": 1.8533,
"step": 3400
},
{
"epoch": 2.48424513654215,
"eval_loss": 1.9872702360153198,
"eval_runtime": 110.4675,
"eval_samples_per_second": 11.017,
"eval_steps_per_second": 11.017,
"step": 3400
},
{
"epoch": 2.491551739884921,
"grad_norm": 5.8204522132873535,
"learning_rate": 8.558430999589723e-06,
"loss": 1.9186,
"step": 3410
},
{
"epoch": 2.498858343227692,
"grad_norm": 7.549562931060791,
"learning_rate": 8.321953902829842e-06,
"loss": 1.6548,
"step": 3420
},
{
"epoch": 2.506164946570463,
"grad_norm": 5.260471820831299,
"learning_rate": 8.08849290693846e-06,
"loss": 1.877,
"step": 3430
},
{
"epoch": 2.513471549913234,
"grad_norm": 7.633718967437744,
"learning_rate": 7.85806490670739e-06,
"loss": 1.7109,
"step": 3440
},
{
"epoch": 2.520778153256005,
"grad_norm": 6.902381420135498,
"learning_rate": 7.630686577440722e-06,
"loss": 1.8875,
"step": 3450
},
{
"epoch": 2.5280847565987763,
"grad_norm": 8.275242805480957,
"learning_rate": 7.406374373748004e-06,
"loss": 1.8563,
"step": 3460
},
{
"epoch": 2.5353913599415474,
"grad_norm": 5.76878023147583,
"learning_rate": 7.185144528353583e-06,
"loss": 1.7962,
"step": 3470
},
{
"epoch": 2.542697963284318,
"grad_norm": 4.979648590087891,
"learning_rate": 6.967013050921795e-06,
"loss": 1.9158,
"step": 3480
},
{
"epoch": 2.550004566627089,
"grad_norm": 9.704955101013184,
"learning_rate": 6.751995726898464e-06,
"loss": 2.0237,
"step": 3490
},
{
"epoch": 2.5573111699698603,
"grad_norm": 9.094025611877441,
"learning_rate": 6.540108116368515e-06,
"loss": 1.8791,
"step": 3500
},
{
"epoch": 2.5646177733126314,
"grad_norm": 6.252310752868652,
"learning_rate": 6.33136555293003e-06,
"loss": 1.8576,
"step": 3510
},
{
"epoch": 2.571924376655402,
"grad_norm": 7.935026168823242,
"learning_rate": 6.125783142584479e-06,
"loss": 1.9725,
"step": 3520
},
{
"epoch": 2.579230979998173,
"grad_norm": 6.289064884185791,
"learning_rate": 5.923375762643668e-06,
"loss": 1.7741,
"step": 3530
},
{
"epoch": 2.5865375833409443,
"grad_norm": 6.07642126083374,
"learning_rate": 5.724158060653029e-06,
"loss": 1.8716,
"step": 3540
},
{
"epoch": 2.5938441866837154,
"grad_norm": 7.241240501403809,
"learning_rate": 5.528144453331696e-06,
"loss": 1.7577,
"step": 3550
},
{
"epoch": 2.6011507900264865,
"grad_norm": 5.838751316070557,
"learning_rate": 5.335349125529154e-06,
"loss": 1.8637,
"step": 3560
},
{
"epoch": 2.6084573933692576,
"grad_norm": 7.343444347381592,
"learning_rate": 5.14578602919879e-06,
"loss": 1.8605,
"step": 3570
},
{
"epoch": 2.6157639967120287,
"grad_norm": 5.384237289428711,
"learning_rate": 4.959468882388163e-06,
"loss": 1.739,
"step": 3580
},
{
"epoch": 2.6230706000547994,
"grad_norm": 8.187921524047852,
"learning_rate": 4.776411168246353e-06,
"loss": 1.8639,
"step": 3590
},
{
"epoch": 2.6303772033975705,
"grad_norm": 5.8128838539123535,
"learning_rate": 4.596626134048176e-06,
"loss": 1.7357,
"step": 3600
},
{
"epoch": 2.6303772033975705,
"eval_loss": 1.9813354015350342,
"eval_runtime": 104.4364,
"eval_samples_per_second": 11.653,
"eval_steps_per_second": 11.653,
"step": 3600
},
{
"epoch": 2.6376838067403416,
"grad_norm": 5.993869304656982,
"learning_rate": 4.420126790235552e-06,
"loss": 1.9172,
"step": 3610
},
{
"epoch": 2.6449904100831128,
"grad_norm": 6.651885509490967,
"learning_rate": 4.246925909475957e-06,
"loss": 1.7701,
"step": 3620
},
{
"epoch": 2.6522970134258834,
"grad_norm": 5.599200248718262,
"learning_rate": 4.077036025738118e-06,
"loss": 1.9576,
"step": 3630
},
{
"epoch": 2.6596036167686545,
"grad_norm": 5.870293140411377,
"learning_rate": 3.910469433385017e-06,
"loss": 1.7158,
"step": 3640
},
{
"epoch": 2.6669102201114256,
"grad_norm": 7.64784574508667,
"learning_rate": 3.7472381862840967e-06,
"loss": 1.8767,
"step": 3650
},
{
"epoch": 2.6742168234541968,
"grad_norm": 5.308115005493164,
"learning_rate": 3.5873540969350415e-06,
"loss": 1.9414,
"step": 3660
},
{
"epoch": 2.681523426796968,
"grad_norm": 6.556251525878906,
"learning_rate": 3.430828735614916e-06,
"loss": 1.8932,
"step": 3670
},
{
"epoch": 2.688830030139739,
"grad_norm": 5.682919025421143,
"learning_rate": 3.277673429540862e-06,
"loss": 1.8576,
"step": 3680
},
{
"epoch": 2.69613663348251,
"grad_norm": 5.832954406738281,
"learning_rate": 3.1278992620503877e-06,
"loss": 1.6419,
"step": 3690
},
{
"epoch": 2.7034432368252808,
"grad_norm": 5.423207759857178,
"learning_rate": 2.9815170717993115e-06,
"loss": 1.7736,
"step": 3700
},
{
"epoch": 2.710749840168052,
"grad_norm": 5.549076080322266,
"learning_rate": 2.83853745197738e-06,
"loss": 1.9699,
"step": 3710
},
{
"epoch": 2.718056443510823,
"grad_norm": 5.261001110076904,
"learning_rate": 2.6989707495417292e-06,
"loss": 1.5671,
"step": 3720
},
{
"epoch": 2.725363046853594,
"grad_norm": 9.049291610717773,
"learning_rate": 2.5628270644680265e-06,
"loss": 1.9909,
"step": 3730
},
{
"epoch": 2.7326696501963648,
"grad_norm": 5.768558979034424,
"learning_rate": 2.430116249019665e-06,
"loss": 1.8207,
"step": 3740
},
{
"epoch": 2.739976253539136,
"grad_norm": 7.210824012756348,
"learning_rate": 2.3008479070346867e-06,
"loss": 1.8219,
"step": 3750
},
{
"epoch": 2.747282856881907,
"grad_norm": 5.107162952423096,
"learning_rate": 2.1750313932308806e-06,
"loss": 1.7551,
"step": 3760
},
{
"epoch": 2.754589460224678,
"grad_norm": 6.917966365814209,
"learning_rate": 2.0526758125287427e-06,
"loss": 1.6674,
"step": 3770
},
{
"epoch": 2.761896063567449,
"grad_norm": 5.870244026184082,
"learning_rate": 1.933790019392634e-06,
"loss": 1.8,
"step": 3780
},
{
"epoch": 2.7692026669102203,
"grad_norm": 5.093700408935547,
"learning_rate": 1.8183826171899677e-06,
"loss": 1.7592,
"step": 3790
},
{
"epoch": 2.776509270252991,
"grad_norm": 9.029096603393555,
"learning_rate": 1.7064619575686336e-06,
"loss": 1.8041,
"step": 3800
},
{
"epoch": 2.776509270252991,
"eval_loss": 1.9782235622406006,
"eval_runtime": 105.0311,
"eval_samples_per_second": 11.587,
"eval_steps_per_second": 11.587,
"step": 3800
},
{
"epoch": 2.783815873595762,
"grad_norm": 5.48701810836792,
"learning_rate": 1.5980361398526267e-06,
"loss": 2.1588,
"step": 3810
},
{
"epoch": 2.791122476938533,
"grad_norm": 8.91283893585205,
"learning_rate": 1.4931130104559154e-06,
"loss": 1.8274,
"step": 3820
},
{
"epoch": 2.7984290802813043,
"grad_norm": 7.846653938293457,
"learning_rate": 1.3917001623146186e-06,
"loss": 1.9029,
"step": 3830
},
{
"epoch": 2.805735683624075,
"grad_norm": 5.458700656890869,
"learning_rate": 1.2938049343375502e-06,
"loss": 1.8208,
"step": 3840
},
{
"epoch": 2.813042286966846,
"grad_norm": 6.338927745819092,
"learning_rate": 1.1994344108750833e-06,
"loss": 1.808,
"step": 3850
},
{
"epoch": 2.820348890309617,
"grad_norm": 6.470076560974121,
"learning_rate": 1.108595421206532e-06,
"loss": 1.8503,
"step": 3860
},
{
"epoch": 2.8276554936523883,
"grad_norm": 4.856273174285889,
"learning_rate": 1.021294539045914e-06,
"loss": 1.9037,
"step": 3870
},
{
"epoch": 2.8349620969951594,
"grad_norm": 6.1774139404296875,
"learning_rate": 9.375380820662194e-07,
"loss": 1.8673,
"step": 3880
},
{
"epoch": 2.8422687003379306,
"grad_norm": 5.6335296630859375,
"learning_rate": 8.57332111442255e-07,
"loss": 1.6094,
"step": 3890
},
{
"epoch": 2.8495753036807017,
"grad_norm": 6.399806022644043,
"learning_rate": 7.806824314119832e-07,
"loss": 1.8876,
"step": 3900
},
{
"epoch": 2.8568819070234723,
"grad_norm": 8.844847679138184,
"learning_rate": 7.075945888565194e-07,
"loss": 1.9354,
"step": 3910
},
{
"epoch": 2.8641885103662434,
"grad_norm": 7.803627014160156,
"learning_rate": 6.380738728986924e-07,
"loss": 1.7139,
"step": 3920
},
{
"epoch": 2.8714951137090146,
"grad_norm": 8.815073013305664,
"learning_rate": 5.721253145203165e-07,
"loss": 1.8449,
"step": 3930
},
{
"epoch": 2.8788017170517857,
"grad_norm": 5.23136568069458,
"learning_rate": 5.097536861981e-07,
"loss": 1.774,
"step": 3940
},
{
"epoch": 2.8861083203945563,
"grad_norm": 5.498495101928711,
"learning_rate": 4.5096350155827693e-07,
"loss": 1.7747,
"step": 3950
},
{
"epoch": 2.8934149237373274,
"grad_norm": 6.538133144378662,
"learning_rate": 3.957590150499735e-07,
"loss": 1.8453,
"step": 3960
},
{
"epoch": 2.9007215270800986,
"grad_norm": 6.259313106536865,
"learning_rate": 3.441442216373436e-07,
"loss": 1.9312,
"step": 3970
},
{
"epoch": 2.9080281304228697,
"grad_norm": 5.1868486404418945,
"learning_rate": 2.9612285651042795e-07,
"loss": 1.8508,
"step": 3980
},
{
"epoch": 2.915334733765641,
"grad_norm": 6.202200412750244,
"learning_rate": 2.5169839481489764e-07,
"loss": 1.8832,
"step": 3990
},
{
"epoch": 2.922641337108412,
"grad_norm": 5.868015289306641,
"learning_rate": 2.1087405140053362e-07,
"loss": 1.7998,
"step": 4000
},
{
"epoch": 2.922641337108412,
"eval_loss": 1.9779986143112183,
"eval_runtime": 103.4275,
"eval_samples_per_second": 11.767,
"eval_steps_per_second": 11.767,
"step": 4000
},
{
"epoch": 2.929947940451183,
"grad_norm": 9.322286605834961,
"learning_rate": 1.736527805885957e-07,
"loss": 1.8784,
"step": 4010
},
{
"epoch": 2.9372545437939537,
"grad_norm": 5.733661651611328,
"learning_rate": 1.4003727595802152e-07,
"loss": 1.9243,
"step": 4020
},
{
"epoch": 2.944561147136725,
"grad_norm": 6.326545238494873,
"learning_rate": 1.1002997015050476e-07,
"loss": 1.8568,
"step": 4030
},
{
"epoch": 2.951867750479496,
"grad_norm": 5.118636608123779,
"learning_rate": 8.363303469445805e-08,
"loss": 1.7683,
"step": 4040
},
{
"epoch": 2.959174353822267,
"grad_norm": 5.2632317543029785,
"learning_rate": 6.084837984786096e-08,
"loss": 1.9612,
"step": 4050
},
{
"epoch": 2.9664809571650377,
"grad_norm": 5.4591898918151855,
"learning_rate": 4.167765446000393e-08,
"loss": 1.8417,
"step": 4060
},
{
"epoch": 2.973787560507809,
"grad_norm": 6.642773628234863,
"learning_rate": 2.6122245852205906e-08,
"loss": 1.9892,
"step": 4070
},
{
"epoch": 2.98109416385058,
"grad_norm": 5.449717998504639,
"learning_rate": 1.4183279717389087e-08,
"loss": 1.8122,
"step": 4080
},
{
"epoch": 2.988400767193351,
"grad_norm": 5.309086322784424,
"learning_rate": 5.861620038610794e-09,
"loss": 1.7895,
"step": 4090
},
{
"epoch": 2.995707370536122,
"grad_norm": 6.38864803314209,
"learning_rate": 1.157869026574554e-09,
"loss": 1.7208,
"step": 4100
},
{
"epoch": 2.9986300118732303,
"step": 4104,
"total_flos": 1.3798092749438976e+17,
"train_loss": 2.123500373163651,
"train_runtime": 11771.2816,
"train_samples_per_second": 2.79,
"train_steps_per_second": 0.349
}
],
"logging_steps": 10,
"max_steps": 4104,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 1.3798092749438976e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}