bitllama-goodwiki / trainer_state.json
amazingvince's picture
End of training
b43c5de verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 2833,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 8.000000000000001e-06,
"loss": 10.5725,
"step": 1
},
{
"epoch": 0.0,
"learning_rate": 8e-05,
"loss": 9.6876,
"step": 10
},
{
"epoch": 0.01,
"learning_rate": 0.00016,
"loss": 8.4164,
"step": 20
},
{
"epoch": 0.01,
"learning_rate": 0.00024,
"loss": 7.5148,
"step": 30
},
{
"epoch": 0.01,
"learning_rate": 0.00032,
"loss": 7.2623,
"step": 40
},
{
"epoch": 0.02,
"learning_rate": 0.0004,
"loss": 7.0235,
"step": 50
},
{
"epoch": 0.02,
"learning_rate": 0.00048,
"loss": 6.7411,
"step": 60
},
{
"epoch": 0.02,
"learning_rate": 0.00056,
"loss": 6.535,
"step": 70
},
{
"epoch": 0.03,
"learning_rate": 0.00064,
"loss": 6.3619,
"step": 80
},
{
"epoch": 0.03,
"learning_rate": 0.00072,
"loss": 6.285,
"step": 90
},
{
"epoch": 0.04,
"learning_rate": 0.0008,
"loss": 6.1199,
"step": 100
},
{
"epoch": 0.04,
"eval_accuracy": 0.15415359968673006,
"eval_loss": 6.0748748779296875,
"eval_runtime": 149.9041,
"eval_samples_per_second": 33.788,
"eval_steps_per_second": 4.229,
"step": 100
},
{
"epoch": 0.04,
"learning_rate": 0.0007999735731319962,
"loss": 6.0192,
"step": 110
},
{
"epoch": 0.04,
"learning_rate": 0.0007998942960198819,
"loss": 5.9528,
"step": 120
},
{
"epoch": 0.05,
"learning_rate": 0.0007997621791388858,
"loss": 5.8808,
"step": 130
},
{
"epoch": 0.05,
"learning_rate": 0.0007995772399461845,
"loss": 5.7862,
"step": 140
},
{
"epoch": 0.05,
"learning_rate": 0.0007993395028785968,
"loss": 5.7041,
"step": 150
},
{
"epoch": 0.06,
"learning_rate": 0.0007990489993493526,
"loss": 5.6365,
"step": 160
},
{
"epoch": 0.06,
"learning_rate": 0.0007987057677439444,
"loss": 5.5384,
"step": 170
},
{
"epoch": 0.06,
"learning_rate": 0.0007983098534150538,
"loss": 5.5325,
"step": 180
},
{
"epoch": 0.07,
"learning_rate": 0.0007978613086765592,
"loss": 5.4794,
"step": 190
},
{
"epoch": 0.07,
"learning_rate": 0.0007973601927966237,
"loss": 5.3869,
"step": 200
},
{
"epoch": 0.07,
"eval_accuracy": 0.20318275703591465,
"eval_loss": 5.326748371124268,
"eval_runtime": 149.8598,
"eval_samples_per_second": 33.798,
"eval_steps_per_second": 4.231,
"step": 200
},
{
"epoch": 0.07,
"learning_rate": 0.0007968065719898634,
"loss": 5.3202,
"step": 210
},
{
"epoch": 0.08,
"learning_rate": 0.0007962005194085981,
"loss": 5.2673,
"step": 220
},
{
"epoch": 0.08,
"learning_rate": 0.0007955421151331857,
"loss": 5.2441,
"step": 230
},
{
"epoch": 0.08,
"learning_rate": 0.0007948314461614408,
"loss": 5.1405,
"step": 240
},
{
"epoch": 0.09,
"learning_rate": 0.0007940686063971387,
"loss": 5.1529,
"step": 250
},
{
"epoch": 0.09,
"learning_rate": 0.0007932536966376081,
"loss": 5.1036,
"step": 260
},
{
"epoch": 0.1,
"learning_rate": 0.0007923868245604124,
"loss": 4.9852,
"step": 270
},
{
"epoch": 0.1,
"learning_rate": 0.0007914681047091216,
"loss": 4.9747,
"step": 280
},
{
"epoch": 0.1,
"learning_rate": 0.0007904976584781766,
"loss": 4.9381,
"step": 290
},
{
"epoch": 0.11,
"learning_rate": 0.0007894756140968497,
"loss": 4.9187,
"step": 300
},
{
"epoch": 0.11,
"eval_accuracy": 0.23856663569010775,
"eval_loss": 4.856618404388428,
"eval_runtime": 149.2653,
"eval_samples_per_second": 33.933,
"eval_steps_per_second": 4.247,
"step": 300
},
{
"epoch": 0.11,
"learning_rate": 0.0007884021066123009,
"loss": 4.8608,
"step": 310
},
{
"epoch": 0.11,
"learning_rate": 0.0007872772778717331,
"loss": 4.8439,
"step": 320
},
{
"epoch": 0.12,
"learning_rate": 0.0007861012765036494,
"loss": 4.7976,
"step": 330
},
{
"epoch": 0.12,
"learning_rate": 0.0007848742578982146,
"loss": 4.752,
"step": 340
},
{
"epoch": 0.12,
"learning_rate": 0.0007835963841867223,
"loss": 4.7569,
"step": 350
},
{
"epoch": 0.13,
"learning_rate": 0.0007822678242201718,
"loss": 4.698,
"step": 360
},
{
"epoch": 0.13,
"learning_rate": 0.0007808887535469578,
"loss": 4.6793,
"step": 370
},
{
"epoch": 0.13,
"learning_rate": 0.0007794593543896733,
"loss": 4.6234,
"step": 380
},
{
"epoch": 0.14,
"learning_rate": 0.0007779798156210327,
"loss": 4.6046,
"step": 390
},
{
"epoch": 0.14,
"learning_rate": 0.0007764503327389145,
"loss": 4.6185,
"step": 400
},
{
"epoch": 0.14,
"eval_accuracy": 0.2624355291325133,
"eval_loss": 4.553475379943848,
"eval_runtime": 149.5941,
"eval_samples_per_second": 33.858,
"eval_steps_per_second": 4.238,
"step": 400
},
{
"epoch": 0.14,
"learning_rate": 0.00077487110784053,
"loss": 4.551,
"step": 410
},
{
"epoch": 0.15,
"learning_rate": 0.0007732423495957192,
"loss": 4.5083,
"step": 420
},
{
"epoch": 0.15,
"learning_rate": 0.0007715642732193774,
"loss": 4.522,
"step": 430
},
{
"epoch": 0.16,
"learning_rate": 0.0007698371004430193,
"loss": 4.4942,
"step": 440
},
{
"epoch": 0.16,
"learning_rate": 0.0007680610594854798,
"loss": 4.5164,
"step": 450
},
{
"epoch": 0.16,
"learning_rate": 0.0007662363850227587,
"loss": 4.4405,
"step": 460
},
{
"epoch": 0.17,
"learning_rate": 0.0007643633181570117,
"loss": 4.3987,
"step": 470
},
{
"epoch": 0.17,
"learning_rate": 0.000762442106384693,
"loss": 4.4089,
"step": 480
},
{
"epoch": 0.17,
"learning_rate": 0.0007604730035638523,
"loss": 4.3598,
"step": 490
},
{
"epoch": 0.18,
"learning_rate": 0.0007584562698805911,
"loss": 4.3509,
"step": 500
},
{
"epoch": 0.18,
"eval_accuracy": 0.2801221637037998,
"eval_loss": 4.338791847229004,
"eval_runtime": 148.9964,
"eval_samples_per_second": 33.994,
"eval_steps_per_second": 4.255,
"step": 500
},
{
"epoch": 0.18,
"learning_rate": 0.0007563921718146838,
"loss": 4.3866,
"step": 510
},
{
"epoch": 0.18,
"learning_rate": 0.0007542809821043658,
"loss": 4.3586,
"step": 520
},
{
"epoch": 0.19,
"learning_rate": 0.0007521229797102965,
"loss": 4.319,
"step": 530
},
{
"epoch": 0.19,
"learning_rate": 0.0007499184497786977,
"loss": 4.3143,
"step": 540
},
{
"epoch": 0.19,
"learning_rate": 0.0007476676836036771,
"loss": 4.2326,
"step": 550
},
{
"epoch": 0.2,
"learning_rate": 0.0007453709785887376,
"loss": 4.2689,
"step": 560
},
{
"epoch": 0.2,
"learning_rate": 0.0007430286382074807,
"loss": 4.2383,
"step": 570
},
{
"epoch": 0.2,
"learning_rate": 0.0007406409719635068,
"loss": 4.232,
"step": 580
},
{
"epoch": 0.21,
"learning_rate": 0.0007382082953495193,
"loss": 4.1941,
"step": 590
},
{
"epoch": 0.21,
"learning_rate": 0.0007357309298056369,
"loss": 4.1666,
"step": 600
},
{
"epoch": 0.21,
"eval_accuracy": 0.2955689374718788,
"eval_loss": 4.16923713684082,
"eval_runtime": 148.9107,
"eval_samples_per_second": 34.014,
"eval_steps_per_second": 4.258,
"step": 600
},
{
"epoch": 0.22,
"learning_rate": 0.0007332092026769209,
"loss": 4.1266,
"step": 610
},
{
"epoch": 0.22,
"learning_rate": 0.0007306434471701209,
"loss": 4.1373,
"step": 620
},
{
"epoch": 0.22,
"learning_rate": 0.0007280340023096477,
"loss": 4.1767,
"step": 630
},
{
"epoch": 0.23,
"learning_rate": 0.0007253812128927756,
"loss": 4.139,
"step": 640
},
{
"epoch": 0.23,
"learning_rate": 0.0007226854294440834,
"loss": 4.0591,
"step": 650
},
{
"epoch": 0.23,
"learning_rate": 0.0007199470081691381,
"loss": 4.1488,
"step": 660
},
{
"epoch": 0.24,
"learning_rate": 0.0007171663109074274,
"loss": 4.1125,
"step": 670
},
{
"epoch": 0.24,
"learning_rate": 0.0007143437050845489,
"loss": 4.1009,
"step": 680
},
{
"epoch": 0.24,
"learning_rate": 0.0007114795636636599,
"loss": 4.085,
"step": 690
},
{
"epoch": 0.25,
"learning_rate": 0.000708574265096197,
"loss": 4.0456,
"step": 700
},
{
"epoch": 0.25,
"eval_accuracy": 0.3088623661815066,
"eval_loss": 4.03993558883667,
"eval_runtime": 148.8369,
"eval_samples_per_second": 34.031,
"eval_steps_per_second": 4.26,
"step": 700
},
{
"epoch": 0.25,
"learning_rate": 0.0007056281932718689,
"loss": 4.0732,
"step": 710
},
{
"epoch": 0.25,
"learning_rate": 0.0007026417374679316,
"loss": 4.0439,
"step": 720
},
{
"epoch": 0.26,
"learning_rate": 0.000699615292297752,
"loss": 4.0528,
"step": 730
},
{
"epoch": 0.26,
"learning_rate": 0.0006965492576586652,
"loss": 4.05,
"step": 740
},
{
"epoch": 0.26,
"learning_rate": 0.0006934440386791345,
"loss": 3.9947,
"step": 750
},
{
"epoch": 0.27,
"learning_rate": 0.0006903000456652207,
"loss": 4.002,
"step": 760
},
{
"epoch": 0.27,
"learning_rate": 0.0006871176940463655,
"loss": 3.937,
"step": 770
},
{
"epoch": 0.28,
"learning_rate": 0.0006838974043204999,
"loss": 3.949,
"step": 780
},
{
"epoch": 0.28,
"learning_rate": 0.0006806396019984811,
"loss": 3.9419,
"step": 790
},
{
"epoch": 0.28,
"learning_rate": 0.0006773447175478696,
"loss": 3.9273,
"step": 800
},
{
"epoch": 0.28,
"eval_accuracy": 0.31934504591266155,
"eval_loss": 3.9317612648010254,
"eval_runtime": 150.5837,
"eval_samples_per_second": 33.636,
"eval_steps_per_second": 4.21,
"step": 800
},
{
"epoch": 0.29,
"learning_rate": 0.000674013186336047,
"loss": 3.9558,
"step": 810
},
{
"epoch": 0.29,
"learning_rate": 0.0006706454485726915,
"loss": 3.9083,
"step": 820
},
{
"epoch": 0.29,
"learning_rate": 0.0006672419492516099,
"loss": 3.9169,
"step": 830
},
{
"epoch": 0.3,
"learning_rate": 0.0006638031380919385,
"loss": 3.9215,
"step": 840
},
{
"epoch": 0.3,
"learning_rate": 0.0006603294694787206,
"loss": 3.9422,
"step": 850
},
{
"epoch": 0.3,
"learning_rate": 0.0006568214024028656,
"loss": 3.9031,
"step": 860
},
{
"epoch": 0.31,
"learning_rate": 0.0006532794004005016,
"loss": 3.8931,
"step": 870
},
{
"epoch": 0.31,
"learning_rate": 0.0006497039314917254,
"loss": 3.871,
"step": 880
},
{
"epoch": 0.31,
"learning_rate": 0.0006460954681187614,
"loss": 3.878,
"step": 890
},
{
"epoch": 0.32,
"learning_rate": 0.0006424544870835359,
"loss": 3.8447,
"step": 900
},
{
"epoch": 0.32,
"eval_accuracy": 0.3326595971954238,
"eval_loss": 3.817286968231201,
"eval_runtime": 151.9107,
"eval_samples_per_second": 33.342,
"eval_steps_per_second": 4.174,
"step": 900
},
{
"epoch": 0.32,
"learning_rate": 0.0006387814694846751,
"loss": 3.7965,
"step": 910
},
{
"epoch": 0.32,
"learning_rate": 0.0006350769006539354,
"loss": 3.7753,
"step": 920
},
{
"epoch": 0.33,
"learning_rate": 0.000631341270092074,
"loss": 3.7734,
"step": 930
},
{
"epoch": 0.33,
"learning_rate": 0.00062757507140417,
"loss": 3.8001,
"step": 940
},
{
"epoch": 0.34,
"learning_rate": 0.0006237788022344014,
"loss": 3.7775,
"step": 950
},
{
"epoch": 0.34,
"learning_rate": 0.0006199529642002892,
"loss": 3.7659,
"step": 960
},
{
"epoch": 0.34,
"learning_rate": 0.0006160980628264175,
"loss": 3.7701,
"step": 970
},
{
"epoch": 0.35,
"learning_rate": 0.0006122146074776347,
"loss": 3.7496,
"step": 980
},
{
"epoch": 0.35,
"learning_rate": 0.0006083031112917506,
"loss": 3.7569,
"step": 990
},
{
"epoch": 0.35,
"learning_rate": 0.0006043640911117322,
"loss": 3.7143,
"step": 1000
},
{
"epoch": 0.35,
"eval_accuracy": 0.3461449616152692,
"eval_loss": 3.7108187675476074,
"eval_runtime": 150.717,
"eval_samples_per_second": 33.606,
"eval_steps_per_second": 4.207,
"step": 1000
},
{
"epoch": 0.36,
"learning_rate": 0.0006003980674174113,
"loss": 3.727,
"step": 1010
},
{
"epoch": 0.36,
"learning_rate": 0.0005964055642567111,
"loss": 3.7216,
"step": 1020
},
{
"epoch": 0.36,
"learning_rate": 0.0005923871091764019,
"loss": 3.6425,
"step": 1030
},
{
"epoch": 0.37,
"learning_rate": 0.0005883432331523935,
"loss": 3.656,
"step": 1040
},
{
"epoch": 0.37,
"learning_rate": 0.0005842744705195756,
"loss": 3.6711,
"step": 1050
},
{
"epoch": 0.37,
"learning_rate": 0.0005801813589012133,
"loss": 3.6739,
"step": 1060
},
{
"epoch": 0.38,
"learning_rate": 0.0005760644391379089,
"loss": 3.6481,
"step": 1070
},
{
"epoch": 0.38,
"learning_rate": 0.0005719242552161383,
"loss": 3.6327,
"step": 1080
},
{
"epoch": 0.38,
"learning_rate": 0.0005677613541963716,
"loss": 3.6286,
"step": 1090
},
{
"epoch": 0.39,
"learning_rate": 0.0005635762861407874,
"loss": 3.6485,
"step": 1100
},
{
"epoch": 0.39,
"eval_accuracy": 0.3589528604931205,
"eval_loss": 3.6115522384643555,
"eval_runtime": 151.483,
"eval_samples_per_second": 33.436,
"eval_steps_per_second": 4.185,
"step": 1100
},
{
"epoch": 0.39,
"learning_rate": 0.0005593696040405915,
"loss": 3.6201,
"step": 1110
},
{
"epoch": 0.4,
"learning_rate": 0.0005551418637429465,
"loss": 3.5593,
"step": 1120
},
{
"epoch": 0.4,
"learning_rate": 0.0005508936238775265,
"loss": 3.6036,
"step": 1130
},
{
"epoch": 0.4,
"learning_rate": 0.0005466254457827025,
"loss": 3.6029,
"step": 1140
},
{
"epoch": 0.41,
"learning_rate": 0.0005423378934313702,
"loss": 3.585,
"step": 1150
},
{
"epoch": 0.41,
"learning_rate": 0.0005380315333564296,
"loss": 3.5505,
"step": 1160
},
{
"epoch": 0.41,
"learning_rate": 0.0005337069345759272,
"loss": 3.5358,
"step": 1170
},
{
"epoch": 0.42,
"learning_rate": 0.0005293646685178686,
"loss": 3.5578,
"step": 1180
},
{
"epoch": 0.42,
"learning_rate": 0.0005250053089447138,
"loss": 3.5917,
"step": 1190
},
{
"epoch": 0.42,
"learning_rate": 0.0005206294318775628,
"loss": 3.5171,
"step": 1200
},
{
"epoch": 0.42,
"eval_accuracy": 0.36927514369860115,
"eval_loss": 3.530304431915283,
"eval_runtime": 151.4072,
"eval_samples_per_second": 33.453,
"eval_steps_per_second": 4.187,
"step": 1200
},
{
"epoch": 0.43,
"learning_rate": 0.0005162376155200437,
"loss": 3.5322,
"step": 1210
},
{
"epoch": 0.43,
"learning_rate": 0.0005118304401819125,
"loss": 3.5639,
"step": 1220
},
{
"epoch": 0.43,
"learning_rate": 0.0005074084882023739,
"loss": 3.5472,
"step": 1230
},
{
"epoch": 0.44,
"learning_rate": 0.0005029723438731346,
"loss": 3.4967,
"step": 1240
},
{
"epoch": 0.44,
"learning_rate": 0.0004985225933611971,
"loss": 3.466,
"step": 1250
},
{
"epoch": 0.44,
"learning_rate": 0.000494059824631409,
"loss": 3.4608,
"step": 1260
},
{
"epoch": 0.45,
"learning_rate": 0.0004895846273687709,
"loss": 3.5004,
"step": 1270
},
{
"epoch": 0.45,
"learning_rate": 0.0004850975929005197,
"loss": 3.4747,
"step": 1280
},
{
"epoch": 0.46,
"learning_rate": 0.00048059931411799335,
"loss": 3.5048,
"step": 1290
},
{
"epoch": 0.46,
"learning_rate": 0.00047609038539829,
"loss": 3.4464,
"step": 1300
},
{
"epoch": 0.46,
"eval_accuracy": 0.3779672272186056,
"eval_loss": 3.455420970916748,
"eval_runtime": 151.2649,
"eval_samples_per_second": 33.484,
"eval_steps_per_second": 4.191,
"step": 1300
},
{
"epoch": 0.46,
"learning_rate": 0.0004715714025257304,
"loss": 3.4953,
"step": 1310
},
{
"epoch": 0.47,
"learning_rate": 0.00046704296261313393,
"loss": 3.471,
"step": 1320
},
{
"epoch": 0.47,
"learning_rate": 0.0004625056640229197,
"loss": 3.4471,
"step": 1330
},
{
"epoch": 0.47,
"learning_rate": 0.0004579601062880422,
"loss": 3.4493,
"step": 1340
},
{
"epoch": 0.48,
"learning_rate": 0.00045340689003277285,
"loss": 3.4145,
"step": 1350
},
{
"epoch": 0.48,
"learning_rate": 0.0004488466168933368,
"loss": 3.4739,
"step": 1360
},
{
"epoch": 0.48,
"learning_rate": 0.00044427988943841534,
"loss": 3.3819,
"step": 1370
},
{
"epoch": 0.49,
"learning_rate": 0.0004397073110895268,
"loss": 3.3975,
"step": 1380
},
{
"epoch": 0.49,
"learning_rate": 0.0004351294860412936,
"loss": 3.4112,
"step": 1390
},
{
"epoch": 0.49,
"learning_rate": 0.000430547019181607,
"loss": 3.3955,
"step": 1400
},
{
"epoch": 0.49,
"eval_accuracy": 0.38511794160042556,
"eval_loss": 3.3999252319335938,
"eval_runtime": 150.5869,
"eval_samples_per_second": 33.635,
"eval_steps_per_second": 4.21,
"step": 1400
},
{
"epoch": 0.5,
"learning_rate": 0.00042596051601170143,
"loss": 3.3769,
"step": 1410
},
{
"epoch": 0.5,
"learning_rate": 0.00042137058256614605,
"loss": 3.389,
"step": 1420
},
{
"epoch": 0.5,
"learning_rate": 0.00041677782533276747,
"loss": 3.3465,
"step": 1430
},
{
"epoch": 0.51,
"learning_rate": 0.00041218285117251163,
"loss": 3.3847,
"step": 1440
},
{
"epoch": 0.51,
"learning_rate": 0.0004075862672392566,
"loss": 3.3683,
"step": 1450
},
{
"epoch": 0.52,
"learning_rate": 0.0004029886808995867,
"loss": 3.3386,
"step": 1460
},
{
"epoch": 0.52,
"learning_rate": 0.00039839069965253864,
"loss": 3.3675,
"step": 1470
},
{
"epoch": 0.52,
"learning_rate": 0.0003937929310493297,
"loss": 3.3393,
"step": 1480
},
{
"epoch": 0.53,
"learning_rate": 0.0003891959826130802,
"loss": 3.4105,
"step": 1490
},
{
"epoch": 0.53,
"learning_rate": 0.0003846004617585376,
"loss": 3.3551,
"step": 1500
},
{
"epoch": 0.53,
"eval_accuracy": 0.39192461845543836,
"eval_loss": 3.3431735038757324,
"eval_runtime": 150.786,
"eval_samples_per_second": 33.591,
"eval_steps_per_second": 4.205,
"step": 1500
},
{
"epoch": 0.53,
"learning_rate": 0.00038000697571181723,
"loss": 3.3163,
"step": 1510
},
{
"epoch": 0.54,
"learning_rate": 0.00037541613143016596,
"loss": 3.2978,
"step": 1520
},
{
"epoch": 0.54,
"learning_rate": 0.00037082853552176324,
"loss": 3.3012,
"step": 1530
},
{
"epoch": 0.54,
"learning_rate": 0.0003662447941655669,
"loss": 3.3617,
"step": 1540
},
{
"epoch": 0.55,
"learning_rate": 0.00036166551303121566,
"loss": 3.2746,
"step": 1550
},
{
"epoch": 0.55,
"learning_rate": 0.00035709129719900003,
"loss": 3.312,
"step": 1560
},
{
"epoch": 0.55,
"learning_rate": 0.0003525227510799099,
"loss": 3.3274,
"step": 1570
},
{
"epoch": 0.56,
"learning_rate": 0.0003479604783357719,
"loss": 3.2888,
"step": 1580
},
{
"epoch": 0.56,
"learning_rate": 0.0003434050817994838,
"loss": 3.3067,
"step": 1590
},
{
"epoch": 0.56,
"learning_rate": 0.00033885716339536047,
"loss": 3.2787,
"step": 1600
},
{
"epoch": 0.56,
"eval_accuracy": 0.39735948545797645,
"eval_loss": 3.2980780601501465,
"eval_runtime": 151.955,
"eval_samples_per_second": 33.332,
"eval_steps_per_second": 4.172,
"step": 1600
},
{
"epoch": 0.57,
"learning_rate": 0.00033431732405959886,
"loss": 3.3245,
"step": 1610
},
{
"epoch": 0.57,
"learning_rate": 0.0003297861636608732,
"loss": 3.328,
"step": 1620
},
{
"epoch": 0.58,
"learning_rate": 0.00032526428092107256,
"loss": 3.2773,
"step": 1630
},
{
"epoch": 0.58,
"learning_rate": 0.0003207522733361881,
"loss": 3.2792,
"step": 1640
},
{
"epoch": 0.58,
"learning_rate": 0.00031625073709736444,
"loss": 3.2355,
"step": 1650
},
{
"epoch": 0.59,
"learning_rate": 0.00031176026701212125,
"loss": 3.2635,
"step": 1660
},
{
"epoch": 0.59,
"learning_rate": 0.00030728145642576,
"loss": 3.226,
"step": 1670
},
{
"epoch": 0.59,
"learning_rate": 0.0003028148971429614,
"loss": 3.2433,
"step": 1680
},
{
"epoch": 0.6,
"learning_rate": 0.00029836117934958843,
"loss": 3.2282,
"step": 1690
},
{
"epoch": 0.6,
"learning_rate": 0.00029392089153470243,
"loss": 3.2705,
"step": 1700
},
{
"epoch": 0.6,
"eval_accuracy": 0.4022830704505329,
"eval_loss": 3.2566046714782715,
"eval_runtime": 151.1757,
"eval_samples_per_second": 33.504,
"eval_steps_per_second": 4.194,
"step": 1700
},
{
"epoch": 0.6,
"learning_rate": 0.0002894946204128031,
"loss": 3.2523,
"step": 1710
},
{
"epoch": 0.61,
"learning_rate": 0.00028508295084630423,
"loss": 3.2703,
"step": 1720
},
{
"epoch": 0.61,
"learning_rate": 0.0002806864657682521,
"loss": 3.2855,
"step": 1730
},
{
"epoch": 0.61,
"learning_rate": 0.0002763057461053008,
"loss": 3.2752,
"step": 1740
},
{
"epoch": 0.62,
"learning_rate": 0.00027194137070095224,
"loss": 3.2225,
"step": 1750
},
{
"epoch": 0.62,
"learning_rate": 0.0002675939162390696,
"loss": 3.2595,
"step": 1760
},
{
"epoch": 0.62,
"learning_rate": 0.0002632639571676793,
"loss": 3.2349,
"step": 1770
},
{
"epoch": 0.63,
"learning_rate": 0.0002589520656230653,
"loss": 3.1926,
"step": 1780
},
{
"epoch": 0.63,
"learning_rate": 0.00025465881135417135,
"loss": 3.2271,
"step": 1790
},
{
"epoch": 0.64,
"learning_rate": 0.00025038476164731643,
"loss": 3.2281,
"step": 1800
},
{
"epoch": 0.64,
"eval_accuracy": 0.40748857910186626,
"eval_loss": 3.217235803604126,
"eval_runtime": 151.2208,
"eval_samples_per_second": 33.494,
"eval_steps_per_second": 4.193,
"step": 1800
},
{
"epoch": 0.64,
"learning_rate": 0.00024613048125123803,
"loss": 3.2461,
"step": 1810
},
{
"epoch": 0.64,
"learning_rate": 0.00024189653230246853,
"loss": 3.2236,
"step": 1820
},
{
"epoch": 0.65,
"learning_rate": 0.0002376834742510578,
"loss": 3.2269,
"step": 1830
},
{
"epoch": 0.65,
"learning_rate": 0.00023349186378665126,
"loss": 3.1916,
"step": 1840
},
{
"epoch": 0.65,
"learning_rate": 0.00022932225476493065,
"loss": 3.2088,
"step": 1850
},
{
"epoch": 0.66,
"learning_rate": 0.00022517519813443292,
"loss": 3.212,
"step": 1860
},
{
"epoch": 0.66,
"learning_rate": 0.00022105124186374818,
"loss": 3.2159,
"step": 1870
},
{
"epoch": 0.66,
"learning_rate": 0.0002169509308691171,
"loss": 3.2286,
"step": 1880
},
{
"epoch": 0.67,
"learning_rate": 0.0002128748069424268,
"loss": 3.1553,
"step": 1890
},
{
"epoch": 0.67,
"learning_rate": 0.00020882340867962174,
"loss": 3.1759,
"step": 1900
},
{
"epoch": 0.67,
"eval_accuracy": 0.41184416942232654,
"eval_loss": 3.1826136112213135,
"eval_runtime": 150.8644,
"eval_samples_per_second": 33.573,
"eval_steps_per_second": 4.202,
"step": 1900
},
{
"epoch": 0.67,
"learning_rate": 0.00020479727140953813,
"loss": 3.1996,
"step": 1910
},
{
"epoch": 0.68,
"learning_rate": 0.00020079692712316648,
"loss": 3.207,
"step": 1920
},
{
"epoch": 0.68,
"learning_rate": 0.00019682290440335907,
"loss": 3.1934,
"step": 1930
},
{
"epoch": 0.68,
"learning_rate": 0.00019287572835498522,
"loss": 3.2055,
"step": 1940
},
{
"epoch": 0.69,
"learning_rate": 0.0001889559205355469,
"loss": 3.165,
"step": 1950
},
{
"epoch": 0.69,
"learning_rate": 0.00018506399888626373,
"loss": 3.2182,
"step": 1960
},
{
"epoch": 0.7,
"learning_rate": 0.00018120047766363384,
"loss": 3.173,
"step": 1970
},
{
"epoch": 0.7,
"learning_rate": 0.0001773658673714842,
"loss": 3.1718,
"step": 1980
},
{
"epoch": 0.7,
"learning_rate": 0.0001735606746935151,
"loss": 3.1621,
"step": 1990
},
{
"epoch": 0.71,
"learning_rate": 0.00016978540242634958,
"loss": 3.1603,
"step": 2000
},
{
"epoch": 0.71,
"eval_accuracy": 0.4152190550686701,
"eval_loss": 3.1547319889068604,
"eval_runtime": 150.8119,
"eval_samples_per_second": 33.585,
"eval_steps_per_second": 4.204,
"step": 2000
},
{
"epoch": 0.71,
"learning_rate": 0.00016604054941309713,
"loss": 3.1781,
"step": 2010
},
{
"epoch": 0.71,
"learning_rate": 0.0001623266104774391,
"loss": 3.1261,
"step": 2020
},
{
"epoch": 0.72,
"learning_rate": 0.00015864407635824562,
"loss": 3.1293,
"step": 2030
},
{
"epoch": 0.72,
"learning_rate": 0.0001549934336447321,
"loss": 3.1486,
"step": 2040
},
{
"epoch": 0.72,
"learning_rate": 0.00015137516471216422,
"loss": 3.1247,
"step": 2050
},
{
"epoch": 0.73,
"learning_rate": 0.00014778974765811928,
"loss": 3.0984,
"step": 2060
},
{
"epoch": 0.73,
"learning_rate": 0.00014423765623931364,
"loss": 3.1173,
"step": 2070
},
{
"epoch": 0.73,
"learning_rate": 0.0001407193598090021,
"loss": 3.147,
"step": 2080
},
{
"epoch": 0.74,
"learning_rate": 0.000137235323254962,
"loss": 3.147,
"step": 2090
},
{
"epoch": 0.74,
"learning_rate": 0.00013378600693806378,
"loss": 3.1328,
"step": 2100
},
{
"epoch": 0.74,
"eval_accuracy": 0.4185663559848014,
"eval_loss": 3.1282718181610107,
"eval_runtime": 150.9461,
"eval_samples_per_second": 33.555,
"eval_steps_per_second": 4.2,
"step": 2100
},
{
"epoch": 0.74,
"learning_rate": 0.0001303718666314425,
"loss": 3.1565,
"step": 2110
},
{
"epoch": 0.75,
"learning_rate": 0.00012699335346027447,
"loss": 3.1537,
"step": 2120
},
{
"epoch": 0.75,
"learning_rate": 0.0001236509138421674,
"loss": 3.1268,
"step": 2130
},
{
"epoch": 0.76,
"learning_rate": 0.00012034498942817482,
"loss": 3.1449,
"step": 2140
},
{
"epoch": 0.76,
"learning_rate": 0.0001170760170444369,
"loss": 3.1358,
"step": 2150
},
{
"epoch": 0.76,
"learning_rate": 0.00011384442863446211,
"loss": 3.0987,
"step": 2160
},
{
"epoch": 0.77,
"learning_rate": 0.00011065065120205264,
"loss": 3.1229,
"step": 2170
},
{
"epoch": 0.77,
"learning_rate": 0.00010749510675488115,
"loss": 3.0585,
"step": 2180
},
{
"epoch": 0.77,
"learning_rate": 0.00010437821224873104,
"loss": 3.1092,
"step": 2190
},
{
"epoch": 0.78,
"learning_rate": 0.00010130037953240043,
"loss": 3.0916,
"step": 2200
},
{
"epoch": 0.78,
"eval_accuracy": 0.4215069268054616,
"eval_loss": 3.105459690093994,
"eval_runtime": 151.0796,
"eval_samples_per_second": 33.525,
"eval_steps_per_second": 4.196,
"step": 2200
},
{
"epoch": 0.78,
"learning_rate": 9.826201529328414e-05,
"loss": 3.1225,
"step": 2210
},
{
"epoch": 0.78,
"learning_rate": 9.526352100363562e-05,
"loss": 3.0914,
"step": 2220
},
{
"epoch": 0.79,
"learning_rate": 9.230529286751886e-05,
"loss": 3.1395,
"step": 2230
},
{
"epoch": 0.79,
"learning_rate": 8.938772176845631e-05,
"loss": 3.0953,
"step": 2240
},
{
"epoch": 0.79,
"learning_rate": 8.651119321777952e-05,
"loss": 3.0434,
"step": 2250
},
{
"epoch": 0.8,
"learning_rate": 8.367608730369015e-05,
"loss": 3.0658,
"step": 2260
},
{
"epoch": 0.8,
"learning_rate": 8.088277864103697e-05,
"loss": 3.0928,
"step": 2270
},
{
"epoch": 0.8,
"learning_rate": 7.81316363218167e-05,
"loss": 3.0746,
"step": 2280
},
{
"epoch": 0.81,
"learning_rate": 7.542302386640385e-05,
"loss": 3.0813,
"step": 2290
},
{
"epoch": 0.81,
"learning_rate": 7.27572991755178e-05,
"loss": 3.0939,
"step": 2300
},
{
"epoch": 0.81,
"eval_accuracy": 0.42381410978240375,
"eval_loss": 3.0875043869018555,
"eval_runtime": 152.0694,
"eval_samples_per_second": 33.307,
"eval_steps_per_second": 4.169,
"step": 2300
},
{
"epoch": 0.82,
"learning_rate": 7.013481448293085e-05,
"loss": 3.0575,
"step": 2310
},
{
"epoch": 0.82,
"learning_rate": 6.755591630892744e-05,
"loss": 3.0907,
"step": 2320
},
{
"epoch": 0.82,
"learning_rate": 6.502094541451573e-05,
"loss": 3.0385,
"step": 2330
},
{
"epoch": 0.83,
"learning_rate": 6.253023675640158e-05,
"loss": 3.1125,
"step": 2340
},
{
"epoch": 0.83,
"learning_rate": 6.008411944273e-05,
"loss": 3.0955,
"step": 2350
},
{
"epoch": 0.83,
"learning_rate": 5.7682916689597535e-05,
"loss": 3.0929,
"step": 2360
},
{
"epoch": 0.84,
"learning_rate": 5.5326945778345586e-05,
"loss": 3.0564,
"step": 2370
},
{
"epoch": 0.84,
"learning_rate": 5.301651801363528e-05,
"loss": 3.0774,
"step": 2380
},
{
"epoch": 0.84,
"learning_rate": 5.075193868231454e-05,
"loss": 3.0838,
"step": 2390
},
{
"epoch": 0.85,
"learning_rate": 4.853350701307897e-05,
"loss": 3.0584,
"step": 2400
},
{
"epoch": 0.85,
"eval_accuracy": 0.42569305429031773,
"eval_loss": 3.073211431503296,
"eval_runtime": 151.0975,
"eval_samples_per_second": 33.521,
"eval_steps_per_second": 4.196,
"step": 2400
},
{
"epoch": 0.85,
"learning_rate": 4.636151613693276e-05,
"loss": 3.0236,
"step": 2410
},
{
"epoch": 0.85,
"learning_rate": 4.423625304845702e-05,
"loss": 3.0852,
"step": 2420
},
{
"epoch": 0.86,
"learning_rate": 4.215799856788727e-05,
"loss": 3.1131,
"step": 2430
},
{
"epoch": 0.86,
"learning_rate": 4.012702730400766e-05,
"loss": 3.0559,
"step": 2440
},
{
"epoch": 0.86,
"learning_rate": 3.8143607617865796e-05,
"loss": 3.0686,
"step": 2450
},
{
"epoch": 0.87,
"learning_rate": 3.620800158731288e-05,
"loss": 3.0508,
"step": 2460
},
{
"epoch": 0.87,
"learning_rate": 3.4320464972374246e-05,
"loss": 3.0623,
"step": 2470
},
{
"epoch": 0.88,
"learning_rate": 3.24812471814548e-05,
"loss": 3.0673,
"step": 2480
},
{
"epoch": 0.88,
"learning_rate": 3.069059123838347e-05,
"loss": 3.0689,
"step": 2490
},
{
"epoch": 0.88,
"learning_rate": 2.894873375030156e-05,
"loss": 3.0711,
"step": 2500
},
{
"epoch": 0.88,
"eval_accuracy": 0.42705097532758074,
"eval_loss": 3.0630664825439453,
"eval_runtime": 151.1382,
"eval_samples_per_second": 33.512,
"eval_steps_per_second": 4.195,
"step": 2500
},
{
"epoch": 0.89,
"learning_rate": 2.7255904876398687e-05,
"loss": 3.0574,
"step": 2510
},
{
"epoch": 0.89,
"learning_rate": 2.5612328297500663e-05,
"loss": 3.0566,
"step": 2520
},
{
"epoch": 0.89,
"learning_rate": 2.4018221186514223e-05,
"loss": 3.0702,
"step": 2530
},
{
"epoch": 0.9,
"learning_rate": 2.2473794179730344e-05,
"loss": 3.0446,
"step": 2540
},
{
"epoch": 0.9,
"learning_rate": 2.0979251348992235e-05,
"loss": 3.0475,
"step": 2550
},
{
"epoch": 0.9,
"learning_rate": 1.953479017473052e-05,
"loss": 3.065,
"step": 2560
},
{
"epoch": 0.91,
"learning_rate": 1.8140601519869026e-05,
"loss": 3.0562,
"step": 2570
},
{
"epoch": 0.91,
"learning_rate": 1.6796869604605735e-05,
"loss": 3.0301,
"step": 2580
},
{
"epoch": 0.91,
"learning_rate": 1.5503771982070226e-05,
"loss": 3.0694,
"step": 2590
},
{
"epoch": 0.92,
"learning_rate": 1.4261479514863452e-05,
"loss": 3.0612,
"step": 2600
},
{
"epoch": 0.92,
"eval_accuracy": 0.4279909780571187,
"eval_loss": 3.0565025806427,
"eval_runtime": 151.9486,
"eval_samples_per_second": 33.334,
"eval_steps_per_second": 4.172,
"step": 2600
},
{
"epoch": 0.92,
"learning_rate": 1.3070156352480877e-05,
"loss": 3.0469,
"step": 2610
},
{
"epoch": 0.92,
"learning_rate": 1.1929959909622045e-05,
"loss": 3.0645,
"step": 2620
},
{
"epoch": 0.93,
"learning_rate": 1.084104084539166e-05,
"loss": 3.0639,
"step": 2630
},
{
"epoch": 0.93,
"learning_rate": 9.803543043391417e-06,
"loss": 3.0735,
"step": 2640
},
{
"epoch": 0.94,
"learning_rate": 8.817603592708779e-06,
"loss": 3.02,
"step": 2650
},
{
"epoch": 0.94,
"learning_rate": 7.8833527698023e-06,
"loss": 3.0485,
"step": 2660
},
{
"epoch": 0.94,
"learning_rate": 7.0009140212878854e-06,
"loss": 3.0469,
"step": 2670
},
{
"epoch": 0.95,
"learning_rate": 6.170403947627179e-06,
"loss": 3.033,
"step": 2680
},
{
"epoch": 0.95,
"learning_rate": 5.39193228772068e-06,
"loss": 3.0413,
"step": 2690
},
{
"epoch": 0.95,
"learning_rate": 4.665601904407347e-06,
"loss": 3.081,
"step": 2700
},
{
"epoch": 0.95,
"eval_accuracy": 0.42835112275156717,
"eval_loss": 3.0534024238586426,
"eval_runtime": 151.3942,
"eval_samples_per_second": 33.456,
"eval_steps_per_second": 4.188,
"step": 2700
},
{
"epoch": 0.96,
"learning_rate": 3.99150877087302e-06,
"loss": 3.0195,
"step": 2710
},
{
"epoch": 0.96,
"learning_rate": 3.3697419579690194e-06,
"loss": 3.0411,
"step": 2720
},
{
"epoch": 0.96,
"learning_rate": 2.800383622442837e-06,
"loss": 3.0234,
"step": 2730
},
{
"epoch": 0.97,
"learning_rate": 2.2835089960823395e-06,
"loss": 3.0571,
"step": 2740
},
{
"epoch": 0.97,
"learning_rate": 1.8191863757751392e-06,
"loss": 3.0598,
"step": 2750
},
{
"epoch": 0.97,
"learning_rate": 1.4074771144842568e-06,
"loss": 3.0829,
"step": 2760
},
{
"epoch": 0.98,
"learning_rate": 1.04843561314123e-06,
"loss": 3.0236,
"step": 2770
},
{
"epoch": 0.98,
"learning_rate": 7.421093134578616e-07,
"loss": 3.0504,
"step": 2780
},
{
"epoch": 0.98,
"learning_rate": 4.885386916575474e-07,
"loss": 3.0446,
"step": 2790
},
{
"epoch": 0.99,
"learning_rate": 2.877572531271078e-07,
"loss": 3.0378,
"step": 2800
},
{
"epoch": 0.99,
"eval_accuracy": 0.4284987878632974,
"eval_loss": 3.052541494369507,
"eval_runtime": 151.2777,
"eval_samples_per_second": 33.481,
"eval_steps_per_second": 4.191,
"step": 2800
},
{
"epoch": 0.99,
"learning_rate": 1.3979152798935247e-07,
"loss": 3.0729,
"step": 2810
},
{
"epoch": 1.0,
"learning_rate": 4.4661067597751015e-08,
"loss": 2.9979,
"step": 2820
},
{
"epoch": 1.0,
"learning_rate": 2.3784419529437883e-09,
"loss": 3.1122,
"step": 2830
},
{
"epoch": 1.0,
"step": 2833,
"total_flos": 4.4316720769479475e+17,
"train_loss": 3.7639108537323183,
"train_runtime": 10013.9285,
"train_samples_per_second": 9.052,
"train_steps_per_second": 0.283
}
],
"logging_steps": 10,
"max_steps": 2833,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000.0,
"total_flos": 4.4316720769479475e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}