diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9597 @@ +{ + "best_metric": 1.610386610031128, + "best_model_checkpoint": "/home/nlplab5/Desktop/roberta-pretrain/ckpt/roberta/pretrain/medium/checkpoint-11880", + "epoch": 8.72136323160366, + "eval_steps": 90, + "global_step": 11970, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "grad_norm": 3.2670376300811768, + "learning_rate": 4.166666666666667e-06, + "loss": 7.2579, + "step": 10 + }, + { + "epoch": 0.02, + "grad_norm": 3.1587345600128174, + "learning_rate": 8.333333333333334e-06, + "loss": 7.2077, + "step": 20 + }, + { + "epoch": 0.03, + "grad_norm": 2.9462554454803467, + "learning_rate": 1.25e-05, + "loss": 7.1099, + "step": 30 + }, + { + "epoch": 0.03, + "grad_norm": 2.7092673778533936, + "learning_rate": 1.6666666666666667e-05, + "loss": 6.9866, + "step": 40 + }, + { + "epoch": 0.04, + "grad_norm": 2.605360269546509, + "learning_rate": 2.0833333333333333e-05, + "loss": 6.87, + "step": 50 + }, + { + "epoch": 0.05, + "grad_norm": 2.5747287273406982, + "learning_rate": 2.5e-05, + "loss": 6.7736, + "step": 60 + }, + { + "epoch": 0.06, + "grad_norm": 2.551903247833252, + "learning_rate": 2.9166666666666666e-05, + "loss": 6.6903, + "step": 70 + }, + { + "epoch": 0.07, + "grad_norm": 2.5231289863586426, + "learning_rate": 3.3333333333333335e-05, + "loss": 6.6151, + "step": 80 + }, + { + "epoch": 0.08, + "grad_norm": 2.503075361251831, + "learning_rate": 3.75e-05, + "loss": 6.5462, + "step": 90 + }, + { + "epoch": 0.08, + "eval_accuracy": 0.2242063046599867, + "eval_loss": 6.500818729400635, + "eval_runtime": 1083.9947, + "eval_samples_per_second": 460.679, + "eval_steps_per_second": 2.399, + "step": 90 + }, + { + "epoch": 0.08, + "grad_norm": 2.4953837394714355, + "learning_rate": 4.1666666666666665e-05, + "loss": 6.4802, + "step": 100 + }, + { + "epoch": 0.09, + "grad_norm": 2.4765946865081787, + "learning_rate": 4.5833333333333334e-05, + "loss": 6.4127, + "step": 110 + }, + { + "epoch": 0.1, + "grad_norm": 2.4805634021759033, + "learning_rate": 5e-05, + "loss": 6.3425, + "step": 120 + }, + { + "epoch": 0.11, + "grad_norm": 2.4720654487609863, + "learning_rate": 5.416666666666667e-05, + "loss": 6.2706, + "step": 130 + }, + { + "epoch": 0.12, + "grad_norm": 2.454899787902832, + "learning_rate": 5.833333333333333e-05, + "loss": 6.1941, + "step": 140 + }, + { + "epoch": 0.13, + "grad_norm": 2.4357142448425293, + "learning_rate": 6.25e-05, + "loss": 6.1169, + "step": 150 + }, + { + "epoch": 0.14, + "grad_norm": 2.4193003177642822, + "learning_rate": 6.666666666666667e-05, + "loss": 6.0351, + "step": 160 + }, + { + "epoch": 0.14, + "grad_norm": 2.396649122238159, + "learning_rate": 7.083333333333334e-05, + "loss": 5.9532, + "step": 170 + }, + { + "epoch": 0.15, + "grad_norm": 2.3566408157348633, + "learning_rate": 7.5e-05, + "loss": 5.8695, + "step": 180 + }, + { + "epoch": 0.15, + "eval_accuracy": 0.22404351306840548, + "eval_loss": 5.819457530975342, + "eval_runtime": 1077.6772, + "eval_samples_per_second": 463.38, + "eval_steps_per_second": 2.414, + "step": 180 + }, + { + "epoch": 0.16, + "grad_norm": 2.310816526412964, + "learning_rate": 7.916666666666666e-05, + "loss": 5.7876, + "step": 190 + }, + { + "epoch": 0.17, + "grad_norm": 2.2797272205352783, + "learning_rate": 8.333333333333333e-05, + "loss": 5.706, + "step": 200 + }, + { + "epoch": 0.18, + "grad_norm": 2.2188355922698975, + "learning_rate": 8.75e-05, + "loss": 5.6255, + "step": 210 + }, + { + "epoch": 0.19, + "grad_norm": 2.142122745513916, + "learning_rate": 9.166666666666667e-05, + "loss": 5.5471, + "step": 220 + }, + { + "epoch": 0.2, + "grad_norm": 2.069880485534668, + "learning_rate": 9.583333333333334e-05, + "loss": 5.4704, + "step": 230 + }, + { + "epoch": 0.2, + "grad_norm": 1.957664966583252, + "learning_rate": 0.0001, + "loss": 5.4003, + "step": 240 + }, + { + "epoch": 0.21, + "grad_norm": 1.8441264629364014, + "learning_rate": 0.00010416666666666667, + "loss": 5.3306, + "step": 250 + }, + { + "epoch": 0.22, + "grad_norm": 1.722961664199829, + "learning_rate": 0.00010833333333333334, + "loss": 5.2648, + "step": 260 + }, + { + "epoch": 0.23, + "grad_norm": 1.5622942447662354, + "learning_rate": 0.00011250000000000001, + "loss": 5.2004, + "step": 270 + }, + { + "epoch": 0.23, + "eval_accuracy": 0.22578753539560809, + "eval_loss": 5.162991046905518, + "eval_runtime": 1079.7321, + "eval_samples_per_second": 462.498, + "eval_steps_per_second": 2.409, + "step": 270 + }, + { + "epoch": 0.24, + "grad_norm": 1.4034879207611084, + "learning_rate": 0.00011666666666666667, + "loss": 5.144, + "step": 280 + }, + { + "epoch": 0.25, + "grad_norm": 1.2136635780334473, + "learning_rate": 0.00012083333333333333, + "loss": 5.087, + "step": 290 + }, + { + "epoch": 0.25, + "grad_norm": 0.9789605140686035, + "learning_rate": 0.000125, + "loss": 5.0347, + "step": 300 + }, + { + "epoch": 0.26, + "grad_norm": 0.7709304094314575, + "learning_rate": 0.00012916666666666667, + "loss": 4.9873, + "step": 310 + }, + { + "epoch": 0.27, + "grad_norm": 0.5693560838699341, + "learning_rate": 0.00013333333333333334, + "loss": 4.95, + "step": 320 + }, + { + "epoch": 0.28, + "grad_norm": 0.42085811495780945, + "learning_rate": 0.0001375, + "loss": 4.9181, + "step": 330 + }, + { + "epoch": 0.29, + "grad_norm": 0.3126681447029114, + "learning_rate": 0.00014166666666666668, + "loss": 4.8959, + "step": 340 + }, + { + "epoch": 0.3, + "grad_norm": 0.24236658215522766, + "learning_rate": 0.00014583333333333335, + "loss": 4.876, + "step": 350 + }, + { + "epoch": 0.31, + "grad_norm": 0.23320983350276947, + "learning_rate": 0.00015, + "loss": 4.8607, + "step": 360 + }, + { + "epoch": 0.31, + "eval_accuracy": 0.2620490039433166, + "eval_loss": 4.841182231903076, + "eval_runtime": 1079.5667, + "eval_samples_per_second": 462.569, + "eval_steps_per_second": 2.409, + "step": 360 + }, + { + "epoch": 0.31, + "grad_norm": 0.23191139101982117, + "learning_rate": 0.00015416666666666668, + "loss": 4.846, + "step": 370 + }, + { + "epoch": 0.32, + "grad_norm": 0.2323000133037567, + "learning_rate": 0.00015833333333333332, + "loss": 4.8304, + "step": 380 + }, + { + "epoch": 0.33, + "grad_norm": 0.2213001251220703, + "learning_rate": 0.00016250000000000002, + "loss": 4.816, + "step": 390 + }, + { + "epoch": 0.34, + "grad_norm": 0.21700094640254974, + "learning_rate": 0.00016666666666666666, + "loss": 4.8044, + "step": 400 + }, + { + "epoch": 0.35, + "grad_norm": 0.27367648482322693, + "learning_rate": 0.00017083333333333333, + "loss": 4.787, + "step": 410 + }, + { + "epoch": 0.36, + "grad_norm": 0.3331514000892639, + "learning_rate": 0.000175, + "loss": 4.775, + "step": 420 + }, + { + "epoch": 0.37, + "grad_norm": 0.3531811833381653, + "learning_rate": 0.00017916666666666667, + "loss": 4.7599, + "step": 430 + }, + { + "epoch": 0.37, + "grad_norm": 0.17425844073295593, + "learning_rate": 0.00018333333333333334, + "loss": 4.7471, + "step": 440 + }, + { + "epoch": 0.38, + "grad_norm": 0.16081774234771729, + "learning_rate": 0.0001875, + "loss": 4.732, + "step": 450 + }, + { + "epoch": 0.38, + "eval_accuracy": 0.2854636107287403, + "eval_loss": 4.713276386260986, + "eval_runtime": 1135.1993, + "eval_samples_per_second": 439.9, + "eval_steps_per_second": 2.291, + "step": 450 + }, + { + "epoch": 0.39, + "grad_norm": 0.5644450783729553, + "learning_rate": 0.00019166666666666667, + "loss": 4.7196, + "step": 460 + }, + { + "epoch": 0.4, + "grad_norm": 0.4822804629802704, + "learning_rate": 0.00019583333333333334, + "loss": 4.7067, + "step": 470 + }, + { + "epoch": 0.41, + "grad_norm": 0.26140686869621277, + "learning_rate": 0.0002, + "loss": 4.6939, + "step": 480 + }, + { + "epoch": 0.42, + "grad_norm": 0.24341146647930145, + "learning_rate": 0.00020416666666666668, + "loss": 4.6797, + "step": 490 + }, + { + "epoch": 0.42, + "grad_norm": 0.1883888840675354, + "learning_rate": 0.00020833333333333335, + "loss": 4.667, + "step": 500 + }, + { + "epoch": 0.43, + "grad_norm": 0.32793405652046204, + "learning_rate": 0.0002125, + "loss": 4.6568, + "step": 510 + }, + { + "epoch": 0.44, + "grad_norm": 0.6900771856307983, + "learning_rate": 0.00021666666666666668, + "loss": 4.6504, + "step": 520 + }, + { + "epoch": 0.45, + "grad_norm": 0.23155897855758667, + "learning_rate": 0.00022083333333333333, + "loss": 4.6371, + "step": 530 + }, + { + "epoch": 0.46, + "grad_norm": 0.15708310902118683, + "learning_rate": 0.00022500000000000002, + "loss": 4.6273, + "step": 540 + }, + { + "epoch": 0.46, + "eval_accuracy": 0.2875459051329938, + "eval_loss": 4.6114654541015625, + "eval_runtime": 1075.2001, + "eval_samples_per_second": 464.447, + "eval_steps_per_second": 2.419, + "step": 540 + }, + { + "epoch": 0.47, + "grad_norm": 0.14138343930244446, + "learning_rate": 0.00022916666666666666, + "loss": 4.6187, + "step": 550 + }, + { + "epoch": 0.48, + "grad_norm": 0.10016636550426483, + "learning_rate": 0.00023333333333333333, + "loss": 4.6101, + "step": 560 + }, + { + "epoch": 0.48, + "grad_norm": 0.09886801242828369, + "learning_rate": 0.0002375, + "loss": 4.6027, + "step": 570 + }, + { + "epoch": 0.49, + "grad_norm": 0.9051061272621155, + "learning_rate": 0.00024166666666666667, + "loss": 4.5993, + "step": 580 + }, + { + "epoch": 0.5, + "grad_norm": 0.8797232508659363, + "learning_rate": 0.0002458333333333333, + "loss": 4.5982, + "step": 590 + }, + { + "epoch": 0.51, + "grad_norm": 0.4441538453102112, + "learning_rate": 0.00025, + "loss": 4.5868, + "step": 600 + }, + { + "epoch": 0.52, + "grad_norm": 0.24725468456745148, + "learning_rate": 0.00025416666666666665, + "loss": 4.5836, + "step": 610 + }, + { + "epoch": 0.53, + "grad_norm": 0.08581159263849258, + "learning_rate": 0.00025833333333333334, + "loss": 4.5773, + "step": 620 + }, + { + "epoch": 0.54, + "grad_norm": 0.10642833262681961, + "learning_rate": 0.00026250000000000004, + "loss": 4.572, + "step": 630 + }, + { + "epoch": 0.54, + "eval_accuracy": 0.28878817310094446, + "eval_loss": 4.563485622406006, + "eval_runtime": 1076.0022, + "eval_samples_per_second": 464.101, + "eval_steps_per_second": 2.417, + "step": 630 + }, + { + "epoch": 0.54, + "grad_norm": 0.17463397979736328, + "learning_rate": 0.0002666666666666667, + "loss": 4.5707, + "step": 640 + }, + { + "epoch": 0.55, + "grad_norm": 0.1806878000497818, + "learning_rate": 0.0002708333333333333, + "loss": 4.5681, + "step": 650 + }, + { + "epoch": 0.56, + "grad_norm": 0.18553458154201508, + "learning_rate": 0.000275, + "loss": 4.5612, + "step": 660 + }, + { + "epoch": 0.57, + "grad_norm": 0.7338638305664062, + "learning_rate": 0.00027916666666666666, + "loss": 4.5601, + "step": 670 + }, + { + "epoch": 0.58, + "grad_norm": 0.3852124512195587, + "learning_rate": 0.00028333333333333335, + "loss": 4.5567, + "step": 680 + }, + { + "epoch": 0.59, + "grad_norm": 0.2558722198009491, + "learning_rate": 0.0002875, + "loss": 4.555, + "step": 690 + }, + { + "epoch": 0.59, + "grad_norm": 0.1996091902256012, + "learning_rate": 0.0002916666666666667, + "loss": 4.553, + "step": 700 + }, + { + "epoch": 0.6, + "grad_norm": 0.414126992225647, + "learning_rate": 0.00029583333333333333, + "loss": 4.551, + "step": 710 + }, + { + "epoch": 0.61, + "grad_norm": 0.6828728914260864, + "learning_rate": 0.0003, + "loss": 4.5485, + "step": 720 + }, + { + "epoch": 0.61, + "eval_accuracy": 0.2897431184302517, + "eval_loss": 4.544471263885498, + "eval_runtime": 1075.524, + "eval_samples_per_second": 464.308, + "eval_steps_per_second": 2.418, + "step": 720 + }, + { + "epoch": 0.62, + "grad_norm": 0.31978073716163635, + "learning_rate": 0.00030416666666666667, + "loss": 4.5478, + "step": 730 + }, + { + "epoch": 0.63, + "grad_norm": 0.23724275827407837, + "learning_rate": 0.00030833333333333337, + "loss": 4.5451, + "step": 740 + }, + { + "epoch": 0.64, + "grad_norm": 1.0507104396820068, + "learning_rate": 0.0003125, + "loss": 4.5441, + "step": 750 + }, + { + "epoch": 0.65, + "grad_norm": 0.38554638624191284, + "learning_rate": 0.00031666666666666665, + "loss": 4.5433, + "step": 760 + }, + { + "epoch": 0.65, + "grad_norm": 0.275704562664032, + "learning_rate": 0.00032083333333333334, + "loss": 4.5415, + "step": 770 + }, + { + "epoch": 0.66, + "grad_norm": 0.2305123656988144, + "learning_rate": 0.00032500000000000004, + "loss": 4.5398, + "step": 780 + }, + { + "epoch": 0.67, + "grad_norm": 0.4813285768032074, + "learning_rate": 0.0003291666666666667, + "loss": 4.5391, + "step": 790 + }, + { + "epoch": 0.68, + "grad_norm": 0.6520434617996216, + "learning_rate": 0.0003333333333333333, + "loss": 4.5361, + "step": 800 + }, + { + "epoch": 0.69, + "grad_norm": 0.3875904381275177, + "learning_rate": 0.0003375, + "loss": 4.5351, + "step": 810 + }, + { + "epoch": 0.69, + "eval_accuracy": 0.29011102141892453, + "eval_loss": 4.53138542175293, + "eval_runtime": 1074.6899, + "eval_samples_per_second": 464.668, + "eval_steps_per_second": 2.42, + "step": 810 + }, + { + "epoch": 0.7, + "grad_norm": 0.6874526143074036, + "learning_rate": 0.00034166666666666666, + "loss": 4.5338, + "step": 820 + }, + { + "epoch": 0.71, + "grad_norm": 0.4180966019630432, + "learning_rate": 0.00034583333333333335, + "loss": 4.5331, + "step": 830 + }, + { + "epoch": 0.71, + "grad_norm": 0.41761451959609985, + "learning_rate": 0.00035, + "loss": 4.5291, + "step": 840 + }, + { + "epoch": 0.72, + "grad_norm": 0.6088799834251404, + "learning_rate": 0.0003541666666666667, + "loss": 4.5303, + "step": 850 + }, + { + "epoch": 0.73, + "grad_norm": 0.20438095927238464, + "learning_rate": 0.00035833333333333333, + "loss": 4.5298, + "step": 860 + }, + { + "epoch": 0.74, + "grad_norm": 0.4336546063423157, + "learning_rate": 0.0003625, + "loss": 4.5283, + "step": 870 + }, + { + "epoch": 0.75, + "grad_norm": 0.40454909205436707, + "learning_rate": 0.00036666666666666667, + "loss": 4.5301, + "step": 880 + }, + { + "epoch": 0.76, + "grad_norm": 0.4893989562988281, + "learning_rate": 0.00037083333333333337, + "loss": 4.5286, + "step": 890 + }, + { + "epoch": 0.76, + "grad_norm": 0.4546484053134918, + "learning_rate": 0.000375, + "loss": 4.5263, + "step": 900 + }, + { + "epoch": 0.76, + "eval_accuracy": 0.2903379261149848, + "eval_loss": 4.5242390632629395, + "eval_runtime": 1074.5238, + "eval_samples_per_second": 464.74, + "eval_steps_per_second": 2.421, + "step": 900 + }, + { + "epoch": 0.77, + "grad_norm": 0.22772909700870514, + "learning_rate": 0.00037916666666666665, + "loss": 4.5251, + "step": 910 + }, + { + "epoch": 0.78, + "grad_norm": 0.48488083481788635, + "learning_rate": 0.00038333333333333334, + "loss": 4.524, + "step": 920 + }, + { + "epoch": 0.79, + "grad_norm": 0.21915870904922485, + "learning_rate": 0.00038750000000000004, + "loss": 4.5276, + "step": 930 + }, + { + "epoch": 0.8, + "grad_norm": 0.4013586938381195, + "learning_rate": 0.0003916666666666667, + "loss": 4.5263, + "step": 940 + }, + { + "epoch": 0.81, + "grad_norm": 0.6083785891532898, + "learning_rate": 0.0003958333333333333, + "loss": 4.5226, + "step": 950 + }, + { + "epoch": 0.82, + "grad_norm": 0.5477403402328491, + "learning_rate": 0.0004, + "loss": 4.5223, + "step": 960 + }, + { + "epoch": 0.82, + "grad_norm": 0.25517192482948303, + "learning_rate": 0.00040416666666666666, + "loss": 4.5213, + "step": 970 + }, + { + "epoch": 0.83, + "grad_norm": 0.5681092739105225, + "learning_rate": 0.00040833333333333336, + "loss": 4.5213, + "step": 980 + }, + { + "epoch": 0.84, + "grad_norm": 0.27979689836502075, + "learning_rate": 0.0004125, + "loss": 4.5223, + "step": 990 + }, + { + "epoch": 0.84, + "eval_accuracy": 0.2904179370099399, + "eval_loss": 4.5171709060668945, + "eval_runtime": 1074.3659, + "eval_samples_per_second": 464.808, + "eval_steps_per_second": 2.421, + "step": 990 + }, + { + "epoch": 0.85, + "grad_norm": 0.278210312128067, + "learning_rate": 0.0004166666666666667, + "loss": 4.5185, + "step": 1000 + }, + { + "epoch": 0.86, + "grad_norm": 0.31496554613113403, + "learning_rate": 0.00042083333333333333, + "loss": 4.5184, + "step": 1010 + }, + { + "epoch": 0.87, + "grad_norm": 0.28795501589775085, + "learning_rate": 0.000425, + "loss": 4.5202, + "step": 1020 + }, + { + "epoch": 0.88, + "grad_norm": 0.1585451066493988, + "learning_rate": 0.00042916666666666667, + "loss": 4.5177, + "step": 1030 + }, + { + "epoch": 0.88, + "grad_norm": 0.25573596358299255, + "learning_rate": 0.00043333333333333337, + "loss": 4.5157, + "step": 1040 + }, + { + "epoch": 0.89, + "grad_norm": 0.9261253476142883, + "learning_rate": 0.0004375, + "loss": 4.5155, + "step": 1050 + }, + { + "epoch": 0.9, + "grad_norm": 0.20488545298576355, + "learning_rate": 0.00044166666666666665, + "loss": 4.5171, + "step": 1060 + }, + { + "epoch": 0.91, + "grad_norm": 0.19982470571994781, + "learning_rate": 0.00044583333333333335, + "loss": 4.5157, + "step": 1070 + }, + { + "epoch": 0.92, + "grad_norm": 0.2195570170879364, + "learning_rate": 0.00045000000000000004, + "loss": 4.511, + "step": 1080 + }, + { + "epoch": 0.92, + "eval_accuracy": 0.29042484814802466, + "eval_loss": 4.511170387268066, + "eval_runtime": 1074.2406, + "eval_samples_per_second": 464.862, + "eval_steps_per_second": 2.421, + "step": 1080 + }, + { + "epoch": 0.93, + "grad_norm": 0.5774135589599609, + "learning_rate": 0.0004541666666666667, + "loss": 4.5159, + "step": 1090 + }, + { + "epoch": 0.93, + "grad_norm": 0.3652968108654022, + "learning_rate": 0.0004583333333333333, + "loss": 4.5152, + "step": 1100 + }, + { + "epoch": 0.94, + "grad_norm": 0.4962700307369232, + "learning_rate": 0.0004625, + "loss": 4.5139, + "step": 1110 + }, + { + "epoch": 0.95, + "grad_norm": 0.3226447105407715, + "learning_rate": 0.00046666666666666666, + "loss": 4.5117, + "step": 1120 + }, + { + "epoch": 0.96, + "grad_norm": 0.5751166939735413, + "learning_rate": 0.00047083333333333336, + "loss": 4.5131, + "step": 1130 + }, + { + "epoch": 0.97, + "grad_norm": 0.181748166680336, + "learning_rate": 0.000475, + "loss": 4.5136, + "step": 1140 + }, + { + "epoch": 0.98, + "grad_norm": 0.6175718903541565, + "learning_rate": 0.0004791666666666667, + "loss": 4.5107, + "step": 1150 + }, + { + "epoch": 0.99, + "grad_norm": 0.261405348777771, + "learning_rate": 0.00048333333333333334, + "loss": 4.5124, + "step": 1160 + }, + { + "epoch": 0.99, + "grad_norm": 0.18674370646476746, + "learning_rate": 0.0004875, + "loss": 4.5093, + "step": 1170 + }, + { + "epoch": 0.99, + "eval_accuracy": 0.2903912366894582, + "eval_loss": 4.507014274597168, + "eval_runtime": 1075.0378, + "eval_samples_per_second": 464.518, + "eval_steps_per_second": 2.419, + "step": 1170 + }, + { + "epoch": 1.0, + "grad_norm": 0.4493379294872284, + "learning_rate": 0.0004916666666666666, + "loss": 4.5107, + "step": 1180 + }, + { + "epoch": 1.01, + "grad_norm": 0.40603315830230713, + "learning_rate": 0.0004958333333333334, + "loss": 4.5103, + "step": 1190 + }, + { + "epoch": 1.02, + "grad_norm": 0.17475590109825134, + "learning_rate": 0.0005, + "loss": 4.507, + "step": 1200 + }, + { + "epoch": 1.03, + "grad_norm": 0.6670963168144226, + "learning_rate": 0.0005041666666666667, + "loss": 4.5076, + "step": 1210 + }, + { + "epoch": 1.04, + "grad_norm": 0.20023925602436066, + "learning_rate": 0.0005083333333333333, + "loss": 4.5092, + "step": 1220 + }, + { + "epoch": 1.05, + "grad_norm": 0.38801464438438416, + "learning_rate": 0.0005124999999999999, + "loss": 4.5047, + "step": 1230 + }, + { + "epoch": 1.05, + "grad_norm": 0.4637294113636017, + "learning_rate": 0.0005166666666666667, + "loss": 4.5088, + "step": 1240 + }, + { + "epoch": 1.06, + "grad_norm": 0.17603175342082977, + "learning_rate": 0.0005208333333333334, + "loss": 4.5057, + "step": 1250 + }, + { + "epoch": 1.07, + "grad_norm": 0.4553210735321045, + "learning_rate": 0.0005250000000000001, + "loss": 4.505, + "step": 1260 + }, + { + "epoch": 1.07, + "eval_accuracy": 0.29054962247372956, + "eval_loss": 4.502260208129883, + "eval_runtime": 1075.7242, + "eval_samples_per_second": 464.221, + "eval_steps_per_second": 2.418, + "step": 1260 + }, + { + "epoch": 1.08, + "grad_norm": 0.30805012583732605, + "learning_rate": 0.0005291666666666667, + "loss": 4.5032, + "step": 1270 + }, + { + "epoch": 1.09, + "grad_norm": 0.5969117283821106, + "learning_rate": 0.0005333333333333334, + "loss": 4.5036, + "step": 1280 + }, + { + "epoch": 1.1, + "grad_norm": 0.24606676399707794, + "learning_rate": 0.0005375, + "loss": 4.5042, + "step": 1290 + }, + { + "epoch": 1.1, + "grad_norm": 0.2750067710876465, + "learning_rate": 0.0005416666666666666, + "loss": 4.5037, + "step": 1300 + }, + { + "epoch": 1.11, + "grad_norm": 0.4421214163303375, + "learning_rate": 0.0005458333333333333, + "loss": 4.5014, + "step": 1310 + }, + { + "epoch": 1.12, + "grad_norm": 0.2441830188035965, + "learning_rate": 0.00055, + "loss": 4.5005, + "step": 1320 + }, + { + "epoch": 1.13, + "grad_norm": 0.25598272681236267, + "learning_rate": 0.0005541666666666667, + "loss": 4.5007, + "step": 1330 + }, + { + "epoch": 1.14, + "grad_norm": 0.17499062418937683, + "learning_rate": 0.0005583333333333333, + "loss": 4.5031, + "step": 1340 + }, + { + "epoch": 1.15, + "grad_norm": 0.6325914263725281, + "learning_rate": 0.0005625000000000001, + "loss": 4.5003, + "step": 1350 + }, + { + "epoch": 1.15, + "eval_accuracy": 0.2903214025778754, + "eval_loss": 4.499546051025391, + "eval_runtime": 1074.5865, + "eval_samples_per_second": 464.713, + "eval_steps_per_second": 2.42, + "step": 1350 + }, + { + "epoch": 1.16, + "grad_norm": 0.29858532547950745, + "learning_rate": 0.0005666666666666667, + "loss": 4.5014, + "step": 1360 + }, + { + "epoch": 1.16, + "grad_norm": 0.3625228703022003, + "learning_rate": 0.0005708333333333333, + "loss": 4.4991, + "step": 1370 + }, + { + "epoch": 1.17, + "grad_norm": 0.271508127450943, + "learning_rate": 0.000575, + "loss": 4.499, + "step": 1380 + }, + { + "epoch": 1.18, + "grad_norm": 0.7316662073135376, + "learning_rate": 0.0005791666666666667, + "loss": 4.498, + "step": 1390 + }, + { + "epoch": 1.19, + "grad_norm": 0.3072379529476166, + "learning_rate": 0.0005833333333333334, + "loss": 4.5021, + "step": 1400 + }, + { + "epoch": 1.2, + "grad_norm": 0.16316668689250946, + "learning_rate": 0.0005875, + "loss": 4.4963, + "step": 1410 + }, + { + "epoch": 1.21, + "grad_norm": 0.6301301717758179, + "learning_rate": 0.0005916666666666667, + "loss": 4.4978, + "step": 1420 + }, + { + "epoch": 1.22, + "grad_norm": 0.22797346115112305, + "learning_rate": 0.0005958333333333333, + "loss": 4.4964, + "step": 1430 + }, + { + "epoch": 1.22, + "grad_norm": 0.3899094760417938, + "learning_rate": 0.0006, + "loss": 4.4939, + "step": 1440 + }, + { + "epoch": 1.22, + "eval_accuracy": 0.29044920062027546, + "eval_loss": 4.489974021911621, + "eval_runtime": 1075.4503, + "eval_samples_per_second": 464.339, + "eval_steps_per_second": 2.419, + "step": 1440 + }, + { + "epoch": 1.23, + "grad_norm": 0.34080126881599426, + "learning_rate": 0.0006041666666666666, + "loss": 4.493, + "step": 1450 + }, + { + "epoch": 1.24, + "grad_norm": 1.12690007686615, + "learning_rate": 0.0006083333333333333, + "loss": 4.4904, + "step": 1460 + }, + { + "epoch": 1.25, + "grad_norm": 0.30394747853279114, + "learning_rate": 0.0006125000000000001, + "loss": 4.489, + "step": 1470 + }, + { + "epoch": 1.26, + "grad_norm": 0.4542248845100403, + "learning_rate": 0.0006166666666666667, + "loss": 4.4841, + "step": 1480 + }, + { + "epoch": 1.27, + "grad_norm": 0.3733484447002411, + "learning_rate": 0.0006208333333333334, + "loss": 4.4738, + "step": 1490 + }, + { + "epoch": 1.27, + "grad_norm": 0.31465083360671997, + "learning_rate": 0.000625, + "loss": 4.4695, + "step": 1500 + }, + { + "epoch": 1.28, + "grad_norm": 0.32257241010665894, + "learning_rate": 0.0006291666666666667, + "loss": 4.461, + "step": 1510 + }, + { + "epoch": 1.29, + "grad_norm": 0.7750576734542847, + "learning_rate": 0.0006333333333333333, + "loss": 4.4636, + "step": 1520 + }, + { + "epoch": 1.3, + "grad_norm": 0.35094720125198364, + "learning_rate": 0.0006374999999999999, + "loss": 4.4569, + "step": 1530 + }, + { + "epoch": 1.3, + "eval_accuracy": 0.2906712538997569, + "eval_loss": 4.445650100708008, + "eval_runtime": 1075.3046, + "eval_samples_per_second": 464.402, + "eval_steps_per_second": 2.419, + "step": 1530 + }, + { + "epoch": 1.31, + "grad_norm": 0.5662222504615784, + "learning_rate": 0.0006416666666666667, + "loss": 4.4542, + "step": 1540 + }, + { + "epoch": 1.32, + "grad_norm": 0.6326726675033569, + "learning_rate": 0.0006458333333333334, + "loss": 4.4509, + "step": 1550 + }, + { + "epoch": 1.33, + "grad_norm": 0.4773523807525635, + "learning_rate": 0.0006500000000000001, + "loss": 4.4468, + "step": 1560 + }, + { + "epoch": 1.33, + "grad_norm": 0.4514019191265106, + "learning_rate": 0.0006541666666666667, + "loss": 4.442, + "step": 1570 + }, + { + "epoch": 1.34, + "grad_norm": 0.5631856918334961, + "learning_rate": 0.0006583333333333334, + "loss": 4.44, + "step": 1580 + }, + { + "epoch": 1.35, + "grad_norm": 0.36760690808296204, + "learning_rate": 0.0006625, + "loss": 4.4317, + "step": 1590 + }, + { + "epoch": 1.36, + "grad_norm": 0.669217586517334, + "learning_rate": 0.0006666666666666666, + "loss": 4.4365, + "step": 1600 + }, + { + "epoch": 1.37, + "grad_norm": 0.4648076891899109, + "learning_rate": 0.0006708333333333333, + "loss": 4.4277, + "step": 1610 + }, + { + "epoch": 1.38, + "grad_norm": 0.45093855261802673, + "learning_rate": 0.000675, + "loss": 4.4215, + "step": 1620 + }, + { + "epoch": 1.38, + "eval_accuracy": 0.29071258845839876, + "eval_loss": 4.40159797668457, + "eval_runtime": 1073.4339, + "eval_samples_per_second": 465.212, + "eval_steps_per_second": 2.423, + "step": 1620 + }, + { + "epoch": 1.39, + "grad_norm": 0.6909874081611633, + "learning_rate": 0.0006791666666666667, + "loss": 4.4243, + "step": 1630 + }, + { + "epoch": 1.39, + "grad_norm": 0.5092808604240417, + "learning_rate": 0.0006833333333333333, + "loss": 4.4162, + "step": 1640 + }, + { + "epoch": 1.4, + "grad_norm": 0.7031335234642029, + "learning_rate": 0.0006875, + "loss": 4.409, + "step": 1650 + }, + { + "epoch": 1.41, + "grad_norm": 0.778090238571167, + "learning_rate": 0.0006916666666666667, + "loss": 4.4091, + "step": 1660 + }, + { + "epoch": 1.42, + "grad_norm": 0.935316801071167, + "learning_rate": 0.0006958333333333334, + "loss": 4.4059, + "step": 1670 + }, + { + "epoch": 1.43, + "grad_norm": 0.9648371934890747, + "learning_rate": 0.0007, + "loss": 4.4033, + "step": 1680 + }, + { + "epoch": 1.44, + "grad_norm": 0.524691641330719, + "learning_rate": 0.0007041666666666667, + "loss": 4.4012, + "step": 1690 + }, + { + "epoch": 1.44, + "grad_norm": 0.5595187544822693, + "learning_rate": 0.0007083333333333334, + "loss": 4.3954, + "step": 1700 + }, + { + "epoch": 1.45, + "grad_norm": 0.5809574723243713, + "learning_rate": 0.0007125, + "loss": 4.3943, + "step": 1710 + }, + { + "epoch": 1.45, + "eval_accuracy": 0.29116029691107925, + "eval_loss": 4.3684492111206055, + "eval_runtime": 1080.2368, + "eval_samples_per_second": 462.282, + "eval_steps_per_second": 2.408, + "step": 1710 + }, + { + "epoch": 1.46, + "grad_norm": 0.4484635293483734, + "learning_rate": 0.0007166666666666667, + "loss": 4.3881, + "step": 1720 + }, + { + "epoch": 1.47, + "grad_norm": 0.6823798418045044, + "learning_rate": 0.0007208333333333333, + "loss": 4.3829, + "step": 1730 + }, + { + "epoch": 1.48, + "grad_norm": 0.7428690791130066, + "learning_rate": 0.000725, + "loss": 4.3843, + "step": 1740 + }, + { + "epoch": 1.49, + "grad_norm": 0.6587589979171753, + "learning_rate": 0.0007291666666666666, + "loss": 4.3828, + "step": 1750 + }, + { + "epoch": 1.5, + "grad_norm": 0.5860837697982788, + "learning_rate": 0.0007333333333333333, + "loss": 4.3764, + "step": 1760 + }, + { + "epoch": 1.5, + "grad_norm": 0.5413070321083069, + "learning_rate": 0.0007375000000000001, + "loss": 4.3752, + "step": 1770 + }, + { + "epoch": 1.51, + "grad_norm": 0.665489137172699, + "learning_rate": 0.0007416666666666667, + "loss": 4.3728, + "step": 1780 + }, + { + "epoch": 1.52, + "grad_norm": 0.711599588394165, + "learning_rate": 0.0007458333333333334, + "loss": 4.373, + "step": 1790 + }, + { + "epoch": 1.53, + "grad_norm": 0.6164100170135498, + "learning_rate": 0.00075, + "loss": 4.3677, + "step": 1800 + }, + { + "epoch": 1.53, + "eval_accuracy": 0.29116748362162354, + "eval_loss": 4.338656902313232, + "eval_runtime": 1078.2447, + "eval_samples_per_second": 463.136, + "eval_steps_per_second": 2.412, + "step": 1800 + }, + { + "epoch": 1.54, + "grad_norm": 0.8174536228179932, + "learning_rate": 0.0007541666666666667, + "loss": 4.3622, + "step": 1810 + }, + { + "epoch": 1.55, + "grad_norm": 0.4686708152294159, + "learning_rate": 0.0007583333333333333, + "loss": 4.3615, + "step": 1820 + }, + { + "epoch": 1.56, + "grad_norm": 0.726311206817627, + "learning_rate": 0.0007624999999999999, + "loss": 4.3553, + "step": 1830 + }, + { + "epoch": 1.56, + "grad_norm": 0.6094339489936829, + "learning_rate": 0.0007666666666666667, + "loss": 4.3584, + "step": 1840 + }, + { + "epoch": 1.57, + "grad_norm": 0.43092453479766846, + "learning_rate": 0.0007708333333333334, + "loss": 4.3515, + "step": 1850 + }, + { + "epoch": 1.58, + "grad_norm": 0.8314465284347534, + "learning_rate": 0.0007750000000000001, + "loss": 4.3503, + "step": 1860 + }, + { + "epoch": 1.59, + "grad_norm": 0.9535554647445679, + "learning_rate": 0.0007791666666666667, + "loss": 4.3459, + "step": 1870 + }, + { + "epoch": 1.6, + "grad_norm": 0.9487770795822144, + "learning_rate": 0.0007833333333333334, + "loss": 4.3438, + "step": 1880 + }, + { + "epoch": 1.61, + "grad_norm": 0.65323406457901, + "learning_rate": 0.0007875, + "loss": 4.3382, + "step": 1890 + }, + { + "epoch": 1.61, + "eval_accuracy": 0.2915768463978657, + "eval_loss": 4.299588203430176, + "eval_runtime": 1078.9106, + "eval_samples_per_second": 462.85, + "eval_steps_per_second": 2.411, + "step": 1890 + }, + { + "epoch": 1.61, + "grad_norm": 0.8234522342681885, + "learning_rate": 0.0007916666666666666, + "loss": 4.337, + "step": 1900 + }, + { + "epoch": 1.62, + "grad_norm": 0.984524667263031, + "learning_rate": 0.0007958333333333333, + "loss": 4.3342, + "step": 1910 + }, + { + "epoch": 1.63, + "grad_norm": 1.087571382522583, + "learning_rate": 0.0008, + "loss": 4.3327, + "step": 1920 + }, + { + "epoch": 1.64, + "grad_norm": 0.8180701732635498, + "learning_rate": 0.0008041666666666667, + "loss": 4.3292, + "step": 1930 + }, + { + "epoch": 1.65, + "grad_norm": 1.209524154663086, + "learning_rate": 0.0008083333333333333, + "loss": 4.3258, + "step": 1940 + }, + { + "epoch": 1.66, + "grad_norm": 1.2440215349197388, + "learning_rate": 0.0008125000000000001, + "loss": 4.322, + "step": 1950 + }, + { + "epoch": 1.67, + "grad_norm": 1.1065999269485474, + "learning_rate": 0.0008166666666666667, + "loss": 4.3207, + "step": 1960 + }, + { + "epoch": 1.67, + "grad_norm": 2.2142093181610107, + "learning_rate": 0.0008208333333333334, + "loss": 4.3183, + "step": 1970 + }, + { + "epoch": 1.68, + "grad_norm": 0.8045121431350708, + "learning_rate": 0.000825, + "loss": 4.3216, + "step": 1980 + }, + { + "epoch": 1.68, + "eval_accuracy": 0.2919771311011085, + "eval_loss": 4.267116069793701, + "eval_runtime": 1079.6166, + "eval_samples_per_second": 462.548, + "eval_steps_per_second": 2.409, + "step": 1980 + }, + { + "epoch": 1.45, + "grad_norm": 0.8329472541809082, + "learning_rate": 0.0008291666666666667, + "loss": 4.3102, + "step": 1990 + }, + { + "epoch": 1.46, + "grad_norm": 1.114058256149292, + "learning_rate": 0.0008333333333333334, + "loss": 4.3138, + "step": 2000 + }, + { + "epoch": 1.46, + "grad_norm": 0.989930272102356, + "learning_rate": 0.0008375, + "loss": 4.3068, + "step": 2010 + }, + { + "epoch": 1.47, + "grad_norm": 0.9290440082550049, + "learning_rate": 0.0008416666666666667, + "loss": 4.3016, + "step": 2020 + }, + { + "epoch": 1.48, + "grad_norm": 1.6841094493865967, + "learning_rate": 0.0008458333333333333, + "loss": 4.3031, + "step": 2030 + }, + { + "epoch": 1.49, + "grad_norm": 1.0473177433013916, + "learning_rate": 0.00085, + "loss": 4.3079, + "step": 2040 + }, + { + "epoch": 1.49, + "grad_norm": 1.1396197080612183, + "learning_rate": 0.0008541666666666666, + "loss": 4.2986, + "step": 2050 + }, + { + "epoch": 1.5, + "grad_norm": 1.2318438291549683, + "learning_rate": 0.0008583333333333333, + "loss": 4.3005, + "step": 2060 + }, + { + "epoch": 1.51, + "grad_norm": 1.749923825263977, + "learning_rate": 0.0008625000000000001, + "loss": 4.2879, + "step": 2070 + }, + { + "epoch": 1.51, + "eval_accuracy": 0.2931822363307358, + "eval_loss": 4.231507778167725, + "eval_runtime": 1089.4507, + "eval_samples_per_second": 458.378, + "eval_steps_per_second": 2.047, + "step": 2070 + }, + { + "epoch": 1.52, + "grad_norm": 0.9980252385139465, + "learning_rate": 0.0008666666666666667, + "loss": 4.2821, + "step": 2080 + }, + { + "epoch": 1.52, + "grad_norm": 1.5635493993759155, + "learning_rate": 0.0008708333333333334, + "loss": 4.29, + "step": 2090 + }, + { + "epoch": 1.53, + "grad_norm": 1.6463395357131958, + "learning_rate": 0.000875, + "loss": 4.2856, + "step": 2100 + }, + { + "epoch": 1.54, + "grad_norm": 1.3602315187454224, + "learning_rate": 0.0008791666666666667, + "loss": 4.2802, + "step": 2110 + }, + { + "epoch": 1.54, + "grad_norm": 0.8902882933616638, + "learning_rate": 0.0008833333333333333, + "loss": 4.2729, + "step": 2120 + }, + { + "epoch": 1.55, + "grad_norm": 1.186219573020935, + "learning_rate": 0.0008874999999999999, + "loss": 4.2691, + "step": 2130 + }, + { + "epoch": 1.56, + "grad_norm": 1.0059683322906494, + "learning_rate": 0.0008916666666666667, + "loss": 4.26, + "step": 2140 + }, + { + "epoch": 1.57, + "grad_norm": 1.2011739015579224, + "learning_rate": 0.0008958333333333334, + "loss": 4.2577, + "step": 2150 + }, + { + "epoch": 1.57, + "grad_norm": 1.4442743062973022, + "learning_rate": 0.0009000000000000001, + "loss": 4.263, + "step": 2160 + }, + { + "epoch": 1.57, + "eval_accuracy": 0.2934964665435206, + "eval_loss": 4.21316385269165, + "eval_runtime": 1096.5373, + "eval_samples_per_second": 455.415, + "eval_steps_per_second": 2.034, + "step": 2160 + }, + { + "epoch": 1.58, + "grad_norm": 1.1617356538772583, + "learning_rate": 0.0009041666666666667, + "loss": 4.2661, + "step": 2170 + }, + { + "epoch": 1.59, + "grad_norm": 1.3990079164505005, + "learning_rate": 0.0009083333333333334, + "loss": 4.2475, + "step": 2180 + }, + { + "epoch": 1.6, + "grad_norm": 1.2959562540054321, + "learning_rate": 0.0009125, + "loss": 4.2415, + "step": 2190 + }, + { + "epoch": 1.6, + "grad_norm": 1.0796222686767578, + "learning_rate": 0.0009166666666666666, + "loss": 4.2337, + "step": 2200 + }, + { + "epoch": 1.61, + "grad_norm": 1.6943458318710327, + "learning_rate": 0.0009208333333333333, + "loss": 4.2281, + "step": 2210 + }, + { + "epoch": 1.62, + "grad_norm": 1.7960783243179321, + "learning_rate": 0.000925, + "loss": 4.2241, + "step": 2220 + }, + { + "epoch": 1.63, + "grad_norm": 2.086534023284912, + "learning_rate": 0.0009291666666666667, + "loss": 4.2269, + "step": 2230 + }, + { + "epoch": 1.63, + "grad_norm": 1.137702226638794, + "learning_rate": 0.0009333333333333333, + "loss": 4.2158, + "step": 2240 + }, + { + "epoch": 1.64, + "grad_norm": 1.1577701568603516, + "learning_rate": 0.0009375, + "loss": 4.2013, + "step": 2250 + }, + { + "epoch": 1.64, + "eval_accuracy": 0.2987269750298371, + "eval_loss": 4.123126029968262, + "eval_runtime": 1097.6256, + "eval_samples_per_second": 454.964, + "eval_steps_per_second": 2.032, + "step": 2250 + }, + { + "epoch": 1.65, + "grad_norm": 1.6455570459365845, + "learning_rate": 0.0009416666666666667, + "loss": 4.1815, + "step": 2260 + }, + { + "epoch": 1.65, + "grad_norm": 1.6025768518447876, + "learning_rate": 0.0009458333333333334, + "loss": 4.1534, + "step": 2270 + }, + { + "epoch": 1.66, + "grad_norm": 1.2223172187805176, + "learning_rate": 0.00095, + "loss": 4.1298, + "step": 2280 + }, + { + "epoch": 1.67, + "grad_norm": 1.766542673110962, + "learning_rate": 0.0009541666666666667, + "loss": 4.1187, + "step": 2290 + }, + { + "epoch": 1.68, + "grad_norm": 2.156003952026367, + "learning_rate": 0.0009583333333333334, + "loss": 4.0858, + "step": 2300 + }, + { + "epoch": 1.68, + "grad_norm": 1.9074057340621948, + "learning_rate": 0.0009625, + "loss": 4.0801, + "step": 2310 + }, + { + "epoch": 1.69, + "grad_norm": 1.6140304803848267, + "learning_rate": 0.0009666666666666667, + "loss": 4.0383, + "step": 2320 + }, + { + "epoch": 1.7, + "grad_norm": 1.5922300815582275, + "learning_rate": 0.0009708333333333333, + "loss": 4.0099, + "step": 2330 + }, + { + "epoch": 1.71, + "grad_norm": 1.9714833498001099, + "learning_rate": 0.000975, + "loss": 3.9757, + "step": 2340 + }, + { + "epoch": 1.71, + "eval_accuracy": 0.3303083702251303, + "eval_loss": 3.764934539794922, + "eval_runtime": 1104.6754, + "eval_samples_per_second": 452.06, + "eval_steps_per_second": 2.019, + "step": 2340 + }, + { + "epoch": 1.71, + "grad_norm": 2.1198415756225586, + "learning_rate": 0.0009791666666666666, + "loss": 3.9507, + "step": 2350 + }, + { + "epoch": 1.72, + "grad_norm": 2.0731935501098633, + "learning_rate": 0.0009833333333333332, + "loss": 3.9258, + "step": 2360 + }, + { + "epoch": 1.73, + "grad_norm": 2.1984808444976807, + "learning_rate": 0.0009875, + "loss": 3.9003, + "step": 2370 + }, + { + "epoch": 1.73, + "grad_norm": 2.033250331878662, + "learning_rate": 0.0009916666666666667, + "loss": 3.8732, + "step": 2380 + }, + { + "epoch": 1.74, + "grad_norm": 1.7183982133865356, + "learning_rate": 0.0009958333333333334, + "loss": 3.8557, + "step": 2390 + }, + { + "epoch": 1.75, + "grad_norm": 2.216938018798828, + "learning_rate": 0.001, + "loss": 3.8376, + "step": 2400 + }, + { + "epoch": 1.76, + "grad_norm": 2.109079599380493, + "learning_rate": 0.000999009900990099, + "loss": 3.8212, + "step": 2410 + }, + { + "epoch": 1.76, + "grad_norm": 1.5402984619140625, + "learning_rate": 0.0009980198019801981, + "loss": 3.8, + "step": 2420 + }, + { + "epoch": 1.77, + "grad_norm": 2.051513433456421, + "learning_rate": 0.000997029702970297, + "loss": 3.7913, + "step": 2430 + }, + { + "epoch": 1.77, + "eval_accuracy": 0.35789052045361985, + "eval_loss": 3.5296359062194824, + "eval_runtime": 1087.9351, + "eval_samples_per_second": 459.016, + "eval_steps_per_second": 2.05, + "step": 2430 + }, + { + "epoch": 1.78, + "grad_norm": 1.8306666612625122, + "learning_rate": 0.000996039603960396, + "loss": 3.7567, + "step": 2440 + }, + { + "epoch": 1.79, + "grad_norm": 1.9114989042282104, + "learning_rate": 0.000995049504950495, + "loss": 3.7491, + "step": 2450 + }, + { + "epoch": 1.79, + "grad_norm": 1.9881885051727295, + "learning_rate": 0.0009940594059405941, + "loss": 3.7297, + "step": 2460 + }, + { + "epoch": 1.8, + "grad_norm": 2.2852580547332764, + "learning_rate": 0.0009930693069306932, + "loss": 3.7073, + "step": 2470 + }, + { + "epoch": 1.81, + "grad_norm": 2.090174913406372, + "learning_rate": 0.000992079207920792, + "loss": 3.6902, + "step": 2480 + }, + { + "epoch": 1.81, + "grad_norm": 2.5586419105529785, + "learning_rate": 0.000991089108910891, + "loss": 3.6792, + "step": 2490 + }, + { + "epoch": 1.82, + "grad_norm": 1.9420301914215088, + "learning_rate": 0.0009900990099009901, + "loss": 3.6728, + "step": 2500 + }, + { + "epoch": 1.83, + "grad_norm": 2.319821834564209, + "learning_rate": 0.0009891089108910892, + "loss": 3.6627, + "step": 2510 + }, + { + "epoch": 1.84, + "grad_norm": 2.134413480758667, + "learning_rate": 0.0009881188118811882, + "loss": 3.6435, + "step": 2520 + }, + { + "epoch": 1.84, + "eval_accuracy": 0.3799301143797497, + "eval_loss": 3.3790884017944336, + "eval_runtime": 1089.5448, + "eval_samples_per_second": 458.338, + "eval_steps_per_second": 2.047, + "step": 2520 + }, + { + "epoch": 1.84, + "grad_norm": 1.8554224967956543, + "learning_rate": 0.000987128712871287, + "loss": 3.6265, + "step": 2530 + }, + { + "epoch": 1.85, + "grad_norm": 2.16987681388855, + "learning_rate": 0.000986138613861386, + "loss": 3.6098, + "step": 2540 + }, + { + "epoch": 1.86, + "grad_norm": 1.9863182306289673, + "learning_rate": 0.0009851485148514852, + "loss": 3.5982, + "step": 2550 + }, + { + "epoch": 1.87, + "grad_norm": 2.0247480869293213, + "learning_rate": 0.0009841584158415842, + "loss": 3.5911, + "step": 2560 + }, + { + "epoch": 1.87, + "grad_norm": 2.2719273567199707, + "learning_rate": 0.0009831683168316833, + "loss": 3.5804, + "step": 2570 + }, + { + "epoch": 1.88, + "grad_norm": 2.0588369369506836, + "learning_rate": 0.000982178217821782, + "loss": 3.5654, + "step": 2580 + }, + { + "epoch": 1.89, + "grad_norm": 1.9666892290115356, + "learning_rate": 0.0009811881188118811, + "loss": 3.558, + "step": 2590 + }, + { + "epoch": 1.89, + "grad_norm": 1.752681016921997, + "learning_rate": 0.0009801980198019802, + "loss": 3.5389, + "step": 2600 + }, + { + "epoch": 1.9, + "grad_norm": 2.821775197982788, + "learning_rate": 0.0009792079207920793, + "loss": 3.5327, + "step": 2610 + }, + { + "epoch": 1.9, + "eval_accuracy": 0.39317171253107736, + "eval_loss": 3.2741596698760986, + "eval_runtime": 1086.2288, + "eval_samples_per_second": 459.737, + "eval_steps_per_second": 2.053, + "step": 2610 + }, + { + "epoch": 1.91, + "grad_norm": 2.4307518005371094, + "learning_rate": 0.0009782178217821783, + "loss": 3.5311, + "step": 2620 + }, + { + "epoch": 1.92, + "grad_norm": 1.8416870832443237, + "learning_rate": 0.0009772277227722771, + "loss": 3.5199, + "step": 2630 + }, + { + "epoch": 1.92, + "grad_norm": 1.7294279336929321, + "learning_rate": 0.0009762376237623762, + "loss": 3.5067, + "step": 2640 + }, + { + "epoch": 1.93, + "grad_norm": 2.0376105308532715, + "learning_rate": 0.0009752475247524752, + "loss": 3.4957, + "step": 2650 + }, + { + "epoch": 1.94, + "grad_norm": 1.845569133758545, + "learning_rate": 0.0009742574257425743, + "loss": 3.4778, + "step": 2660 + }, + { + "epoch": 1.95, + "grad_norm": 2.1370015144348145, + "learning_rate": 0.0009732673267326732, + "loss": 3.4766, + "step": 2670 + }, + { + "epoch": 1.95, + "grad_norm": 2.0046229362487793, + "learning_rate": 0.0009722772277227723, + "loss": 3.4609, + "step": 2680 + }, + { + "epoch": 1.96, + "grad_norm": 1.7367238998413086, + "learning_rate": 0.0009712871287128712, + "loss": 3.4574, + "step": 2690 + }, + { + "epoch": 1.97, + "grad_norm": 2.245299816131592, + "learning_rate": 0.0009702970297029703, + "loss": 3.4402, + "step": 2700 + }, + { + "epoch": 1.97, + "eval_accuracy": 0.40886959953318786, + "eval_loss": 3.1605701446533203, + "eval_runtime": 1086.3963, + "eval_samples_per_second": 459.666, + "eval_steps_per_second": 2.053, + "step": 2700 + }, + { + "epoch": 1.97, + "grad_norm": 1.6792678833007812, + "learning_rate": 0.0009693069306930693, + "loss": 3.4155, + "step": 2710 + }, + { + "epoch": 1.98, + "grad_norm": 2.1290223598480225, + "learning_rate": 0.0009683168316831683, + "loss": 3.3953, + "step": 2720 + }, + { + "epoch": 1.99, + "grad_norm": 1.9963873624801636, + "learning_rate": 0.0009673267326732673, + "loss": 3.3722, + "step": 2730 + }, + { + "epoch": 2.0, + "grad_norm": 2.212454080581665, + "learning_rate": 0.0009663366336633663, + "loss": 3.3532, + "step": 2740 + }, + { + "epoch": 2.0, + "grad_norm": 2.145552396774292, + "learning_rate": 0.0009653465346534653, + "loss": 3.336, + "step": 2750 + }, + { + "epoch": 2.01, + "grad_norm": 2.423874616622925, + "learning_rate": 0.0009643564356435644, + "loss": 3.3111, + "step": 2760 + }, + { + "epoch": 2.02, + "grad_norm": 2.0116701126098633, + "learning_rate": 0.0009633663366336633, + "loss": 3.305, + "step": 2770 + }, + { + "epoch": 2.03, + "grad_norm": 2.243619203567505, + "learning_rate": 0.0009623762376237624, + "loss": 3.2854, + "step": 2780 + }, + { + "epoch": 2.03, + "grad_norm": 1.5583114624023438, + "learning_rate": 0.0009613861386138613, + "loss": 3.2635, + "step": 2790 + }, + { + "epoch": 2.03, + "eval_accuracy": 0.43169227745021366, + "eval_loss": 2.9848363399505615, + "eval_runtime": 1087.8763, + "eval_samples_per_second": 459.041, + "eval_steps_per_second": 2.05, + "step": 2790 + }, + { + "epoch": 2.04, + "grad_norm": 2.156170606613159, + "learning_rate": 0.0009603960396039604, + "loss": 3.2498, + "step": 2800 + }, + { + "epoch": 2.05, + "grad_norm": 1.917297601699829, + "learning_rate": 0.0009594059405940594, + "loss": 3.2343, + "step": 2810 + }, + { + "epoch": 2.05, + "grad_norm": 1.7647627592086792, + "learning_rate": 0.0009584158415841584, + "loss": 3.2206, + "step": 2820 + }, + { + "epoch": 2.06, + "grad_norm": 1.7406831979751587, + "learning_rate": 0.0009574257425742574, + "loss": 3.2023, + "step": 2830 + }, + { + "epoch": 2.07, + "grad_norm": 1.721940040588379, + "learning_rate": 0.0009564356435643564, + "loss": 3.1896, + "step": 2840 + }, + { + "epoch": 2.08, + "grad_norm": 1.5204572677612305, + "learning_rate": 0.0009554455445544554, + "loss": 3.1769, + "step": 2850 + }, + { + "epoch": 2.08, + "grad_norm": 2.20760440826416, + "learning_rate": 0.0009544554455445545, + "loss": 3.1706, + "step": 2860 + }, + { + "epoch": 2.09, + "grad_norm": 1.6796480417251587, + "learning_rate": 0.0009534653465346534, + "loss": 3.1552, + "step": 2870 + }, + { + "epoch": 2.1, + "grad_norm": 2.044858455657959, + "learning_rate": 0.0009524752475247525, + "loss": 3.1385, + "step": 2880 + }, + { + "epoch": 2.1, + "eval_accuracy": 0.4465053493029932, + "eval_loss": 2.872570037841797, + "eval_runtime": 1089.0874, + "eval_samples_per_second": 458.531, + "eval_steps_per_second": 2.048, + "step": 2880 + }, + { + "epoch": 2.11, + "grad_norm": 1.3739113807678223, + "learning_rate": 0.0009514851485148514, + "loss": 3.1217, + "step": 2890 + }, + { + "epoch": 2.11, + "grad_norm": 1.631298303604126, + "learning_rate": 0.0009504950495049505, + "loss": 3.1102, + "step": 2900 + }, + { + "epoch": 2.12, + "grad_norm": 1.8213354349136353, + "learning_rate": 0.0009495049504950495, + "loss": 3.092, + "step": 2910 + }, + { + "epoch": 2.13, + "grad_norm": 1.910646915435791, + "learning_rate": 0.0009485148514851485, + "loss": 3.0768, + "step": 2920 + }, + { + "epoch": 2.14, + "grad_norm": 1.6591072082519531, + "learning_rate": 0.0009475247524752475, + "loss": 3.0721, + "step": 2930 + }, + { + "epoch": 2.14, + "grad_norm": 1.921587347984314, + "learning_rate": 0.0009465346534653465, + "loss": 3.0493, + "step": 2940 + }, + { + "epoch": 2.15, + "grad_norm": 1.576114296913147, + "learning_rate": 0.0009455445544554455, + "loss": 3.0392, + "step": 2950 + }, + { + "epoch": 2.16, + "grad_norm": 1.41093909740448, + "learning_rate": 0.0009445544554455446, + "loss": 3.0204, + "step": 2960 + }, + { + "epoch": 2.16, + "grad_norm": 1.41178297996521, + "learning_rate": 0.0009435643564356435, + "loss": 3.0046, + "step": 2970 + }, + { + "epoch": 2.16, + "eval_accuracy": 0.46085574907280247, + "eval_loss": 2.7754335403442383, + "eval_runtime": 1088.9809, + "eval_samples_per_second": 458.576, + "eval_steps_per_second": 2.048, + "step": 2970 + }, + { + "epoch": 2.17, + "grad_norm": 1.5229026079177856, + "learning_rate": 0.0009425742574257426, + "loss": 2.989, + "step": 2980 + }, + { + "epoch": 2.18, + "grad_norm": 1.3193325996398926, + "learning_rate": 0.0009415841584158415, + "loss": 2.9764, + "step": 2990 + }, + { + "epoch": 2.19, + "grad_norm": 1.1938610076904297, + "learning_rate": 0.0009405940594059406, + "loss": 2.9636, + "step": 3000 + }, + { + "epoch": 2.19, + "grad_norm": 1.1402697563171387, + "learning_rate": 0.0009396039603960396, + "loss": 2.9517, + "step": 3010 + }, + { + "epoch": 2.2, + "grad_norm": 1.4980099201202393, + "learning_rate": 0.0009386138613861386, + "loss": 2.9445, + "step": 3020 + }, + { + "epoch": 2.21, + "grad_norm": 1.4591041803359985, + "learning_rate": 0.0009376237623762376, + "loss": 2.9317, + "step": 3030 + }, + { + "epoch": 2.22, + "grad_norm": 1.4302833080291748, + "learning_rate": 0.0009366336633663367, + "loss": 2.9167, + "step": 3040 + }, + { + "epoch": 2.22, + "grad_norm": 1.2571301460266113, + "learning_rate": 0.0009356435643564357, + "loss": 2.9049, + "step": 3050 + }, + { + "epoch": 2.23, + "grad_norm": 1.2039096355438232, + "learning_rate": 0.0009346534653465348, + "loss": 2.8885, + "step": 3060 + }, + { + "epoch": 2.23, + "eval_accuracy": 0.4740639726753192, + "eval_loss": 2.6853535175323486, + "eval_runtime": 1088.5789, + "eval_samples_per_second": 458.745, + "eval_steps_per_second": 2.049, + "step": 3060 + }, + { + "epoch": 2.24, + "grad_norm": 1.1458439826965332, + "learning_rate": 0.0009336633663366337, + "loss": 2.8844, + "step": 3070 + }, + { + "epoch": 2.24, + "grad_norm": 1.1883801221847534, + "learning_rate": 0.0009326732673267328, + "loss": 2.8777, + "step": 3080 + }, + { + "epoch": 2.25, + "grad_norm": 1.0597162246704102, + "learning_rate": 0.0009316831683168317, + "loss": 2.8647, + "step": 3090 + }, + { + "epoch": 2.26, + "grad_norm": 1.2362898588180542, + "learning_rate": 0.0009306930693069308, + "loss": 2.8565, + "step": 3100 + }, + { + "epoch": 2.27, + "grad_norm": 1.588973879814148, + "learning_rate": 0.0009297029702970298, + "loss": 2.8443, + "step": 3110 + }, + { + "epoch": 2.27, + "grad_norm": 1.7287636995315552, + "learning_rate": 0.0009287128712871288, + "loss": 2.8401, + "step": 3120 + }, + { + "epoch": 2.28, + "grad_norm": 1.2376179695129395, + "learning_rate": 0.0009277227722772278, + "loss": 2.8338, + "step": 3130 + }, + { + "epoch": 2.29, + "grad_norm": 1.1424429416656494, + "learning_rate": 0.0009267326732673268, + "loss": 2.8158, + "step": 3140 + }, + { + "epoch": 2.3, + "grad_norm": 1.0561778545379639, + "learning_rate": 0.0009257425742574258, + "loss": 2.8086, + "step": 3150 + }, + { + "epoch": 2.3, + "eval_accuracy": 0.4839770905517238, + "eval_loss": 2.6107919216156006, + "eval_runtime": 1086.2364, + "eval_samples_per_second": 459.734, + "eval_steps_per_second": 2.053, + "step": 3150 + }, + { + "epoch": 2.3, + "grad_norm": 1.1775144338607788, + "learning_rate": 0.0009247524752475249, + "loss": 2.8002, + "step": 3160 + }, + { + "epoch": 2.31, + "grad_norm": 1.1752933263778687, + "learning_rate": 0.0009237623762376238, + "loss": 2.7913, + "step": 3170 + }, + { + "epoch": 2.32, + "grad_norm": 1.35330331325531, + "learning_rate": 0.0009227722772277229, + "loss": 2.7821, + "step": 3180 + }, + { + "epoch": 2.32, + "grad_norm": 1.163878321647644, + "learning_rate": 0.0009217821782178218, + "loss": 2.7755, + "step": 3190 + }, + { + "epoch": 2.33, + "grad_norm": 1.0859084129333496, + "learning_rate": 0.0009207920792079209, + "loss": 2.7651, + "step": 3200 + }, + { + "epoch": 2.34, + "grad_norm": 1.1351704597473145, + "learning_rate": 0.0009198019801980199, + "loss": 2.7583, + "step": 3210 + }, + { + "epoch": 2.35, + "grad_norm": 1.0613981485366821, + "learning_rate": 0.0009188118811881188, + "loss": 2.7482, + "step": 3220 + }, + { + "epoch": 2.35, + "grad_norm": 1.1925811767578125, + "learning_rate": 0.0009178217821782179, + "loss": 2.7411, + "step": 3230 + }, + { + "epoch": 2.36, + "grad_norm": 1.00603187084198, + "learning_rate": 0.0009168316831683168, + "loss": 2.7357, + "step": 3240 + }, + { + "epoch": 2.36, + "eval_accuracy": 0.4945267646340819, + "eval_loss": 2.5409770011901855, + "eval_runtime": 1086.0357, + "eval_samples_per_second": 459.819, + "eval_steps_per_second": 2.053, + "step": 3240 + }, + { + "epoch": 2.37, + "grad_norm": 1.0638866424560547, + "learning_rate": 0.0009158415841584159, + "loss": 2.7271, + "step": 3250 + }, + { + "epoch": 2.38, + "grad_norm": 1.0579949617385864, + "learning_rate": 0.000914851485148515, + "loss": 2.7192, + "step": 3260 + }, + { + "epoch": 2.38, + "grad_norm": 0.8697578310966492, + "learning_rate": 0.0009138613861386139, + "loss": 2.7114, + "step": 3270 + }, + { + "epoch": 2.39, + "grad_norm": 1.1074854135513306, + "learning_rate": 0.0009128712871287129, + "loss": 2.7044, + "step": 3280 + }, + { + "epoch": 2.4, + "grad_norm": 1.2240349054336548, + "learning_rate": 0.0009118811881188119, + "loss": 2.6981, + "step": 3290 + }, + { + "epoch": 2.4, + "grad_norm": 1.0825715065002441, + "learning_rate": 0.0009108910891089109, + "loss": 2.688, + "step": 3300 + }, + { + "epoch": 2.41, + "grad_norm": 0.9786016941070557, + "learning_rate": 0.00090990099009901, + "loss": 2.681, + "step": 3310 + }, + { + "epoch": 2.42, + "grad_norm": 0.7355318665504456, + "learning_rate": 0.0009089108910891089, + "loss": 2.6771, + "step": 3320 + }, + { + "epoch": 2.43, + "grad_norm": 1.0296987295150757, + "learning_rate": 0.000907920792079208, + "loss": 2.6714, + "step": 3330 + }, + { + "epoch": 2.43, + "eval_accuracy": 0.5031676098849697, + "eval_loss": 2.4873406887054443, + "eval_runtime": 1086.6349, + "eval_samples_per_second": 459.566, + "eval_steps_per_second": 2.052, + "step": 3330 + }, + { + "epoch": 2.43, + "grad_norm": 1.0019482374191284, + "learning_rate": 0.0009069306930693069, + "loss": 2.659, + "step": 3340 + }, + { + "epoch": 2.44, + "grad_norm": 0.95697021484375, + "learning_rate": 0.000905940594059406, + "loss": 2.6534, + "step": 3350 + }, + { + "epoch": 2.45, + "grad_norm": 0.9206619262695312, + "learning_rate": 0.000904950495049505, + "loss": 2.6499, + "step": 3360 + }, + { + "epoch": 2.46, + "grad_norm": 0.9173060059547424, + "learning_rate": 0.000903960396039604, + "loss": 2.6436, + "step": 3370 + }, + { + "epoch": 2.46, + "grad_norm": 0.946976900100708, + "learning_rate": 0.000902970297029703, + "loss": 2.6386, + "step": 3380 + }, + { + "epoch": 2.47, + "grad_norm": 1.1060245037078857, + "learning_rate": 0.000901980198019802, + "loss": 2.6295, + "step": 3390 + }, + { + "epoch": 2.48, + "grad_norm": 0.9128373861312866, + "learning_rate": 0.000900990099009901, + "loss": 2.6207, + "step": 3400 + }, + { + "epoch": 2.48, + "grad_norm": 0.7775394916534424, + "learning_rate": 0.0009000000000000001, + "loss": 2.6152, + "step": 3410 + }, + { + "epoch": 2.49, + "grad_norm": 1.0009465217590332, + "learning_rate": 0.000899009900990099, + "loss": 2.6114, + "step": 3420 + }, + { + "epoch": 2.49, + "eval_accuracy": 0.5107387401188807, + "eval_loss": 2.430750608444214, + "eval_runtime": 1085.7185, + "eval_samples_per_second": 459.953, + "eval_steps_per_second": 2.054, + "step": 3420 + }, + { + "epoch": 2.5, + "grad_norm": 0.9122986197471619, + "learning_rate": 0.0008980198019801981, + "loss": 2.6046, + "step": 3430 + }, + { + "epoch": 2.51, + "grad_norm": 0.9263846278190613, + "learning_rate": 0.000897029702970297, + "loss": 2.6006, + "step": 3440 + }, + { + "epoch": 2.51, + "grad_norm": 0.9440599083900452, + "learning_rate": 0.0008960396039603961, + "loss": 2.5954, + "step": 3450 + }, + { + "epoch": 2.52, + "grad_norm": 1.0791646242141724, + "learning_rate": 0.0008950495049504951, + "loss": 2.5875, + "step": 3460 + }, + { + "epoch": 2.53, + "grad_norm": 0.8801349401473999, + "learning_rate": 0.0008940594059405941, + "loss": 2.5805, + "step": 3470 + }, + { + "epoch": 2.54, + "grad_norm": 0.8976075053215027, + "learning_rate": 0.0008930693069306931, + "loss": 2.5856, + "step": 3480 + }, + { + "epoch": 2.54, + "grad_norm": 0.9874941110610962, + "learning_rate": 0.0008920792079207921, + "loss": 2.5741, + "step": 3490 + }, + { + "epoch": 2.55, + "grad_norm": 0.8185487985610962, + "learning_rate": 0.0008910891089108911, + "loss": 2.5625, + "step": 3500 + }, + { + "epoch": 2.56, + "grad_norm": 1.0372703075408936, + "learning_rate": 0.0008900990099009902, + "loss": 2.5622, + "step": 3510 + }, + { + "epoch": 2.56, + "eval_accuracy": 0.5173882190435195, + "eval_loss": 2.3903918266296387, + "eval_runtime": 1085.7537, + "eval_samples_per_second": 459.939, + "eval_steps_per_second": 2.054, + "step": 3510 + }, + { + "epoch": 2.56, + "grad_norm": 0.7737933397293091, + "learning_rate": 0.0008891089108910891, + "loss": 2.5551, + "step": 3520 + }, + { + "epoch": 2.57, + "grad_norm": 0.8418464660644531, + "learning_rate": 0.0008881188118811882, + "loss": 2.5505, + "step": 3530 + }, + { + "epoch": 2.58, + "grad_norm": 0.9638449549674988, + "learning_rate": 0.0008871287128712871, + "loss": 2.5506, + "step": 3540 + }, + { + "epoch": 2.59, + "grad_norm": 0.8606787919998169, + "learning_rate": 0.0008861386138613862, + "loss": 2.5446, + "step": 3550 + }, + { + "epoch": 2.59, + "grad_norm": 0.9567099213600159, + "learning_rate": 0.0008851485148514852, + "loss": 2.5316, + "step": 3560 + }, + { + "epoch": 2.6, + "grad_norm": 0.9098414182662964, + "learning_rate": 0.0008841584158415842, + "loss": 2.5299, + "step": 3570 + }, + { + "epoch": 2.61, + "grad_norm": 0.9305897951126099, + "learning_rate": 0.0008831683168316832, + "loss": 2.53, + "step": 3580 + }, + { + "epoch": 2.62, + "grad_norm": 0.6542484164237976, + "learning_rate": 0.0008821782178217822, + "loss": 2.5182, + "step": 3590 + }, + { + "epoch": 2.62, + "grad_norm": 0.8789640069007874, + "learning_rate": 0.0008811881188118812, + "loss": 2.5145, + "step": 3600 + }, + { + "epoch": 2.62, + "eval_accuracy": 0.5233957340804408, + "eval_loss": 2.3455302715301514, + "eval_runtime": 1084.7897, + "eval_samples_per_second": 460.347, + "eval_steps_per_second": 2.056, + "step": 3600 + }, + { + "epoch": 2.63, + "grad_norm": 0.7239986062049866, + "learning_rate": 0.0008801980198019803, + "loss": 2.5092, + "step": 3610 + }, + { + "epoch": 2.64, + "grad_norm": 0.7931397557258606, + "learning_rate": 0.0008792079207920792, + "loss": 2.4992, + "step": 3620 + }, + { + "epoch": 2.65, + "grad_norm": 0.9696986675262451, + "learning_rate": 0.0008782178217821783, + "loss": 2.5035, + "step": 3630 + }, + { + "epoch": 2.65, + "grad_norm": 0.7151007056236267, + "learning_rate": 0.0008772277227722772, + "loss": 2.4927, + "step": 3640 + }, + { + "epoch": 2.66, + "grad_norm": 1.054768443107605, + "learning_rate": 0.0008762376237623763, + "loss": 2.4831, + "step": 3650 + }, + { + "epoch": 2.67, + "grad_norm": 0.7492119669914246, + "learning_rate": 0.0008752475247524753, + "loss": 2.4872, + "step": 3660 + }, + { + "epoch": 2.67, + "grad_norm": 0.6677684187889099, + "learning_rate": 0.0008742574257425743, + "loss": 2.4797, + "step": 3670 + }, + { + "epoch": 2.68, + "grad_norm": 0.8682121634483337, + "learning_rate": 0.0008732673267326733, + "loss": 2.4747, + "step": 3680 + }, + { + "epoch": 2.69, + "grad_norm": 0.9361952543258667, + "learning_rate": 0.0008722772277227722, + "loss": 2.4741, + "step": 3690 + }, + { + "epoch": 2.69, + "eval_accuracy": 0.5298733491746213, + "eval_loss": 2.301394462585449, + "eval_runtime": 1087.2464, + "eval_samples_per_second": 459.307, + "eval_steps_per_second": 2.051, + "step": 3690 + }, + { + "epoch": 2.7, + "grad_norm": 0.7424592971801758, + "learning_rate": 0.0008712871287128713, + "loss": 2.4641, + "step": 3700 + }, + { + "epoch": 2.7, + "grad_norm": 0.7794514894485474, + "learning_rate": 0.0008702970297029704, + "loss": 2.4558, + "step": 3710 + }, + { + "epoch": 2.71, + "grad_norm": 0.8285810351371765, + "learning_rate": 0.0008693069306930693, + "loss": 2.4541, + "step": 3720 + }, + { + "epoch": 2.72, + "grad_norm": 0.8555008769035339, + "learning_rate": 0.0008683168316831684, + "loss": 2.456, + "step": 3730 + }, + { + "epoch": 2.73, + "grad_norm": 0.7897722721099854, + "learning_rate": 0.0008673267326732673, + "loss": 2.4447, + "step": 3740 + }, + { + "epoch": 2.73, + "grad_norm": 0.7043498754501343, + "learning_rate": 0.0008663366336633663, + "loss": 2.44, + "step": 3750 + }, + { + "epoch": 2.74, + "grad_norm": 0.7770318984985352, + "learning_rate": 0.0008653465346534654, + "loss": 2.439, + "step": 3760 + }, + { + "epoch": 2.75, + "grad_norm": 0.8721603155136108, + "learning_rate": 0.0008643564356435643, + "loss": 2.4363, + "step": 3770 + }, + { + "epoch": 2.75, + "grad_norm": 0.653965413570404, + "learning_rate": 0.0008633663366336634, + "loss": 2.4298, + "step": 3780 + }, + { + "epoch": 2.75, + "eval_accuracy": 0.5352807916398568, + "eval_loss": 2.2673096656799316, + "eval_runtime": 1086.4494, + "eval_samples_per_second": 459.644, + "eval_steps_per_second": 2.053, + "step": 3780 + }, + { + "epoch": 2.76, + "grad_norm": 0.7826604247093201, + "learning_rate": 0.0008623762376237623, + "loss": 2.4204, + "step": 3790 + }, + { + "epoch": 2.77, + "grad_norm": 0.6523510813713074, + "learning_rate": 0.0008613861386138614, + "loss": 2.4194, + "step": 3800 + }, + { + "epoch": 2.78, + "grad_norm": 0.9408191442489624, + "learning_rate": 0.0008603960396039604, + "loss": 2.4135, + "step": 3810 + }, + { + "epoch": 2.78, + "grad_norm": 0.8393464684486389, + "learning_rate": 0.0008594059405940594, + "loss": 2.4179, + "step": 3820 + }, + { + "epoch": 2.79, + "grad_norm": 0.7277712821960449, + "learning_rate": 0.0008584158415841584, + "loss": 2.4087, + "step": 3830 + }, + { + "epoch": 2.8, + "grad_norm": 0.6864319443702698, + "learning_rate": 0.0008574257425742574, + "loss": 2.405, + "step": 3840 + }, + { + "epoch": 2.81, + "grad_norm": 0.6246985197067261, + "learning_rate": 0.0008564356435643564, + "loss": 2.3962, + "step": 3850 + }, + { + "epoch": 2.81, + "grad_norm": 0.7276294231414795, + "learning_rate": 0.0008554455445544555, + "loss": 2.4043, + "step": 3860 + }, + { + "epoch": 2.82, + "grad_norm": 0.7767272591590881, + "learning_rate": 0.0008544554455445544, + "loss": 2.3947, + "step": 3870 + }, + { + "epoch": 2.82, + "eval_accuracy": 0.5401726205914658, + "eval_loss": 2.2340025901794434, + "eval_runtime": 1089.0379, + "eval_samples_per_second": 458.552, + "eval_steps_per_second": 2.048, + "step": 3870 + }, + { + "epoch": 2.83, + "grad_norm": 0.717089831829071, + "learning_rate": 0.0008534653465346535, + "loss": 2.3934, + "step": 3880 + }, + { + "epoch": 2.83, + "grad_norm": 0.7381496429443359, + "learning_rate": 0.0008524752475247524, + "loss": 2.384, + "step": 3890 + }, + { + "epoch": 2.84, + "grad_norm": 0.8456007838249207, + "learning_rate": 0.0008514851485148515, + "loss": 2.3842, + "step": 3900 + }, + { + "epoch": 2.85, + "grad_norm": 0.6353156566619873, + "learning_rate": 0.0008504950495049505, + "loss": 2.3774, + "step": 3910 + }, + { + "epoch": 2.86, + "grad_norm": 0.6743925213813782, + "learning_rate": 0.0008495049504950495, + "loss": 2.3775, + "step": 3920 + }, + { + "epoch": 2.86, + "grad_norm": 0.6839917898178101, + "learning_rate": 0.0008485148514851485, + "loss": 2.3737, + "step": 3930 + }, + { + "epoch": 2.87, + "grad_norm": 0.6635532379150391, + "learning_rate": 0.0008475247524752475, + "loss": 2.3689, + "step": 3940 + }, + { + "epoch": 2.88, + "grad_norm": 0.7584016919136047, + "learning_rate": 0.0008465346534653465, + "loss": 2.3623, + "step": 3950 + }, + { + "epoch": 2.89, + "grad_norm": 0.9728506803512573, + "learning_rate": 0.0008455445544554456, + "loss": 2.3653, + "step": 3960 + }, + { + "epoch": 2.89, + "eval_accuracy": 0.5444187372461048, + "eval_loss": 2.210555076599121, + "eval_runtime": 1087.1075, + "eval_samples_per_second": 459.366, + "eval_steps_per_second": 2.051, + "step": 3960 + }, + { + "epoch": 2.89, + "grad_norm": 0.7243532538414001, + "learning_rate": 0.0008445544554455445, + "loss": 2.3605, + "step": 3970 + }, + { + "epoch": 2.9, + "grad_norm": 0.5774228572845459, + "learning_rate": 0.0008435643564356436, + "loss": 2.3547, + "step": 3980 + }, + { + "epoch": 2.91, + "grad_norm": 0.9098168611526489, + "learning_rate": 0.0008425742574257425, + "loss": 2.3507, + "step": 3990 + }, + { + "epoch": 2.91, + "grad_norm": 0.8770572543144226, + "learning_rate": 0.0008415841584158416, + "loss": 2.3535, + "step": 4000 + }, + { + "epoch": 2.92, + "grad_norm": 0.5555605292320251, + "learning_rate": 0.0008405940594059406, + "loss": 2.3463, + "step": 4010 + }, + { + "epoch": 2.93, + "grad_norm": 0.8333105444908142, + "learning_rate": 0.0008396039603960396, + "loss": 2.3417, + "step": 4020 + }, + { + "epoch": 2.94, + "grad_norm": 0.724617600440979, + "learning_rate": 0.0008386138613861386, + "loss": 2.3382, + "step": 4030 + }, + { + "epoch": 2.94, + "grad_norm": 0.6441348791122437, + "learning_rate": 0.0008376237623762376, + "loss": 2.3322, + "step": 4040 + }, + { + "epoch": 2.95, + "grad_norm": 0.7889347672462463, + "learning_rate": 0.0008366336633663366, + "loss": 2.332, + "step": 4050 + }, + { + "epoch": 2.95, + "eval_accuracy": 0.5490578933317315, + "eval_loss": 2.1787993907928467, + "eval_runtime": 1089.6908, + "eval_samples_per_second": 458.277, + "eval_steps_per_second": 2.046, + "step": 4050 + }, + { + "epoch": 2.96, + "grad_norm": 0.7708555459976196, + "learning_rate": 0.0008356435643564357, + "loss": 2.3326, + "step": 4060 + }, + { + "epoch": 2.97, + "grad_norm": 0.5825323462486267, + "learning_rate": 0.0008346534653465346, + "loss": 2.3269, + "step": 4070 + }, + { + "epoch": 2.97, + "grad_norm": 0.6252484321594238, + "learning_rate": 0.0008336633663366337, + "loss": 2.3226, + "step": 4080 + }, + { + "epoch": 2.98, + "grad_norm": 0.8860800266265869, + "learning_rate": 0.0008326732673267326, + "loss": 2.3213, + "step": 4090 + }, + { + "epoch": 2.99, + "grad_norm": 0.6472296714782715, + "learning_rate": 0.0008316831683168317, + "loss": 2.3159, + "step": 4100 + }, + { + "epoch": 2.99, + "grad_norm": 0.5913267731666565, + "learning_rate": 0.0008306930693069307, + "loss": 2.31, + "step": 4110 + }, + { + "epoch": 3.0, + "grad_norm": 0.9591660499572754, + "learning_rate": 0.0008297029702970297, + "loss": 2.3105, + "step": 4120 + }, + { + "epoch": 3.01, + "grad_norm": 0.6770280599594116, + "learning_rate": 0.0008287128712871287, + "loss": 2.3043, + "step": 4130 + }, + { + "epoch": 3.02, + "grad_norm": 0.6088559031486511, + "learning_rate": 0.0008277227722772277, + "loss": 2.3006, + "step": 4140 + }, + { + "epoch": 3.02, + "eval_accuracy": 0.5543035745580942, + "eval_loss": 2.146070718765259, + "eval_runtime": 1088.9664, + "eval_samples_per_second": 458.582, + "eval_steps_per_second": 2.048, + "step": 4140 + }, + { + "epoch": 3.02, + "grad_norm": 0.8546580076217651, + "learning_rate": 0.0008267326732673267, + "loss": 2.3008, + "step": 4150 + }, + { + "epoch": 3.03, + "grad_norm": 0.5891646146774292, + "learning_rate": 0.0008257425742574258, + "loss": 2.2996, + "step": 4160 + }, + { + "epoch": 3.04, + "grad_norm": 0.7681787610054016, + "learning_rate": 0.0008247524752475247, + "loss": 2.291, + "step": 4170 + }, + { + "epoch": 3.05, + "grad_norm": 0.6504939198493958, + "learning_rate": 0.0008237623762376238, + "loss": 2.2908, + "step": 4180 + }, + { + "epoch": 3.05, + "grad_norm": 0.7509806752204895, + "learning_rate": 0.0008227722772277227, + "loss": 2.2894, + "step": 4190 + }, + { + "epoch": 3.06, + "grad_norm": 0.9019284844398499, + "learning_rate": 0.0008217821782178218, + "loss": 2.2851, + "step": 4200 + }, + { + "epoch": 3.07, + "grad_norm": 0.5588305592536926, + "learning_rate": 0.0008207920792079208, + "loss": 2.284, + "step": 4210 + }, + { + "epoch": 3.07, + "grad_norm": 0.659537672996521, + "learning_rate": 0.0008198019801980197, + "loss": 2.2758, + "step": 4220 + }, + { + "epoch": 3.08, + "grad_norm": 0.6673635840415955, + "learning_rate": 0.0008188118811881188, + "loss": 2.2731, + "step": 4230 + }, + { + "epoch": 3.08, + "eval_accuracy": 0.5577760802425583, + "eval_loss": 2.1262881755828857, + "eval_runtime": 1086.5663, + "eval_samples_per_second": 459.595, + "eval_steps_per_second": 2.052, + "step": 4230 + }, + { + "epoch": 3.09, + "grad_norm": 0.7020514607429504, + "learning_rate": 0.0008178217821782177, + "loss": 2.2744, + "step": 4240 + }, + { + "epoch": 3.1, + "grad_norm": 0.6836090087890625, + "learning_rate": 0.0008168316831683168, + "loss": 2.2726, + "step": 4250 + }, + { + "epoch": 3.1, + "grad_norm": 0.8297485709190369, + "learning_rate": 0.0008158415841584159, + "loss": 2.2696, + "step": 4260 + }, + { + "epoch": 3.11, + "grad_norm": 0.6747464537620544, + "learning_rate": 0.0008148514851485148, + "loss": 2.2667, + "step": 4270 + }, + { + "epoch": 3.12, + "grad_norm": 0.6610013246536255, + "learning_rate": 0.0008138613861386138, + "loss": 2.261, + "step": 4280 + }, + { + "epoch": 3.13, + "grad_norm": 0.9729331731796265, + "learning_rate": 0.0008128712871287128, + "loss": 2.2634, + "step": 4290 + }, + { + "epoch": 3.13, + "grad_norm": 0.580893874168396, + "learning_rate": 0.000811881188118812, + "loss": 2.2643, + "step": 4300 + }, + { + "epoch": 3.14, + "grad_norm": 0.5239897966384888, + "learning_rate": 0.000810891089108911, + "loss": 2.2533, + "step": 4310 + }, + { + "epoch": 3.15, + "grad_norm": 0.9247382283210754, + "learning_rate": 0.00080990099009901, + "loss": 2.2544, + "step": 4320 + }, + { + "epoch": 3.15, + "eval_accuracy": 0.5607038793304535, + "eval_loss": 2.1080663204193115, + "eval_runtime": 1087.2566, + "eval_samples_per_second": 459.303, + "eval_steps_per_second": 2.051, + "step": 4320 + }, + { + "epoch": 3.16, + "grad_norm": 0.7254253029823303, + "learning_rate": 0.000808910891089109, + "loss": 2.2521, + "step": 4330 + }, + { + "epoch": 3.16, + "grad_norm": 0.47916772961616516, + "learning_rate": 0.0008079207920792079, + "loss": 2.2471, + "step": 4340 + }, + { + "epoch": 3.17, + "grad_norm": 0.8148419260978699, + "learning_rate": 0.000806930693069307, + "loss": 2.2479, + "step": 4350 + }, + { + "epoch": 3.18, + "grad_norm": 0.6527644991874695, + "learning_rate": 0.000805940594059406, + "loss": 2.2446, + "step": 4360 + }, + { + "epoch": 3.18, + "grad_norm": 0.7129182815551758, + "learning_rate": 0.000804950495049505, + "loss": 2.2382, + "step": 4370 + }, + { + "epoch": 3.19, + "grad_norm": 0.6954285502433777, + "learning_rate": 0.000803960396039604, + "loss": 2.2399, + "step": 4380 + }, + { + "epoch": 3.2, + "grad_norm": 0.6172522902488708, + "learning_rate": 0.000802970297029703, + "loss": 2.2395, + "step": 4390 + }, + { + "epoch": 3.21, + "grad_norm": 0.8309088349342346, + "learning_rate": 0.000801980198019802, + "loss": 2.2379, + "step": 4400 + }, + { + "epoch": 3.21, + "grad_norm": 0.6792633533477783, + "learning_rate": 0.0008009900990099011, + "loss": 2.2364, + "step": 4410 + }, + { + "epoch": 3.21, + "eval_accuracy": 0.5646760378560493, + "eval_loss": 2.0840134620666504, + "eval_runtime": 1086.4023, + "eval_samples_per_second": 459.664, + "eval_steps_per_second": 2.053, + "step": 4410 + }, + { + "epoch": 3.22, + "grad_norm": 0.6008450388908386, + "learning_rate": 0.0008, + "loss": 2.2289, + "step": 4420 + }, + { + "epoch": 3.23, + "grad_norm": 0.5826246738433838, + "learning_rate": 0.0007990099009900991, + "loss": 2.2259, + "step": 4430 + }, + { + "epoch": 3.24, + "grad_norm": 0.6360362768173218, + "learning_rate": 0.000798019801980198, + "loss": 2.2262, + "step": 4440 + }, + { + "epoch": 3.24, + "grad_norm": 0.7450495362281799, + "learning_rate": 0.0007970297029702971, + "loss": 2.2241, + "step": 4450 + }, + { + "epoch": 3.25, + "grad_norm": 0.5571395754814148, + "learning_rate": 0.0007960396039603961, + "loss": 2.2245, + "step": 4460 + }, + { + "epoch": 3.26, + "grad_norm": 0.622724175453186, + "learning_rate": 0.0007950495049504951, + "loss": 2.2183, + "step": 4470 + }, + { + "epoch": 3.26, + "grad_norm": 0.621284544467926, + "learning_rate": 0.0007940594059405941, + "loss": 2.2153, + "step": 4480 + }, + { + "epoch": 3.27, + "grad_norm": 0.9469096660614014, + "learning_rate": 0.0007930693069306931, + "loss": 2.2156, + "step": 4490 + }, + { + "epoch": 3.28, + "grad_norm": 0.6318257451057434, + "learning_rate": 0.0007920792079207921, + "loss": 2.2143, + "step": 4500 + }, + { + "epoch": 3.28, + "eval_accuracy": 0.5673093799211185, + "eval_loss": 2.0671584606170654, + "eval_runtime": 1088.5283, + "eval_samples_per_second": 458.766, + "eval_steps_per_second": 2.049, + "step": 4500 + }, + { + "epoch": 3.29, + "grad_norm": 0.7229343056678772, + "learning_rate": 0.0007910891089108912, + "loss": 2.2111, + "step": 4510 + }, + { + "epoch": 3.29, + "grad_norm": 0.8711042404174805, + "learning_rate": 0.0007900990099009901, + "loss": 2.2144, + "step": 4520 + }, + { + "epoch": 3.3, + "grad_norm": 0.5540309548377991, + "learning_rate": 0.0007891089108910892, + "loss": 2.2063, + "step": 4530 + }, + { + "epoch": 3.31, + "grad_norm": 0.6157627105712891, + "learning_rate": 0.0007881188118811881, + "loss": 2.2026, + "step": 4540 + }, + { + "epoch": 3.32, + "grad_norm": 1.0260213613510132, + "learning_rate": 0.0007871287128712872, + "loss": 2.2107, + "step": 4550 + }, + { + "epoch": 3.32, + "grad_norm": 0.5229135155677795, + "learning_rate": 0.0007861386138613862, + "loss": 2.2031, + "step": 4560 + }, + { + "epoch": 3.33, + "grad_norm": 0.460483580827713, + "learning_rate": 0.0007851485148514852, + "loss": 2.1956, + "step": 4570 + }, + { + "epoch": 3.34, + "grad_norm": 0.9283266067504883, + "learning_rate": 0.0007841584158415842, + "loss": 2.1976, + "step": 4580 + }, + { + "epoch": 3.34, + "grad_norm": 0.5621626973152161, + "learning_rate": 0.0007831683168316832, + "loss": 2.1972, + "step": 4590 + }, + { + "epoch": 3.34, + "eval_accuracy": 0.5702843964025183, + "eval_loss": 2.049508810043335, + "eval_runtime": 1086.5875, + "eval_samples_per_second": 459.586, + "eval_steps_per_second": 2.052, + "step": 4590 + }, + { + "epoch": 3.35, + "grad_norm": 0.6998817324638367, + "learning_rate": 0.0007821782178217822, + "loss": 2.1905, + "step": 4600 + }, + { + "epoch": 3.36, + "grad_norm": 0.6161481738090515, + "learning_rate": 0.0007811881188118813, + "loss": 2.1916, + "step": 4610 + }, + { + "epoch": 3.37, + "grad_norm": 0.7690967321395874, + "learning_rate": 0.0007801980198019802, + "loss": 2.1904, + "step": 4620 + }, + { + "epoch": 3.37, + "grad_norm": 0.6058160066604614, + "learning_rate": 0.0007792079207920793, + "loss": 2.1894, + "step": 4630 + }, + { + "epoch": 3.38, + "grad_norm": 0.5859819054603577, + "learning_rate": 0.0007782178217821782, + "loss": 2.1823, + "step": 4640 + }, + { + "epoch": 3.39, + "grad_norm": 0.7922290563583374, + "learning_rate": 0.0007772277227722773, + "loss": 2.1816, + "step": 4650 + }, + { + "epoch": 3.4, + "grad_norm": 0.617785632610321, + "learning_rate": 0.0007762376237623763, + "loss": 2.1855, + "step": 4660 + }, + { + "epoch": 3.4, + "grad_norm": 0.6082860827445984, + "learning_rate": 0.0007752475247524753, + "loss": 2.1811, + "step": 4670 + }, + { + "epoch": 3.41, + "grad_norm": 0.44060420989990234, + "learning_rate": 0.0007742574257425743, + "loss": 2.1738, + "step": 4680 + }, + { + "epoch": 3.41, + "eval_accuracy": 0.5732795510180845, + "eval_loss": 2.031883955001831, + "eval_runtime": 1085.1554, + "eval_samples_per_second": 460.192, + "eval_steps_per_second": 2.055, + "step": 4680 + }, + { + "epoch": 3.42, + "grad_norm": 0.6842211484909058, + "learning_rate": 0.0007732673267326733, + "loss": 2.1746, + "step": 4690 + }, + { + "epoch": 3.42, + "grad_norm": 0.7363536357879639, + "learning_rate": 0.0007722772277227723, + "loss": 2.1756, + "step": 4700 + }, + { + "epoch": 3.43, + "grad_norm": 0.657122015953064, + "learning_rate": 0.0007712871287128714, + "loss": 2.1767, + "step": 4710 + }, + { + "epoch": 3.44, + "grad_norm": 0.525112509727478, + "learning_rate": 0.0007702970297029703, + "loss": 2.1705, + "step": 4720 + }, + { + "epoch": 3.45, + "grad_norm": 0.594642162322998, + "learning_rate": 0.0007693069306930694, + "loss": 2.167, + "step": 4730 + }, + { + "epoch": 3.45, + "grad_norm": 0.7353718280792236, + "learning_rate": 0.0007683168316831683, + "loss": 2.1678, + "step": 4740 + }, + { + "epoch": 3.46, + "grad_norm": 0.7462971806526184, + "learning_rate": 0.0007673267326732674, + "loss": 2.1665, + "step": 4750 + }, + { + "epoch": 3.47, + "grad_norm": 0.543685257434845, + "learning_rate": 0.0007663366336633664, + "loss": 2.1624, + "step": 4760 + }, + { + "epoch": 3.48, + "grad_norm": 0.53340744972229, + "learning_rate": 0.0007653465346534654, + "loss": 2.1587, + "step": 4770 + }, + { + "epoch": 3.48, + "eval_accuracy": 0.5758243970327254, + "eval_loss": 2.01580810546875, + "eval_runtime": 1087.4822, + "eval_samples_per_second": 459.208, + "eval_steps_per_second": 2.051, + "step": 4770 + }, + { + "epoch": 3.48, + "grad_norm": 0.819010853767395, + "learning_rate": 0.0007643564356435644, + "loss": 2.1623, + "step": 4780 + }, + { + "epoch": 3.49, + "grad_norm": 0.6191548109054565, + "learning_rate": 0.0007633663366336634, + "loss": 2.1611, + "step": 4790 + }, + { + "epoch": 3.5, + "grad_norm": 0.5724292993545532, + "learning_rate": 0.0007623762376237624, + "loss": 2.1583, + "step": 4800 + }, + { + "epoch": 3.5, + "grad_norm": 0.6278745532035828, + "learning_rate": 0.0007613861386138615, + "loss": 2.1573, + "step": 4810 + }, + { + "epoch": 3.51, + "grad_norm": 0.6978874802589417, + "learning_rate": 0.0007603960396039604, + "loss": 2.1523, + "step": 4820 + }, + { + "epoch": 3.52, + "grad_norm": 0.9318163990974426, + "learning_rate": 0.0007594059405940595, + "loss": 2.1524, + "step": 4830 + }, + { + "epoch": 3.53, + "grad_norm": 0.5397381782531738, + "learning_rate": 0.0007584158415841584, + "loss": 2.1505, + "step": 4840 + }, + { + "epoch": 3.53, + "grad_norm": 0.6277997493743896, + "learning_rate": 0.0007574257425742574, + "loss": 2.15, + "step": 4850 + }, + { + "epoch": 3.54, + "grad_norm": 0.6128600239753723, + "learning_rate": 0.0007564356435643565, + "loss": 2.1466, + "step": 4860 + }, + { + "epoch": 3.54, + "eval_accuracy": 0.5778634145294884, + "eval_loss": 2.0040018558502197, + "eval_runtime": 1086.3606, + "eval_samples_per_second": 459.682, + "eval_steps_per_second": 2.053, + "step": 4860 + }, + { + "epoch": 3.55, + "grad_norm": 0.602790892124176, + "learning_rate": 0.0007554455445544554, + "loss": 2.1483, + "step": 4870 + }, + { + "epoch": 3.56, + "grad_norm": 0.5856905579566956, + "learning_rate": 0.0007544554455445545, + "loss": 2.1471, + "step": 4880 + }, + { + "epoch": 3.56, + "grad_norm": 0.6018987894058228, + "learning_rate": 0.0007534653465346534, + "loss": 2.141, + "step": 4890 + }, + { + "epoch": 3.57, + "grad_norm": 0.6041855216026306, + "learning_rate": 0.0007524752475247525, + "loss": 2.1385, + "step": 4900 + }, + { + "epoch": 3.58, + "grad_norm": 0.580766499042511, + "learning_rate": 0.0007514851485148515, + "loss": 2.1375, + "step": 4910 + }, + { + "epoch": 3.58, + "grad_norm": 0.5637401342391968, + "learning_rate": 0.0007504950495049505, + "loss": 2.1374, + "step": 4920 + }, + { + "epoch": 3.59, + "grad_norm": 0.6017095446586609, + "learning_rate": 0.0007495049504950495, + "loss": 2.1372, + "step": 4930 + }, + { + "epoch": 3.6, + "grad_norm": 0.766730010509491, + "learning_rate": 0.0007485148514851485, + "loss": 2.1336, + "step": 4940 + }, + { + "epoch": 3.61, + "grad_norm": 0.5679196119308472, + "learning_rate": 0.0007475247524752475, + "loss": 2.1339, + "step": 4950 + }, + { + "epoch": 3.61, + "eval_accuracy": 0.5801526152356328, + "eval_loss": 1.9901340007781982, + "eval_runtime": 1085.9898, + "eval_samples_per_second": 459.839, + "eval_steps_per_second": 2.053, + "step": 4950 + }, + { + "epoch": 3.61, + "grad_norm": 0.5875471234321594, + "learning_rate": 0.0007465346534653466, + "loss": 2.1327, + "step": 4960 + }, + { + "epoch": 3.62, + "grad_norm": 0.885311484336853, + "learning_rate": 0.0007455445544554455, + "loss": 2.1319, + "step": 4970 + }, + { + "epoch": 3.63, + "grad_norm": 0.5480872392654419, + "learning_rate": 0.0007445544554455446, + "loss": 2.1283, + "step": 4980 + }, + { + "epoch": 3.64, + "grad_norm": 0.6052006483078003, + "learning_rate": 0.0007435643564356435, + "loss": 2.1258, + "step": 4990 + }, + { + "epoch": 3.64, + "grad_norm": 0.4672467112541199, + "learning_rate": 0.0007425742574257426, + "loss": 2.1246, + "step": 5000 + }, + { + "epoch": 3.65, + "grad_norm": 0.7137532234191895, + "learning_rate": 0.0007415841584158416, + "loss": 2.1225, + "step": 5010 + }, + { + "epoch": 3.66, + "grad_norm": 0.726308286190033, + "learning_rate": 0.0007405940594059406, + "loss": 2.1247, + "step": 5020 + }, + { + "epoch": 3.67, + "grad_norm": 0.4779931902885437, + "learning_rate": 0.0007396039603960396, + "loss": 2.1215, + "step": 5030 + }, + { + "epoch": 3.67, + "grad_norm": 0.5192296504974365, + "learning_rate": 0.0007386138613861386, + "loss": 2.1151, + "step": 5040 + }, + { + "epoch": 3.67, + "eval_accuracy": 0.5818349975818327, + "eval_loss": 1.981979489326477, + "eval_runtime": 1085.0979, + "eval_samples_per_second": 460.217, + "eval_steps_per_second": 2.055, + "step": 5040 + }, + { + "epoch": 3.68, + "grad_norm": 0.824546754360199, + "learning_rate": 0.0007376237623762376, + "loss": 2.1176, + "step": 5050 + }, + { + "epoch": 3.69, + "grad_norm": 0.5459938049316406, + "learning_rate": 0.0007366336633663367, + "loss": 2.1209, + "step": 5060 + }, + { + "epoch": 3.69, + "grad_norm": 0.657993733882904, + "learning_rate": 0.0007356435643564356, + "loss": 2.1174, + "step": 5070 + }, + { + "epoch": 3.7, + "grad_norm": 0.6625123023986816, + "learning_rate": 0.0007346534653465347, + "loss": 2.114, + "step": 5080 + }, + { + "epoch": 3.71, + "grad_norm": 0.6521473526954651, + "learning_rate": 0.0007336633663366336, + "loss": 2.1126, + "step": 5090 + }, + { + "epoch": 3.72, + "grad_norm": 0.540843665599823, + "learning_rate": 0.0007326732673267327, + "loss": 2.1139, + "step": 5100 + }, + { + "epoch": 3.72, + "grad_norm": 0.5456762313842773, + "learning_rate": 0.0007316831683168317, + "loss": 2.1096, + "step": 5110 + }, + { + "epoch": 3.73, + "grad_norm": 0.6277236938476562, + "learning_rate": 0.0007306930693069307, + "loss": 2.1085, + "step": 5120 + }, + { + "epoch": 3.74, + "grad_norm": 0.5047609210014343, + "learning_rate": 0.0007297029702970297, + "loss": 2.1048, + "step": 5130 + }, + { + "epoch": 3.74, + "eval_accuracy": 0.5844656610858342, + "eval_loss": 1.9673104286193848, + "eval_runtime": 1086.0993, + "eval_samples_per_second": 459.792, + "eval_steps_per_second": 2.053, + "step": 5130 + }, + { + "epoch": 3.75, + "grad_norm": 0.5852828025817871, + "learning_rate": 0.0007287128712871287, + "loss": 2.1091, + "step": 5140 + }, + { + "epoch": 3.75, + "grad_norm": 0.49883949756622314, + "learning_rate": 0.0007277227722772277, + "loss": 2.1059, + "step": 5150 + }, + { + "epoch": 3.76, + "grad_norm": 0.4874211251735687, + "learning_rate": 0.0007267326732673268, + "loss": 2.101, + "step": 5160 + }, + { + "epoch": 3.77, + "grad_norm": 0.807388961315155, + "learning_rate": 0.0007257425742574257, + "loss": 2.1019, + "step": 5170 + }, + { + "epoch": 3.77, + "grad_norm": 0.4876428246498108, + "learning_rate": 0.0007247524752475248, + "loss": 2.101, + "step": 5180 + }, + { + "epoch": 3.78, + "grad_norm": 0.5534060597419739, + "learning_rate": 0.0007237623762376237, + "loss": 2.096, + "step": 5190 + }, + { + "epoch": 3.79, + "grad_norm": 0.5464605093002319, + "learning_rate": 0.0007227722772277228, + "loss": 2.0999, + "step": 5200 + }, + { + "epoch": 3.8, + "grad_norm": 0.6738607883453369, + "learning_rate": 0.0007217821782178218, + "loss": 2.0973, + "step": 5210 + }, + { + "epoch": 3.8, + "grad_norm": 0.5829378366470337, + "learning_rate": 0.0007207920792079208, + "loss": 2.0943, + "step": 5220 + }, + { + "epoch": 3.8, + "eval_accuracy": 0.5862642603086486, + "eval_loss": 1.9560039043426514, + "eval_runtime": 1085.7866, + "eval_samples_per_second": 459.925, + "eval_steps_per_second": 2.054, + "step": 5220 + }, + { + "epoch": 3.81, + "grad_norm": 0.6175411939620972, + "learning_rate": 0.0007198019801980198, + "loss": 2.097, + "step": 5230 + }, + { + "epoch": 3.82, + "grad_norm": 0.4551532566547394, + "learning_rate": 0.0007188118811881188, + "loss": 2.0909, + "step": 5240 + }, + { + "epoch": 3.83, + "grad_norm": 0.6612002849578857, + "learning_rate": 0.0007178217821782178, + "loss": 2.0885, + "step": 5250 + }, + { + "epoch": 3.83, + "grad_norm": 0.5608059763908386, + "learning_rate": 0.0007168316831683169, + "loss": 2.0893, + "step": 5260 + }, + { + "epoch": 3.84, + "grad_norm": 0.6450534462928772, + "learning_rate": 0.0007158415841584158, + "loss": 2.0881, + "step": 5270 + }, + { + "epoch": 3.85, + "grad_norm": 0.5012123584747314, + "learning_rate": 0.0007148514851485149, + "loss": 2.0843, + "step": 5280 + }, + { + "epoch": 3.85, + "grad_norm": 0.5907981395721436, + "learning_rate": 0.0007138613861386138, + "loss": 2.0848, + "step": 5290 + }, + { + "epoch": 3.86, + "grad_norm": 0.6378484964370728, + "learning_rate": 0.0007128712871287129, + "loss": 2.0891, + "step": 5300 + }, + { + "epoch": 3.87, + "grad_norm": 0.5008774399757385, + "learning_rate": 0.0007118811881188119, + "loss": 2.0839, + "step": 5310 + }, + { + "epoch": 3.87, + "eval_accuracy": 0.5886411137424359, + "eval_loss": 1.9411782026290894, + "eval_runtime": 1086.0682, + "eval_samples_per_second": 459.805, + "eval_steps_per_second": 2.053, + "step": 5310 + }, + { + "epoch": 3.88, + "grad_norm": 0.5727280378341675, + "learning_rate": 0.0007108910891089109, + "loss": 2.0834, + "step": 5320 + }, + { + "epoch": 3.88, + "grad_norm": 0.5417036414146423, + "learning_rate": 0.0007099009900990099, + "loss": 2.0806, + "step": 5330 + }, + { + "epoch": 3.89, + "grad_norm": 0.5739684104919434, + "learning_rate": 0.0007089108910891088, + "loss": 2.0809, + "step": 5340 + }, + { + "epoch": 3.9, + "grad_norm": 0.4842034876346588, + "learning_rate": 0.0007079207920792079, + "loss": 2.0787, + "step": 5350 + }, + { + "epoch": 3.91, + "grad_norm": 0.7235381603240967, + "learning_rate": 0.000706930693069307, + "loss": 2.0761, + "step": 5360 + }, + { + "epoch": 3.91, + "grad_norm": 0.5333278775215149, + "learning_rate": 0.0007059405940594059, + "loss": 2.082, + "step": 5370 + }, + { + "epoch": 3.92, + "grad_norm": 0.5933953523635864, + "learning_rate": 0.000704950495049505, + "loss": 2.0711, + "step": 5380 + }, + { + "epoch": 3.93, + "grad_norm": 0.3995033800601959, + "learning_rate": 0.0007039603960396039, + "loss": 2.072, + "step": 5390 + }, + { + "epoch": 3.93, + "grad_norm": 0.5461521744728088, + "learning_rate": 0.0007029702970297029, + "loss": 2.0748, + "step": 5400 + }, + { + "epoch": 3.93, + "eval_accuracy": 0.5892188513716441, + "eval_loss": 1.9380106925964355, + "eval_runtime": 1086.2866, + "eval_samples_per_second": 459.713, + "eval_steps_per_second": 2.053, + "step": 5400 + }, + { + "epoch": 3.94, + "grad_norm": 0.6759417057037354, + "learning_rate": 0.000701980198019802, + "loss": 2.0743, + "step": 5410 + }, + { + "epoch": 3.95, + "grad_norm": 0.4820743203163147, + "learning_rate": 0.0007009900990099009, + "loss": 2.0708, + "step": 5420 + }, + { + "epoch": 3.96, + "grad_norm": 0.5824475884437561, + "learning_rate": 0.0007, + "loss": 2.0676, + "step": 5430 + }, + { + "epoch": 3.96, + "grad_norm": 0.5133311748504639, + "learning_rate": 0.0006990099009900989, + "loss": 2.0688, + "step": 5440 + }, + { + "epoch": 3.97, + "grad_norm": 0.5744913220405579, + "learning_rate": 0.000698019801980198, + "loss": 2.0657, + "step": 5450 + }, + { + "epoch": 3.98, + "grad_norm": 0.5103346705436707, + "learning_rate": 0.000697029702970297, + "loss": 2.0674, + "step": 5460 + }, + { + "epoch": 3.99, + "grad_norm": 0.48300009965896606, + "learning_rate": 0.000696039603960396, + "loss": 2.0649, + "step": 5470 + }, + { + "epoch": 3.99, + "grad_norm": 0.64620441198349, + "learning_rate": 0.000695049504950495, + "loss": 2.0658, + "step": 5480 + }, + { + "epoch": 4.0, + "grad_norm": 0.5096336603164673, + "learning_rate": 0.000694059405940594, + "loss": 2.0671, + "step": 5490 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.5915717983649539, + "eval_loss": 1.9237810373306274, + "eval_runtime": 1088.2291, + "eval_samples_per_second": 458.892, + "eval_steps_per_second": 2.049, + "step": 5490 + }, + { + "epoch": 4.01, + "grad_norm": 0.45002949237823486, + "learning_rate": 0.000693069306930693, + "loss": 2.0612, + "step": 5500 + }, + { + "epoch": 4.01, + "grad_norm": 0.48484668135643005, + "learning_rate": 0.0006920792079207921, + "loss": 2.0598, + "step": 5510 + }, + { + "epoch": 4.02, + "grad_norm": 0.5888765454292297, + "learning_rate": 0.000691089108910891, + "loss": 2.0665, + "step": 5520 + }, + { + "epoch": 4.03, + "grad_norm": 0.6311919689178467, + "learning_rate": 0.0006900990099009901, + "loss": 2.0604, + "step": 5530 + }, + { + "epoch": 4.04, + "grad_norm": 0.4337356686592102, + "learning_rate": 0.000689108910891089, + "loss": 2.0562, + "step": 5540 + }, + { + "epoch": 4.04, + "grad_norm": 0.5855375528335571, + "learning_rate": 0.0006881188118811881, + "loss": 2.0535, + "step": 5550 + }, + { + "epoch": 4.05, + "grad_norm": 0.6715276837348938, + "learning_rate": 0.0006871287128712872, + "loss": 2.0569, + "step": 5560 + }, + { + "epoch": 4.06, + "grad_norm": 0.5453487634658813, + "learning_rate": 0.0006861386138613862, + "loss": 2.0539, + "step": 5570 + }, + { + "epoch": 4.07, + "grad_norm": 0.44611501693725586, + "learning_rate": 0.0006851485148514852, + "loss": 2.0557, + "step": 5580 + }, + { + "epoch": 4.07, + "eval_accuracy": 0.593527753135558, + "eval_loss": 1.9137904644012451, + "eval_runtime": 1092.9474, + "eval_samples_per_second": 456.911, + "eval_steps_per_second": 2.04, + "step": 5580 + }, + { + "epoch": 4.07, + "grad_norm": 0.46945101022720337, + "learning_rate": 0.0006841584158415842, + "loss": 2.0494, + "step": 5590 + }, + { + "epoch": 4.08, + "grad_norm": 0.6355441808700562, + "learning_rate": 0.0006831683168316832, + "loss": 2.0535, + "step": 5600 + }, + { + "epoch": 4.09, + "grad_norm": 0.5079066157341003, + "learning_rate": 0.0006821782178217823, + "loss": 2.0535, + "step": 5610 + }, + { + "epoch": 4.09, + "grad_norm": 0.48156508803367615, + "learning_rate": 0.0006811881188118812, + "loss": 2.0479, + "step": 5620 + }, + { + "epoch": 4.1, + "grad_norm": 0.40759479999542236, + "learning_rate": 0.0006801980198019803, + "loss": 2.0465, + "step": 5630 + }, + { + "epoch": 4.11, + "grad_norm": 0.6267878413200378, + "learning_rate": 0.0006792079207920792, + "loss": 2.047, + "step": 5640 + }, + { + "epoch": 4.12, + "grad_norm": 0.4935464560985565, + "learning_rate": 0.0006782178217821783, + "loss": 2.0457, + "step": 5650 + }, + { + "epoch": 4.12, + "grad_norm": 0.8215575814247131, + "learning_rate": 0.0006772277227722773, + "loss": 2.0459, + "step": 5660 + }, + { + "epoch": 4.13, + "grad_norm": 0.5285871624946594, + "learning_rate": 0.0006762376237623763, + "loss": 2.046, + "step": 5670 + }, + { + "epoch": 4.13, + "eval_accuracy": 0.5947354040081998, + "eval_loss": 1.9072514772415161, + "eval_runtime": 1086.9971, + "eval_samples_per_second": 459.412, + "eval_steps_per_second": 2.052, + "step": 5670 + }, + { + "epoch": 4.14, + "grad_norm": 0.4480656087398529, + "learning_rate": 0.0006752475247524753, + "loss": 2.0419, + "step": 5680 + }, + { + "epoch": 4.15, + "grad_norm": 0.5856757760047913, + "learning_rate": 0.0006742574257425743, + "loss": 2.043, + "step": 5690 + }, + { + "epoch": 4.15, + "grad_norm": 0.5994493365287781, + "learning_rate": 0.0006732673267326733, + "loss": 2.0423, + "step": 5700 + }, + { + "epoch": 4.16, + "grad_norm": 0.5151802897453308, + "learning_rate": 0.0006722772277227724, + "loss": 2.0367, + "step": 5710 + }, + { + "epoch": 4.17, + "grad_norm": 0.5299440622329712, + "learning_rate": 0.0006712871287128713, + "loss": 2.0404, + "step": 5720 + }, + { + "epoch": 4.18, + "grad_norm": 0.5033411979675293, + "learning_rate": 0.0006702970297029704, + "loss": 2.0383, + "step": 5730 + }, + { + "epoch": 4.18, + "grad_norm": 0.5515163540840149, + "learning_rate": 0.0006693069306930693, + "loss": 2.0377, + "step": 5740 + }, + { + "epoch": 4.19, + "grad_norm": 0.6445341110229492, + "learning_rate": 0.0006683168316831684, + "loss": 2.0348, + "step": 5750 + }, + { + "epoch": 4.2, + "grad_norm": 0.48192110657691956, + "learning_rate": 0.0006673267326732674, + "loss": 2.0376, + "step": 5760 + }, + { + "epoch": 4.2, + "eval_accuracy": 0.5963143559474492, + "eval_loss": 1.8983112573623657, + "eval_runtime": 1086.0935, + "eval_samples_per_second": 459.795, + "eval_steps_per_second": 2.053, + "step": 5760 + }, + { + "epoch": 4.2, + "grad_norm": 0.43177658319473267, + "learning_rate": 0.0006663366336633664, + "loss": 2.0304, + "step": 5770 + }, + { + "epoch": 4.21, + "grad_norm": 0.5673606395721436, + "learning_rate": 0.0006653465346534654, + "loss": 2.0342, + "step": 5780 + }, + { + "epoch": 4.22, + "grad_norm": 0.6636048555374146, + "learning_rate": 0.0006643564356435644, + "loss": 2.035, + "step": 5790 + }, + { + "epoch": 4.23, + "grad_norm": 0.5259466171264648, + "learning_rate": 0.0006633663366336634, + "loss": 2.0323, + "step": 5800 + }, + { + "epoch": 4.23, + "grad_norm": 0.561341404914856, + "learning_rate": 0.0006623762376237625, + "loss": 2.0277, + "step": 5810 + }, + { + "epoch": 4.24, + "grad_norm": 0.48961034417152405, + "learning_rate": 0.0006613861386138614, + "loss": 2.0316, + "step": 5820 + }, + { + "epoch": 4.25, + "grad_norm": 0.46912887692451477, + "learning_rate": 0.0006603960396039605, + "loss": 2.0285, + "step": 5830 + }, + { + "epoch": 4.26, + "grad_norm": 0.5009626746177673, + "learning_rate": 0.0006594059405940594, + "loss": 2.0287, + "step": 5840 + }, + { + "epoch": 4.26, + "grad_norm": 0.5485634207725525, + "learning_rate": 0.0006584158415841585, + "loss": 2.0275, + "step": 5850 + }, + { + "epoch": 4.26, + "eval_accuracy": 0.5978907811550054, + "eval_loss": 1.8894693851470947, + "eval_runtime": 1085.9104, + "eval_samples_per_second": 459.872, + "eval_steps_per_second": 2.054, + "step": 5850 + }, + { + "epoch": 4.27, + "grad_norm": 0.5094584822654724, + "learning_rate": 0.0006574257425742575, + "loss": 2.0285, + "step": 5860 + }, + { + "epoch": 4.28, + "grad_norm": 0.5796740651130676, + "learning_rate": 0.0006564356435643565, + "loss": 2.0246, + "step": 5870 + }, + { + "epoch": 4.28, + "grad_norm": 0.5473222136497498, + "learning_rate": 0.0006554455445544555, + "loss": 2.0262, + "step": 5880 + }, + { + "epoch": 4.29, + "grad_norm": 0.514639675617218, + "learning_rate": 0.0006544554455445545, + "loss": 2.0196, + "step": 5890 + }, + { + "epoch": 4.3, + "grad_norm": 0.7184515595436096, + "learning_rate": 0.0006534653465346535, + "loss": 2.0228, + "step": 5900 + }, + { + "epoch": 4.31, + "grad_norm": 0.5729575157165527, + "learning_rate": 0.0006524752475247526, + "loss": 2.0216, + "step": 5910 + }, + { + "epoch": 4.31, + "grad_norm": 0.543946385383606, + "learning_rate": 0.0006514851485148515, + "loss": 2.0221, + "step": 5920 + }, + { + "epoch": 4.32, + "grad_norm": 0.5328618884086609, + "learning_rate": 0.0006504950495049506, + "loss": 2.0205, + "step": 5930 + }, + { + "epoch": 4.33, + "grad_norm": 0.45202726125717163, + "learning_rate": 0.0006495049504950495, + "loss": 2.0171, + "step": 5940 + }, + { + "epoch": 4.33, + "eval_accuracy": 0.5995061137091588, + "eval_loss": 1.880002498626709, + "eval_runtime": 1087.0114, + "eval_samples_per_second": 459.406, + "eval_steps_per_second": 2.051, + "step": 5940 + }, + { + "epoch": 4.34, + "grad_norm": 0.6767913103103638, + "learning_rate": 0.0006485148514851485, + "loss": 2.0196, + "step": 5950 + }, + { + "epoch": 4.34, + "grad_norm": 0.6758072972297668, + "learning_rate": 0.0006475247524752476, + "loss": 2.0187, + "step": 5960 + }, + { + "epoch": 4.35, + "grad_norm": 0.49338895082473755, + "learning_rate": 0.0006465346534653465, + "loss": 2.0203, + "step": 5970 + }, + { + "epoch": 4.36, + "grad_norm": 0.6283939480781555, + "learning_rate": 0.0006455445544554456, + "loss": 2.0141, + "step": 5980 + }, + { + "epoch": 4.36, + "grad_norm": 0.4420575499534607, + "learning_rate": 0.0006445544554455445, + "loss": 2.0169, + "step": 5990 + }, + { + "epoch": 4.37, + "grad_norm": 0.5496084690093994, + "learning_rate": 0.0006435643564356436, + "loss": 2.0132, + "step": 6000 + }, + { + "epoch": 4.38, + "grad_norm": 0.5455350279808044, + "learning_rate": 0.0006425742574257426, + "loss": 2.0135, + "step": 6010 + }, + { + "epoch": 4.39, + "grad_norm": 0.5139738917350769, + "learning_rate": 0.0006415841584158416, + "loss": 2.0165, + "step": 6020 + }, + { + "epoch": 4.39, + "grad_norm": 0.5023753046989441, + "learning_rate": 0.0006405940594059406, + "loss": 2.0107, + "step": 6030 + }, + { + "epoch": 4.39, + "eval_accuracy": 0.6004914418538849, + "eval_loss": 1.8729732036590576, + "eval_runtime": 1086.4568, + "eval_samples_per_second": 459.641, + "eval_steps_per_second": 2.053, + "step": 6030 + }, + { + "epoch": 4.4, + "grad_norm": 0.50832599401474, + "learning_rate": 0.0006396039603960396, + "loss": 2.0125, + "step": 6040 + }, + { + "epoch": 4.41, + "grad_norm": 0.6144891977310181, + "learning_rate": 0.0006386138613861386, + "loss": 2.0074, + "step": 6050 + }, + { + "epoch": 4.42, + "grad_norm": 0.5852723121643066, + "learning_rate": 0.0006376237623762377, + "loss": 2.0116, + "step": 6060 + }, + { + "epoch": 4.42, + "grad_norm": 0.6694257259368896, + "learning_rate": 0.0006366336633663366, + "loss": 2.0068, + "step": 6070 + }, + { + "epoch": 4.43, + "grad_norm": 0.5024294257164001, + "learning_rate": 0.0006356435643564357, + "loss": 2.0097, + "step": 6080 + }, + { + "epoch": 4.44, + "grad_norm": 0.4397622048854828, + "learning_rate": 0.0006346534653465346, + "loss": 2.0058, + "step": 6090 + }, + { + "epoch": 4.44, + "grad_norm": 0.46592214703559875, + "learning_rate": 0.0006336633663366337, + "loss": 2.0036, + "step": 6100 + }, + { + "epoch": 4.45, + "grad_norm": 0.6728220582008362, + "learning_rate": 0.0006326732673267327, + "loss": 2.0027, + "step": 6110 + }, + { + "epoch": 4.46, + "grad_norm": 0.5555120706558228, + "learning_rate": 0.0006316831683168317, + "loss": 2.0037, + "step": 6120 + }, + { + "epoch": 4.46, + "eval_accuracy": 0.6017631778318724, + "eval_loss": 1.8681055307388306, + "eval_runtime": 1084.7583, + "eval_samples_per_second": 460.361, + "eval_steps_per_second": 2.056, + "step": 6120 + }, + { + "epoch": 4.47, + "grad_norm": 0.5361539721488953, + "learning_rate": 0.0006306930693069307, + "loss": 2.0029, + "step": 6130 + }, + { + "epoch": 4.47, + "grad_norm": 0.4616907238960266, + "learning_rate": 0.0006297029702970297, + "loss": 2.004, + "step": 6140 + }, + { + "epoch": 4.48, + "grad_norm": 0.4987693727016449, + "learning_rate": 0.0006287128712871287, + "loss": 2.0023, + "step": 6150 + }, + { + "epoch": 4.49, + "grad_norm": 0.5090926289558411, + "learning_rate": 0.0006277227722772278, + "loss": 1.9998, + "step": 6160 + }, + { + "epoch": 4.5, + "grad_norm": 0.5008625388145447, + "learning_rate": 0.0006267326732673267, + "loss": 1.9983, + "step": 6170 + }, + { + "epoch": 4.5, + "grad_norm": 0.5074314475059509, + "learning_rate": 0.0006257425742574258, + "loss": 1.9971, + "step": 6180 + }, + { + "epoch": 4.51, + "grad_norm": 0.5803602933883667, + "learning_rate": 0.0006247524752475247, + "loss": 1.9974, + "step": 6190 + }, + { + "epoch": 4.52, + "grad_norm": 0.534377932548523, + "learning_rate": 0.0006237623762376238, + "loss": 1.9962, + "step": 6200 + }, + { + "epoch": 4.52, + "grad_norm": 0.5166971683502197, + "learning_rate": 0.0006227722772277228, + "loss": 1.9967, + "step": 6210 + }, + { + "epoch": 4.52, + "eval_accuracy": 0.6030721722960667, + "eval_loss": 1.8595592975616455, + "eval_runtime": 1083.857, + "eval_samples_per_second": 460.743, + "eval_steps_per_second": 2.057, + "step": 6210 + }, + { + "epoch": 4.53, + "grad_norm": 0.610471785068512, + "learning_rate": 0.0006217821782178218, + "loss": 1.9972, + "step": 6220 + }, + { + "epoch": 4.54, + "grad_norm": 0.4919542372226715, + "learning_rate": 0.0006207920792079208, + "loss": 1.9945, + "step": 6230 + }, + { + "epoch": 4.55, + "grad_norm": 0.6607844829559326, + "learning_rate": 0.0006198019801980198, + "loss": 1.9962, + "step": 6240 + }, + { + "epoch": 4.55, + "grad_norm": 0.4905446171760559, + "learning_rate": 0.0006188118811881188, + "loss": 1.9937, + "step": 6250 + }, + { + "epoch": 4.56, + "grad_norm": 0.46790874004364014, + "learning_rate": 0.0006178217821782179, + "loss": 1.99, + "step": 6260 + }, + { + "epoch": 4.57, + "grad_norm": 0.5997541546821594, + "learning_rate": 0.0006168316831683168, + "loss": 1.9928, + "step": 6270 + }, + { + "epoch": 4.58, + "grad_norm": 0.5869884490966797, + "learning_rate": 0.0006158415841584159, + "loss": 1.9913, + "step": 6280 + }, + { + "epoch": 4.58, + "grad_norm": 0.5359517335891724, + "learning_rate": 0.0006148514851485148, + "loss": 1.9892, + "step": 6290 + }, + { + "epoch": 4.59, + "grad_norm": 0.5119579434394836, + "learning_rate": 0.0006138613861386139, + "loss": 1.9892, + "step": 6300 + }, + { + "epoch": 4.59, + "eval_accuracy": 0.6040549869719135, + "eval_loss": 1.8538638353347778, + "eval_runtime": 1084.8597, + "eval_samples_per_second": 460.318, + "eval_steps_per_second": 2.056, + "step": 6300 + }, + { + "epoch": 4.6, + "grad_norm": 0.501132071018219, + "learning_rate": 0.0006128712871287129, + "loss": 1.9874, + "step": 6310 + }, + { + "epoch": 4.6, + "grad_norm": 0.5507422089576721, + "learning_rate": 0.0006118811881188119, + "loss": 1.9904, + "step": 6320 + }, + { + "epoch": 4.61, + "grad_norm": 0.5412635207176208, + "learning_rate": 0.0006108910891089109, + "loss": 1.9873, + "step": 6330 + }, + { + "epoch": 4.62, + "grad_norm": 0.44309449195861816, + "learning_rate": 0.0006099009900990099, + "loss": 1.9869, + "step": 6340 + }, + { + "epoch": 4.63, + "grad_norm": 0.5056418776512146, + "learning_rate": 0.0006089108910891089, + "loss": 1.9855, + "step": 6350 + }, + { + "epoch": 4.63, + "grad_norm": 0.48882943391799927, + "learning_rate": 0.000607920792079208, + "loss": 1.984, + "step": 6360 + }, + { + "epoch": 4.64, + "grad_norm": 0.4717276692390442, + "learning_rate": 0.0006069306930693069, + "loss": 1.9841, + "step": 6370 + }, + { + "epoch": 4.65, + "grad_norm": 0.5123859643936157, + "learning_rate": 0.000605940594059406, + "loss": 1.9847, + "step": 6380 + }, + { + "epoch": 4.66, + "grad_norm": 0.4882528483867645, + "learning_rate": 0.0006049504950495049, + "loss": 1.9824, + "step": 6390 + }, + { + "epoch": 4.66, + "eval_accuracy": 0.6054229902100914, + "eval_loss": 1.8454294204711914, + "eval_runtime": 1084.8895, + "eval_samples_per_second": 460.305, + "eval_steps_per_second": 2.056, + "step": 6390 + }, + { + "epoch": 4.66, + "grad_norm": 0.4354398548603058, + "learning_rate": 0.000603960396039604, + "loss": 1.9806, + "step": 6400 + }, + { + "epoch": 4.67, + "grad_norm": 0.5733498334884644, + "learning_rate": 0.000602970297029703, + "loss": 1.9828, + "step": 6410 + }, + { + "epoch": 4.68, + "grad_norm": 0.5736912488937378, + "learning_rate": 0.000601980198019802, + "loss": 1.9816, + "step": 6420 + }, + { + "epoch": 4.69, + "grad_norm": 0.44418302178382874, + "learning_rate": 0.000600990099009901, + "loss": 1.978, + "step": 6430 + }, + { + "epoch": 4.69, + "grad_norm": 0.5334004759788513, + "learning_rate": 0.0006, + "loss": 1.9814, + "step": 6440 + }, + { + "epoch": 4.7, + "grad_norm": 0.5706362128257751, + "learning_rate": 0.000599009900990099, + "loss": 1.9787, + "step": 6450 + }, + { + "epoch": 4.71, + "grad_norm": 0.8315806984901428, + "learning_rate": 0.000598019801980198, + "loss": 1.9772, + "step": 6460 + }, + { + "epoch": 4.71, + "grad_norm": 0.47482118010520935, + "learning_rate": 0.000597029702970297, + "loss": 1.98, + "step": 6470 + }, + { + "epoch": 4.72, + "grad_norm": 0.43432384729385376, + "learning_rate": 0.000596039603960396, + "loss": 1.9766, + "step": 6480 + }, + { + "epoch": 4.72, + "eval_accuracy": 0.6066947766959975, + "eval_loss": 1.8382798433303833, + "eval_runtime": 1084.0208, + "eval_samples_per_second": 460.674, + "eval_steps_per_second": 2.057, + "step": 6480 + }, + { + "epoch": 4.73, + "grad_norm": 0.4684685170650482, + "learning_rate": 0.000595049504950495, + "loss": 1.9747, + "step": 6490 + }, + { + "epoch": 4.74, + "grad_norm": 0.5347057580947876, + "learning_rate": 0.000594059405940594, + "loss": 1.9773, + "step": 6500 + }, + { + "epoch": 4.74, + "grad_norm": 0.5178421139717102, + "learning_rate": 0.0005930693069306931, + "loss": 1.9754, + "step": 6510 + }, + { + "epoch": 4.75, + "grad_norm": 0.5036115050315857, + "learning_rate": 0.000592079207920792, + "loss": 1.9737, + "step": 6520 + }, + { + "epoch": 4.76, + "grad_norm": 0.48273569345474243, + "learning_rate": 0.0005910891089108911, + "loss": 1.9723, + "step": 6530 + }, + { + "epoch": 4.77, + "grad_norm": 0.4350590109825134, + "learning_rate": 0.00059009900990099, + "loss": 1.9715, + "step": 6540 + }, + { + "epoch": 4.77, + "grad_norm": 0.4483092129230499, + "learning_rate": 0.0005891089108910891, + "loss": 1.9718, + "step": 6550 + }, + { + "epoch": 4.78, + "grad_norm": 0.6620519757270813, + "learning_rate": 0.0005881188118811881, + "loss": 1.9726, + "step": 6560 + }, + { + "epoch": 4.79, + "grad_norm": 0.41184690594673157, + "learning_rate": 0.0005871287128712871, + "loss": 1.9682, + "step": 6570 + }, + { + "epoch": 4.79, + "eval_accuracy": 0.6078188605164281, + "eval_loss": 1.8335860967636108, + "eval_runtime": 1085.8051, + "eval_samples_per_second": 459.917, + "eval_steps_per_second": 2.054, + "step": 6570 + }, + { + "epoch": 4.79, + "grad_norm": 0.6032079458236694, + "learning_rate": 0.0005861386138613861, + "loss": 1.9701, + "step": 6580 + }, + { + "epoch": 4.8, + "grad_norm": 0.5113199949264526, + "learning_rate": 0.0005851485148514851, + "loss": 1.9715, + "step": 6590 + }, + { + "epoch": 4.81, + "grad_norm": 0.43198567628860474, + "learning_rate": 0.0005841584158415841, + "loss": 1.9677, + "step": 6600 + }, + { + "epoch": 4.82, + "grad_norm": 0.511009931564331, + "learning_rate": 0.0005831683168316832, + "loss": 1.9692, + "step": 6610 + }, + { + "epoch": 4.82, + "grad_norm": 0.48394373059272766, + "learning_rate": 0.0005821782178217821, + "loss": 1.9658, + "step": 6620 + }, + { + "epoch": 4.83, + "grad_norm": 0.5180623531341553, + "learning_rate": 0.0005811881188118812, + "loss": 1.9656, + "step": 6630 + }, + { + "epoch": 4.84, + "grad_norm": 0.5282729864120483, + "learning_rate": 0.0005801980198019801, + "loss": 1.9672, + "step": 6640 + }, + { + "epoch": 4.85, + "grad_norm": 0.44400766491889954, + "learning_rate": 0.0005792079207920792, + "loss": 1.9628, + "step": 6650 + }, + { + "epoch": 4.85, + "grad_norm": 0.47033068537712097, + "learning_rate": 0.0005782178217821782, + "loss": 1.9653, + "step": 6660 + }, + { + "epoch": 4.85, + "eval_accuracy": 0.6084373992178661, + "eval_loss": 1.8281679153442383, + "eval_runtime": 1084.5844, + "eval_samples_per_second": 460.434, + "eval_steps_per_second": 2.056, + "step": 6660 + }, + { + "epoch": 4.86, + "grad_norm": 0.7458497881889343, + "learning_rate": 0.0005772277227722772, + "loss": 1.9662, + "step": 6670 + }, + { + "epoch": 4.87, + "grad_norm": 0.4841892421245575, + "learning_rate": 0.0005762376237623762, + "loss": 1.9645, + "step": 6680 + }, + { + "epoch": 4.87, + "grad_norm": 0.472526490688324, + "learning_rate": 0.0005752475247524752, + "loss": 1.9621, + "step": 6690 + }, + { + "epoch": 4.88, + "grad_norm": 0.41672539710998535, + "learning_rate": 0.0005742574257425742, + "loss": 1.9595, + "step": 6700 + }, + { + "epoch": 4.89, + "grad_norm": 0.5024113059043884, + "learning_rate": 0.0005732673267326733, + "loss": 1.9593, + "step": 6710 + }, + { + "epoch": 4.9, + "grad_norm": 0.5023587942123413, + "learning_rate": 0.0005722772277227722, + "loss": 1.9599, + "step": 6720 + }, + { + "epoch": 4.9, + "grad_norm": 0.45752909779548645, + "learning_rate": 0.0005712871287128713, + "loss": 1.9599, + "step": 6730 + }, + { + "epoch": 4.91, + "grad_norm": 0.6170557737350464, + "learning_rate": 0.0005702970297029702, + "loss": 1.9616, + "step": 6740 + }, + { + "epoch": 4.92, + "grad_norm": 0.4267810583114624, + "learning_rate": 0.0005693069306930693, + "loss": 1.9599, + "step": 6750 + }, + { + "epoch": 4.92, + "eval_accuracy": 0.609472718679915, + "eval_loss": 1.8216131925582886, + "eval_runtime": 1085.0974, + "eval_samples_per_second": 460.217, + "eval_steps_per_second": 2.055, + "step": 6750 + }, + { + "epoch": 4.93, + "grad_norm": 0.5765691995620728, + "learning_rate": 0.0005683168316831683, + "loss": 1.9571, + "step": 6760 + }, + { + "epoch": 4.93, + "grad_norm": 0.5143380761146545, + "learning_rate": 0.0005673267326732673, + "loss": 1.9594, + "step": 6770 + }, + { + "epoch": 4.94, + "grad_norm": 0.47696933150291443, + "learning_rate": 0.0005663366336633663, + "loss": 1.9569, + "step": 6780 + }, + { + "epoch": 4.95, + "grad_norm": 0.6473893523216248, + "learning_rate": 0.0005653465346534653, + "loss": 1.959, + "step": 6790 + }, + { + "epoch": 4.95, + "grad_norm": 0.42045238614082336, + "learning_rate": 0.0005643564356435643, + "loss": 1.9537, + "step": 6800 + }, + { + "epoch": 4.96, + "grad_norm": 0.47495463490486145, + "learning_rate": 0.0005633663366336634, + "loss": 1.9539, + "step": 6810 + }, + { + "epoch": 4.97, + "grad_norm": 0.46555668115615845, + "learning_rate": 0.0005623762376237624, + "loss": 1.9532, + "step": 6820 + }, + { + "epoch": 4.98, + "grad_norm": 0.5669355988502502, + "learning_rate": 0.0005613861386138615, + "loss": 1.9539, + "step": 6830 + }, + { + "epoch": 4.98, + "grad_norm": 0.5012803077697754, + "learning_rate": 0.0005603960396039604, + "loss": 1.9516, + "step": 6840 + }, + { + "epoch": 4.98, + "eval_accuracy": 0.6106878911706901, + "eval_loss": 1.8161377906799316, + "eval_runtime": 1086.8974, + "eval_samples_per_second": 459.455, + "eval_steps_per_second": 2.052, + "step": 6840 + }, + { + "epoch": 4.99, + "grad_norm": 0.6031624674797058, + "learning_rate": 0.0005594059405940595, + "loss": 1.951, + "step": 6850 + }, + { + "epoch": 5.0, + "grad_norm": 0.48754164576530457, + "learning_rate": 0.0005584158415841585, + "loss": 1.9512, + "step": 6860 + }, + { + "epoch": 5.01, + "grad_norm": 0.5661942362785339, + "learning_rate": 0.0005574257425742575, + "loss": 1.953, + "step": 6870 + }, + { + "epoch": 5.01, + "grad_norm": 0.5200914144515991, + "learning_rate": 0.0005564356435643565, + "loss": 1.9528, + "step": 6880 + }, + { + "epoch": 5.02, + "grad_norm": 0.504625678062439, + "learning_rate": 0.0005554455445544555, + "loss": 1.9473, + "step": 6890 + }, + { + "epoch": 5.03, + "grad_norm": 0.5114207863807678, + "learning_rate": 0.0005544554455445545, + "loss": 1.9503, + "step": 6900 + }, + { + "epoch": 5.03, + "grad_norm": 0.4563724100589752, + "learning_rate": 0.0005534653465346536, + "loss": 1.9522, + "step": 6910 + }, + { + "epoch": 5.04, + "grad_norm": 0.48981210589408875, + "learning_rate": 0.0005524752475247525, + "loss": 1.9463, + "step": 6920 + }, + { + "epoch": 5.05, + "grad_norm": 0.4641856849193573, + "learning_rate": 0.0005514851485148516, + "loss": 1.9473, + "step": 6930 + }, + { + "epoch": 5.05, + "eval_accuracy": 0.6114587633382502, + "eval_loss": 1.8128423690795898, + "eval_runtime": 1088.997, + "eval_samples_per_second": 458.569, + "eval_steps_per_second": 2.048, + "step": 6930 + }, + { + "epoch": 5.06, + "grad_norm": 0.522405743598938, + "learning_rate": 0.0005504950495049505, + "loss": 1.9464, + "step": 6940 + }, + { + "epoch": 5.06, + "grad_norm": 0.5010780692100525, + "learning_rate": 0.0005495049504950496, + "loss": 1.9462, + "step": 6950 + }, + { + "epoch": 5.07, + "grad_norm": 0.4186078906059265, + "learning_rate": 0.0005485148514851486, + "loss": 1.9454, + "step": 6960 + }, + { + "epoch": 5.08, + "grad_norm": 0.43226584792137146, + "learning_rate": 0.0005475247524752476, + "loss": 1.9429, + "step": 6970 + }, + { + "epoch": 5.09, + "grad_norm": 0.4429096579551697, + "learning_rate": 0.0005465346534653466, + "loss": 1.9439, + "step": 6980 + }, + { + "epoch": 5.09, + "grad_norm": 0.5576241612434387, + "learning_rate": 0.0005455445544554456, + "loss": 1.9465, + "step": 6990 + }, + { + "epoch": 5.1, + "grad_norm": 0.5840058326721191, + "learning_rate": 0.0005445544554455446, + "loss": 1.9465, + "step": 7000 + }, + { + "epoch": 5.11, + "grad_norm": 0.42570897936820984, + "learning_rate": 0.0005435643564356437, + "loss": 1.9423, + "step": 7010 + }, + { + "epoch": 5.11, + "grad_norm": 0.4703156650066376, + "learning_rate": 0.0005425742574257426, + "loss": 1.9445, + "step": 7020 + }, + { + "epoch": 5.11, + "eval_accuracy": 0.6122824776592447, + "eval_loss": 1.8064905405044556, + "eval_runtime": 1089.2916, + "eval_samples_per_second": 458.445, + "eval_steps_per_second": 2.047, + "step": 7020 + }, + { + "epoch": 5.12, + "grad_norm": 0.47701114416122437, + "learning_rate": 0.0005415841584158417, + "loss": 1.9414, + "step": 7030 + }, + { + "epoch": 5.13, + "grad_norm": 0.4756263196468353, + "learning_rate": 0.0005405940594059406, + "loss": 1.9412, + "step": 7040 + }, + { + "epoch": 5.14, + "grad_norm": 0.4438433051109314, + "learning_rate": 0.0005396039603960396, + "loss": 1.9396, + "step": 7050 + }, + { + "epoch": 5.14, + "grad_norm": 0.7217634916305542, + "learning_rate": 0.0005386138613861387, + "loss": 1.9405, + "step": 7060 + }, + { + "epoch": 5.15, + "grad_norm": 0.5862283110618591, + "learning_rate": 0.0005376237623762376, + "loss": 1.9408, + "step": 7070 + }, + { + "epoch": 5.16, + "grad_norm": 0.4042336642742157, + "learning_rate": 0.0005366336633663367, + "loss": 1.9389, + "step": 7080 + }, + { + "epoch": 5.17, + "grad_norm": 0.48928365111351013, + "learning_rate": 0.0005356435643564356, + "loss": 1.9387, + "step": 7090 + }, + { + "epoch": 5.17, + "grad_norm": 0.4354238212108612, + "learning_rate": 0.0005346534653465347, + "loss": 1.9383, + "step": 7100 + }, + { + "epoch": 5.18, + "grad_norm": 0.4175672233104706, + "learning_rate": 0.0005336633663366337, + "loss": 1.9352, + "step": 7110 + }, + { + "epoch": 5.18, + "eval_accuracy": 0.6133205485630702, + "eval_loss": 1.8010112047195435, + "eval_runtime": 1088.7719, + "eval_samples_per_second": 458.664, + "eval_steps_per_second": 2.048, + "step": 7110 + }, + { + "epoch": 5.19, + "grad_norm": 0.49512535333633423, + "learning_rate": 0.0005326732673267327, + "loss": 1.936, + "step": 7120 + }, + { + "epoch": 5.2, + "grad_norm": 0.5564088225364685, + "learning_rate": 0.0005316831683168317, + "loss": 1.9352, + "step": 7130 + }, + { + "epoch": 5.2, + "grad_norm": 0.42951545119285583, + "learning_rate": 0.0005306930693069307, + "loss": 1.9382, + "step": 7140 + }, + { + "epoch": 5.21, + "grad_norm": 0.4925052523612976, + "learning_rate": 0.0005297029702970297, + "loss": 1.9332, + "step": 7150 + }, + { + "epoch": 5.22, + "grad_norm": 0.47808635234832764, + "learning_rate": 0.0005287128712871288, + "loss": 1.9347, + "step": 7160 + }, + { + "epoch": 5.22, + "grad_norm": 0.49086272716522217, + "learning_rate": 0.0005277227722772277, + "loss": 1.9346, + "step": 7170 + }, + { + "epoch": 5.23, + "grad_norm": 0.6410700678825378, + "learning_rate": 0.0005267326732673268, + "loss": 1.9355, + "step": 7180 + }, + { + "epoch": 5.24, + "grad_norm": 0.5207043886184692, + "learning_rate": 0.0005257425742574257, + "loss": 1.9362, + "step": 7190 + }, + { + "epoch": 5.25, + "grad_norm": 0.4774588346481323, + "learning_rate": 0.0005247524752475248, + "loss": 1.9275, + "step": 7200 + }, + { + "epoch": 5.25, + "eval_accuracy": 0.6140941851085354, + "eval_loss": 1.795212745666504, + "eval_runtime": 1086.7491, + "eval_samples_per_second": 459.517, + "eval_steps_per_second": 2.052, + "step": 7200 + }, + { + "epoch": 5.25, + "grad_norm": 0.43084409832954407, + "learning_rate": 0.0005237623762376238, + "loss": 1.9323, + "step": 7210 + }, + { + "epoch": 5.26, + "grad_norm": 0.47473257780075073, + "learning_rate": 0.0005227722772277228, + "loss": 1.9325, + "step": 7220 + }, + { + "epoch": 5.27, + "grad_norm": 0.5835340023040771, + "learning_rate": 0.0005217821782178218, + "loss": 1.9307, + "step": 7230 + }, + { + "epoch": 5.28, + "grad_norm": 0.5061103701591492, + "learning_rate": 0.0005207920792079208, + "loss": 1.9312, + "step": 7240 + }, + { + "epoch": 5.28, + "grad_norm": 0.41326335072517395, + "learning_rate": 0.0005198019801980198, + "loss": 1.9302, + "step": 7250 + }, + { + "epoch": 5.29, + "grad_norm": 0.4128727316856384, + "learning_rate": 0.0005188118811881189, + "loss": 1.9306, + "step": 7260 + }, + { + "epoch": 5.3, + "grad_norm": 0.4514748752117157, + "learning_rate": 0.0005178217821782178, + "loss": 1.9306, + "step": 7270 + }, + { + "epoch": 5.3, + "grad_norm": 0.4459412097930908, + "learning_rate": 0.0005168316831683169, + "loss": 1.9273, + "step": 7280 + }, + { + "epoch": 5.31, + "grad_norm": 0.508747935295105, + "learning_rate": 0.0005158415841584158, + "loss": 1.9288, + "step": 7290 + }, + { + "epoch": 5.31, + "eval_accuracy": 0.6147504784530803, + "eval_loss": 1.7935823202133179, + "eval_runtime": 1086.9403, + "eval_samples_per_second": 459.436, + "eval_steps_per_second": 2.052, + "step": 7290 + }, + { + "epoch": 5.32, + "grad_norm": 0.47734731435775757, + "learning_rate": 0.0005148514851485149, + "loss": 1.9298, + "step": 7300 + }, + { + "epoch": 5.33, + "grad_norm": 0.5174552798271179, + "learning_rate": 0.0005138613861386139, + "loss": 1.9265, + "step": 7310 + }, + { + "epoch": 5.33, + "grad_norm": 0.5077902674674988, + "learning_rate": 0.0005128712871287129, + "loss": 1.9261, + "step": 7320 + }, + { + "epoch": 5.34, + "grad_norm": 0.5650092959403992, + "learning_rate": 0.0005118811881188119, + "loss": 1.9298, + "step": 7330 + }, + { + "epoch": 5.35, + "grad_norm": 0.4757165312767029, + "learning_rate": 0.0005108910891089109, + "loss": 1.924, + "step": 7340 + }, + { + "epoch": 5.36, + "grad_norm": 0.39661648869514465, + "learning_rate": 0.0005099009900990099, + "loss": 1.9246, + "step": 7350 + }, + { + "epoch": 5.36, + "grad_norm": 0.4992424547672272, + "learning_rate": 0.000508910891089109, + "loss": 1.9238, + "step": 7360 + }, + { + "epoch": 5.37, + "grad_norm": 0.5065789222717285, + "learning_rate": 0.0005079207920792079, + "loss": 1.9227, + "step": 7370 + }, + { + "epoch": 5.38, + "grad_norm": 0.4040853977203369, + "learning_rate": 0.000506930693069307, + "loss": 1.9216, + "step": 7380 + }, + { + "epoch": 5.38, + "eval_accuracy": 0.6157344716957146, + "eval_loss": 1.7872822284698486, + "eval_runtime": 1088.1718, + "eval_samples_per_second": 458.917, + "eval_steps_per_second": 2.049, + "step": 7380 + }, + { + "epoch": 5.38, + "grad_norm": 0.5322463512420654, + "learning_rate": 0.0005059405940594059, + "loss": 1.9224, + "step": 7390 + }, + { + "epoch": 5.39, + "grad_norm": 0.7337666153907776, + "learning_rate": 0.000504950495049505, + "loss": 1.9215, + "step": 7400 + }, + { + "epoch": 5.4, + "grad_norm": 0.4274744987487793, + "learning_rate": 0.000503960396039604, + "loss": 1.9204, + "step": 7410 + }, + { + "epoch": 5.41, + "grad_norm": 0.5131354928016663, + "learning_rate": 0.000502970297029703, + "loss": 1.9232, + "step": 7420 + }, + { + "epoch": 5.41, + "grad_norm": 0.46376702189445496, + "learning_rate": 0.000501980198019802, + "loss": 1.9206, + "step": 7430 + }, + { + "epoch": 5.42, + "grad_norm": 0.4123290181159973, + "learning_rate": 0.000500990099009901, + "loss": 1.9195, + "step": 7440 + }, + { + "epoch": 5.43, + "grad_norm": 0.5006473660469055, + "learning_rate": 0.0005, + "loss": 1.9193, + "step": 7450 + }, + { + "epoch": 5.44, + "grad_norm": 0.4529099464416504, + "learning_rate": 0.0004990099009900991, + "loss": 1.9194, + "step": 7460 + }, + { + "epoch": 5.44, + "grad_norm": 0.43744367361068726, + "learning_rate": 0.000498019801980198, + "loss": 1.9215, + "step": 7470 + }, + { + "epoch": 5.44, + "eval_accuracy": 0.6164003583042955, + "eval_loss": 1.7831112146377563, + "eval_runtime": 1087.85, + "eval_samples_per_second": 459.052, + "eval_steps_per_second": 2.05, + "step": 7470 + }, + { + "epoch": 5.45, + "grad_norm": 0.4329184591770172, + "learning_rate": 0.0004970297029702971, + "loss": 1.9184, + "step": 7480 + }, + { + "epoch": 5.46, + "grad_norm": 0.4381932020187378, + "learning_rate": 0.000496039603960396, + "loss": 1.9194, + "step": 7490 + }, + { + "epoch": 5.46, + "grad_norm": 0.4533543586730957, + "learning_rate": 0.0004950495049504951, + "loss": 1.9163, + "step": 7500 + }, + { + "epoch": 5.47, + "grad_norm": 0.50531405210495, + "learning_rate": 0.0004940594059405941, + "loss": 1.9196, + "step": 7510 + }, + { + "epoch": 5.48, + "grad_norm": 0.45237472653388977, + "learning_rate": 0.000493069306930693, + "loss": 1.9148, + "step": 7520 + }, + { + "epoch": 5.49, + "grad_norm": 0.4136326313018799, + "learning_rate": 0.0004920792079207921, + "loss": 1.9152, + "step": 7530 + }, + { + "epoch": 5.49, + "grad_norm": 0.5655802488327026, + "learning_rate": 0.000491089108910891, + "loss": 1.9162, + "step": 7540 + }, + { + "epoch": 5.5, + "grad_norm": 0.4693652391433716, + "learning_rate": 0.0004900990099009901, + "loss": 1.9172, + "step": 7550 + }, + { + "epoch": 5.51, + "grad_norm": 0.43454521894454956, + "learning_rate": 0.0004891089108910892, + "loss": 1.9143, + "step": 7560 + }, + { + "epoch": 5.51, + "eval_accuracy": 0.6173679783746377, + "eval_loss": 1.7775607109069824, + "eval_runtime": 1086.5491, + "eval_samples_per_second": 459.602, + "eval_steps_per_second": 2.052, + "step": 7560 + }, + { + "epoch": 5.52, + "grad_norm": 0.5520709156990051, + "learning_rate": 0.0004881188118811881, + "loss": 1.9129, + "step": 7570 + }, + { + "epoch": 5.52, + "grad_norm": 0.5848320722579956, + "learning_rate": 0.00048712871287128715, + "loss": 1.9157, + "step": 7580 + }, + { + "epoch": 5.53, + "grad_norm": 0.4443696141242981, + "learning_rate": 0.00048613861386138615, + "loss": 1.9129, + "step": 7590 + }, + { + "epoch": 5.54, + "grad_norm": 0.4639554023742676, + "learning_rate": 0.00048514851485148515, + "loss": 1.9118, + "step": 7600 + }, + { + "epoch": 5.54, + "grad_norm": 0.4965130090713501, + "learning_rate": 0.00048415841584158414, + "loss": 1.911, + "step": 7610 + }, + { + "epoch": 5.55, + "grad_norm": 0.5710552334785461, + "learning_rate": 0.00048316831683168314, + "loss": 1.9133, + "step": 7620 + }, + { + "epoch": 5.56, + "grad_norm": 0.5551273226737976, + "learning_rate": 0.0004821782178217822, + "loss": 1.9115, + "step": 7630 + }, + { + "epoch": 5.57, + "grad_norm": 0.4237355887889862, + "learning_rate": 0.0004811881188118812, + "loss": 1.9107, + "step": 7640 + }, + { + "epoch": 5.57, + "grad_norm": 0.5999632477760315, + "learning_rate": 0.0004801980198019802, + "loss": 1.9125, + "step": 7650 + }, + { + "epoch": 5.57, + "eval_accuracy": 0.6181112629292015, + "eval_loss": 1.7737356424331665, + "eval_runtime": 1089.6084, + "eval_samples_per_second": 458.311, + "eval_steps_per_second": 2.047, + "step": 7650 + }, + { + "epoch": 5.58, + "grad_norm": 0.4806526303291321, + "learning_rate": 0.0004792079207920792, + "loss": 1.9102, + "step": 7660 + }, + { + "epoch": 5.59, + "grad_norm": 0.4812794327735901, + "learning_rate": 0.0004782178217821782, + "loss": 1.9071, + "step": 7670 + }, + { + "epoch": 5.6, + "grad_norm": 0.4029771089553833, + "learning_rate": 0.00047722772277227724, + "loss": 1.9081, + "step": 7680 + }, + { + "epoch": 5.6, + "grad_norm": 0.40626466274261475, + "learning_rate": 0.00047623762376237624, + "loss": 1.9113, + "step": 7690 + }, + { + "epoch": 5.61, + "grad_norm": 0.4513317942619324, + "learning_rate": 0.00047524752475247524, + "loss": 1.9088, + "step": 7700 + }, + { + "epoch": 5.62, + "grad_norm": 0.4461658000946045, + "learning_rate": 0.00047425742574257423, + "loss": 1.9054, + "step": 7710 + }, + { + "epoch": 5.62, + "grad_norm": 0.4517589807510376, + "learning_rate": 0.00047326732673267323, + "loss": 1.9082, + "step": 7720 + }, + { + "epoch": 5.63, + "grad_norm": 0.45047953724861145, + "learning_rate": 0.0004722772277227723, + "loss": 1.9064, + "step": 7730 + }, + { + "epoch": 5.64, + "grad_norm": 0.4856921434402466, + "learning_rate": 0.0004712871287128713, + "loss": 1.9075, + "step": 7740 + }, + { + "epoch": 5.64, + "eval_accuracy": 0.6187679819070717, + "eval_loss": 1.7702995538711548, + "eval_runtime": 1084.6666, + "eval_samples_per_second": 460.4, + "eval_steps_per_second": 2.056, + "step": 7740 + }, + { + "epoch": 5.65, + "grad_norm": 0.4753153920173645, + "learning_rate": 0.0004702970297029703, + "loss": 1.9063, + "step": 7750 + }, + { + "epoch": 5.65, + "grad_norm": 0.45308151841163635, + "learning_rate": 0.0004693069306930693, + "loss": 1.9077, + "step": 7760 + }, + { + "epoch": 5.66, + "grad_norm": 0.4464881718158722, + "learning_rate": 0.00046831683168316833, + "loss": 1.9038, + "step": 7770 + }, + { + "epoch": 5.67, + "grad_norm": 0.49109694361686707, + "learning_rate": 0.0004673267326732674, + "loss": 1.9041, + "step": 7780 + }, + { + "epoch": 5.68, + "grad_norm": 0.45286795496940613, + "learning_rate": 0.0004663366336633664, + "loss": 1.9028, + "step": 7790 + }, + { + "epoch": 5.68, + "grad_norm": 0.4543616771697998, + "learning_rate": 0.0004653465346534654, + "loss": 1.9055, + "step": 7800 + }, + { + "epoch": 5.69, + "grad_norm": 0.45107975602149963, + "learning_rate": 0.0004643564356435644, + "loss": 1.905, + "step": 7810 + }, + { + "epoch": 5.7, + "grad_norm": 0.4717351198196411, + "learning_rate": 0.0004633663366336634, + "loss": 1.9035, + "step": 7820 + }, + { + "epoch": 5.71, + "grad_norm": 0.47570666670799255, + "learning_rate": 0.00046237623762376243, + "loss": 1.9023, + "step": 7830 + }, + { + "epoch": 5.71, + "eval_accuracy": 0.619487519380987, + "eval_loss": 1.764754295349121, + "eval_runtime": 1085.4383, + "eval_samples_per_second": 460.072, + "eval_steps_per_second": 2.054, + "step": 7830 + }, + { + "epoch": 5.71, + "grad_norm": 0.4219855070114136, + "learning_rate": 0.00046138613861386143, + "loss": 1.9023, + "step": 7840 + }, + { + "epoch": 5.72, + "grad_norm": 0.43669965863227844, + "learning_rate": 0.0004603960396039604, + "loss": 1.9027, + "step": 7850 + }, + { + "epoch": 5.73, + "grad_norm": 0.3779612183570862, + "learning_rate": 0.0004594059405940594, + "loss": 1.8983, + "step": 7860 + }, + { + "epoch": 5.73, + "grad_norm": 0.42528143525123596, + "learning_rate": 0.0004584158415841584, + "loss": 1.9003, + "step": 7870 + }, + { + "epoch": 5.74, + "grad_norm": 0.5401535630226135, + "learning_rate": 0.0004574257425742575, + "loss": 1.902, + "step": 7880 + }, + { + "epoch": 5.75, + "grad_norm": 0.5040842890739441, + "learning_rate": 0.00045643564356435647, + "loss": 1.898, + "step": 7890 + }, + { + "epoch": 5.76, + "grad_norm": 0.40838822722435, + "learning_rate": 0.00045544554455445547, + "loss": 1.9009, + "step": 7900 + }, + { + "epoch": 5.76, + "grad_norm": 0.4518304169178009, + "learning_rate": 0.00045445544554455447, + "loss": 1.8971, + "step": 7910 + }, + { + "epoch": 5.77, + "grad_norm": 0.4375505745410919, + "learning_rate": 0.00045346534653465347, + "loss": 1.8968, + "step": 7920 + }, + { + "epoch": 5.77, + "eval_accuracy": 0.6199796521907708, + "eval_loss": 1.7632313966751099, + "eval_runtime": 1087.3022, + "eval_samples_per_second": 459.284, + "eval_steps_per_second": 2.051, + "step": 7920 + }, + { + "epoch": 5.78, + "grad_norm": 0.4348089098930359, + "learning_rate": 0.0004524752475247525, + "loss": 1.897, + "step": 7930 + }, + { + "epoch": 5.79, + "grad_norm": 0.4852411150932312, + "learning_rate": 0.0004514851485148515, + "loss": 1.8979, + "step": 7940 + }, + { + "epoch": 5.79, + "grad_norm": 0.3691408932209015, + "learning_rate": 0.0004504950495049505, + "loss": 1.8952, + "step": 7950 + }, + { + "epoch": 5.8, + "grad_norm": 0.40795382857322693, + "learning_rate": 0.0004495049504950495, + "loss": 1.8976, + "step": 7960 + }, + { + "epoch": 5.81, + "grad_norm": 0.5768758058547974, + "learning_rate": 0.0004485148514851485, + "loss": 1.9002, + "step": 7970 + }, + { + "epoch": 5.81, + "grad_norm": 0.47147759795188904, + "learning_rate": 0.00044752475247524756, + "loss": 1.8962, + "step": 7980 + }, + { + "epoch": 5.82, + "grad_norm": 0.4033481478691101, + "learning_rate": 0.00044653465346534656, + "loss": 1.8902, + "step": 7990 + }, + { + "epoch": 5.83, + "grad_norm": 0.474514365196228, + "learning_rate": 0.00044554455445544556, + "loss": 1.8914, + "step": 8000 + }, + { + "epoch": 5.84, + "grad_norm": 0.4343509376049042, + "learning_rate": 0.00044455445544554456, + "loss": 1.8909, + "step": 8010 + }, + { + "epoch": 5.84, + "eval_accuracy": 0.6212259728987508, + "eval_loss": 1.755420207977295, + "eval_runtime": 1084.9224, + "eval_samples_per_second": 460.291, + "eval_steps_per_second": 2.055, + "step": 8010 + }, + { + "epoch": 5.84, + "grad_norm": 0.435395210981369, + "learning_rate": 0.00044356435643564356, + "loss": 1.8907, + "step": 8020 + }, + { + "epoch": 5.85, + "grad_norm": 0.48715683817863464, + "learning_rate": 0.0004425742574257426, + "loss": 1.8894, + "step": 8030 + }, + { + "epoch": 5.86, + "grad_norm": 0.4001710116863251, + "learning_rate": 0.0004415841584158416, + "loss": 1.89, + "step": 8040 + }, + { + "epoch": 5.87, + "grad_norm": 0.38079318404197693, + "learning_rate": 0.0004405940594059406, + "loss": 1.8893, + "step": 8050 + }, + { + "epoch": 5.87, + "grad_norm": 0.5229191780090332, + "learning_rate": 0.0004396039603960396, + "loss": 1.8916, + "step": 8060 + }, + { + "epoch": 5.88, + "grad_norm": 0.4705289900302887, + "learning_rate": 0.0004386138613861386, + "loss": 1.891, + "step": 8070 + }, + { + "epoch": 5.89, + "grad_norm": 0.4324556589126587, + "learning_rate": 0.00043762376237623765, + "loss": 1.8872, + "step": 8080 + }, + { + "epoch": 5.89, + "grad_norm": 0.4106965959072113, + "learning_rate": 0.00043663366336633665, + "loss": 1.8861, + "step": 8090 + }, + { + "epoch": 5.9, + "grad_norm": 0.461008220911026, + "learning_rate": 0.00043564356435643565, + "loss": 1.8903, + "step": 8100 + }, + { + "epoch": 5.9, + "eval_accuracy": 0.6217799147539793, + "eval_loss": 1.751676082611084, + "eval_runtime": 1084.7892, + "eval_samples_per_second": 460.348, + "eval_steps_per_second": 2.056, + "step": 8100 + }, + { + "epoch": 5.91, + "grad_norm": 0.4489499032497406, + "learning_rate": 0.00043465346534653465, + "loss": 1.8894, + "step": 8110 + }, + { + "epoch": 5.92, + "grad_norm": 0.40970441699028015, + "learning_rate": 0.00043366336633663365, + "loss": 1.8858, + "step": 8120 + }, + { + "epoch": 5.92, + "grad_norm": 0.397197425365448, + "learning_rate": 0.0004326732673267327, + "loss": 1.8866, + "step": 8130 + }, + { + "epoch": 5.93, + "grad_norm": 0.4046621322631836, + "learning_rate": 0.0004316831683168317, + "loss": 1.8865, + "step": 8140 + }, + { + "epoch": 5.94, + "grad_norm": 0.4488195478916168, + "learning_rate": 0.0004306930693069307, + "loss": 1.8898, + "step": 8150 + }, + { + "epoch": 5.95, + "grad_norm": 0.37480929493904114, + "learning_rate": 0.0004297029702970297, + "loss": 1.883, + "step": 8160 + }, + { + "epoch": 5.95, + "grad_norm": 0.39327338337898254, + "learning_rate": 0.0004287128712871287, + "loss": 1.8837, + "step": 8170 + }, + { + "epoch": 5.96, + "grad_norm": 0.4443519115447998, + "learning_rate": 0.00042772277227722774, + "loss": 1.8825, + "step": 8180 + }, + { + "epoch": 5.97, + "grad_norm": 0.4902293384075165, + "learning_rate": 0.00042673267326732674, + "loss": 1.8837, + "step": 8190 + }, + { + "epoch": 5.97, + "eval_accuracy": 0.622874142241673, + "eval_loss": 1.7469381093978882, + "eval_runtime": 1082.2695, + "eval_samples_per_second": 461.419, + "eval_steps_per_second": 2.06, + "step": 8190 + }, + { + "epoch": 5.97, + "grad_norm": 0.43381059169769287, + "learning_rate": 0.00042574257425742574, + "loss": 1.8812, + "step": 8200 + }, + { + "epoch": 5.98, + "grad_norm": 0.4675629734992981, + "learning_rate": 0.00042475247524752474, + "loss": 1.8808, + "step": 8210 + }, + { + "epoch": 5.99, + "grad_norm": 0.4100710451602936, + "learning_rate": 0.00042376237623762374, + "loss": 1.882, + "step": 8220 + }, + { + "epoch": 6.0, + "grad_norm": 0.5555077791213989, + "learning_rate": 0.0004227722772277228, + "loss": 1.8824, + "step": 8230 + }, + { + "epoch": 6.0, + "grad_norm": 0.5407902598381042, + "learning_rate": 0.0004217821782178218, + "loss": 1.8813, + "step": 8240 + }, + { + "epoch": 6.01, + "grad_norm": 0.48739898204803467, + "learning_rate": 0.0004207920792079208, + "loss": 1.8782, + "step": 8250 + }, + { + "epoch": 6.02, + "grad_norm": 0.4977239966392517, + "learning_rate": 0.0004198019801980198, + "loss": 1.8801, + "step": 8260 + }, + { + "epoch": 6.03, + "grad_norm": 0.47402721643447876, + "learning_rate": 0.0004188118811881188, + "loss": 1.8811, + "step": 8270 + }, + { + "epoch": 6.03, + "grad_norm": 0.4796026349067688, + "learning_rate": 0.00041782178217821784, + "loss": 1.8801, + "step": 8280 + }, + { + "epoch": 6.03, + "eval_accuracy": 0.6238451571943063, + "eval_loss": 1.741037130355835, + "eval_runtime": 1087.5002, + "eval_samples_per_second": 459.2, + "eval_steps_per_second": 2.051, + "step": 8280 + }, + { + "epoch": 6.04, + "grad_norm": 0.419162392616272, + "learning_rate": 0.00041683168316831683, + "loss": 1.8796, + "step": 8290 + }, + { + "epoch": 6.05, + "grad_norm": 0.409493625164032, + "learning_rate": 0.00041584158415841583, + "loss": 1.8794, + "step": 8300 + }, + { + "epoch": 6.05, + "grad_norm": 0.5155593752861023, + "learning_rate": 0.00041485148514851483, + "loss": 1.8786, + "step": 8310 + }, + { + "epoch": 6.06, + "grad_norm": 0.46643194556236267, + "learning_rate": 0.00041386138613861383, + "loss": 1.8786, + "step": 8320 + }, + { + "epoch": 6.07, + "grad_norm": 0.4528968930244446, + "learning_rate": 0.0004128712871287129, + "loss": 1.8765, + "step": 8330 + }, + { + "epoch": 6.08, + "grad_norm": 0.4780101180076599, + "learning_rate": 0.0004118811881188119, + "loss": 1.8742, + "step": 8340 + }, + { + "epoch": 6.08, + "grad_norm": 0.41187387704849243, + "learning_rate": 0.0004108910891089109, + "loss": 1.8754, + "step": 8350 + }, + { + "epoch": 6.09, + "grad_norm": 0.48807311058044434, + "learning_rate": 0.0004099009900990099, + "loss": 1.877, + "step": 8360 + }, + { + "epoch": 6.1, + "grad_norm": 0.4205549657344818, + "learning_rate": 0.0004089108910891089, + "loss": 1.8769, + "step": 8370 + }, + { + "epoch": 6.1, + "eval_accuracy": 0.6247575321496361, + "eval_loss": 1.7364323139190674, + "eval_runtime": 1085.7284, + "eval_samples_per_second": 459.949, + "eval_steps_per_second": 2.054, + "step": 8370 + }, + { + "epoch": 6.11, + "grad_norm": 0.5422940850257874, + "learning_rate": 0.0004079207920792079, + "loss": 1.8743, + "step": 8380 + }, + { + "epoch": 6.11, + "grad_norm": 0.4061022400856018, + "learning_rate": 0.0004069306930693069, + "loss": 1.8748, + "step": 8390 + }, + { + "epoch": 6.12, + "grad_norm": 0.4819527268409729, + "learning_rate": 0.000405940594059406, + "loss": 1.8734, + "step": 8400 + }, + { + "epoch": 6.13, + "grad_norm": 0.4938518702983856, + "learning_rate": 0.000404950495049505, + "loss": 1.8746, + "step": 8410 + }, + { + "epoch": 6.13, + "grad_norm": 0.39021047949790955, + "learning_rate": 0.00040396039603960397, + "loss": 1.874, + "step": 8420 + }, + { + "epoch": 6.14, + "grad_norm": 0.3815496265888214, + "learning_rate": 0.000402970297029703, + "loss": 1.8722, + "step": 8430 + }, + { + "epoch": 6.15, + "grad_norm": 0.4226173758506775, + "learning_rate": 0.000401980198019802, + "loss": 1.8716, + "step": 8440 + }, + { + "epoch": 6.16, + "grad_norm": 0.4311840236186981, + "learning_rate": 0.000400990099009901, + "loss": 1.8729, + "step": 8450 + }, + { + "epoch": 6.16, + "grad_norm": 0.37644141912460327, + "learning_rate": 0.0004, + "loss": 1.8697, + "step": 8460 + }, + { + "epoch": 6.16, + "eval_accuracy": 0.6258217191475034, + "eval_loss": 1.7313834428787231, + "eval_runtime": 1087.4157, + "eval_samples_per_second": 459.236, + "eval_steps_per_second": 2.051, + "step": 8460 + }, + { + "epoch": 6.17, + "grad_norm": 0.34123849868774414, + "learning_rate": 0.000399009900990099, + "loss": 1.8709, + "step": 8470 + }, + { + "epoch": 6.18, + "grad_norm": 0.5545117259025574, + "learning_rate": 0.00039801980198019807, + "loss": 1.8729, + "step": 8480 + }, + { + "epoch": 6.19, + "grad_norm": 0.44565874338150024, + "learning_rate": 0.00039702970297029707, + "loss": 1.8716, + "step": 8490 + }, + { + "epoch": 6.19, + "grad_norm": 0.4301845133304596, + "learning_rate": 0.00039603960396039607, + "loss": 1.866, + "step": 8500 + }, + { + "epoch": 6.2, + "grad_norm": 0.46877652406692505, + "learning_rate": 0.00039504950495049506, + "loss": 1.8672, + "step": 8510 + }, + { + "epoch": 6.21, + "grad_norm": 0.535370945930481, + "learning_rate": 0.00039405940594059406, + "loss": 1.8712, + "step": 8520 + }, + { + "epoch": 6.22, + "grad_norm": 0.39393341541290283, + "learning_rate": 0.0003930693069306931, + "loss": 1.8671, + "step": 8530 + }, + { + "epoch": 6.22, + "grad_norm": 0.36698782444000244, + "learning_rate": 0.0003920792079207921, + "loss": 1.8685, + "step": 8540 + }, + { + "epoch": 6.23, + "grad_norm": 0.4744206666946411, + "learning_rate": 0.0003910891089108911, + "loss": 1.8673, + "step": 8550 + }, + { + "epoch": 6.23, + "eval_accuracy": 0.6265433755799307, + "eval_loss": 1.7275755405426025, + "eval_runtime": 1085.596, + "eval_samples_per_second": 460.005, + "eval_steps_per_second": 2.054, + "step": 8550 + }, + { + "epoch": 6.24, + "grad_norm": 0.4614261984825134, + "learning_rate": 0.0003900990099009901, + "loss": 1.8631, + "step": 8560 + }, + { + "epoch": 6.24, + "grad_norm": 0.4780130386352539, + "learning_rate": 0.0003891089108910891, + "loss": 1.8658, + "step": 8570 + }, + { + "epoch": 6.25, + "grad_norm": 0.4078359007835388, + "learning_rate": 0.00038811881188118816, + "loss": 1.8644, + "step": 8580 + }, + { + "epoch": 6.26, + "grad_norm": 0.4950817823410034, + "learning_rate": 0.00038712871287128716, + "loss": 1.8631, + "step": 8590 + }, + { + "epoch": 6.27, + "grad_norm": 0.4473728835582733, + "learning_rate": 0.00038613861386138616, + "loss": 1.862, + "step": 8600 + }, + { + "epoch": 6.27, + "grad_norm": 0.41996896266937256, + "learning_rate": 0.00038514851485148515, + "loss": 1.8609, + "step": 8610 + }, + { + "epoch": 6.28, + "grad_norm": 0.4896424114704132, + "learning_rate": 0.00038415841584158415, + "loss": 1.8639, + "step": 8620 + }, + { + "epoch": 6.29, + "grad_norm": 0.5288357138633728, + "learning_rate": 0.0003831683168316832, + "loss": 1.8629, + "step": 8630 + }, + { + "epoch": 6.3, + "grad_norm": 0.414982408285141, + "learning_rate": 0.0003821782178217822, + "loss": 1.8611, + "step": 8640 + }, + { + "epoch": 6.3, + "eval_accuracy": 0.6280281595524545, + "eval_loss": 1.7198432683944702, + "eval_runtime": 1085.3535, + "eval_samples_per_second": 460.108, + "eval_steps_per_second": 2.055, + "step": 8640 + }, + { + "epoch": 6.3, + "grad_norm": 0.45578229427337646, + "learning_rate": 0.0003811881188118812, + "loss": 1.8598, + "step": 8650 + }, + { + "epoch": 6.31, + "grad_norm": 0.4705806374549866, + "learning_rate": 0.0003801980198019802, + "loss": 1.8606, + "step": 8660 + }, + { + "epoch": 6.32, + "grad_norm": 0.40844404697418213, + "learning_rate": 0.0003792079207920792, + "loss": 1.8597, + "step": 8670 + }, + { + "epoch": 6.32, + "grad_norm": 0.3853258490562439, + "learning_rate": 0.00037821782178217825, + "loss": 1.8582, + "step": 8680 + }, + { + "epoch": 6.33, + "grad_norm": 0.4357406198978424, + "learning_rate": 0.00037722772277227725, + "loss": 1.8595, + "step": 8690 + }, + { + "epoch": 6.34, + "grad_norm": 0.5262021422386169, + "learning_rate": 0.00037623762376237625, + "loss": 1.858, + "step": 8700 + }, + { + "epoch": 6.35, + "grad_norm": 0.5264425873756409, + "learning_rate": 0.00037524752475247524, + "loss": 1.8587, + "step": 8710 + }, + { + "epoch": 6.35, + "grad_norm": 0.45019951462745667, + "learning_rate": 0.00037425742574257424, + "loss": 1.8566, + "step": 8720 + }, + { + "epoch": 6.36, + "grad_norm": 0.41189640760421753, + "learning_rate": 0.0003732673267326733, + "loss": 1.8572, + "step": 8730 + }, + { + "epoch": 6.36, + "eval_accuracy": 0.6290112373666048, + "eval_loss": 1.7162350416183472, + "eval_runtime": 1085.09, + "eval_samples_per_second": 460.22, + "eval_steps_per_second": 2.055, + "step": 8730 + }, + { + "epoch": 6.37, + "grad_norm": 0.4982648491859436, + "learning_rate": 0.0003722772277227723, + "loss": 1.8546, + "step": 8740 + }, + { + "epoch": 6.38, + "grad_norm": 0.3992471694946289, + "learning_rate": 0.0003712871287128713, + "loss": 1.8537, + "step": 8750 + }, + { + "epoch": 6.38, + "grad_norm": 0.46979019045829773, + "learning_rate": 0.0003702970297029703, + "loss": 1.8566, + "step": 8760 + }, + { + "epoch": 6.39, + "grad_norm": 0.5312979817390442, + "learning_rate": 0.0003693069306930693, + "loss": 1.8528, + "step": 8770 + }, + { + "epoch": 6.4, + "grad_norm": 0.4439733624458313, + "learning_rate": 0.00036831683168316834, + "loss": 1.855, + "step": 8780 + }, + { + "epoch": 6.4, + "grad_norm": 0.39169180393218994, + "learning_rate": 0.00036732673267326734, + "loss": 1.8528, + "step": 8790 + }, + { + "epoch": 6.41, + "grad_norm": 0.41544708609580994, + "learning_rate": 0.00036633663366336634, + "loss": 1.8508, + "step": 8800 + }, + { + "epoch": 6.42, + "grad_norm": 0.4738489091396332, + "learning_rate": 0.00036534653465346533, + "loss": 1.8515, + "step": 8810 + }, + { + "epoch": 6.43, + "grad_norm": 0.4519752860069275, + "learning_rate": 0.00036435643564356433, + "loss": 1.8507, + "step": 8820 + }, + { + "epoch": 6.43, + "eval_accuracy": 0.6307912179674288, + "eval_loss": 1.707141399383545, + "eval_runtime": 1087.4605, + "eval_samples_per_second": 459.217, + "eval_steps_per_second": 2.051, + "step": 8820 + }, + { + "epoch": 6.43, + "grad_norm": 0.382656991481781, + "learning_rate": 0.0003633663366336634, + "loss": 1.8487, + "step": 8830 + }, + { + "epoch": 6.44, + "grad_norm": 0.4599255919456482, + "learning_rate": 0.0003623762376237624, + "loss": 1.8506, + "step": 8840 + }, + { + "epoch": 6.45, + "grad_norm": 0.4658602476119995, + "learning_rate": 0.0003613861386138614, + "loss": 1.8496, + "step": 8850 + }, + { + "epoch": 6.46, + "grad_norm": 0.43937841057777405, + "learning_rate": 0.0003603960396039604, + "loss": 1.8504, + "step": 8860 + }, + { + "epoch": 6.46, + "grad_norm": 0.4001791477203369, + "learning_rate": 0.0003594059405940594, + "loss": 1.8479, + "step": 8870 + }, + { + "epoch": 6.47, + "grad_norm": 0.47235003113746643, + "learning_rate": 0.00035841584158415843, + "loss": 1.8462, + "step": 8880 + }, + { + "epoch": 6.48, + "grad_norm": 0.42399758100509644, + "learning_rate": 0.00035742574257425743, + "loss": 1.8483, + "step": 8890 + }, + { + "epoch": 6.48, + "grad_norm": 0.3666219413280487, + "learning_rate": 0.0003564356435643564, + "loss": 1.8462, + "step": 8900 + }, + { + "epoch": 6.49, + "grad_norm": 0.4140106737613678, + "learning_rate": 0.0003554455445544554, + "loss": 1.8447, + "step": 8910 + }, + { + "epoch": 6.49, + "eval_accuracy": 0.6324917716108477, + "eval_loss": 1.6985867023468018, + "eval_runtime": 1087.0314, + "eval_samples_per_second": 459.398, + "eval_steps_per_second": 2.051, + "step": 8910 + }, + { + "epoch": 6.5, + "grad_norm": 0.4904990792274475, + "learning_rate": 0.0003544554455445544, + "loss": 1.8451, + "step": 8920 + }, + { + "epoch": 6.51, + "grad_norm": 0.43653395771980286, + "learning_rate": 0.0003534653465346535, + "loss": 1.8434, + "step": 8930 + }, + { + "epoch": 6.51, + "grad_norm": 0.4815143942832947, + "learning_rate": 0.0003524752475247525, + "loss": 1.8405, + "step": 8940 + }, + { + "epoch": 6.52, + "grad_norm": 0.5665289759635925, + "learning_rate": 0.00035148514851485147, + "loss": 1.8426, + "step": 8950 + }, + { + "epoch": 6.53, + "grad_norm": 0.45166435837745667, + "learning_rate": 0.00035049504950495047, + "loss": 1.8431, + "step": 8960 + }, + { + "epoch": 6.54, + "grad_norm": 0.45748040080070496, + "learning_rate": 0.00034950495049504947, + "loss": 1.8434, + "step": 8970 + }, + { + "epoch": 6.54, + "grad_norm": 0.42432862520217896, + "learning_rate": 0.0003485148514851485, + "loss": 1.8408, + "step": 8980 + }, + { + "epoch": 6.55, + "grad_norm": 0.3683374226093292, + "learning_rate": 0.0003475247524752475, + "loss": 1.8388, + "step": 8990 + }, + { + "epoch": 6.56, + "grad_norm": 0.4106079041957855, + "learning_rate": 0.0003465346534653465, + "loss": 1.839, + "step": 9000 + }, + { + "epoch": 6.56, + "eval_accuracy": 0.6338676239837059, + "eval_loss": 1.6945050954818726, + "eval_runtime": 1084.8256, + "eval_samples_per_second": 460.332, + "eval_steps_per_second": 2.056, + "step": 9000 + }, + { + "epoch": 6.56, + "grad_norm": 0.46325406432151794, + "learning_rate": 0.0003455445544554455, + "loss": 1.8383, + "step": 9010 + }, + { + "epoch": 6.57, + "grad_norm": 0.4024347960948944, + "learning_rate": 0.0003445544554455445, + "loss": 1.8397, + "step": 9020 + }, + { + "epoch": 6.58, + "grad_norm": 0.5055080652236938, + "learning_rate": 0.0003435643564356436, + "loss": 1.8386, + "step": 9030 + }, + { + "epoch": 6.59, + "grad_norm": 0.38540130853652954, + "learning_rate": 0.0003425742574257426, + "loss": 1.8381, + "step": 9040 + }, + { + "epoch": 6.59, + "grad_norm": 0.42219218611717224, + "learning_rate": 0.0003415841584158416, + "loss": 1.8379, + "step": 9050 + }, + { + "epoch": 6.6, + "grad_norm": 0.37353622913360596, + "learning_rate": 0.0003405940594059406, + "loss": 1.8354, + "step": 9060 + }, + { + "epoch": 6.61, + "grad_norm": 0.4756326973438263, + "learning_rate": 0.0003396039603960396, + "loss": 1.8345, + "step": 9070 + }, + { + "epoch": 6.62, + "grad_norm": 0.44085556268692017, + "learning_rate": 0.00033861386138613867, + "loss": 1.8355, + "step": 9080 + }, + { + "epoch": 6.62, + "grad_norm": 0.4287554919719696, + "learning_rate": 0.00033762376237623766, + "loss": 1.8345, + "step": 9090 + }, + { + "epoch": 6.62, + "eval_accuracy": 0.6351918568609488, + "eval_loss": 1.6874170303344727, + "eval_runtime": 1085.5989, + "eval_samples_per_second": 460.004, + "eval_steps_per_second": 2.054, + "step": 9090 + }, + { + "epoch": 6.63, + "grad_norm": 0.4309207499027252, + "learning_rate": 0.00033663366336633666, + "loss": 1.8336, + "step": 9100 + }, + { + "epoch": 6.64, + "grad_norm": 0.41971975564956665, + "learning_rate": 0.00033564356435643566, + "loss": 1.83, + "step": 9110 + }, + { + "epoch": 6.64, + "grad_norm": 0.4418039619922638, + "learning_rate": 0.00033465346534653466, + "loss": 1.829, + "step": 9120 + }, + { + "epoch": 6.65, + "grad_norm": 0.44311702251434326, + "learning_rate": 0.0003336633663366337, + "loss": 1.8337, + "step": 9130 + }, + { + "epoch": 6.66, + "grad_norm": 0.46464502811431885, + "learning_rate": 0.0003326732673267327, + "loss": 1.835, + "step": 9140 + }, + { + "epoch": 6.67, + "grad_norm": 0.40370500087738037, + "learning_rate": 0.0003316831683168317, + "loss": 1.8304, + "step": 9150 + }, + { + "epoch": 6.67, + "grad_norm": 0.37608906626701355, + "learning_rate": 0.0003306930693069307, + "loss": 1.8312, + "step": 9160 + }, + { + "epoch": 6.68, + "grad_norm": 0.3990706503391266, + "learning_rate": 0.0003297029702970297, + "loss": 1.8313, + "step": 9170 + }, + { + "epoch": 6.69, + "grad_norm": 0.5069761872291565, + "learning_rate": 0.00032871287128712876, + "loss": 1.8292, + "step": 9180 + }, + { + "epoch": 6.69, + "eval_accuracy": 0.6364002620555558, + "eval_loss": 1.6848387718200684, + "eval_runtime": 1085.7505, + "eval_samples_per_second": 459.94, + "eval_steps_per_second": 2.054, + "step": 9180 + }, + { + "epoch": 6.7, + "grad_norm": 0.450114905834198, + "learning_rate": 0.00032772277227722775, + "loss": 1.8279, + "step": 9190 + }, + { + "epoch": 6.7, + "grad_norm": 0.4129829704761505, + "learning_rate": 0.00032673267326732675, + "loss": 1.8282, + "step": 9200 + }, + { + "epoch": 6.71, + "grad_norm": 0.4841521084308624, + "learning_rate": 0.00032574257425742575, + "loss": 1.8282, + "step": 9210 + }, + { + "epoch": 6.72, + "grad_norm": 0.5160727500915527, + "learning_rate": 0.00032475247524752475, + "loss": 1.8269, + "step": 9220 + }, + { + "epoch": 6.73, + "grad_norm": 0.5048640370368958, + "learning_rate": 0.0003237623762376238, + "loss": 1.8268, + "step": 9230 + }, + { + "epoch": 6.73, + "grad_norm": 0.584237813949585, + "learning_rate": 0.0003227722772277228, + "loss": 1.8295, + "step": 9240 + }, + { + "epoch": 6.74, + "grad_norm": 0.43098556995391846, + "learning_rate": 0.0003217821782178218, + "loss": 1.8249, + "step": 9250 + }, + { + "epoch": 6.75, + "grad_norm": 0.3972054421901703, + "learning_rate": 0.0003207920792079208, + "loss": 1.8246, + "step": 9260 + }, + { + "epoch": 6.75, + "grad_norm": 0.3777482211589813, + "learning_rate": 0.0003198019801980198, + "loss": 1.8243, + "step": 9270 + }, + { + "epoch": 6.75, + "eval_accuracy": 0.6379164581645916, + "eval_loss": 1.6772228479385376, + "eval_runtime": 1085.6894, + "eval_samples_per_second": 459.966, + "eval_steps_per_second": 2.054, + "step": 9270 + }, + { + "epoch": 6.76, + "grad_norm": 0.42633891105651855, + "learning_rate": 0.00031881188118811885, + "loss": 1.8221, + "step": 9280 + }, + { + "epoch": 6.77, + "grad_norm": 0.5893245339393616, + "learning_rate": 0.00031782178217821784, + "loss": 1.8243, + "step": 9290 + }, + { + "epoch": 6.78, + "grad_norm": 0.5304878354072571, + "learning_rate": 0.00031683168316831684, + "loss": 1.8242, + "step": 9300 + }, + { + "epoch": 6.78, + "grad_norm": 0.4657045304775238, + "learning_rate": 0.00031584158415841584, + "loss": 1.8242, + "step": 9310 + }, + { + "epoch": 6.79, + "grad_norm": 0.43276962637901306, + "learning_rate": 0.00031485148514851484, + "loss": 1.8225, + "step": 9320 + }, + { + "epoch": 6.8, + "grad_norm": 0.40515634417533875, + "learning_rate": 0.0003138613861386139, + "loss": 1.8228, + "step": 9330 + }, + { + "epoch": 6.81, + "grad_norm": 0.40448087453842163, + "learning_rate": 0.0003128712871287129, + "loss": 1.8198, + "step": 9340 + }, + { + "epoch": 6.81, + "grad_norm": 0.4380359351634979, + "learning_rate": 0.0003118811881188119, + "loss": 1.8201, + "step": 9350 + }, + { + "epoch": 6.82, + "grad_norm": 0.46539106965065, + "learning_rate": 0.0003108910891089109, + "loss": 1.8217, + "step": 9360 + }, + { + "epoch": 6.82, + "eval_accuracy": 0.6389200973153136, + "eval_loss": 1.6709976196289062, + "eval_runtime": 1085.3788, + "eval_samples_per_second": 460.097, + "eval_steps_per_second": 2.055, + "step": 9360 + }, + { + "epoch": 6.83, + "grad_norm": 0.4772910177707672, + "learning_rate": 0.0003099009900990099, + "loss": 1.8202, + "step": 9370 + }, + { + "epoch": 6.83, + "grad_norm": 0.42799142003059387, + "learning_rate": 0.00030891089108910894, + "loss": 1.819, + "step": 9380 + }, + { + "epoch": 6.84, + "grad_norm": 0.40562522411346436, + "learning_rate": 0.00030792079207920793, + "loss": 1.8208, + "step": 9390 + }, + { + "epoch": 6.85, + "grad_norm": 0.6129370927810669, + "learning_rate": 0.00030693069306930693, + "loss": 1.8158, + "step": 9400 + }, + { + "epoch": 6.86, + "grad_norm": 0.4654337465763092, + "learning_rate": 0.00030594059405940593, + "loss": 1.8175, + "step": 9410 + }, + { + "epoch": 6.86, + "grad_norm": 0.4340899884700775, + "learning_rate": 0.00030495049504950493, + "loss": 1.8156, + "step": 9420 + }, + { + "epoch": 6.87, + "grad_norm": 0.4216438829898834, + "learning_rate": 0.000303960396039604, + "loss": 1.8182, + "step": 9430 + }, + { + "epoch": 6.88, + "grad_norm": 0.3995111584663391, + "learning_rate": 0.000302970297029703, + "loss": 1.8175, + "step": 9440 + }, + { + "epoch": 6.89, + "grad_norm": 0.4166571795940399, + "learning_rate": 0.000301980198019802, + "loss": 1.8171, + "step": 9450 + }, + { + "epoch": 6.89, + "eval_accuracy": 0.6397325747268708, + "eval_loss": 1.6698857545852661, + "eval_runtime": 1086.4276, + "eval_samples_per_second": 459.653, + "eval_steps_per_second": 2.053, + "step": 9450 + }, + { + "epoch": 6.89, + "grad_norm": 0.4397519528865814, + "learning_rate": 0.000300990099009901, + "loss": 1.8171, + "step": 9460 + }, + { + "epoch": 6.9, + "grad_norm": 0.4201519787311554, + "learning_rate": 0.0003, + "loss": 1.8166, + "step": 9470 + }, + { + "epoch": 6.91, + "grad_norm": 0.4194183945655823, + "learning_rate": 0.000299009900990099, + "loss": 1.8137, + "step": 9480 + }, + { + "epoch": 6.91, + "grad_norm": 0.4156358540058136, + "learning_rate": 0.000298019801980198, + "loss": 1.8149, + "step": 9490 + }, + { + "epoch": 6.92, + "grad_norm": 0.43356573581695557, + "learning_rate": 0.000297029702970297, + "loss": 1.8104, + "step": 9500 + }, + { + "epoch": 6.93, + "grad_norm": 0.4354686141014099, + "learning_rate": 0.000296039603960396, + "loss": 1.8131, + "step": 9510 + }, + { + "epoch": 6.94, + "grad_norm": 0.45513659715652466, + "learning_rate": 0.000295049504950495, + "loss": 1.8115, + "step": 9520 + }, + { + "epoch": 6.94, + "grad_norm": 0.39433979988098145, + "learning_rate": 0.00029405940594059407, + "loss": 1.8106, + "step": 9530 + }, + { + "epoch": 6.95, + "grad_norm": 0.5176064372062683, + "learning_rate": 0.00029306930693069307, + "loss": 1.8153, + "step": 9540 + }, + { + "epoch": 6.95, + "eval_accuracy": 0.640691022778346, + "eval_loss": 1.66335129737854, + "eval_runtime": 1082.6564, + "eval_samples_per_second": 461.254, + "eval_steps_per_second": 2.06, + "step": 9540 + }, + { + "epoch": 6.96, + "grad_norm": 0.5153635740280151, + "learning_rate": 0.00029207920792079207, + "loss": 1.8142, + "step": 9550 + }, + { + "epoch": 6.97, + "grad_norm": 0.37104055285453796, + "learning_rate": 0.00029108910891089107, + "loss": 1.812, + "step": 9560 + }, + { + "epoch": 6.97, + "grad_norm": 0.40426042675971985, + "learning_rate": 0.00029009900990099006, + "loss": 1.8119, + "step": 9570 + }, + { + "epoch": 6.98, + "grad_norm": 0.5108228325843811, + "learning_rate": 0.0002891089108910891, + "loss": 1.8131, + "step": 9580 + }, + { + "epoch": 6.99, + "grad_norm": 0.4702747166156769, + "learning_rate": 0.0002881188118811881, + "loss": 1.812, + "step": 9590 + }, + { + "epoch": 6.99, + "grad_norm": 0.3683488965034485, + "learning_rate": 0.0002871287128712871, + "loss": 1.8081, + "step": 9600 + }, + { + "epoch": 7.0, + "grad_norm": 0.42281776666641235, + "learning_rate": 0.0002861386138613861, + "loss": 1.81, + "step": 9610 + }, + { + "epoch": 7.01, + "grad_norm": 0.48128095269203186, + "learning_rate": 0.0002851485148514851, + "loss": 1.81, + "step": 9620 + }, + { + "epoch": 7.02, + "grad_norm": 0.4020933210849762, + "learning_rate": 0.00028415841584158416, + "loss": 1.81, + "step": 9630 + }, + { + "epoch": 7.02, + "eval_accuracy": 0.6416495776980271, + "eval_loss": 1.6598803997039795, + "eval_runtime": 1084.1267, + "eval_samples_per_second": 460.629, + "eval_steps_per_second": 2.057, + "step": 9630 + }, + { + "epoch": 7.02, + "grad_norm": 0.5061802864074707, + "learning_rate": 0.00028316831683168316, + "loss": 1.8091, + "step": 9640 + }, + { + "epoch": 7.03, + "grad_norm": 0.5182695388793945, + "learning_rate": 0.00028217821782178216, + "loss": 1.8101, + "step": 9650 + }, + { + "epoch": 7.04, + "grad_norm": 0.45669490098953247, + "learning_rate": 0.0002811881188118812, + "loss": 1.8059, + "step": 9660 + }, + { + "epoch": 7.05, + "grad_norm": 0.5214717984199524, + "learning_rate": 0.0002801980198019802, + "loss": 1.809, + "step": 9670 + }, + { + "epoch": 7.05, + "grad_norm": 0.46073251962661743, + "learning_rate": 0.00027920792079207926, + "loss": 1.8066, + "step": 9680 + }, + { + "epoch": 7.06, + "grad_norm": 0.3924192190170288, + "learning_rate": 0.00027821782178217826, + "loss": 1.8058, + "step": 9690 + }, + { + "epoch": 7.07, + "grad_norm": 0.42634785175323486, + "learning_rate": 0.00027722772277227726, + "loss": 1.8082, + "step": 9700 + }, + { + "epoch": 7.07, + "grad_norm": 0.46675705909729004, + "learning_rate": 0.00027623762376237626, + "loss": 1.8057, + "step": 9710 + }, + { + "epoch": 7.08, + "grad_norm": 0.43609708547592163, + "learning_rate": 0.00027524752475247525, + "loss": 1.8051, + "step": 9720 + }, + { + "epoch": 7.08, + "eval_accuracy": 0.6424563152815425, + "eval_loss": 1.6557390689849854, + "eval_runtime": 1083.0526, + "eval_samples_per_second": 461.086, + "eval_steps_per_second": 2.059, + "step": 9720 + }, + { + "epoch": 7.09, + "grad_norm": 0.35754847526550293, + "learning_rate": 0.0002742574257425743, + "loss": 1.8053, + "step": 9730 + }, + { + "epoch": 7.1, + "grad_norm": 0.38588428497314453, + "learning_rate": 0.0002732673267326733, + "loss": 1.8059, + "step": 9740 + }, + { + "epoch": 7.1, + "grad_norm": 0.4815811514854431, + "learning_rate": 0.0002722772277227723, + "loss": 1.8053, + "step": 9750 + }, + { + "epoch": 7.11, + "grad_norm": 0.4292014241218567, + "learning_rate": 0.0002712871287128713, + "loss": 1.8018, + "step": 9760 + }, + { + "epoch": 7.12, + "grad_norm": 0.4443877339363098, + "learning_rate": 0.0002702970297029703, + "loss": 1.8022, + "step": 9770 + }, + { + "epoch": 7.13, + "grad_norm": 0.4887067675590515, + "learning_rate": 0.00026930693069306935, + "loss": 1.804, + "step": 9780 + }, + { + "epoch": 7.13, + "grad_norm": 0.4577280282974243, + "learning_rate": 0.00026831683168316835, + "loss": 1.8043, + "step": 9790 + }, + { + "epoch": 7.14, + "grad_norm": 0.38725900650024414, + "learning_rate": 0.00026732673267326735, + "loss": 1.8014, + "step": 9800 + }, + { + "epoch": 7.15, + "grad_norm": 0.45041412115097046, + "learning_rate": 0.00026633663366336635, + "loss": 1.8046, + "step": 9810 + }, + { + "epoch": 7.15, + "eval_accuracy": 0.6431962908340371, + "eval_loss": 1.6528569459915161, + "eval_runtime": 1084.5222, + "eval_samples_per_second": 460.461, + "eval_steps_per_second": 2.056, + "step": 9810 + }, + { + "epoch": 7.15, + "grad_norm": 0.532798707485199, + "learning_rate": 0.00026534653465346534, + "loss": 1.8015, + "step": 9820 + }, + { + "epoch": 7.16, + "grad_norm": 0.43234437704086304, + "learning_rate": 0.0002643564356435644, + "loss": 1.8013, + "step": 9830 + }, + { + "epoch": 7.17, + "grad_norm": 0.4301891624927521, + "learning_rate": 0.0002633663366336634, + "loss": 1.8011, + "step": 9840 + }, + { + "epoch": 7.18, + "grad_norm": 0.41938352584838867, + "learning_rate": 0.0002623762376237624, + "loss": 1.8032, + "step": 9850 + }, + { + "epoch": 7.18, + "grad_norm": 0.4415999948978424, + "learning_rate": 0.0002613861386138614, + "loss": 1.8011, + "step": 9860 + }, + { + "epoch": 7.19, + "grad_norm": 0.46587106585502625, + "learning_rate": 0.0002603960396039604, + "loss": 1.8005, + "step": 9870 + }, + { + "epoch": 7.2, + "grad_norm": 0.3755381107330322, + "learning_rate": 0.00025940594059405944, + "loss": 1.7996, + "step": 9880 + }, + { + "epoch": 7.21, + "grad_norm": 0.37551912665367126, + "learning_rate": 0.00025841584158415844, + "loss": 1.7981, + "step": 9890 + }, + { + "epoch": 7.21, + "grad_norm": 0.4268946349620819, + "learning_rate": 0.00025742574257425744, + "loss": 1.7997, + "step": 9900 + }, + { + "epoch": 7.21, + "eval_accuracy": 0.6438810244549774, + "eval_loss": 1.6489626169204712, + "eval_runtime": 1085.6377, + "eval_samples_per_second": 459.988, + "eval_steps_per_second": 2.054, + "step": 9900 + }, + { + "epoch": 7.22, + "grad_norm": 0.5793518424034119, + "learning_rate": 0.00025643564356435644, + "loss": 1.8, + "step": 9910 + }, + { + "epoch": 7.23, + "grad_norm": 0.37436190247535706, + "learning_rate": 0.00025544554455445543, + "loss": 1.7974, + "step": 9920 + }, + { + "epoch": 7.24, + "grad_norm": 0.5522225499153137, + "learning_rate": 0.0002544554455445545, + "loss": 1.7968, + "step": 9930 + }, + { + "epoch": 7.24, + "grad_norm": 0.4452868402004242, + "learning_rate": 0.0002534653465346535, + "loss": 1.7997, + "step": 9940 + }, + { + "epoch": 7.25, + "grad_norm": 0.41211819648742676, + "learning_rate": 0.0002524752475247525, + "loss": 1.798, + "step": 9950 + }, + { + "epoch": 7.26, + "grad_norm": 0.4052869379520416, + "learning_rate": 0.0002514851485148515, + "loss": 1.7948, + "step": 9960 + }, + { + "epoch": 7.26, + "grad_norm": 0.4514144957065582, + "learning_rate": 0.0002504950495049505, + "loss": 1.7955, + "step": 9970 + }, + { + "epoch": 7.27, + "grad_norm": 0.49351832270622253, + "learning_rate": 0.00024950495049504953, + "loss": 1.7991, + "step": 9980 + }, + { + "epoch": 7.28, + "grad_norm": 0.49827703833580017, + "learning_rate": 0.00024851485148514853, + "loss": 1.7994, + "step": 9990 + }, + { + "epoch": 7.28, + "eval_accuracy": 0.6442858204973027, + "eval_loss": 1.6466220617294312, + "eval_runtime": 1086.2584, + "eval_samples_per_second": 459.725, + "eval_steps_per_second": 2.053, + "step": 9990 + }, + { + "epoch": 7.29, + "grad_norm": 0.4449995160102844, + "learning_rate": 0.00024752475247524753, + "loss": 1.7962, + "step": 10000 + }, + { + "epoch": 7.29, + "grad_norm": 0.3683604300022125, + "learning_rate": 0.0002465346534653465, + "loss": 1.7984, + "step": 10010 + }, + { + "epoch": 7.3, + "grad_norm": 0.48126864433288574, + "learning_rate": 0.0002455445544554455, + "loss": 1.7953, + "step": 10020 + }, + { + "epoch": 7.31, + "grad_norm": 0.3565351068973541, + "learning_rate": 0.0002445544554455446, + "loss": 1.7914, + "step": 10030 + }, + { + "epoch": 7.32, + "grad_norm": 0.4369056820869446, + "learning_rate": 0.00024356435643564357, + "loss": 1.7949, + "step": 10040 + }, + { + "epoch": 7.32, + "grad_norm": 0.5566734671592712, + "learning_rate": 0.00024257425742574257, + "loss": 1.7961, + "step": 10050 + }, + { + "epoch": 7.33, + "grad_norm": 0.4066598117351532, + "learning_rate": 0.00024158415841584157, + "loss": 1.7962, + "step": 10060 + }, + { + "epoch": 7.34, + "grad_norm": 0.4281260669231415, + "learning_rate": 0.0002405940594059406, + "loss": 1.7943, + "step": 10070 + }, + { + "epoch": 7.34, + "grad_norm": 0.34586983919143677, + "learning_rate": 0.0002396039603960396, + "loss": 1.7934, + "step": 10080 + }, + { + "epoch": 7.34, + "eval_accuracy": 0.6448193534487687, + "eval_loss": 1.644710898399353, + "eval_runtime": 1084.6339, + "eval_samples_per_second": 460.413, + "eval_steps_per_second": 2.056, + "step": 10080 + }, + { + "epoch": 7.35, + "grad_norm": 0.3622014820575714, + "learning_rate": 0.00023861386138613862, + "loss": 1.7912, + "step": 10090 + }, + { + "epoch": 7.36, + "grad_norm": 0.456106960773468, + "learning_rate": 0.00023762376237623762, + "loss": 1.7922, + "step": 10100 + }, + { + "epoch": 7.37, + "grad_norm": 0.4329501986503601, + "learning_rate": 0.00023663366336633662, + "loss": 1.7937, + "step": 10110 + }, + { + "epoch": 7.37, + "grad_norm": 0.4640803039073944, + "learning_rate": 0.00023564356435643564, + "loss": 1.793, + "step": 10120 + }, + { + "epoch": 7.38, + "grad_norm": 0.39238548278808594, + "learning_rate": 0.00023465346534653464, + "loss": 1.7916, + "step": 10130 + }, + { + "epoch": 7.39, + "grad_norm": 0.43311530351638794, + "learning_rate": 0.0002336633663366337, + "loss": 1.7943, + "step": 10140 + }, + { + "epoch": 7.4, + "grad_norm": 0.35872432589530945, + "learning_rate": 0.0002326732673267327, + "loss": 1.7918, + "step": 10150 + }, + { + "epoch": 7.4, + "grad_norm": 0.36510738730430603, + "learning_rate": 0.0002316831683168317, + "loss": 1.7907, + "step": 10160 + }, + { + "epoch": 7.41, + "grad_norm": 0.46106651425361633, + "learning_rate": 0.00023069306930693071, + "loss": 1.7917, + "step": 10170 + }, + { + "epoch": 7.41, + "eval_accuracy": 0.6454906168005663, + "eval_loss": 1.641427993774414, + "eval_runtime": 1084.7154, + "eval_samples_per_second": 460.379, + "eval_steps_per_second": 2.056, + "step": 10170 + }, + { + "epoch": 7.42, + "grad_norm": 0.37493211030960083, + "learning_rate": 0.0002297029702970297, + "loss": 1.7921, + "step": 10180 + }, + { + "epoch": 7.42, + "grad_norm": 0.3865686058998108, + "learning_rate": 0.00022871287128712874, + "loss": 1.791, + "step": 10190 + }, + { + "epoch": 7.43, + "grad_norm": 0.3889116942882538, + "learning_rate": 0.00022772277227722774, + "loss": 1.7907, + "step": 10200 + }, + { + "epoch": 7.44, + "grad_norm": 0.6045088768005371, + "learning_rate": 0.00022673267326732673, + "loss": 1.7902, + "step": 10210 + }, + { + "epoch": 7.45, + "grad_norm": 0.4185848832130432, + "learning_rate": 0.00022574257425742576, + "loss": 1.7889, + "step": 10220 + }, + { + "epoch": 7.45, + "grad_norm": 0.3915616571903229, + "learning_rate": 0.00022475247524752476, + "loss": 1.7906, + "step": 10230 + }, + { + "epoch": 7.46, + "grad_norm": 0.3687775433063507, + "learning_rate": 0.00022376237623762378, + "loss": 1.7897, + "step": 10240 + }, + { + "epoch": 7.47, + "grad_norm": 0.39049315452575684, + "learning_rate": 0.00022277227722772278, + "loss": 1.789, + "step": 10250 + }, + { + "epoch": 7.48, + "grad_norm": 0.37289920449256897, + "learning_rate": 0.00022178217821782178, + "loss": 1.7887, + "step": 10260 + }, + { + "epoch": 7.48, + "eval_accuracy": 0.645869271741179, + "eval_loss": 1.639427661895752, + "eval_runtime": 1118.4989, + "eval_samples_per_second": 446.473, + "eval_steps_per_second": 1.994, + "step": 10260 + }, + { + "epoch": 7.48, + "grad_norm": 0.45301392674446106, + "learning_rate": 0.0002207920792079208, + "loss": 1.7911, + "step": 10270 + }, + { + "epoch": 7.49, + "grad_norm": 0.42282310128211975, + "learning_rate": 0.0002198019801980198, + "loss": 1.788, + "step": 10280 + }, + { + "epoch": 7.5, + "grad_norm": 0.34825190901756287, + "learning_rate": 0.00021881188118811883, + "loss": 1.7888, + "step": 10290 + }, + { + "epoch": 7.5, + "grad_norm": 0.4688248038291931, + "learning_rate": 0.00021782178217821783, + "loss": 1.7878, + "step": 10300 + }, + { + "epoch": 7.51, + "grad_norm": 0.4295547902584076, + "learning_rate": 0.00021683168316831682, + "loss": 1.7877, + "step": 10310 + }, + { + "epoch": 7.52, + "grad_norm": 0.3433161675930023, + "learning_rate": 0.00021584158415841585, + "loss": 1.7884, + "step": 10320 + }, + { + "epoch": 7.53, + "grad_norm": 0.34638333320617676, + "learning_rate": 0.00021485148514851485, + "loss": 1.7891, + "step": 10330 + }, + { + "epoch": 7.53, + "grad_norm": 0.40277931094169617, + "learning_rate": 0.00021386138613861387, + "loss": 1.7856, + "step": 10340 + }, + { + "epoch": 7.54, + "grad_norm": 0.42518341541290283, + "learning_rate": 0.00021287128712871287, + "loss": 1.7861, + "step": 10350 + }, + { + "epoch": 7.54, + "eval_accuracy": 0.6466240342094058, + "eval_loss": 1.637886881828308, + "eval_runtime": 1094.5797, + "eval_samples_per_second": 456.23, + "eval_steps_per_second": 2.037, + "step": 10350 + }, + { + "epoch": 7.55, + "grad_norm": 0.3844136893749237, + "learning_rate": 0.00021188118811881187, + "loss": 1.7862, + "step": 10360 + }, + { + "epoch": 7.56, + "grad_norm": 0.34440210461616516, + "learning_rate": 0.0002108910891089109, + "loss": 1.7864, + "step": 10370 + }, + { + "epoch": 7.56, + "grad_norm": 0.501716136932373, + "learning_rate": 0.0002099009900990099, + "loss": 1.7843, + "step": 10380 + }, + { + "epoch": 7.57, + "grad_norm": 0.3695526421070099, + "learning_rate": 0.00020891089108910892, + "loss": 1.7855, + "step": 10390 + }, + { + "epoch": 7.58, + "grad_norm": 0.38437628746032715, + "learning_rate": 0.00020792079207920792, + "loss": 1.7847, + "step": 10400 + }, + { + "epoch": 7.58, + "grad_norm": 0.4197578430175781, + "learning_rate": 0.00020693069306930691, + "loss": 1.7833, + "step": 10410 + }, + { + "epoch": 7.59, + "grad_norm": 0.39093175530433655, + "learning_rate": 0.00020594059405940594, + "loss": 1.786, + "step": 10420 + }, + { + "epoch": 7.6, + "grad_norm": 0.36000731587409973, + "learning_rate": 0.00020495049504950494, + "loss": 1.7872, + "step": 10430 + }, + { + "epoch": 7.61, + "grad_norm": 0.45473846793174744, + "learning_rate": 0.00020396039603960396, + "loss": 1.7853, + "step": 10440 + }, + { + "epoch": 7.61, + "eval_accuracy": 0.6470880404015521, + "eval_loss": 1.6340434551239014, + "eval_runtime": 1091.9524, + "eval_samples_per_second": 457.328, + "eval_steps_per_second": 2.042, + "step": 10440 + }, + { + "epoch": 7.61, + "grad_norm": 0.3739522099494934, + "learning_rate": 0.000202970297029703, + "loss": 1.7824, + "step": 10450 + }, + { + "epoch": 7.62, + "grad_norm": 0.40237516164779663, + "learning_rate": 0.00020198019801980199, + "loss": 1.7837, + "step": 10460 + }, + { + "epoch": 7.63, + "grad_norm": 0.44717445969581604, + "learning_rate": 0.000200990099009901, + "loss": 1.7828, + "step": 10470 + }, + { + "epoch": 7.64, + "grad_norm": 0.4383144676685333, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 10480 + }, + { + "epoch": 7.64, + "grad_norm": 0.368528813123703, + "learning_rate": 0.00019900990099009903, + "loss": 1.7818, + "step": 10490 + }, + { + "epoch": 7.65, + "grad_norm": 0.40288105607032776, + "learning_rate": 0.00019801980198019803, + "loss": 1.7832, + "step": 10500 + }, + { + "epoch": 7.66, + "grad_norm": 0.3775827884674072, + "learning_rate": 0.00019702970297029703, + "loss": 1.7814, + "step": 10510 + }, + { + "epoch": 7.66, + "grad_norm": 0.4067000448703766, + "learning_rate": 0.00019603960396039606, + "loss": 1.7815, + "step": 10520 + }, + { + "epoch": 7.67, + "grad_norm": 0.45529502630233765, + "learning_rate": 0.00019504950495049505, + "loss": 1.7847, + "step": 10530 + }, + { + "epoch": 7.67, + "eval_accuracy": 0.647348903103301, + "eval_loss": 1.6344057321548462, + "eval_runtime": 1092.9918, + "eval_samples_per_second": 456.893, + "eval_steps_per_second": 2.04, + "step": 10530 + }, + { + "epoch": 7.68, + "grad_norm": 0.4383666217327118, + "learning_rate": 0.00019405940594059408, + "loss": 1.7799, + "step": 10540 + }, + { + "epoch": 7.69, + "grad_norm": 0.41071295738220215, + "learning_rate": 0.00019306930693069308, + "loss": 1.7826, + "step": 10550 + }, + { + "epoch": 7.69, + "grad_norm": 0.4367865324020386, + "learning_rate": 0.00019207920792079208, + "loss": 1.7819, + "step": 10560 + }, + { + "epoch": 7.7, + "grad_norm": 0.3609165549278259, + "learning_rate": 0.0001910891089108911, + "loss": 1.7802, + "step": 10570 + }, + { + "epoch": 7.71, + "grad_norm": 0.39005783200263977, + "learning_rate": 0.0001900990099009901, + "loss": 1.78, + "step": 10580 + }, + { + "epoch": 7.72, + "grad_norm": 0.3287705183029175, + "learning_rate": 0.00018910891089108913, + "loss": 1.7797, + "step": 10590 + }, + { + "epoch": 7.72, + "grad_norm": 0.3266151547431946, + "learning_rate": 0.00018811881188118812, + "loss": 1.7795, + "step": 10600 + }, + { + "epoch": 7.73, + "grad_norm": 0.35796740651130676, + "learning_rate": 0.00018712871287128712, + "loss": 1.7782, + "step": 10610 + }, + { + "epoch": 7.74, + "grad_norm": 0.39394471049308777, + "learning_rate": 0.00018613861386138615, + "loss": 1.7799, + "step": 10620 + }, + { + "epoch": 7.74, + "eval_accuracy": 0.647908722477522, + "eval_loss": 1.6299968957901, + "eval_runtime": 1086.7095, + "eval_samples_per_second": 459.534, + "eval_steps_per_second": 2.052, + "step": 10620 + }, + { + "epoch": 7.75, + "grad_norm": 0.4640734791755676, + "learning_rate": 0.00018514851485148514, + "loss": 1.7794, + "step": 10630 + }, + { + "epoch": 7.75, + "grad_norm": 0.3890862464904785, + "learning_rate": 0.00018415841584158417, + "loss": 1.7794, + "step": 10640 + }, + { + "epoch": 7.76, + "grad_norm": 0.3503568768501282, + "learning_rate": 0.00018316831683168317, + "loss": 1.7776, + "step": 10650 + }, + { + "epoch": 7.77, + "grad_norm": 0.39593997597694397, + "learning_rate": 0.00018217821782178217, + "loss": 1.7798, + "step": 10660 + }, + { + "epoch": 7.77, + "grad_norm": 0.3677063286304474, + "learning_rate": 0.0001811881188118812, + "loss": 1.7769, + "step": 10670 + }, + { + "epoch": 7.78, + "grad_norm": 0.3558836877346039, + "learning_rate": 0.0001801980198019802, + "loss": 1.7803, + "step": 10680 + }, + { + "epoch": 7.79, + "grad_norm": 0.4983728229999542, + "learning_rate": 0.00017920792079207922, + "loss": 1.7774, + "step": 10690 + }, + { + "epoch": 7.8, + "grad_norm": 0.3735315203666687, + "learning_rate": 0.0001782178217821782, + "loss": 1.778, + "step": 10700 + }, + { + "epoch": 7.8, + "grad_norm": 0.3440467417240143, + "learning_rate": 0.0001772277227722772, + "loss": 1.7773, + "step": 10710 + }, + { + "epoch": 7.8, + "eval_accuracy": 0.6480254558570787, + "eval_loss": 1.6308239698410034, + "eval_runtime": 1084.4491, + "eval_samples_per_second": 460.492, + "eval_steps_per_second": 2.056, + "step": 10710 + }, + { + "epoch": 7.81, + "grad_norm": 0.3812803626060486, + "learning_rate": 0.00017623762376237624, + "loss": 1.7776, + "step": 10720 + }, + { + "epoch": 7.82, + "grad_norm": 0.3772016763687134, + "learning_rate": 0.00017524752475247524, + "loss": 1.7751, + "step": 10730 + }, + { + "epoch": 7.83, + "grad_norm": 0.3638882339000702, + "learning_rate": 0.00017425742574257426, + "loss": 1.7794, + "step": 10740 + }, + { + "epoch": 7.83, + "grad_norm": 0.43426576256752014, + "learning_rate": 0.00017326732673267326, + "loss": 1.7787, + "step": 10750 + }, + { + "epoch": 7.84, + "grad_norm": 0.41420796513557434, + "learning_rate": 0.00017227722772277226, + "loss": 1.7766, + "step": 10760 + }, + { + "epoch": 7.85, + "grad_norm": 0.3958096206188202, + "learning_rate": 0.0001712871287128713, + "loss": 1.7765, + "step": 10770 + }, + { + "epoch": 7.85, + "grad_norm": 0.3829286992549896, + "learning_rate": 0.0001702970297029703, + "loss": 1.7773, + "step": 10780 + }, + { + "epoch": 7.86, + "grad_norm": 0.41278573870658875, + "learning_rate": 0.00016930693069306933, + "loss": 1.7769, + "step": 10790 + }, + { + "epoch": 7.87, + "grad_norm": 0.337071031332016, + "learning_rate": 0.00016831683168316833, + "loss": 1.7766, + "step": 10800 + }, + { + "epoch": 7.87, + "eval_accuracy": 0.6483471740815172, + "eval_loss": 1.6272797584533691, + "eval_runtime": 1083.9003, + "eval_samples_per_second": 460.725, + "eval_steps_per_second": 2.057, + "step": 10800 + }, + { + "epoch": 7.88, + "grad_norm": 0.3459130823612213, + "learning_rate": 0.00016732673267326733, + "loss": 1.7781, + "step": 10810 + }, + { + "epoch": 7.88, + "grad_norm": 0.3339349925518036, + "learning_rate": 0.00016633663366336635, + "loss": 1.7754, + "step": 10820 + }, + { + "epoch": 7.89, + "grad_norm": 0.3156519830226898, + "learning_rate": 0.00016534653465346535, + "loss": 1.776, + "step": 10830 + }, + { + "epoch": 7.9, + "grad_norm": 0.3628999590873718, + "learning_rate": 0.00016435643564356438, + "loss": 1.7744, + "step": 10840 + }, + { + "epoch": 7.91, + "grad_norm": 0.350087434053421, + "learning_rate": 0.00016336633663366338, + "loss": 1.7752, + "step": 10850 + }, + { + "epoch": 7.91, + "grad_norm": 0.40664511919021606, + "learning_rate": 0.00016237623762376237, + "loss": 1.7781, + "step": 10860 + }, + { + "epoch": 7.92, + "grad_norm": 0.3364078104496002, + "learning_rate": 0.0001613861386138614, + "loss": 1.776, + "step": 10870 + }, + { + "epoch": 7.93, + "grad_norm": 0.32285594940185547, + "learning_rate": 0.0001603960396039604, + "loss": 1.772, + "step": 10880 + }, + { + "epoch": 7.93, + "grad_norm": 0.39036986231803894, + "learning_rate": 0.00015940594059405942, + "loss": 1.7755, + "step": 10890 + }, + { + "epoch": 7.93, + "eval_accuracy": 0.6488458576263911, + "eval_loss": 1.6268614530563354, + "eval_runtime": 1085.4437, + "eval_samples_per_second": 460.07, + "eval_steps_per_second": 2.054, + "step": 10890 + }, + { + "epoch": 7.94, + "grad_norm": 0.41111525893211365, + "learning_rate": 0.00015841584158415842, + "loss": 1.7747, + "step": 10900 + }, + { + "epoch": 7.95, + "grad_norm": 0.4492338299751282, + "learning_rate": 0.00015742574257425742, + "loss": 1.7753, + "step": 10910 + }, + { + "epoch": 7.96, + "grad_norm": 0.3836340606212616, + "learning_rate": 0.00015643564356435644, + "loss": 1.7727, + "step": 10920 + }, + { + "epoch": 7.96, + "grad_norm": 0.33723685145378113, + "learning_rate": 0.00015544554455445544, + "loss": 1.775, + "step": 10930 + }, + { + "epoch": 7.97, + "grad_norm": 0.4088629484176636, + "learning_rate": 0.00015445544554455447, + "loss": 1.7741, + "step": 10940 + }, + { + "epoch": 7.98, + "grad_norm": 0.3302168548107147, + "learning_rate": 0.00015346534653465347, + "loss": 1.7732, + "step": 10950 + }, + { + "epoch": 7.99, + "grad_norm": 0.3605554401874542, + "learning_rate": 0.00015247524752475246, + "loss": 1.7722, + "step": 10960 + }, + { + "epoch": 7.99, + "grad_norm": 0.355826735496521, + "learning_rate": 0.0001514851485148515, + "loss": 1.7715, + "step": 10970 + }, + { + "epoch": 8.0, + "grad_norm": 0.3708418607711792, + "learning_rate": 0.0001504950495049505, + "loss": 1.7721, + "step": 10980 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.6491409467366662, + "eval_loss": 1.6255041360855103, + "eval_runtime": 1085.4323, + "eval_samples_per_second": 460.075, + "eval_steps_per_second": 2.054, + "step": 10980 + }, + { + "epoch": 8.01, + "grad_norm": 0.3321845233440399, + "learning_rate": 0.0001495049504950495, + "loss": 1.7719, + "step": 10990 + }, + { + "epoch": 8.01, + "grad_norm": 0.3251964747905731, + "learning_rate": 0.0001485148514851485, + "loss": 1.7731, + "step": 11000 + }, + { + "epoch": 8.02, + "grad_norm": 0.4384077787399292, + "learning_rate": 0.0001475247524752475, + "loss": 1.7753, + "step": 11010 + }, + { + "epoch": 8.03, + "grad_norm": 0.3885464370250702, + "learning_rate": 0.00014653465346534653, + "loss": 1.7707, + "step": 11020 + }, + { + "epoch": 8.04, + "grad_norm": 0.42948633432388306, + "learning_rate": 0.00014554455445544553, + "loss": 1.7716, + "step": 11030 + }, + { + "epoch": 8.04, + "grad_norm": 0.33700281381607056, + "learning_rate": 0.00014455445544554456, + "loss": 1.7711, + "step": 11040 + }, + { + "epoch": 8.05, + "grad_norm": 0.37887144088745117, + "learning_rate": 0.00014356435643564356, + "loss": 1.7707, + "step": 11050 + }, + { + "epoch": 8.06, + "grad_norm": 0.2920531630516052, + "learning_rate": 0.00014257425742574255, + "loss": 1.7717, + "step": 11060 + }, + { + "epoch": 8.07, + "grad_norm": 0.41969195008277893, + "learning_rate": 0.00014158415841584158, + "loss": 1.77, + "step": 11070 + }, + { + "epoch": 8.07, + "eval_accuracy": 0.6496409109474853, + "eval_loss": 1.6225236654281616, + "eval_runtime": 1085.9823, + "eval_samples_per_second": 459.842, + "eval_steps_per_second": 2.053, + "step": 11070 + }, + { + "epoch": 8.07, + "grad_norm": 0.3497646152973175, + "learning_rate": 0.0001405940594059406, + "loss": 1.7712, + "step": 11080 + }, + { + "epoch": 8.08, + "grad_norm": 0.38377320766448975, + "learning_rate": 0.00013960396039603963, + "loss": 1.77, + "step": 11090 + }, + { + "epoch": 8.09, + "grad_norm": 0.3549109101295471, + "learning_rate": 0.00013861386138613863, + "loss": 1.7736, + "step": 11100 + }, + { + "epoch": 8.09, + "grad_norm": 0.3375445604324341, + "learning_rate": 0.00013762376237623763, + "loss": 1.7715, + "step": 11110 + }, + { + "epoch": 8.1, + "grad_norm": 0.30400267243385315, + "learning_rate": 0.00013663366336633665, + "loss": 1.7708, + "step": 11120 + }, + { + "epoch": 8.11, + "grad_norm": 0.36075320839881897, + "learning_rate": 0.00013564356435643565, + "loss": 1.7706, + "step": 11130 + }, + { + "epoch": 8.12, + "grad_norm": 0.40548309683799744, + "learning_rate": 0.00013465346534653468, + "loss": 1.7677, + "step": 11140 + }, + { + "epoch": 8.12, + "grad_norm": 0.45684516429901123, + "learning_rate": 0.00013366336633663367, + "loss": 1.7682, + "step": 11150 + }, + { + "epoch": 8.13, + "grad_norm": 0.35352224111557007, + "learning_rate": 0.00013267326732673267, + "loss": 1.7708, + "step": 11160 + }, + { + "epoch": 8.13, + "eval_accuracy": 0.6498058850737081, + "eval_loss": 1.6215531826019287, + "eval_runtime": 1090.6922, + "eval_samples_per_second": 457.856, + "eval_steps_per_second": 2.045, + "step": 11160 + }, + { + "epoch": 8.14, + "grad_norm": 0.32690539956092834, + "learning_rate": 0.0001316831683168317, + "loss": 1.7701, + "step": 11170 + }, + { + "epoch": 8.15, + "grad_norm": 0.3275192975997925, + "learning_rate": 0.0001306930693069307, + "loss": 1.7664, + "step": 11180 + }, + { + "epoch": 8.15, + "grad_norm": 0.35621440410614014, + "learning_rate": 0.00012970297029702972, + "loss": 1.7708, + "step": 11190 + }, + { + "epoch": 8.16, + "grad_norm": 0.3187929093837738, + "learning_rate": 0.00012871287128712872, + "loss": 1.7686, + "step": 11200 + }, + { + "epoch": 8.17, + "grad_norm": 0.30944034457206726, + "learning_rate": 0.00012772277227722772, + "loss": 1.7705, + "step": 11210 + }, + { + "epoch": 8.17, + "grad_norm": 0.3147297501564026, + "learning_rate": 0.00012673267326732674, + "loss": 1.7668, + "step": 11220 + }, + { + "epoch": 8.18, + "grad_norm": 0.31606265902519226, + "learning_rate": 0.00012574257425742574, + "loss": 1.7664, + "step": 11230 + }, + { + "epoch": 8.19, + "grad_norm": 0.3430984914302826, + "learning_rate": 0.00012475247524752477, + "loss": 1.7688, + "step": 11240 + }, + { + "epoch": 8.2, + "grad_norm": 0.36714431643486023, + "learning_rate": 0.00012376237623762376, + "loss": 1.7686, + "step": 11250 + }, + { + "epoch": 8.2, + "eval_accuracy": 0.6500912899825928, + "eval_loss": 1.6193368434906006, + "eval_runtime": 1084.4098, + "eval_samples_per_second": 460.509, + "eval_steps_per_second": 2.056, + "step": 11250 + }, + { + "epoch": 8.2, + "grad_norm": 0.33857467770576477, + "learning_rate": 0.00012277227722772276, + "loss": 1.7706, + "step": 11260 + }, + { + "epoch": 8.21, + "grad_norm": 0.33994871377944946, + "learning_rate": 0.00012178217821782179, + "loss": 1.7657, + "step": 11270 + }, + { + "epoch": 8.22, + "grad_norm": 0.3522297143936157, + "learning_rate": 0.00012079207920792079, + "loss": 1.7671, + "step": 11280 + }, + { + "epoch": 8.23, + "grad_norm": 0.3409149646759033, + "learning_rate": 0.0001198019801980198, + "loss": 1.7683, + "step": 11290 + }, + { + "epoch": 8.23, + "grad_norm": 0.34772610664367676, + "learning_rate": 0.00011881188118811881, + "loss": 1.7694, + "step": 11300 + }, + { + "epoch": 8.24, + "grad_norm": 0.3905799388885498, + "learning_rate": 0.00011782178217821782, + "loss": 1.7676, + "step": 11310 + }, + { + "epoch": 8.25, + "grad_norm": 0.3478334844112396, + "learning_rate": 0.00011683168316831685, + "loss": 1.7656, + "step": 11320 + }, + { + "epoch": 8.26, + "grad_norm": 0.3372560143470764, + "learning_rate": 0.00011584158415841584, + "loss": 1.7662, + "step": 11330 + }, + { + "epoch": 8.26, + "grad_norm": 0.31676506996154785, + "learning_rate": 0.00011485148514851486, + "loss": 1.7673, + "step": 11340 + }, + { + "epoch": 8.26, + "eval_accuracy": 0.6503277550921033, + "eval_loss": 1.6178277730941772, + "eval_runtime": 1085.1752, + "eval_samples_per_second": 460.184, + "eval_steps_per_second": 2.055, + "step": 11340 + }, + { + "epoch": 8.27, + "grad_norm": 0.31384769082069397, + "learning_rate": 0.00011386138613861387, + "loss": 1.7657, + "step": 11350 + }, + { + "epoch": 8.28, + "grad_norm": 0.3267647922039032, + "learning_rate": 0.00011287128712871288, + "loss": 1.766, + "step": 11360 + }, + { + "epoch": 8.28, + "grad_norm": 0.32966649532318115, + "learning_rate": 0.00011188118811881189, + "loss": 1.7636, + "step": 11370 + }, + { + "epoch": 8.29, + "grad_norm": 0.34511563181877136, + "learning_rate": 0.00011089108910891089, + "loss": 1.767, + "step": 11380 + }, + { + "epoch": 8.3, + "grad_norm": 0.3151010274887085, + "learning_rate": 0.0001099009900990099, + "loss": 1.7663, + "step": 11390 + }, + { + "epoch": 8.31, + "grad_norm": 0.33481037616729736, + "learning_rate": 0.00010891089108910891, + "loss": 1.7687, + "step": 11400 + }, + { + "epoch": 8.31, + "grad_norm": 0.35512518882751465, + "learning_rate": 0.00010792079207920792, + "loss": 1.7681, + "step": 11410 + }, + { + "epoch": 8.32, + "grad_norm": 0.3472909927368164, + "learning_rate": 0.00010693069306930694, + "loss": 1.7653, + "step": 11420 + }, + { + "epoch": 8.33, + "grad_norm": 0.3452986180782318, + "learning_rate": 0.00010594059405940593, + "loss": 1.7666, + "step": 11430 + }, + { + "epoch": 8.33, + "eval_accuracy": 0.6505906465254327, + "eval_loss": 1.6169500350952148, + "eval_runtime": 1083.6278, + "eval_samples_per_second": 460.841, + "eval_steps_per_second": 2.058, + "step": 11430 + }, + { + "epoch": 8.34, + "grad_norm": 0.34459248185157776, + "learning_rate": 0.00010495049504950495, + "loss": 1.7661, + "step": 11440 + }, + { + "epoch": 8.34, + "grad_norm": 0.3042079508304596, + "learning_rate": 0.00010396039603960396, + "loss": 1.761, + "step": 11450 + }, + { + "epoch": 8.35, + "grad_norm": 0.32908689975738525, + "learning_rate": 0.00010297029702970297, + "loss": 1.7657, + "step": 11460 + }, + { + "epoch": 8.36, + "grad_norm": 0.34110862016677856, + "learning_rate": 0.00010198019801980198, + "loss": 1.7644, + "step": 11470 + }, + { + "epoch": 8.36, + "grad_norm": 0.2914797365665436, + "learning_rate": 0.00010099009900990099, + "loss": 1.7659, + "step": 11480 + }, + { + "epoch": 8.37, + "grad_norm": 0.32843562960624695, + "learning_rate": 0.0001, + "loss": 1.7634, + "step": 11490 + }, + { + "epoch": 8.38, + "grad_norm": 0.3443434536457062, + "learning_rate": 9.900990099009902e-05, + "loss": 1.7647, + "step": 11500 + }, + { + "epoch": 8.39, + "grad_norm": 0.3086354732513428, + "learning_rate": 9.801980198019803e-05, + "loss": 1.7633, + "step": 11510 + }, + { + "epoch": 8.39, + "grad_norm": 0.2858017086982727, + "learning_rate": 9.702970297029704e-05, + "loss": 1.7635, + "step": 11520 + }, + { + "epoch": 8.39, + "eval_accuracy": 0.6506746559931541, + "eval_loss": 1.61593496799469, + "eval_runtime": 1085.1265, + "eval_samples_per_second": 460.204, + "eval_steps_per_second": 2.055, + "step": 11520 + }, + { + "epoch": 8.4, + "grad_norm": 0.32728368043899536, + "learning_rate": 9.603960396039604e-05, + "loss": 1.7637, + "step": 11530 + }, + { + "epoch": 8.41, + "grad_norm": 0.3133088946342468, + "learning_rate": 9.504950495049505e-05, + "loss": 1.7629, + "step": 11540 + }, + { + "epoch": 8.42, + "grad_norm": 0.2798636853694916, + "learning_rate": 9.405940594059406e-05, + "loss": 1.766, + "step": 11550 + }, + { + "epoch": 8.42, + "grad_norm": 0.3476144075393677, + "learning_rate": 9.306930693069307e-05, + "loss": 1.7658, + "step": 11560 + }, + { + "epoch": 8.43, + "grad_norm": 0.2828819751739502, + "learning_rate": 9.207920792079209e-05, + "loss": 1.7624, + "step": 11570 + }, + { + "epoch": 8.44, + "grad_norm": 0.27723389863967896, + "learning_rate": 9.108910891089108e-05, + "loss": 1.7635, + "step": 11580 + }, + { + "epoch": 8.44, + "grad_norm": 0.32631412148475647, + "learning_rate": 9.00990099009901e-05, + "loss": 1.7621, + "step": 11590 + }, + { + "epoch": 8.45, + "grad_norm": 0.3203299641609192, + "learning_rate": 8.91089108910891e-05, + "loss": 1.7629, + "step": 11600 + }, + { + "epoch": 8.46, + "grad_norm": 0.31430014967918396, + "learning_rate": 8.811881188118812e-05, + "loss": 1.7631, + "step": 11610 + }, + { + "epoch": 8.46, + "eval_accuracy": 0.6510713372187361, + "eval_loss": 1.6139030456542969, + "eval_runtime": 1084.8096, + "eval_samples_per_second": 460.339, + "eval_steps_per_second": 2.056, + "step": 11610 + }, + { + "epoch": 8.47, + "grad_norm": 0.302937775850296, + "learning_rate": 8.712871287128713e-05, + "loss": 1.7628, + "step": 11620 + }, + { + "epoch": 8.47, + "grad_norm": 0.2944415807723999, + "learning_rate": 8.613861386138613e-05, + "loss": 1.7642, + "step": 11630 + }, + { + "epoch": 8.48, + "grad_norm": 0.3318140506744385, + "learning_rate": 8.514851485148515e-05, + "loss": 1.7626, + "step": 11640 + }, + { + "epoch": 8.49, + "grad_norm": 0.2729699909687042, + "learning_rate": 8.415841584158417e-05, + "loss": 1.7623, + "step": 11650 + }, + { + "epoch": 8.5, + "grad_norm": 0.37606656551361084, + "learning_rate": 8.316831683168318e-05, + "loss": 1.7609, + "step": 11660 + }, + { + "epoch": 8.5, + "grad_norm": 0.30239492654800415, + "learning_rate": 8.217821782178219e-05, + "loss": 1.7621, + "step": 11670 + }, + { + "epoch": 8.51, + "grad_norm": 0.2841242253780365, + "learning_rate": 8.118811881188119e-05, + "loss": 1.7629, + "step": 11680 + }, + { + "epoch": 8.52, + "grad_norm": 0.3027147054672241, + "learning_rate": 8.01980198019802e-05, + "loss": 1.7618, + "step": 11690 + }, + { + "epoch": 8.52, + "grad_norm": 0.2852645218372345, + "learning_rate": 7.920792079207921e-05, + "loss": 1.7633, + "step": 11700 + }, + { + "epoch": 8.52, + "eval_accuracy": 0.6512795497031492, + "eval_loss": 1.6128230094909668, + "eval_runtime": 1083.3765, + "eval_samples_per_second": 460.948, + "eval_steps_per_second": 2.058, + "step": 11700 + }, + { + "epoch": 8.53, + "grad_norm": 0.30832818150520325, + "learning_rate": 7.821782178217822e-05, + "loss": 1.7606, + "step": 11710 + }, + { + "epoch": 8.54, + "grad_norm": 0.30394139885902405, + "learning_rate": 7.722772277227723e-05, + "loss": 1.7609, + "step": 11720 + }, + { + "epoch": 8.55, + "grad_norm": 0.2910577952861786, + "learning_rate": 7.623762376237623e-05, + "loss": 1.7607, + "step": 11730 + }, + { + "epoch": 8.55, + "grad_norm": 0.27040547132492065, + "learning_rate": 7.524752475247524e-05, + "loss": 1.7607, + "step": 11740 + }, + { + "epoch": 8.56, + "grad_norm": 0.2972952723503113, + "learning_rate": 7.425742574257426e-05, + "loss": 1.7615, + "step": 11750 + }, + { + "epoch": 8.57, + "grad_norm": 0.26889026165008545, + "learning_rate": 7.326732673267327e-05, + "loss": 1.7631, + "step": 11760 + }, + { + "epoch": 8.58, + "grad_norm": 0.3071919083595276, + "learning_rate": 7.227722772277228e-05, + "loss": 1.7572, + "step": 11770 + }, + { + "epoch": 8.58, + "grad_norm": 0.30390483140945435, + "learning_rate": 7.128712871287128e-05, + "loss": 1.7593, + "step": 11780 + }, + { + "epoch": 8.59, + "grad_norm": 0.2942393124103546, + "learning_rate": 7.02970297029703e-05, + "loss": 1.7616, + "step": 11790 + }, + { + "epoch": 8.59, + "eval_accuracy": 0.6516749418564994, + "eval_loss": 1.6112834215164185, + "eval_runtime": 1083.6459, + "eval_samples_per_second": 460.833, + "eval_steps_per_second": 2.058, + "step": 11790 + }, + { + "epoch": 8.6, + "grad_norm": 0.2823123037815094, + "learning_rate": 6.930693069306931e-05, + "loss": 1.7615, + "step": 11800 + }, + { + "epoch": 8.6, + "grad_norm": 0.3058627247810364, + "learning_rate": 6.831683168316833e-05, + "loss": 1.7612, + "step": 11810 + }, + { + "epoch": 8.61, + "grad_norm": 0.2954027056694031, + "learning_rate": 6.732673267326734e-05, + "loss": 1.7623, + "step": 11820 + }, + { + "epoch": 8.62, + "grad_norm": 0.32210031151771545, + "learning_rate": 6.633663366336634e-05, + "loss": 1.7607, + "step": 11830 + }, + { + "epoch": 8.63, + "grad_norm": 0.2638227343559265, + "learning_rate": 6.534653465346535e-05, + "loss": 1.7591, + "step": 11840 + }, + { + "epoch": 8.63, + "grad_norm": 0.2716045379638672, + "learning_rate": 6.435643564356436e-05, + "loss": 1.7602, + "step": 11850 + }, + { + "epoch": 8.64, + "grad_norm": 0.2823101282119751, + "learning_rate": 6.336633663366337e-05, + "loss": 1.7602, + "step": 11860 + }, + { + "epoch": 8.65, + "grad_norm": 0.26111429929733276, + "learning_rate": 6.237623762376238e-05, + "loss": 1.7608, + "step": 11870 + }, + { + "epoch": 8.66, + "grad_norm": 0.28957730531692505, + "learning_rate": 6.138613861386138e-05, + "loss": 1.7602, + "step": 11880 + }, + { + "epoch": 8.66, + "eval_accuracy": 0.6518121780771471, + "eval_loss": 1.610386610031128, + "eval_runtime": 1085.801, + "eval_samples_per_second": 459.919, + "eval_steps_per_second": 2.054, + "step": 11880 + }, + { + "epoch": 8.66, + "grad_norm": 0.31074461340904236, + "learning_rate": 6.039603960396039e-05, + "loss": 1.7599, + "step": 11890 + }, + { + "epoch": 8.67, + "grad_norm": 0.2974682152271271, + "learning_rate": 5.9405940594059404e-05, + "loss": 1.7604, + "step": 11900 + }, + { + "epoch": 8.68, + "grad_norm": 0.3034124970436096, + "learning_rate": 5.841584158415842e-05, + "loss": 1.7605, + "step": 11910 + }, + { + "epoch": 8.68, + "grad_norm": 0.28555795550346375, + "learning_rate": 5.742574257425743e-05, + "loss": 1.7568, + "step": 11920 + }, + { + "epoch": 8.69, + "grad_norm": 0.2668933868408203, + "learning_rate": 5.643564356435644e-05, + "loss": 1.7576, + "step": 11930 + }, + { + "epoch": 8.7, + "grad_norm": 0.2799495458602905, + "learning_rate": 5.5445544554455445e-05, + "loss": 1.7595, + "step": 11940 + }, + { + "epoch": 8.71, + "grad_norm": 0.28266316652297974, + "learning_rate": 5.4455445544554456e-05, + "loss": 1.7602, + "step": 11950 + }, + { + "epoch": 8.71, + "grad_norm": 0.27878817915916443, + "learning_rate": 5.346534653465347e-05, + "loss": 1.7607, + "step": 11960 + }, + { + "epoch": 8.72, + "grad_norm": 0.2904963493347168, + "learning_rate": 5.247524752475247e-05, + "loss": 1.7578, + "step": 11970 + }, + { + "epoch": 8.72, + "eval_accuracy": 0.6515499792766575, + "eval_loss": 1.6111468076705933, + "eval_runtime": 1090.6667, + "eval_samples_per_second": 457.867, + "eval_steps_per_second": 2.045, + "step": 11970 + } + ], + "logging_steps": 10, + "max_steps": 12500, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 90, + "total_flos": 1.2438771143111148e+18, + "train_batch_size": 192, + "trial_name": null, + "trial_params": null +}