{ "best_metric": 1.004758358001709, "best_model_checkpoint": "output/morgenshtern/checkpoint-1300", "epoch": 13.0, "global_step": 1300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "learning_rate": 0.00013638815138477438, "loss": 2.534, "step": 5 }, { "epoch": 0.1, "learning_rate": 0.00013397182122930294, "loss": 2.1207, "step": 10 }, { "epoch": 0.15, "learning_rate": 0.0001300082017869573, "loss": 1.9437, "step": 15 }, { "epoch": 0.2, "learning_rate": 0.00012459110818763681, "loss": 2.0463, "step": 20 }, { "epoch": 0.2, "eval_loss": 1.9048744440078735, "eval_runtime": 2.5437, "eval_samples_per_second": 44.03, "eval_steps_per_second": 5.504, "step": 20 }, { "epoch": 0.25, "learning_rate": 0.00011784875792222071, "loss": 1.953, "step": 25 }, { "epoch": 0.29, "learning_rate": 0.000109940736055617, "loss": 2.0751, "step": 30 }, { "epoch": 0.34, "learning_rate": 0.0001010542179989503, "loss": 1.9436, "step": 35 }, { "epoch": 0.39, "learning_rate": 9.139953924430466e-05, "loss": 1.8745, "step": 40 }, { "epoch": 0.39, "eval_loss": 1.7966899871826172, "eval_runtime": 2.5987, "eval_samples_per_second": 43.099, "eval_steps_per_second": 5.387, "step": 40 }, { "epoch": 0.44, "learning_rate": 8.120521692221673e-05, "loss": 1.8989, "step": 45 }, { "epoch": 0.49, "learning_rate": 7.071254101695329e-05, "loss": 1.8346, "step": 50 }, { "epoch": 0.54, "learning_rate": 6.016986326040062e-05, "loss": 1.8576, "step": 55 }, { "epoch": 0.59, "learning_rate": 4.982671888105512e-05, "loss": 1.8876, "step": 60 }, { "epoch": 0.59, "eval_loss": 1.760327935218811, "eval_runtime": 2.6932, "eval_samples_per_second": 41.586, "eval_steps_per_second": 5.198, "step": 60 }, { "epoch": 0.64, "learning_rate": 3.992792034076668e-05, "loss": 1.7386, "step": 65 }, { "epoch": 0.69, "learning_rate": 3.0707762854909304e-05, "loss": 1.9309, "step": 70 }, { "epoch": 0.74, "learning_rate": 2.2384478845846175e-05, "loss": 1.6283, "step": 75 }, { "epoch": 0.78, "learning_rate": 1.5155072587539005e-05, "loss": 1.9742, "step": 80 }, { "epoch": 0.78, "eval_loss": 1.7276561260223389, "eval_runtime": 2.6936, "eval_samples_per_second": 41.581, "eval_steps_per_second": 5.198, "step": 80 }, { "epoch": 0.83, "learning_rate": 9.190657300387505e-06, "loss": 1.8267, "step": 85 }, { "epoch": 0.88, "learning_rate": 4.6324050628611986e-06, "loss": 1.7887, "step": 90 }, { "epoch": 0.93, "learning_rate": 1.5882054016913933e-06, "loss": 1.8381, "step": 95 }, { "epoch": 0.98, "learning_rate": 1.3011164863877445e-07, "loss": 1.8954, "step": 100 }, { "epoch": 0.98, "eval_loss": 1.7208999395370483, "eval_runtime": 2.7574, "eval_samples_per_second": 40.618, "eval_steps_per_second": 5.077, "step": 100 }, { "epoch": 0.91, "learning_rate": 3.0216830127274476e-06, "loss": 1.8376, "step": 105 }, { "epoch": 0.95, "learning_rate": 9.037005536513067e-07, "loss": 1.7024, "step": 110 }, { "epoch": 0.99, "learning_rate": 2.515656508272057e-08, "loss": 1.7911, "step": 115 }, { "epoch": 1.03, "learning_rate": 4.0213613921093164e-07, "loss": 1.8512, "step": 120 }, { "epoch": 1.08, "learning_rate": 2.0277372298297e-06, "loss": 1.7573, "step": 125 }, { "epoch": 1.12, "learning_rate": 4.8721970205680935e-06, "loss": 1.7902, "step": 130 }, { "epoch": 1.16, "learning_rate": 8.88343684654658e-06, "loss": 1.7602, "step": 135 }, { "epoch": 1.21, "learning_rate": 1.3988015692592823e-05, "loss": 1.8606, "step": 140 }, { "epoch": 1.25, "learning_rate": 2.009247481060283e-05, "loss": 1.6102, "step": 145 }, { "epoch": 1.29, "learning_rate": 2.708504883770769e-05, "loss": 1.8574, "step": 150 }, { "epoch": 1.34, "learning_rate": 3.483771208671411e-05, "loss": 1.6927, "step": 155 }, { "epoch": 1.38, "learning_rate": 4.320852254368187e-05, "loss": 1.7203, "step": 160 }, { "epoch": 1.42, "learning_rate": 5.204422065684016e-05, "loss": 1.8592, "step": 165 }, { "epoch": 1.47, "learning_rate": 6.118303533611755e-05, "loss": 1.7338, "step": 170 }, { "epoch": 1.51, "learning_rate": 7.045764578878282e-05, "loss": 1.7386, "step": 175 }, { "epoch": 1.55, "learning_rate": 7.969824496351964e-05, "loss": 1.6874, "step": 180 }, { "epoch": 1.59, "learning_rate": 8.873564851492995e-05, "loss": 1.8691, "step": 185 }, { "epoch": 1.64, "learning_rate": 9.740439236703416e-05, "loss": 1.7808, "step": 190 }, { "epoch": 1.68, "learning_rate": 0.00010554576216307802, "loss": 1.8296, "step": 195 }, { "epoch": 1.72, "learning_rate": 0.00011301069913603334, "loss": 1.737, "step": 200 }, { "epoch": 1.77, "learning_rate": 0.0001196625291967717, "loss": 1.8, "step": 205 }, { "epoch": 1.81, "learning_rate": 0.00012537946527356269, "loss": 1.6787, "step": 210 }, { "epoch": 1.85, "learning_rate": 0.000130056837088046, "loss": 1.664, "step": 215 }, { "epoch": 1.9, "learning_rate": 0.00013360900754314024, "loss": 1.5839, "step": 220 }, { "epoch": 1.94, "learning_rate": 0.0001359709406361119, "loss": 1.8525, "step": 225 }, { "epoch": 1.98, "learning_rate": 0.0001370993921901871, "loss": 1.7228, "step": 230 }, { "epoch": 2.4, "learning_rate": 9.021642375642038e-05, "loss": 1.6079, "step": 235 }, { "epoch": 2.45, "learning_rate": 7.954855279928984e-05, "loss": 1.6691, "step": 240 }, { "epoch": 2.5, "learning_rate": 6.860000000000001e-05, "loss": 1.7047, "step": 245 }, { "epoch": 2.55, "learning_rate": 5.765144720071019e-05, "loss": 1.6921, "step": 250 }, { "epoch": 2.6, "learning_rate": 4.698357624357961e-05, "loss": 1.5894, "step": 255 }, { "epoch": 2.65, "learning_rate": 3.686987328947878e-05, "loss": 1.6388, "step": 260 }, { "epoch": 2.7, "learning_rate": 2.7569617608302645e-05, "loss": 1.6748, "step": 265 }, { "epoch": 2.76, "learning_rate": 1.932123458329584e-05, "loss": 1.6765, "step": 270 }, { "epoch": 2.81, "learning_rate": 1.233618333464885e-05, "loss": 1.6658, "step": 275 }, { "epoch": 2.86, "learning_rate": 6.793535661894062e-06, "loss": 1.5677, "step": 280 }, { "epoch": 2.91, "learning_rate": 2.8353852816850843e-06, "loss": 1.6118, "step": 285 }, { "epoch": 2.96, "learning_rate": 5.632050517253132e-07, "loss": 1.552, "step": 290 }, { "epoch": 3.0, "eval_loss": 1.546966552734375, "eval_runtime": 6.307, "eval_samples_per_second": 22.99, "eval_steps_per_second": 3.013, "step": 294 }, { "epoch": 3.01, "learning_rate": 3.52455686328105e-08, "loss": 1.4805, "step": 295 }, { "epoch": 3.06, "learning_rate": 1.2650418304129032e-06, "loss": 1.6327, "step": 300 }, { "epoch": 3.11, "learning_rate": 4.221066247386418e-06, "loss": 1.7416, "step": 305 }, { "epoch": 3.16, "learning_rate": 8.827536897135471e-06, "loss": 1.6078, "step": 310 }, { "epoch": 3.21, "learning_rate": 1.496636030269314e-05, "loss": 1.5727, "step": 315 }, { "epoch": 3.27, "learning_rate": 2.2480158928073662e-05, "loss": 1.6548, "step": 320 }, { "epoch": 3.32, "learning_rate": 3.117630577695637e-05, "loss": 1.596, "step": 325 }, { "epoch": 3.37, "learning_rate": 4.08318626618038e-05, "loss": 1.4367, "step": 330 }, { "epoch": 3.42, "learning_rate": 5.119929554380771e-05, "loss": 1.621, "step": 335 }, { "epoch": 3.47, "learning_rate": 6.201282042273297e-05, "loss": 1.7111, "step": 340 }, { "epoch": 3.52, "learning_rate": 7.299521709067686e-05, "loss": 1.6959, "step": 345 }, { "epoch": 3.57, "learning_rate": 8.386493606940314e-05, "loss": 1.7452, "step": 350 }, { "epoch": 3.62, "learning_rate": 9.434331653472505e-05, "loss": 1.6352, "step": 355 }, { "epoch": 3.67, "learning_rate": 0.00010416173018610202, "loss": 1.5285, "step": 360 }, { "epoch": 3.72, "learning_rate": 0.00011306846791811431, "loss": 1.7379, "step": 365 }, { "epoch": 3.78, "learning_rate": 0.00012083519274412256, "loss": 1.7212, "step": 370 }, { "epoch": 3.83, "learning_rate": 0.0001272627935421667, "loss": 1.6984, "step": 375 }, { "epoch": 3.88, "learning_rate": 0.00013218648955393709, "loss": 1.5563, "step": 380 }, { "epoch": 3.93, "learning_rate": 0.0001354800547756731, "loss": 1.6133, "step": 385 }, { "epoch": 3.98, "learning_rate": 0.00013705905394267309, "loss": 1.6524, "step": 390 }, { "epoch": 4.0, "eval_loss": 1.4850682020187378, "eval_runtime": 6.9944, "eval_samples_per_second": 21.303, "eval_steps_per_second": 2.716, "step": 392 }, { "epoch": 4.03, "learning_rate": 0.000136883007148315, "loss": 1.6487, "step": 395 }, { "epoch": 4.08, "learning_rate": 0.00013495642760447742, "loss": 1.5179, "step": 400 }, { "epoch": 4.13, "learning_rate": 0.00013132870593888477, "loss": 1.3698, "step": 405 }, { "epoch": 4.18, "learning_rate": 0.00012609284399558033, "loss": 1.6439, "step": 410 }, { "epoch": 4.23, "learning_rate": 0.00011938307059936668, "loss": 1.5471, "step": 415 }, { "epoch": 4.29, "learning_rate": 0.00011137140040750914, "loss": 1.6368, "step": 420 }, { "epoch": 4.34, "learning_rate": 0.00010226322406747016, "loss": 1.5678, "step": 425 }, { "epoch": 4.39, "learning_rate": 9.229204273330182e-05, "loss": 1.6766, "step": 430 }, { "epoch": 4.44, "learning_rate": 8.171348192891424e-05, "loss": 1.6611, "step": 435 }, { "epoch": 4.49, "learning_rate": 7.079873822141565e-05, "loss": 1.6183, "step": 440 }, { "epoch": 4.54, "learning_rate": 5.982762670844296e-05, "loss": 1.5438, "step": 445 }, { "epoch": 4.59, "learning_rate": 4.908140755711112e-05, "loss": 1.6215, "step": 450 }, { "epoch": 4.64, "learning_rate": 3.883557549653544e-05, "loss": 1.5207, "step": 455 }, { "epoch": 4.69, "learning_rate": 2.9352797115619177e-05, "loss": 1.4362, "step": 460 }, { "epoch": 4.74, "learning_rate": 2.0876177028600835e-05, "loss": 1.5917, "step": 465 }, { "epoch": 4.8, "learning_rate": 1.3623025539858168e-05, "loss": 1.4801, "step": 470 }, { "epoch": 4.85, "learning_rate": 7.779287582812185e-06, "loss": 1.4835, "step": 475 }, { "epoch": 4.9, "learning_rate": 3.494775755068154e-06, "loss": 1.5351, "step": 480 }, { "epoch": 4.95, "learning_rate": 8.793296577687332e-07, "loss": 1.4665, "step": 485 }, { "epoch": 5.0, "learning_rate": 0.0, "loss": 1.5628, "step": 490 }, { "epoch": 5.0, "eval_loss": 1.417140007019043, "eval_runtime": 6.9215, "eval_samples_per_second": 21.527, "eval_steps_per_second": 2.745, "step": 490 }, { "epoch": 5.05, "learning_rate": 8.79329657768718e-07, "loss": 1.3775, "step": 495 }, { "epoch": 5.1, "learning_rate": 3.4947757550681237e-06, "loss": 1.5294, "step": 500 }, { "epoch": 5.15, "learning_rate": 7.7792875828122e-06, "loss": 1.436, "step": 505 }, { "epoch": 5.2, "learning_rate": 1.3623025539858192e-05, "loss": 1.5282, "step": 510 }, { "epoch": 5.26, "learning_rate": 2.087617702860069e-05, "loss": 1.5277, "step": 515 }, { "epoch": 5.31, "learning_rate": 2.9352797115619008e-05, "loss": 1.6218, "step": 520 }, { "epoch": 5.36, "learning_rate": 3.8835575496535365e-05, "loss": 1.4628, "step": 525 }, { "epoch": 5.41, "learning_rate": 4.9081407557111025e-05, "loss": 1.4702, "step": 530 }, { "epoch": 5.46, "learning_rate": 5.9827626708442996e-05, "loss": 1.3757, "step": 535 }, { "epoch": 5.51, "learning_rate": 7.079873822141567e-05, "loss": 1.3535, "step": 540 }, { "epoch": 5.56, "learning_rate": 8.171348192891405e-05, "loss": 1.4668, "step": 545 }, { "epoch": 5.61, "learning_rate": 9.229204273330163e-05, "loss": 1.4463, "step": 550 }, { "epoch": 5.66, "learning_rate": 0.00010226322406747008, "loss": 1.3922, "step": 555 }, { "epoch": 5.71, "learning_rate": 0.00011137140040750908, "loss": 1.4607, "step": 560 }, { "epoch": 5.77, "learning_rate": 0.00011938307059936662, "loss": 1.533, "step": 565 }, { "epoch": 5.82, "learning_rate": 0.00012609284399558025, "loss": 1.5392, "step": 570 }, { "epoch": 5.87, "learning_rate": 0.00013132870593888474, "loss": 1.6347, "step": 575 }, { "epoch": 5.92, "learning_rate": 0.00013495642760447742, "loss": 1.6022, "step": 580 }, { "epoch": 5.97, "learning_rate": 0.000136883007148315, "loss": 1.4927, "step": 585 }, { "epoch": 6.0, "eval_loss": 1.4285378456115723, "eval_runtime": 6.9515, "eval_samples_per_second": 21.434, "eval_steps_per_second": 2.733, "step": 588 }, { "epoch": 6.02, "learning_rate": 0.00013705905394267309, "loss": 1.415, "step": 590 }, { "epoch": 6.07, "learning_rate": 0.00013548005477567314, "loss": 1.4139, "step": 595 }, { "epoch": 6.12, "learning_rate": 0.00013218648955393714, "loss": 1.522, "step": 600 }, { "epoch": 6.17, "learning_rate": 0.00012726279354216682, "loss": 1.4313, "step": 605 }, { "epoch": 6.22, "learning_rate": 0.0001208351927441227, "loss": 1.3831, "step": 610 }, { "epoch": 6.28, "learning_rate": 0.00011306846791811419, "loss": 1.4225, "step": 615 }, { "epoch": 6.33, "learning_rate": 0.0001041617301861021, "loss": 1.5253, "step": 620 }, { "epoch": 6.38, "learning_rate": 9.434331653472514e-05, "loss": 1.3459, "step": 625 }, { "epoch": 6.43, "learning_rate": 8.386493606940322e-05, "loss": 1.3962, "step": 630 }, { "epoch": 6.48, "learning_rate": 7.299521709067695e-05, "loss": 1.3858, "step": 635 }, { "epoch": 6.53, "learning_rate": 6.201282042273305e-05, "loss": 1.4634, "step": 640 }, { "epoch": 6.58, "learning_rate": 5.11992955438078e-05, "loss": 1.4289, "step": 645 }, { "epoch": 6.63, "learning_rate": 4.0831862661803776e-05, "loss": 1.4365, "step": 650 }, { "epoch": 6.68, "learning_rate": 3.1176305776956335e-05, "loss": 1.4483, "step": 655 }, { "epoch": 6.73, "learning_rate": 2.248015892807363e-05, "loss": 1.387, "step": 660 }, { "epoch": 6.79, "learning_rate": 1.496636030269327e-05, "loss": 1.4562, "step": 665 }, { "epoch": 6.84, "learning_rate": 8.827536897135571e-06, "loss": 1.4136, "step": 670 }, { "epoch": 6.89, "learning_rate": 4.221066247386487e-06, "loss": 1.3972, "step": 675 }, { "epoch": 6.94, "learning_rate": 1.2650418304129413e-06, "loss": 1.4283, "step": 680 }, { "epoch": 6.99, "learning_rate": 3.5245568632818114e-08, "loss": 1.4148, "step": 685 }, { "epoch": 7.0, "eval_loss": 1.388408899307251, "eval_runtime": 6.9772, "eval_samples_per_second": 21.355, "eval_steps_per_second": 2.723, "step": 686 }, { "epoch": 7.04, "learning_rate": 5.632050517253056e-07, "loss": 1.4365, "step": 690 }, { "epoch": 7.09, "learning_rate": 2.8353852816850615e-06, "loss": 1.2923, "step": 695 }, { "epoch": 7.14, "learning_rate": 6.793535661894024e-06, "loss": 1.3917, "step": 700 }, { "epoch": 7.19, "learning_rate": 1.2336183334648805e-05, "loss": 1.4588, "step": 705 }, { "epoch": 7.24, "learning_rate": 1.932123458329587e-05, "loss": 1.3238, "step": 710 }, { "epoch": 7.3, "learning_rate": 2.7569617608302577e-05, "loss": 1.3604, "step": 715 }, { "epoch": 7.35, "learning_rate": 3.686987328947871e-05, "loss": 1.3916, "step": 720 }, { "epoch": 7.4, "learning_rate": 4.698357624357965e-05, "loss": 1.3839, "step": 725 }, { "epoch": 7.45, "learning_rate": 5.7651447200710234e-05, "loss": 1.3886, "step": 730 }, { "epoch": 7.5, "learning_rate": 6.859999999999982e-05, "loss": 1.3475, "step": 735 }, { "epoch": 7.55, "learning_rate": 7.954855279928965e-05, "loss": 1.3304, "step": 740 }, { "epoch": 7.6, "learning_rate": 9.021642375642024e-05, "loss": 1.3287, "step": 745 }, { "epoch": 7.65, "learning_rate": 0.00010033012671052118, "loss": 1.4354, "step": 750 }, { "epoch": 7.7, "learning_rate": 0.00010963038239169733, "loss": 1.3597, "step": 755 }, { "epoch": 7.76, "learning_rate": 0.00011787876541670406, "loss": 1.3221, "step": 760 }, { "epoch": 7.81, "learning_rate": 0.00012486381666535114, "loss": 1.3458, "step": 765 }, { "epoch": 7.86, "learning_rate": 0.00013040646433810593, "loss": 1.4032, "step": 770 }, { "epoch": 7.91, "learning_rate": 0.00013436461471831492, "loss": 1.403, "step": 775 }, { "epoch": 7.96, "learning_rate": 0.00013663679494827467, "loss": 1.4234, "step": 780 }, { "epoch": 8.0, "eval_loss": 1.4104630947113037, "eval_runtime": 6.9539, "eval_samples_per_second": 21.427, "eval_steps_per_second": 2.732, "step": 784 }, { "epoch": 8.01, "learning_rate": 0.0001371647544313672, "loss": 1.1334, "step": 785 }, { "epoch": 8.06, "learning_rate": 0.0001359349581695871, "loss": 1.2244, "step": 790 }, { "epoch": 8.11, "learning_rate": 0.00013297893375261365, "loss": 1.361, "step": 795 }, { "epoch": 8.16, "learning_rate": 0.00012837246310286448, "loss": 1.3544, "step": 800 }, { "epoch": 8.21, "learning_rate": 0.00012223363969730697, "loss": 1.1642, "step": 805 }, { "epoch": 8.27, "learning_rate": 0.00011471984107192647, "loss": 1.3799, "step": 810 }, { "epoch": 8.32, "learning_rate": 0.00010602369422304377, "loss": 1.3705, "step": 815 }, { "epoch": 8.37, "learning_rate": 9.636813733819635e-05, "loss": 1.2818, "step": 820 }, { "epoch": 8.42, "learning_rate": 8.600070445619209e-05, "loss": 1.3989, "step": 825 }, { "epoch": 8.47, "learning_rate": 7.518717957726708e-05, "loss": 1.2671, "step": 830 }, { "epoch": 8.52, "learning_rate": 6.420478290932294e-05, "loss": 1.359, "step": 835 }, { "epoch": 8.57, "learning_rate": 5.33350639305969e-05, "loss": 1.3868, "step": 840 }, { "epoch": 8.62, "learning_rate": 4.285668346527499e-05, "loss": 1.4007, "step": 845 }, { "epoch": 8.67, "learning_rate": 3.3038269813898015e-05, "loss": 1.351, "step": 850 }, { "epoch": 8.72, "learning_rate": 2.413153208188573e-05, "loss": 1.317, "step": 855 }, { "epoch": 8.78, "learning_rate": 1.636480725587754e-05, "loss": 1.2524, "step": 860 }, { "epoch": 8.83, "learning_rate": 9.937206457833243e-06, "loss": 1.3045, "step": 865 }, { "epoch": 8.88, "learning_rate": 5.013510446062984e-06, "loss": 1.4154, "step": 870 }, { "epoch": 8.93, "learning_rate": 1.719945224326892e-06, "loss": 1.2452, "step": 875 }, { "epoch": 8.98, "learning_rate": 1.4094605732693502e-07, "loss": 1.3688, "step": 880 }, { "epoch": 9.0, "eval_loss": 1.3758981227874756, "eval_runtime": 6.9963, "eval_samples_per_second": 21.297, "eval_steps_per_second": 2.716, "step": 882 }, { "epoch": 9.03, "learning_rate": 3.169928516849862e-07, "loss": 1.4731, "step": 885 }, { "epoch": 9.08, "learning_rate": 2.2435723955225417e-06, "loss": 1.217, "step": 890 }, { "epoch": 9.13, "learning_rate": 5.8712940611152096e-06, "loss": 1.2492, "step": 895 }, { "epoch": 9.18, "learning_rate": 1.1107156004419803e-05, "loss": 1.2326, "step": 900 }, { "epoch": 9.23, "learning_rate": 1.7816929400633287e-05, "loss": 1.2444, "step": 905 }, { "epoch": 9.29, "learning_rate": 2.582859959249101e-05, "loss": 1.2759, "step": 910 }, { "epoch": 9.34, "learning_rate": 3.493677593252981e-05, "loss": 1.2315, "step": 915 }, { "epoch": 9.39, "learning_rate": 4.490795726669825e-05, "loss": 1.2353, "step": 920 }, { "epoch": 9.44, "learning_rate": 5.548651807108583e-05, "loss": 1.2733, "step": 925 }, { "epoch": 9.49, "learning_rate": 6.640126177858445e-05, "loss": 1.205, "step": 930 }, { "epoch": 9.54, "learning_rate": 7.737237329155688e-05, "loss": 1.2441, "step": 935 }, { "epoch": 9.59, "learning_rate": 8.811859244288885e-05, "loss": 1.2469, "step": 940 }, { "epoch": 9.64, "learning_rate": 9.83644245034643e-05, "loss": 1.2763, "step": 945 }, { "epoch": 9.69, "learning_rate": 0.00010784720288438088, "loss": 1.1781, "step": 950 }, { "epoch": 9.74, "learning_rate": 0.00011632382297139905, "loss": 1.2822, "step": 955 }, { "epoch": 9.8, "learning_rate": 0.00012357697446014173, "loss": 1.2663, "step": 960 }, { "epoch": 9.85, "learning_rate": 0.00012942071241718773, "loss": 1.3906, "step": 965 }, { "epoch": 9.9, "learning_rate": 0.00013370522424493184, "loss": 1.3015, "step": 970 }, { "epoch": 9.95, "learning_rate": 0.00013632067034223124, "loss": 1.2348, "step": 975 }, { "epoch": 10.0, "learning_rate": 0.0001372, "loss": 1.2793, "step": 980 }, { "epoch": 10.0, "eval_loss": 1.3976633548736572, "eval_runtime": 7.0041, "eval_samples_per_second": 21.273, "eval_steps_per_second": 2.713, "step": 980 }, { "epoch": 10.05, "learning_rate": 0.00013632067034223126, "loss": 1.2203, "step": 985 }, { "epoch": 10.1, "learning_rate": 0.0001337052242449319, "loss": 1.3016, "step": 990 }, { "epoch": 10.15, "learning_rate": 0.00012942071241718781, "loss": 1.2078, "step": 995 }, { "epoch": 10.2, "learning_rate": 0.00012357697446014183, "loss": 1.2408, "step": 1000 }, { "epoch": 10.26, "learning_rate": 0.00011632382297139899, "loss": 1.2247, "step": 1005 }, { "epoch": 10.31, "learning_rate": 0.00010784720288438083, "loss": 1.0939, "step": 1010 }, { "epoch": 10.36, "learning_rate": 9.836442450346467e-05, "loss": 1.1944, "step": 1015 }, { "epoch": 10.41, "learning_rate": 8.811859244288877e-05, "loss": 1.2171, "step": 1020 }, { "epoch": 10.46, "learning_rate": 7.737237329155728e-05, "loss": 1.2546, "step": 1025 }, { "epoch": 10.51, "learning_rate": 6.640126177858486e-05, "loss": 1.1607, "step": 1030 }, { "epoch": 10.56, "learning_rate": 5.5486518071086e-05, "loss": 1.2547, "step": 1035 }, { "epoch": 10.61, "learning_rate": 4.4907957266698644e-05, "loss": 1.2295, "step": 1040 }, { "epoch": 10.66, "learning_rate": 3.493677593252996e-05, "loss": 1.2566, "step": 1045 }, { "epoch": 10.71, "learning_rate": 2.5828599592491143e-05, "loss": 1.2949, "step": 1050 }, { "epoch": 10.77, "learning_rate": 1.7816929400633402e-05, "loss": 1.2886, "step": 1055 }, { "epoch": 10.82, "learning_rate": 1.1107156004419895e-05, "loss": 1.1858, "step": 1060 }, { "epoch": 10.87, "learning_rate": 5.871294061115278e-06, "loss": 1.2322, "step": 1065 }, { "epoch": 10.92, "learning_rate": 2.2435723955225265e-06, "loss": 1.0434, "step": 1070 }, { "epoch": 10.97, "learning_rate": 3.169928516850014e-07, "loss": 1.2505, "step": 1075 }, { "epoch": 11.0, "eval_loss": 1.3747466802597046, "eval_runtime": 7.0127, "eval_samples_per_second": 21.247, "eval_steps_per_second": 2.709, "step": 1078 }, { "epoch": 11.37, "learning_rate": 4.1043693868806304e-05, "loss": 1.1875, "step": 1080 }, { "epoch": 11.42, "learning_rate": 5.17596955821411e-05, "loss": 1.3125, "step": 1085 }, { "epoch": 11.47, "learning_rate": 6.293505690059783e-05, "loss": 1.2444, "step": 1090 }, { "epoch": 11.53, "learning_rate": 7.426494309940222e-05, "loss": 1.3268, "step": 1095 }, { "epoch": 11.58, "learning_rate": 8.544030441785894e-05, "loss": 1.3066, "step": 1100 }, { "epoch": 11.63, "learning_rate": 9.615630613119375e-05, "loss": 1.2542, "step": 1105 }, { "epoch": 11.68, "learning_rate": 0.00010612064364719844, "loss": 1.246, "step": 1110 }, { "epoch": 11.74, "learning_rate": 0.00011506151581352574, "loss": 1.2672, "step": 1115 }, { "epoch": 11.79, "learning_rate": 0.00012273503894459246, "loss": 1.2702, "step": 1120 }, { "epoch": 11.84, "learning_rate": 0.00012893189933276523, "loss": 1.3417, "step": 1125 }, { "epoch": 11.89, "learning_rate": 0.00013348306278066356, "loss": 1.2473, "step": 1130 }, { "epoch": 11.95, "learning_rate": 0.00013626438541342674, "loss": 1.1708, "step": 1135 }, { "epoch": 12.0, "learning_rate": 0.0001372, "loss": 1.1564, "step": 1140 }, { "epoch": 12.0, "eval_loss": 1.1428929567337036, "eval_runtime": 7.9608, "eval_samples_per_second": 21.48, "eval_steps_per_second": 2.764, "step": 1140 }, { "epoch": 11.8, "learning_rate": 0.00012461626728572456, "loss": 1.2708, "step": 1145 }, { "epoch": 11.86, "learning_rate": 0.000130268089438458, "loss": 1.2385, "step": 1150 }, { "epoch": 11.91, "learning_rate": 0.00013430626843929596, "loss": 1.2608, "step": 1155 }, { "epoch": 11.96, "learning_rate": 0.00013662513894413278, "loss": 1.2132, "step": 1160 }, { "epoch": 12.0, "eval_loss": 1.1082079410552979, "eval_runtime": 6.721, "eval_samples_per_second": 23.211, "eval_steps_per_second": 2.976, "step": 1164 }, { "epoch": 12.01, "learning_rate": 0.00013716402403652231, "loss": 1.2701, "step": 1165 }, { "epoch": 12.06, "learning_rate": 0.0001359088229352192, "loss": 1.2274, "step": 1170 }, { "epoch": 12.11, "learning_rate": 0.0001328923799634352, "loss": 1.1978, "step": 1175 }, { "epoch": 12.16, "learning_rate": 0.0001281936251251452, "loss": 1.0879, "step": 1180 }, { "epoch": 12.22, "learning_rate": 0.00012193550877662404, "loss": 1.2841, "step": 1185 }, { "epoch": 12.27, "learning_rate": 0.00011428178443580113, "loss": 1.166, "step": 1190 }, { "epoch": 12.32, "learning_rate": 0.0001054327239123201, "loss": 1.2385, "step": 1195 }, { "epoch": 12.37, "learning_rate": 9.561987687870095e-05, "loss": 1.1758, "step": 1200 }, { "epoch": 12.42, "learning_rate": 8.51000120067249e-05, "loss": 1.1698, "step": 1205 }, { "epoch": 12.47, "learning_rate": 7.414839820879227e-05, "loss": 1.3722, "step": 1210 }, { "epoch": 12.53, "learning_rate": 6.305160179120769e-05, "loss": 1.1787, "step": 1215 }, { "epoch": 12.58, "learning_rate": 5.209998799327507e-05, "loss": 1.2814, "step": 1220 }, { "epoch": 12.63, "learning_rate": 4.158012312129902e-05, "loss": 1.2154, "step": 1225 }, { "epoch": 12.68, "learning_rate": 3.176727608767987e-05, "loss": 1.2798, "step": 1230 }, { "epoch": 12.73, "learning_rate": 2.291821556419886e-05, "loss": 1.2241, "step": 1235 }, { "epoch": 12.78, "learning_rate": 1.5264491223375942e-05, "loss": 1.1194, "step": 1240 }, { "epoch": 12.84, "learning_rate": 9.006374874854777e-06, "loss": 1.1957, "step": 1245 }, { "epoch": 12.89, "learning_rate": 4.3076200365648044e-06, "loss": 1.1553, "step": 1250 }, { "epoch": 12.94, "learning_rate": 1.2911770647808012e-06, "loss": 1.1644, "step": 1255 }, { "epoch": 12.99, "learning_rate": 3.597596347767558e-08, "loss": 1.1646, "step": 1260 }, { "epoch": 13.0, "eval_loss": 1.0746197700500488, "eval_runtime": 7.0141, "eval_samples_per_second": 22.241, "eval_steps_per_second": 2.851, "step": 1261 }, { "epoch": 12.91, "learning_rate": 2.835385281685176e-06, "loss": 1.0594, "step": 1265 }, { "epoch": 12.96, "learning_rate": 5.632050517253284e-07, "loss": 1.1527, "step": 1270 }, { "epoch": 13.0, "eval_loss": 1.029100775718689, "eval_runtime": 6.9675, "eval_samples_per_second": 21.242, "eval_steps_per_second": 2.727, "step": 1274 }, { "epoch": 12.75, "learning_rate": 2.0092474810602958e-05, "loss": 1.2194, "step": 1275 }, { "epoch": 12.8, "learning_rate": 1.3101434185878674e-05, "loss": 1.0397, "step": 1280 }, { "epoch": 12.85, "learning_rate": 7.476952440677985e-06, "loss": 1.0537, "step": 1285 }, { "epoch": 12.9, "learning_rate": 3.35752298215246e-06, "loss": 1.2597, "step": 1290 }, { "epoch": 12.95, "learning_rate": 8.445798351736176e-07, "loss": 1.0949, "step": 1295 }, { "epoch": 13.0, "learning_rate": 0.0, "loss": 1.1523, "step": 1300 }, { "epoch": 13.0, "eval_loss": 1.004758358001709, "eval_runtime": 6.2971, "eval_samples_per_second": 20.803, "eval_steps_per_second": 2.7, "step": 1300 } ], "max_steps": 1400, "num_train_epochs": 14, "total_flos": 1346568486912000.0, "trial_name": null, "trial_params": null }