{ "best_metric": 0.5917085427135679, "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-eurosat/checkpoint-224", "epoch": 50.0, "eval_steps": 500, "global_step": 1400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.36, "grad_norm": 2.619962215423584, "learning_rate": 3.5714285714285714e-06, "loss": 1.0415, "step": 10 }, { "epoch": 0.71, "grad_norm": 2.402837038040161, "learning_rate": 7.142857142857143e-06, "loss": 1.0339, "step": 20 }, { "epoch": 1.0, "eval_accuracy": 0.5640703517587939, "eval_loss": 1.0541082620620728, "eval_runtime": 5.8961, "eval_samples_per_second": 270.009, "eval_steps_per_second": 2.205, "step": 28 }, { "epoch": 1.07, "grad_norm": 2.223407745361328, "learning_rate": 1.0714285714285714e-05, "loss": 1.025, "step": 30 }, { "epoch": 1.43, "grad_norm": 2.5425541400909424, "learning_rate": 1.4285714285714285e-05, "loss": 1.0072, "step": 40 }, { "epoch": 1.79, "grad_norm": 3.219869375228882, "learning_rate": 1.785714285714286e-05, "loss": 1.0193, "step": 50 }, { "epoch": 2.0, "eval_accuracy": 0.5621859296482412, "eval_loss": 1.0463842153549194, "eval_runtime": 5.8033, "eval_samples_per_second": 274.326, "eval_steps_per_second": 2.24, "step": 56 }, { "epoch": 2.14, "grad_norm": 3.941281318664551, "learning_rate": 2.1428571428571428e-05, "loss": 1.0133, "step": 60 }, { "epoch": 2.5, "grad_norm": 4.169382095336914, "learning_rate": 2.5e-05, "loss": 1.0182, "step": 70 }, { "epoch": 2.86, "grad_norm": 3.475947380065918, "learning_rate": 2.857142857142857e-05, "loss": 1.0348, "step": 80 }, { "epoch": 3.0, "eval_accuracy": 0.5690954773869347, "eval_loss": 1.0330955982208252, "eval_runtime": 5.8594, "eval_samples_per_second": 271.702, "eval_steps_per_second": 2.219, "step": 84 }, { "epoch": 3.21, "grad_norm": 3.1382639408111572, "learning_rate": 3.2142857142857144e-05, "loss": 1.0266, "step": 90 }, { "epoch": 3.57, "grad_norm": 2.344207525253296, "learning_rate": 3.571428571428572e-05, "loss": 1.0075, "step": 100 }, { "epoch": 3.93, "grad_norm": 2.2122068405151367, "learning_rate": 3.928571428571429e-05, "loss": 1.0072, "step": 110 }, { "epoch": 4.0, "eval_accuracy": 0.5847989949748744, "eval_loss": 1.0253527164459229, "eval_runtime": 5.8726, "eval_samples_per_second": 271.091, "eval_steps_per_second": 2.214, "step": 112 }, { "epoch": 4.29, "grad_norm": 2.9957728385925293, "learning_rate": 4.2857142857142856e-05, "loss": 1.0024, "step": 120 }, { "epoch": 4.64, "grad_norm": 2.9330532550811768, "learning_rate": 4.642857142857143e-05, "loss": 0.991, "step": 130 }, { "epoch": 5.0, "grad_norm": 2.355830430984497, "learning_rate": 5e-05, "loss": 0.9892, "step": 140 }, { "epoch": 5.0, "eval_accuracy": 0.5753768844221105, "eval_loss": 1.012104868888855, "eval_runtime": 5.622, "eval_samples_per_second": 283.175, "eval_steps_per_second": 2.312, "step": 140 }, { "epoch": 5.36, "grad_norm": 3.8320775032043457, "learning_rate": 4.960317460317461e-05, "loss": 0.9517, "step": 150 }, { "epoch": 5.71, "grad_norm": 3.0432510375976562, "learning_rate": 4.9206349206349204e-05, "loss": 0.9379, "step": 160 }, { "epoch": 6.0, "eval_accuracy": 0.5810301507537688, "eval_loss": 1.017525553703308, "eval_runtime": 6.0814, "eval_samples_per_second": 261.781, "eval_steps_per_second": 2.138, "step": 168 }, { "epoch": 6.07, "grad_norm": 3.3289706707000732, "learning_rate": 4.880952380952381e-05, "loss": 0.9475, "step": 170 }, { "epoch": 6.43, "grad_norm": 3.335624933242798, "learning_rate": 4.841269841269841e-05, "loss": 0.9103, "step": 180 }, { "epoch": 6.79, "grad_norm": 3.2209532260894775, "learning_rate": 4.801587301587302e-05, "loss": 0.9123, "step": 190 }, { "epoch": 7.0, "eval_accuracy": 0.5866834170854272, "eval_loss": 1.0119962692260742, "eval_runtime": 5.8208, "eval_samples_per_second": 273.502, "eval_steps_per_second": 2.233, "step": 196 }, { "epoch": 7.14, "grad_norm": 3.194115400314331, "learning_rate": 4.761904761904762e-05, "loss": 0.896, "step": 200 }, { "epoch": 7.5, "grad_norm": 2.89758563041687, "learning_rate": 4.722222222222222e-05, "loss": 0.9013, "step": 210 }, { "epoch": 7.86, "grad_norm": 3.0895237922668457, "learning_rate": 4.682539682539683e-05, "loss": 0.8865, "step": 220 }, { "epoch": 8.0, "eval_accuracy": 0.5917085427135679, "eval_loss": 1.0104140043258667, "eval_runtime": 6.0762, "eval_samples_per_second": 262.008, "eval_steps_per_second": 2.14, "step": 224 }, { "epoch": 8.21, "grad_norm": 2.715590715408325, "learning_rate": 4.642857142857143e-05, "loss": 0.8758, "step": 230 }, { "epoch": 8.57, "grad_norm": 3.639813184738159, "learning_rate": 4.603174603174603e-05, "loss": 0.8516, "step": 240 }, { "epoch": 8.93, "grad_norm": 3.1830241680145264, "learning_rate": 4.563492063492064e-05, "loss": 0.8668, "step": 250 }, { "epoch": 9.0, "eval_accuracy": 0.5873115577889447, "eval_loss": 1.0236343145370483, "eval_runtime": 6.0393, "eval_samples_per_second": 263.608, "eval_steps_per_second": 2.153, "step": 252 }, { "epoch": 9.29, "grad_norm": 3.1714346408843994, "learning_rate": 4.523809523809524e-05, "loss": 0.8352, "step": 260 }, { "epoch": 9.64, "grad_norm": 3.9194183349609375, "learning_rate": 4.4841269841269846e-05, "loss": 0.8207, "step": 270 }, { "epoch": 10.0, "grad_norm": 4.183494567871094, "learning_rate": 4.4444444444444447e-05, "loss": 0.8189, "step": 280 }, { "epoch": 10.0, "eval_accuracy": 0.5829145728643216, "eval_loss": 1.0359796285629272, "eval_runtime": 6.1118, "eval_samples_per_second": 260.481, "eval_steps_per_second": 2.127, "step": 280 }, { "epoch": 10.36, "grad_norm": 3.281873941421509, "learning_rate": 4.404761904761905e-05, "loss": 0.8051, "step": 290 }, { "epoch": 10.71, "grad_norm": 2.9959723949432373, "learning_rate": 4.3650793650793655e-05, "loss": 0.7933, "step": 300 }, { "epoch": 11.0, "eval_accuracy": 0.5835427135678392, "eval_loss": 1.039525032043457, "eval_runtime": 5.9823, "eval_samples_per_second": 266.118, "eval_steps_per_second": 2.173, "step": 308 }, { "epoch": 11.07, "grad_norm": 3.470358371734619, "learning_rate": 4.3253968253968256e-05, "loss": 0.797, "step": 310 }, { "epoch": 11.43, "grad_norm": 3.285288095474243, "learning_rate": 4.2857142857142856e-05, "loss": 0.7723, "step": 320 }, { "epoch": 11.79, "grad_norm": 4.59849739074707, "learning_rate": 4.2460317460317464e-05, "loss": 0.7765, "step": 330 }, { "epoch": 12.0, "eval_accuracy": 0.5728643216080402, "eval_loss": 1.0594085454940796, "eval_runtime": 5.9856, "eval_samples_per_second": 265.972, "eval_steps_per_second": 2.172, "step": 336 }, { "epoch": 12.14, "grad_norm": 4.7830305099487305, "learning_rate": 4.2063492063492065e-05, "loss": 0.7567, "step": 340 }, { "epoch": 12.5, "grad_norm": 3.7676315307617188, "learning_rate": 4.166666666666667e-05, "loss": 0.738, "step": 350 }, { "epoch": 12.86, "grad_norm": 4.092833518981934, "learning_rate": 4.126984126984127e-05, "loss": 0.7538, "step": 360 }, { "epoch": 13.0, "eval_accuracy": 0.5879396984924623, "eval_loss": 1.0552036762237549, "eval_runtime": 5.8071, "eval_samples_per_second": 274.146, "eval_steps_per_second": 2.239, "step": 364 }, { "epoch": 13.21, "grad_norm": 4.582472801208496, "learning_rate": 4.0873015873015874e-05, "loss": 0.7175, "step": 370 }, { "epoch": 13.57, "grad_norm": 4.479797840118408, "learning_rate": 4.047619047619048e-05, "loss": 0.7101, "step": 380 }, { "epoch": 13.93, "grad_norm": 4.223387241363525, "learning_rate": 4.007936507936508e-05, "loss": 0.7146, "step": 390 }, { "epoch": 14.0, "eval_accuracy": 0.5829145728643216, "eval_loss": 1.0619994401931763, "eval_runtime": 5.8703, "eval_samples_per_second": 271.194, "eval_steps_per_second": 2.215, "step": 392 }, { "epoch": 14.29, "grad_norm": 4.616257190704346, "learning_rate": 3.968253968253968e-05, "loss": 0.7031, "step": 400 }, { "epoch": 14.64, "grad_norm": 3.6395843029022217, "learning_rate": 3.928571428571429e-05, "loss": 0.6901, "step": 410 }, { "epoch": 15.0, "grad_norm": 3.426710367202759, "learning_rate": 3.888888888888889e-05, "loss": 0.6885, "step": 420 }, { "epoch": 15.0, "eval_accuracy": 0.5841708542713567, "eval_loss": 1.0782768726348877, "eval_runtime": 6.0171, "eval_samples_per_second": 264.578, "eval_steps_per_second": 2.16, "step": 420 }, { "epoch": 15.36, "grad_norm": 3.5050458908081055, "learning_rate": 3.84920634920635e-05, "loss": 0.6697, "step": 430 }, { "epoch": 15.71, "grad_norm": 4.314873218536377, "learning_rate": 3.809523809523809e-05, "loss": 0.6556, "step": 440 }, { "epoch": 16.0, "eval_accuracy": 0.5816582914572864, "eval_loss": 1.1010228395462036, "eval_runtime": 6.002, "eval_samples_per_second": 265.244, "eval_steps_per_second": 2.166, "step": 448 }, { "epoch": 16.07, "grad_norm": 3.6204850673675537, "learning_rate": 3.76984126984127e-05, "loss": 0.6621, "step": 450 }, { "epoch": 16.43, "grad_norm": 5.87465238571167, "learning_rate": 3.730158730158731e-05, "loss": 0.6388, "step": 460 }, { "epoch": 16.79, "grad_norm": 3.773968458175659, "learning_rate": 3.690476190476191e-05, "loss": 0.6453, "step": 470 }, { "epoch": 17.0, "eval_accuracy": 0.5734924623115578, "eval_loss": 1.1130640506744385, "eval_runtime": 5.9355, "eval_samples_per_second": 268.215, "eval_steps_per_second": 2.19, "step": 476 }, { "epoch": 17.14, "grad_norm": 3.7962214946746826, "learning_rate": 3.650793650793651e-05, "loss": 0.6504, "step": 480 }, { "epoch": 17.5, "grad_norm": 4.2159013748168945, "learning_rate": 3.611111111111111e-05, "loss": 0.6058, "step": 490 }, { "epoch": 17.86, "grad_norm": 4.258510112762451, "learning_rate": 3.571428571428572e-05, "loss": 0.6175, "step": 500 }, { "epoch": 18.0, "eval_accuracy": 0.5891959798994975, "eval_loss": 1.1074261665344238, "eval_runtime": 5.4208, "eval_samples_per_second": 293.686, "eval_steps_per_second": 2.398, "step": 504 }, { "epoch": 18.21, "grad_norm": 4.8816914558410645, "learning_rate": 3.5317460317460324e-05, "loss": 0.5942, "step": 510 }, { "epoch": 18.57, "grad_norm": 4.609433650970459, "learning_rate": 3.492063492063492e-05, "loss": 0.5943, "step": 520 }, { "epoch": 18.93, "grad_norm": 7.7561211585998535, "learning_rate": 3.4523809523809526e-05, "loss": 0.5993, "step": 530 }, { "epoch": 19.0, "eval_accuracy": 0.5741206030150754, "eval_loss": 1.1327540874481201, "eval_runtime": 5.7963, "eval_samples_per_second": 274.658, "eval_steps_per_second": 2.243, "step": 532 }, { "epoch": 19.29, "grad_norm": 4.787009239196777, "learning_rate": 3.412698412698413e-05, "loss": 0.5741, "step": 540 }, { "epoch": 19.64, "grad_norm": 7.622641086578369, "learning_rate": 3.3730158730158734e-05, "loss": 0.5803, "step": 550 }, { "epoch": 20.0, "grad_norm": 6.530401706695557, "learning_rate": 3.3333333333333335e-05, "loss": 0.5683, "step": 560 }, { "epoch": 20.0, "eval_accuracy": 0.5791457286432161, "eval_loss": 1.1423227787017822, "eval_runtime": 5.7458, "eval_samples_per_second": 277.073, "eval_steps_per_second": 2.263, "step": 560 }, { "epoch": 20.36, "grad_norm": 4.021312236785889, "learning_rate": 3.2936507936507936e-05, "loss": 0.5457, "step": 570 }, { "epoch": 20.71, "grad_norm": 5.022521018981934, "learning_rate": 3.253968253968254e-05, "loss": 0.5524, "step": 580 }, { "epoch": 21.0, "eval_accuracy": 0.5873115577889447, "eval_loss": 1.1516565084457397, "eval_runtime": 6.4553, "eval_samples_per_second": 246.619, "eval_steps_per_second": 2.014, "step": 588 }, { "epoch": 21.07, "grad_norm": 4.585721969604492, "learning_rate": 3.2142857142857144e-05, "loss": 0.5493, "step": 590 }, { "epoch": 21.43, "grad_norm": 3.6502797603607178, "learning_rate": 3.1746031746031745e-05, "loss": 0.5393, "step": 600 }, { "epoch": 21.79, "grad_norm": 4.494349002838135, "learning_rate": 3.134920634920635e-05, "loss": 0.5151, "step": 610 }, { "epoch": 22.0, "eval_accuracy": 0.5766331658291457, "eval_loss": 1.1673014163970947, "eval_runtime": 6.0498, "eval_samples_per_second": 263.15, "eval_steps_per_second": 2.149, "step": 616 }, { "epoch": 22.14, "grad_norm": 4.292243957519531, "learning_rate": 3.095238095238095e-05, "loss": 0.5241, "step": 620 }, { "epoch": 22.5, "grad_norm": 4.669267177581787, "learning_rate": 3.055555555555556e-05, "loss": 0.5095, "step": 630 }, { "epoch": 22.86, "grad_norm": 4.2560882568359375, "learning_rate": 3.0158730158730158e-05, "loss": 0.5096, "step": 640 }, { "epoch": 23.0, "eval_accuracy": 0.5797738693467337, "eval_loss": 1.17599618434906, "eval_runtime": 5.8279, "eval_samples_per_second": 273.169, "eval_steps_per_second": 2.231, "step": 644 }, { "epoch": 23.21, "grad_norm": 4.331119537353516, "learning_rate": 2.9761904761904762e-05, "loss": 0.5092, "step": 650 }, { "epoch": 23.57, "grad_norm": 5.246380805969238, "learning_rate": 2.9365079365079366e-05, "loss": 0.5016, "step": 660 }, { "epoch": 23.93, "grad_norm": 4.598259449005127, "learning_rate": 2.8968253968253974e-05, "loss": 0.4937, "step": 670 }, { "epoch": 24.0, "eval_accuracy": 0.5816582914572864, "eval_loss": 1.193081259727478, "eval_runtime": 5.7093, "eval_samples_per_second": 278.844, "eval_steps_per_second": 2.277, "step": 672 }, { "epoch": 24.29, "grad_norm": 4.1697893142700195, "learning_rate": 2.857142857142857e-05, "loss": 0.4713, "step": 680 }, { "epoch": 24.64, "grad_norm": 3.761561870574951, "learning_rate": 2.8174603174603175e-05, "loss": 0.469, "step": 690 }, { "epoch": 25.0, "grad_norm": 3.7730281352996826, "learning_rate": 2.777777777777778e-05, "loss": 0.487, "step": 700 }, { "epoch": 25.0, "eval_accuracy": 0.5734924623115578, "eval_loss": 1.2083638906478882, "eval_runtime": 6.0189, "eval_samples_per_second": 264.498, "eval_steps_per_second": 2.16, "step": 700 }, { "epoch": 25.36, "grad_norm": 4.0658674240112305, "learning_rate": 2.7380952380952383e-05, "loss": 0.4587, "step": 710 }, { "epoch": 25.71, "grad_norm": 3.6383256912231445, "learning_rate": 2.6984126984126984e-05, "loss": 0.4597, "step": 720 }, { "epoch": 26.0, "eval_accuracy": 0.571608040201005, "eval_loss": 1.2270249128341675, "eval_runtime": 6.2134, "eval_samples_per_second": 256.222, "eval_steps_per_second": 2.092, "step": 728 }, { "epoch": 26.07, "grad_norm": 4.403080940246582, "learning_rate": 2.6587301587301588e-05, "loss": 0.4461, "step": 730 }, { "epoch": 26.43, "grad_norm": 4.59806489944458, "learning_rate": 2.6190476190476192e-05, "loss": 0.4487, "step": 740 }, { "epoch": 26.79, "grad_norm": 3.9561307430267334, "learning_rate": 2.5793650793650796e-05, "loss": 0.4482, "step": 750 }, { "epoch": 27.0, "eval_accuracy": 0.5829145728643216, "eval_loss": 1.2389401197433472, "eval_runtime": 5.923, "eval_samples_per_second": 268.783, "eval_steps_per_second": 2.195, "step": 756 }, { "epoch": 27.14, "grad_norm": 4.864688396453857, "learning_rate": 2.5396825396825397e-05, "loss": 0.4678, "step": 760 }, { "epoch": 27.5, "grad_norm": 11.65251636505127, "learning_rate": 2.5e-05, "loss": 0.4347, "step": 770 }, { "epoch": 27.86, "grad_norm": 4.065871715545654, "learning_rate": 2.4603174603174602e-05, "loss": 0.4183, "step": 780 }, { "epoch": 28.0, "eval_accuracy": 0.5772613065326633, "eval_loss": 1.2430446147918701, "eval_runtime": 5.8698, "eval_samples_per_second": 271.218, "eval_steps_per_second": 2.215, "step": 784 }, { "epoch": 28.21, "grad_norm": 3.8666417598724365, "learning_rate": 2.4206349206349206e-05, "loss": 0.42, "step": 790 }, { "epoch": 28.57, "grad_norm": 4.105574131011963, "learning_rate": 2.380952380952381e-05, "loss": 0.4359, "step": 800 }, { "epoch": 28.93, "grad_norm": 4.677915573120117, "learning_rate": 2.3412698412698414e-05, "loss": 0.4228, "step": 810 }, { "epoch": 29.0, "eval_accuracy": 0.5741206030150754, "eval_loss": 1.2637208700180054, "eval_runtime": 5.5801, "eval_samples_per_second": 285.302, "eval_steps_per_second": 2.33, "step": 812 }, { "epoch": 29.29, "grad_norm": 4.680498123168945, "learning_rate": 2.3015873015873015e-05, "loss": 0.4268, "step": 820 }, { "epoch": 29.64, "grad_norm": 3.9155895709991455, "learning_rate": 2.261904761904762e-05, "loss": 0.3908, "step": 830 }, { "epoch": 30.0, "grad_norm": 4.745033264160156, "learning_rate": 2.2222222222222223e-05, "loss": 0.4116, "step": 840 }, { "epoch": 30.0, "eval_accuracy": 0.5778894472361809, "eval_loss": 1.2687839269638062, "eval_runtime": 5.7938, "eval_samples_per_second": 274.776, "eval_steps_per_second": 2.244, "step": 840 }, { "epoch": 30.36, "grad_norm": 4.2474236488342285, "learning_rate": 2.1825396825396827e-05, "loss": 0.3889, "step": 850 }, { "epoch": 30.71, "grad_norm": 4.243285655975342, "learning_rate": 2.1428571428571428e-05, "loss": 0.3942, "step": 860 }, { "epoch": 31.0, "eval_accuracy": 0.5879396984924623, "eval_loss": 1.2986161708831787, "eval_runtime": 5.7249, "eval_samples_per_second": 278.081, "eval_steps_per_second": 2.271, "step": 868 }, { "epoch": 31.07, "grad_norm": 4.418118000030518, "learning_rate": 2.1031746031746032e-05, "loss": 0.3868, "step": 870 }, { "epoch": 31.43, "grad_norm": 4.63828182220459, "learning_rate": 2.0634920634920636e-05, "loss": 0.3935, "step": 880 }, { "epoch": 31.79, "grad_norm": 4.932127952575684, "learning_rate": 2.023809523809524e-05, "loss": 0.3815, "step": 890 }, { "epoch": 32.0, "eval_accuracy": 0.5766331658291457, "eval_loss": 1.2911109924316406, "eval_runtime": 5.5086, "eval_samples_per_second": 289.004, "eval_steps_per_second": 2.36, "step": 896 }, { "epoch": 32.14, "grad_norm": 5.120588779449463, "learning_rate": 1.984126984126984e-05, "loss": 0.3814, "step": 900 }, { "epoch": 32.5, "grad_norm": 4.348486423492432, "learning_rate": 1.9444444444444445e-05, "loss": 0.3891, "step": 910 }, { "epoch": 32.86, "grad_norm": 5.042387008666992, "learning_rate": 1.9047619047619046e-05, "loss": 0.3828, "step": 920 }, { "epoch": 33.0, "eval_accuracy": 0.5772613065326633, "eval_loss": 1.311328411102295, "eval_runtime": 5.8619, "eval_samples_per_second": 271.584, "eval_steps_per_second": 2.218, "step": 924 }, { "epoch": 33.21, "grad_norm": 4.366072177886963, "learning_rate": 1.8650793650793654e-05, "loss": 0.3692, "step": 930 }, { "epoch": 33.57, "grad_norm": 3.8073909282684326, "learning_rate": 1.8253968253968254e-05, "loss": 0.3617, "step": 940 }, { "epoch": 33.93, "grad_norm": 4.39243221282959, "learning_rate": 1.785714285714286e-05, "loss": 0.3791, "step": 950 }, { "epoch": 34.0, "eval_accuracy": 0.5766331658291457, "eval_loss": 1.3316831588745117, "eval_runtime": 5.7518, "eval_samples_per_second": 276.782, "eval_steps_per_second": 2.26, "step": 952 }, { "epoch": 34.29, "grad_norm": 4.339027404785156, "learning_rate": 1.746031746031746e-05, "loss": 0.3716, "step": 960 }, { "epoch": 34.64, "grad_norm": 4.211394786834717, "learning_rate": 1.7063492063492063e-05, "loss": 0.3644, "step": 970 }, { "epoch": 35.0, "grad_norm": 4.398725509643555, "learning_rate": 1.6666666666666667e-05, "loss": 0.3701, "step": 980 }, { "epoch": 35.0, "eval_accuracy": 0.5772613065326633, "eval_loss": 1.3383643627166748, "eval_runtime": 5.7692, "eval_samples_per_second": 275.948, "eval_steps_per_second": 2.253, "step": 980 }, { "epoch": 35.36, "grad_norm": 5.39536714553833, "learning_rate": 1.626984126984127e-05, "loss": 0.3625, "step": 990 }, { "epoch": 35.71, "grad_norm": 4.096094131469727, "learning_rate": 1.5873015873015872e-05, "loss": 0.3566, "step": 1000 }, { "epoch": 36.0, "eval_accuracy": 0.5753768844221105, "eval_loss": 1.3406319618225098, "eval_runtime": 5.7402, "eval_samples_per_second": 277.342, "eval_steps_per_second": 2.265, "step": 1008 }, { "epoch": 36.07, "grad_norm": 4.44684362411499, "learning_rate": 1.5476190476190476e-05, "loss": 0.3621, "step": 1010 }, { "epoch": 36.43, "grad_norm": 4.413670063018799, "learning_rate": 1.5079365079365079e-05, "loss": 0.3435, "step": 1020 }, { "epoch": 36.79, "grad_norm": 4.409304618835449, "learning_rate": 1.4682539682539683e-05, "loss": 0.3551, "step": 1030 }, { "epoch": 37.0, "eval_accuracy": 0.5766331658291457, "eval_loss": 1.34104323387146, "eval_runtime": 5.5746, "eval_samples_per_second": 285.58, "eval_steps_per_second": 2.332, "step": 1036 }, { "epoch": 37.14, "grad_norm": 4.187081813812256, "learning_rate": 1.4285714285714285e-05, "loss": 0.3558, "step": 1040 }, { "epoch": 37.5, "grad_norm": 4.235673427581787, "learning_rate": 1.388888888888889e-05, "loss": 0.3392, "step": 1050 }, { "epoch": 37.86, "grad_norm": 4.1030497550964355, "learning_rate": 1.3492063492063492e-05, "loss": 0.3487, "step": 1060 }, { "epoch": 38.0, "eval_accuracy": 0.5866834170854272, "eval_loss": 1.3364226818084717, "eval_runtime": 5.6497, "eval_samples_per_second": 281.783, "eval_steps_per_second": 2.301, "step": 1064 }, { "epoch": 38.21, "grad_norm": 5.5322394371032715, "learning_rate": 1.3095238095238096e-05, "loss": 0.3359, "step": 1070 }, { "epoch": 38.57, "grad_norm": 4.478665828704834, "learning_rate": 1.2698412698412699e-05, "loss": 0.3445, "step": 1080 }, { "epoch": 38.93, "grad_norm": 4.051063060760498, "learning_rate": 1.2301587301587301e-05, "loss": 0.3463, "step": 1090 }, { "epoch": 39.0, "eval_accuracy": 0.5810301507537688, "eval_loss": 1.3495649099349976, "eval_runtime": 5.6668, "eval_samples_per_second": 280.934, "eval_steps_per_second": 2.294, "step": 1092 }, { "epoch": 39.29, "grad_norm": 3.2469401359558105, "learning_rate": 1.1904761904761905e-05, "loss": 0.3348, "step": 1100 }, { "epoch": 39.64, "grad_norm": 3.4843826293945312, "learning_rate": 1.1507936507936508e-05, "loss": 0.3162, "step": 1110 }, { "epoch": 40.0, "grad_norm": 4.144495010375977, "learning_rate": 1.1111111111111112e-05, "loss": 0.3242, "step": 1120 }, { "epoch": 40.0, "eval_accuracy": 0.574748743718593, "eval_loss": 1.3639838695526123, "eval_runtime": 5.8819, "eval_samples_per_second": 270.661, "eval_steps_per_second": 2.21, "step": 1120 }, { "epoch": 40.36, "grad_norm": 4.064784526824951, "learning_rate": 1.0714285714285714e-05, "loss": 0.3376, "step": 1130 }, { "epoch": 40.71, "grad_norm": 3.94600248336792, "learning_rate": 1.0317460317460318e-05, "loss": 0.3308, "step": 1140 }, { "epoch": 41.0, "eval_accuracy": 0.571608040201005, "eval_loss": 1.3626537322998047, "eval_runtime": 5.5788, "eval_samples_per_second": 285.366, "eval_steps_per_second": 2.33, "step": 1148 }, { "epoch": 41.07, "grad_norm": 3.8364109992980957, "learning_rate": 9.92063492063492e-06, "loss": 0.3196, "step": 1150 }, { "epoch": 41.43, "grad_norm": 3.929502010345459, "learning_rate": 9.523809523809523e-06, "loss": 0.3224, "step": 1160 }, { "epoch": 41.79, "grad_norm": 4.359261989593506, "learning_rate": 9.126984126984127e-06, "loss": 0.3255, "step": 1170 }, { "epoch": 42.0, "eval_accuracy": 0.5804020100502513, "eval_loss": 1.379508376121521, "eval_runtime": 5.7679, "eval_samples_per_second": 276.011, "eval_steps_per_second": 2.254, "step": 1176 }, { "epoch": 42.14, "grad_norm": 4.006887912750244, "learning_rate": 8.73015873015873e-06, "loss": 0.3268, "step": 1180 }, { "epoch": 42.5, "grad_norm": 3.5152969360351562, "learning_rate": 8.333333333333334e-06, "loss": 0.3166, "step": 1190 }, { "epoch": 42.86, "grad_norm": 3.868173122406006, "learning_rate": 7.936507936507936e-06, "loss": 0.3295, "step": 1200 }, { "epoch": 43.0, "eval_accuracy": 0.5797738693467337, "eval_loss": 1.374683141708374, "eval_runtime": 5.5853, "eval_samples_per_second": 285.032, "eval_steps_per_second": 2.328, "step": 1204 }, { "epoch": 43.21, "grad_norm": 3.3808319568634033, "learning_rate": 7.5396825396825394e-06, "loss": 0.3201, "step": 1210 }, { "epoch": 43.57, "grad_norm": 4.094415187835693, "learning_rate": 7.142857142857143e-06, "loss": 0.3223, "step": 1220 }, { "epoch": 43.93, "grad_norm": 3.389286518096924, "learning_rate": 6.746031746031746e-06, "loss": 0.3147, "step": 1230 }, { "epoch": 44.0, "eval_accuracy": 0.5860552763819096, "eval_loss": 1.3746650218963623, "eval_runtime": 6.0707, "eval_samples_per_second": 262.244, "eval_steps_per_second": 2.141, "step": 1232 }, { "epoch": 44.29, "grad_norm": 3.764704942703247, "learning_rate": 6.349206349206349e-06, "loss": 0.3107, "step": 1240 }, { "epoch": 44.64, "grad_norm": 4.65610933303833, "learning_rate": 5.9523809523809525e-06, "loss": 0.3058, "step": 1250 }, { "epoch": 45.0, "grad_norm": 4.179033279418945, "learning_rate": 5.555555555555556e-06, "loss": 0.3125, "step": 1260 }, { "epoch": 45.0, "eval_accuracy": 0.5816582914572864, "eval_loss": 1.3839383125305176, "eval_runtime": 5.8047, "eval_samples_per_second": 274.262, "eval_steps_per_second": 2.24, "step": 1260 }, { "epoch": 45.36, "grad_norm": 7.927102088928223, "learning_rate": 5.158730158730159e-06, "loss": 0.3077, "step": 1270 }, { "epoch": 45.71, "grad_norm": 3.8612356185913086, "learning_rate": 4.7619047619047615e-06, "loss": 0.3276, "step": 1280 }, { "epoch": 46.0, "eval_accuracy": 0.5841708542713567, "eval_loss": 1.3805787563323975, "eval_runtime": 5.5264, "eval_samples_per_second": 288.072, "eval_steps_per_second": 2.352, "step": 1288 }, { "epoch": 46.07, "grad_norm": 4.099473476409912, "learning_rate": 4.365079365079365e-06, "loss": 0.3076, "step": 1290 }, { "epoch": 46.43, "grad_norm": 3.8270063400268555, "learning_rate": 3.968253968253968e-06, "loss": 0.3097, "step": 1300 }, { "epoch": 46.79, "grad_norm": 3.907658576965332, "learning_rate": 3.5714285714285714e-06, "loss": 0.2989, "step": 1310 }, { "epoch": 47.0, "eval_accuracy": 0.5885678391959799, "eval_loss": 1.3905527591705322, "eval_runtime": 5.6646, "eval_samples_per_second": 281.046, "eval_steps_per_second": 2.295, "step": 1316 }, { "epoch": 47.14, "grad_norm": 4.716944694519043, "learning_rate": 3.1746031746031746e-06, "loss": 0.3173, "step": 1320 }, { "epoch": 47.5, "grad_norm": 3.6047604084014893, "learning_rate": 2.777777777777778e-06, "loss": 0.3172, "step": 1330 }, { "epoch": 47.86, "grad_norm": 4.362003803253174, "learning_rate": 2.3809523809523808e-06, "loss": 0.2941, "step": 1340 }, { "epoch": 48.0, "eval_accuracy": 0.5866834170854272, "eval_loss": 1.3876359462738037, "eval_runtime": 5.5384, "eval_samples_per_second": 287.447, "eval_steps_per_second": 2.347, "step": 1344 }, { "epoch": 48.21, "grad_norm": 4.251241207122803, "learning_rate": 1.984126984126984e-06, "loss": 0.3138, "step": 1350 }, { "epoch": 48.57, "grad_norm": 3.9117441177368164, "learning_rate": 1.5873015873015873e-06, "loss": 0.3042, "step": 1360 }, { "epoch": 48.93, "grad_norm": 3.703327178955078, "learning_rate": 1.1904761904761904e-06, "loss": 0.3131, "step": 1370 }, { "epoch": 49.0, "eval_accuracy": 0.582286432160804, "eval_loss": 1.3895643949508667, "eval_runtime": 5.5003, "eval_samples_per_second": 289.44, "eval_steps_per_second": 2.364, "step": 1372 }, { "epoch": 49.29, "grad_norm": 6.085123538970947, "learning_rate": 7.936507936507937e-07, "loss": 0.2984, "step": 1380 }, { "epoch": 49.64, "grad_norm": 3.707207679748535, "learning_rate": 3.9682539682539683e-07, "loss": 0.3092, "step": 1390 }, { "epoch": 50.0, "grad_norm": 4.588694095611572, "learning_rate": 0.0, "loss": 0.2975, "step": 1400 }, { "epoch": 50.0, "eval_accuracy": 0.5835427135678392, "eval_loss": 1.3905625343322754, "eval_runtime": 5.7976, "eval_samples_per_second": 274.595, "eval_steps_per_second": 2.242, "step": 1400 }, { "epoch": 50.0, "step": 1400, "total_flos": 1.7807825640923136e+19, "train_loss": 0.5565686808313642, "train_runtime": 3248.0342, "train_samples_per_second": 220.564, "train_steps_per_second": 0.431 } ], "logging_steps": 10, "max_steps": 1400, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "total_flos": 1.7807825640923136e+19, "train_batch_size": 128, "trial_name": null, "trial_params": null }