{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "global_step": 122720, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "learning_rate": 2.975700782268579e-05, "loss": 0.5382, "step": 1000 }, { "epoch": 0.08, "learning_rate": 2.951254889178618e-05, "loss": 0.4368, "step": 2000 }, { "epoch": 0.12, "learning_rate": 2.9268334419817473e-05, "loss": 0.4277, "step": 3000 }, { "epoch": 0.16, "learning_rate": 2.9023875488917863e-05, "loss": 0.4026, "step": 4000 }, { "epoch": 0.2, "learning_rate": 2.877966101694915e-05, "loss": 0.4137, "step": 5000 }, { "epoch": 0.24, "learning_rate": 2.8535202086049545e-05, "loss": 0.4048, "step": 6000 }, { "epoch": 0.29, "learning_rate": 2.8290743155149935e-05, "loss": 0.3949, "step": 7000 }, { "epoch": 0.33, "learning_rate": 2.804628422425033e-05, "loss": 0.3883, "step": 8000 }, { "epoch": 0.37, "learning_rate": 2.780182529335072e-05, "loss": 0.3759, "step": 9000 }, { "epoch": 0.41, "learning_rate": 2.755736636245111e-05, "loss": 0.3868, "step": 10000 }, { "epoch": 0.45, "learning_rate": 2.73129074315515e-05, "loss": 0.3809, "step": 11000 }, { "epoch": 0.49, "learning_rate": 2.7068448500651893e-05, "loss": 0.3754, "step": 12000 }, { "epoch": 0.53, "learning_rate": 2.6823989569752283e-05, "loss": 0.3733, "step": 13000 }, { "epoch": 0.57, "learning_rate": 2.6579530638852673e-05, "loss": 0.3825, "step": 14000 }, { "epoch": 0.61, "learning_rate": 2.6335071707953063e-05, "loss": 0.3719, "step": 15000 }, { "epoch": 0.65, "learning_rate": 2.6091101694915253e-05, "loss": 0.3725, "step": 16000 }, { "epoch": 0.69, "learning_rate": 2.5846642764015644e-05, "loss": 0.3694, "step": 17000 }, { "epoch": 0.73, "learning_rate": 2.5602428292046935e-05, "loss": 0.3714, "step": 18000 }, { "epoch": 0.77, "learning_rate": 2.5358213820078227e-05, "loss": 0.3818, "step": 19000 }, { "epoch": 0.81, "learning_rate": 2.511399934810952e-05, "loss": 0.374, "step": 20000 }, { "epoch": 0.86, "learning_rate": 2.486954041720991e-05, "loss": 0.4574, "step": 21000 }, { "epoch": 0.9, "learning_rate": 2.46250814863103e-05, "loss": 0.426, "step": 22000 }, { "epoch": 0.94, "learning_rate": 2.4380622555410693e-05, "loss": 0.3714, "step": 23000 }, { "epoch": 0.98, "learning_rate": 2.4136163624511083e-05, "loss": 0.3676, "step": 24000 }, { "epoch": 1.0, "eval_accuracy": 0.8680590932246561, "eval_loss": 0.3760605454444885, "eval_runtime": 67.1189, "eval_samples_per_second": 146.233, "eval_steps_per_second": 18.281, "step": 24544 }, { "epoch": 1.02, "learning_rate": 2.3892193611473273e-05, "loss": 0.3246, "step": 25000 }, { "epoch": 1.06, "learning_rate": 2.3647734680573663e-05, "loss": 0.2698, "step": 26000 }, { "epoch": 1.1, "learning_rate": 2.3403275749674057e-05, "loss": 0.2783, "step": 27000 }, { "epoch": 1.14, "learning_rate": 2.3158816818774447e-05, "loss": 0.2761, "step": 28000 }, { "epoch": 1.18, "learning_rate": 2.2914357887874837e-05, "loss": 0.2831, "step": 29000 }, { "epoch": 1.22, "learning_rate": 2.2669898956975228e-05, "loss": 0.2819, "step": 30000 }, { "epoch": 1.26, "learning_rate": 2.2425440026075618e-05, "loss": 0.2822, "step": 31000 }, { "epoch": 1.3, "learning_rate": 2.2180981095176008e-05, "loss": 0.2769, "step": 32000 }, { "epoch": 1.34, "learning_rate": 2.1936522164276405e-05, "loss": 0.2853, "step": 33000 }, { "epoch": 1.39, "learning_rate": 2.1692063233376795e-05, "loss": 0.2891, "step": 34000 }, { "epoch": 1.43, "learning_rate": 2.1447848761408084e-05, "loss": 0.281, "step": 35000 }, { "epoch": 1.47, "learning_rate": 2.1203389830508474e-05, "loss": 0.2733, "step": 36000 }, { "epoch": 1.51, "learning_rate": 2.0959175358539766e-05, "loss": 0.2779, "step": 37000 }, { "epoch": 1.55, "learning_rate": 2.0714716427640156e-05, "loss": 0.2799, "step": 38000 }, { "epoch": 1.59, "learning_rate": 2.0470257496740546e-05, "loss": 0.2824, "step": 39000 }, { "epoch": 1.63, "learning_rate": 2.022628748370274e-05, "loss": 0.2823, "step": 40000 }, { "epoch": 1.67, "learning_rate": 1.998182855280313e-05, "loss": 0.2789, "step": 41000 }, { "epoch": 1.71, "learning_rate": 1.973736962190352e-05, "loss": 0.2806, "step": 42000 }, { "epoch": 1.75, "learning_rate": 1.9492910691003913e-05, "loss": 0.2808, "step": 43000 }, { "epoch": 1.79, "learning_rate": 1.9248696219035202e-05, "loss": 0.2745, "step": 44000 }, { "epoch": 1.83, "learning_rate": 1.9004481747066494e-05, "loss": 0.2918, "step": 45000 }, { "epoch": 1.87, "learning_rate": 1.8760511734028684e-05, "loss": 0.2789, "step": 46000 }, { "epoch": 1.91, "learning_rate": 1.8516052803129074e-05, "loss": 0.2754, "step": 47000 }, { "epoch": 1.96, "learning_rate": 1.8271593872229467e-05, "loss": 0.2709, "step": 48000 }, { "epoch": 2.0, "learning_rate": 1.8027134941329858e-05, "loss": 0.2782, "step": 49000 }, { "epoch": 2.0, "eval_accuracy": 0.8881304126337239, "eval_loss": 0.36052629351615906, "eval_runtime": 68.3607, "eval_samples_per_second": 143.577, "eval_steps_per_second": 17.949, "step": 49088 }, { "epoch": 2.04, "learning_rate": 1.7782676010430248e-05, "loss": 0.2017, "step": 50000 }, { "epoch": 2.08, "learning_rate": 1.7538217079530638e-05, "loss": 0.1921, "step": 51000 }, { "epoch": 2.12, "learning_rate": 1.7293758148631028e-05, "loss": 0.2017, "step": 52000 }, { "epoch": 2.16, "learning_rate": 1.704929921773142e-05, "loss": 0.1894, "step": 53000 }, { "epoch": 2.2, "learning_rate": 1.6805084745762714e-05, "loss": 0.1978, "step": 54000 }, { "epoch": 2.24, "learning_rate": 1.6560625814863104e-05, "loss": 0.1983, "step": 55000 }, { "epoch": 2.28, "learning_rate": 1.6316166883963494e-05, "loss": 0.2003, "step": 56000 }, { "epoch": 2.32, "learning_rate": 1.6071707953063884e-05, "loss": 0.2059, "step": 57000 }, { "epoch": 2.36, "learning_rate": 1.5827249022164274e-05, "loss": 0.1971, "step": 58000 }, { "epoch": 2.4, "learning_rate": 1.5582790091264668e-05, "loss": 0.2014, "step": 59000 }, { "epoch": 2.44, "learning_rate": 1.533833116036506e-05, "loss": 0.1922, "step": 60000 }, { "epoch": 2.49, "learning_rate": 1.5094116688396348e-05, "loss": 0.1927, "step": 61000 }, { "epoch": 2.53, "learning_rate": 1.4849657757496742e-05, "loss": 0.1965, "step": 62000 }, { "epoch": 2.57, "learning_rate": 1.4605443285528032e-05, "loss": 0.1971, "step": 63000 }, { "epoch": 2.61, "learning_rate": 1.4360984354628422e-05, "loss": 0.2095, "step": 64000 }, { "epoch": 2.65, "learning_rate": 1.4116769882659712e-05, "loss": 0.199, "step": 65000 }, { "epoch": 2.69, "learning_rate": 1.3872555410691004e-05, "loss": 0.2072, "step": 66000 }, { "epoch": 2.73, "learning_rate": 1.3628096479791396e-05, "loss": 0.1955, "step": 67000 }, { "epoch": 2.77, "learning_rate": 1.3383637548891786e-05, "loss": 0.1924, "step": 68000 }, { "epoch": 2.81, "learning_rate": 1.3139178617992178e-05, "loss": 0.2055, "step": 69000 }, { "epoch": 2.85, "learning_rate": 1.2894719687092568e-05, "loss": 0.2016, "step": 70000 }, { "epoch": 2.89, "learning_rate": 1.265026075619296e-05, "loss": 0.2011, "step": 71000 }, { "epoch": 2.93, "learning_rate": 1.2405801825293352e-05, "loss": 0.1915, "step": 72000 }, { "epoch": 2.97, "learning_rate": 1.2161587353324642e-05, "loss": 0.1986, "step": 73000 }, { "epoch": 3.0, "eval_accuracy": 0.8893530310748854, "eval_loss": 0.4672204256057739, "eval_runtime": 67.0424, "eval_samples_per_second": 146.4, "eval_steps_per_second": 18.302, "step": 73632 }, { "epoch": 3.01, "learning_rate": 1.1917128422425032e-05, "loss": 0.1697, "step": 74000 }, { "epoch": 3.06, "learning_rate": 1.1672669491525423e-05, "loss": 0.1246, "step": 75000 }, { "epoch": 3.1, "learning_rate": 1.1428455019556716e-05, "loss": 0.1231, "step": 76000 }, { "epoch": 3.14, "learning_rate": 1.1183996088657106e-05, "loss": 0.1314, "step": 77000 }, { "epoch": 3.18, "learning_rate": 1.0939537157757497e-05, "loss": 0.1272, "step": 78000 }, { "epoch": 3.22, "learning_rate": 1.0695567144719687e-05, "loss": 0.1257, "step": 79000 }, { "epoch": 3.26, "learning_rate": 1.0451108213820078e-05, "loss": 0.1278, "step": 80000 }, { "epoch": 3.3, "learning_rate": 1.0206893741851369e-05, "loss": 0.1365, "step": 81000 }, { "epoch": 3.34, "learning_rate": 9.96243481095176e-06, "loss": 0.1284, "step": 82000 }, { "epoch": 3.38, "learning_rate": 9.717975880052152e-06, "loss": 0.1435, "step": 83000 }, { "epoch": 3.42, "learning_rate": 9.473761408083443e-06, "loss": 0.1386, "step": 84000 }, { "epoch": 3.46, "learning_rate": 9.229546936114733e-06, "loss": 0.1301, "step": 85000 }, { "epoch": 3.5, "learning_rate": 8.985088005215123e-06, "loss": 0.1261, "step": 86000 }, { "epoch": 3.54, "learning_rate": 8.740629074315516e-06, "loss": 0.1291, "step": 87000 }, { "epoch": 3.59, "learning_rate": 8.496170143415907e-06, "loss": 0.1281, "step": 88000 }, { "epoch": 3.63, "learning_rate": 8.251711212516297e-06, "loss": 0.1166, "step": 89000 }, { "epoch": 3.67, "learning_rate": 8.007252281616689e-06, "loss": 0.124, "step": 90000 }, { "epoch": 3.71, "learning_rate": 7.762793350717079e-06, "loss": 0.1257, "step": 91000 }, { "epoch": 3.75, "learning_rate": 7.518334419817472e-06, "loss": 0.1232, "step": 92000 }, { "epoch": 3.79, "learning_rate": 7.274119947848762e-06, "loss": 0.1271, "step": 93000 }, { "epoch": 3.83, "learning_rate": 7.029905475880053e-06, "loss": 0.128, "step": 94000 }, { "epoch": 3.87, "learning_rate": 6.785446544980443e-06, "loss": 0.1212, "step": 95000 }, { "epoch": 3.91, "learning_rate": 6.540987614080835e-06, "loss": 0.1193, "step": 96000 }, { "epoch": 3.95, "learning_rate": 6.296773142112125e-06, "loss": 0.1199, "step": 97000 }, { "epoch": 3.99, "learning_rate": 6.052314211212517e-06, "loss": 0.1299, "step": 98000 }, { "epoch": 4.0, "eval_accuracy": 0.8966887417218543, "eval_loss": 0.5247582197189331, "eval_runtime": 66.9471, "eval_samples_per_second": 146.608, "eval_steps_per_second": 18.328, "step": 98176 }, { "epoch": 4.03, "learning_rate": 5.807855280312908e-06, "loss": 0.0766, "step": 99000 }, { "epoch": 4.07, "learning_rate": 5.563640808344198e-06, "loss": 0.0642, "step": 100000 }, { "epoch": 4.12, "learning_rate": 5.31918187744459e-06, "loss": 0.0744, "step": 101000 }, { "epoch": 4.16, "learning_rate": 5.07496740547588e-06, "loss": 0.0684, "step": 102000 }, { "epoch": 4.2, "learning_rate": 4.830508474576271e-06, "loss": 0.0732, "step": 103000 }, { "epoch": 4.24, "learning_rate": 4.586294002607562e-06, "loss": 0.0637, "step": 104000 }, { "epoch": 4.28, "learning_rate": 4.342323989569753e-06, "loss": 0.071, "step": 105000 }, { "epoch": 4.32, "learning_rate": 4.097865058670143e-06, "loss": 0.066, "step": 106000 }, { "epoch": 4.36, "learning_rate": 3.853406127770535e-06, "loss": 0.0715, "step": 107000 }, { "epoch": 4.4, "learning_rate": 3.6089471968709256e-06, "loss": 0.069, "step": 108000 }, { "epoch": 4.44, "learning_rate": 3.3647327249022166e-06, "loss": 0.0707, "step": 109000 }, { "epoch": 4.48, "learning_rate": 3.1202737940026076e-06, "loss": 0.0702, "step": 110000 }, { "epoch": 4.52, "learning_rate": 2.875814863102999e-06, "loss": 0.0632, "step": 111000 }, { "epoch": 4.56, "learning_rate": 2.63135593220339e-06, "loss": 0.0672, "step": 112000 }, { "epoch": 4.6, "learning_rate": 2.3871414602346807e-06, "loss": 0.0665, "step": 113000 }, { "epoch": 4.64, "learning_rate": 2.1426825293350717e-06, "loss": 0.0637, "step": 114000 }, { "epoch": 4.69, "learning_rate": 1.898223598435463e-06, "loss": 0.0712, "step": 115000 }, { "epoch": 4.73, "learning_rate": 1.653764667535854e-06, "loss": 0.064, "step": 116000 }, { "epoch": 4.77, "learning_rate": 1.4093057366362453e-06, "loss": 0.0644, "step": 117000 }, { "epoch": 4.81, "learning_rate": 1.1653357235984355e-06, "loss": 0.0683, "step": 118000 }, { "epoch": 4.85, "learning_rate": 9.208767926988267e-07, "loss": 0.0622, "step": 119000 }, { "epoch": 4.89, "learning_rate": 6.766623207301173e-07, "loss": 0.0678, "step": 120000 }, { "epoch": 4.93, "learning_rate": 4.322033898305085e-07, "loss": 0.0578, "step": 121000 }, { "epoch": 4.97, "learning_rate": 1.877444589308996e-07, "loss": 0.0643, "step": 122000 }, { "epoch": 5.0, "eval_accuracy": 0.8999490575649516, "eval_loss": 0.6488531231880188, "eval_runtime": 67.1753, "eval_samples_per_second": 146.11, "eval_steps_per_second": 18.266, "step": 122720 }, { "epoch": 5.0, "step": 122720, "total_flos": 4.5746877181130496e+17, "train_loss": 0.2137734965519613, "train_runtime": 32769.5489, "train_samples_per_second": 59.919, "train_steps_per_second": 3.745 } ], "max_steps": 122720, "num_train_epochs": 5, "total_flos": 4.5746877181130496e+17, "trial_name": null, "trial_params": null }