{ "best_metric": 0.0002666182699613273, "best_model_checkpoint": "autotrain-3b9gd-7t9lc/checkpoint-494", "epoch": 2.0, "eval_steps": 500, "global_step": 494, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 8.240466117858887, "learning_rate": 3.3333333333333333e-06, "loss": 1.1781, "step": 5 }, { "epoch": 0.04, "grad_norm": 14.987759590148926, "learning_rate": 6.666666666666667e-06, "loss": 1.1781, "step": 10 }, { "epoch": 0.06, "grad_norm": 8.519774436950684, "learning_rate": 1e-05, "loss": 1.0057, "step": 15 }, { "epoch": 0.08, "grad_norm": 6.639293193817139, "learning_rate": 1.3333333333333333e-05, "loss": 0.909, "step": 20 }, { "epoch": 0.1, "grad_norm": 8.727118492126465, "learning_rate": 1.6666666666666667e-05, "loss": 0.8319, "step": 25 }, { "epoch": 0.12, "grad_norm": 7.4621148109436035, "learning_rate": 2e-05, "loss": 0.6461, "step": 30 }, { "epoch": 0.14, "grad_norm": 6.218155860900879, "learning_rate": 2.3333333333333336e-05, "loss": 0.5342, "step": 35 }, { "epoch": 0.16, "grad_norm": 2.2607550621032715, "learning_rate": 2.6666666666666667e-05, "loss": 0.3212, "step": 40 }, { "epoch": 0.18, "grad_norm": 7.647438049316406, "learning_rate": 3e-05, "loss": 0.2912, "step": 45 }, { "epoch": 0.2, "grad_norm": 3.327169418334961, "learning_rate": 3.3333333333333335e-05, "loss": 0.1935, "step": 50 }, { "epoch": 0.22, "grad_norm": 10.839067459106445, "learning_rate": 3.6666666666666666e-05, "loss": 0.2682, "step": 55 }, { "epoch": 0.24, "grad_norm": 5.581568717956543, "learning_rate": 4e-05, "loss": 0.3104, "step": 60 }, { "epoch": 0.26, "grad_norm": 8.139863967895508, "learning_rate": 4.3333333333333334e-05, "loss": 0.1683, "step": 65 }, { "epoch": 0.28, "grad_norm": 7.741869926452637, "learning_rate": 4.666666666666667e-05, "loss": 0.4055, "step": 70 }, { "epoch": 0.3, "grad_norm": 23.573352813720703, "learning_rate": 5e-05, "loss": 0.3234, "step": 75 }, { "epoch": 0.32, "grad_norm": 8.024917602539062, "learning_rate": 4.9624624624624625e-05, "loss": 0.5331, "step": 80 }, { "epoch": 0.34, "grad_norm": 0.1634410321712494, "learning_rate": 4.9249249249249253e-05, "loss": 0.0505, "step": 85 }, { "epoch": 0.36, "grad_norm": 1.8996868133544922, "learning_rate": 4.8873873873873876e-05, "loss": 0.1691, "step": 90 }, { "epoch": 0.38, "grad_norm": 6.981603622436523, "learning_rate": 4.8498498498498504e-05, "loss": 0.2798, "step": 95 }, { "epoch": 0.4, "grad_norm": 5.159909248352051, "learning_rate": 4.812312312312313e-05, "loss": 0.2114, "step": 100 }, { "epoch": 0.43, "grad_norm": 4.676971435546875, "learning_rate": 4.774774774774775e-05, "loss": 0.2634, "step": 105 }, { "epoch": 0.45, "grad_norm": 6.247714042663574, "learning_rate": 4.737237237237238e-05, "loss": 0.1426, "step": 110 }, { "epoch": 0.47, "grad_norm": 1.8779627084732056, "learning_rate": 4.6996996996997e-05, "loss": 0.4049, "step": 115 }, { "epoch": 0.49, "grad_norm": 0.6359833478927612, "learning_rate": 4.662162162162162e-05, "loss": 0.0321, "step": 120 }, { "epoch": 0.51, "grad_norm": 0.4598715007305145, "learning_rate": 4.624624624624625e-05, "loss": 0.0668, "step": 125 }, { "epoch": 0.53, "grad_norm": 0.07359237223863602, "learning_rate": 4.587087087087087e-05, "loss": 0.2143, "step": 130 }, { "epoch": 0.55, "grad_norm": 8.209424018859863, "learning_rate": 4.54954954954955e-05, "loss": 0.1671, "step": 135 }, { "epoch": 0.57, "grad_norm": 0.7971270084381104, "learning_rate": 4.5120120120120124e-05, "loss": 0.2135, "step": 140 }, { "epoch": 0.59, "grad_norm": 12.232625961303711, "learning_rate": 4.4744744744744746e-05, "loss": 0.1478, "step": 145 }, { "epoch": 0.61, "grad_norm": 13.341341972351074, "learning_rate": 4.4369369369369375e-05, "loss": 0.3675, "step": 150 }, { "epoch": 0.63, "grad_norm": 22.468730926513672, "learning_rate": 4.3993993993994e-05, "loss": 0.1616, "step": 155 }, { "epoch": 0.65, "grad_norm": 6.675795078277588, "learning_rate": 4.3618618618618626e-05, "loss": 0.1661, "step": 160 }, { "epoch": 0.67, "grad_norm": 1.265025019645691, "learning_rate": 4.324324324324325e-05, "loss": 0.1427, "step": 165 }, { "epoch": 0.69, "grad_norm": 0.23774471879005432, "learning_rate": 4.286786786786787e-05, "loss": 0.0339, "step": 170 }, { "epoch": 0.71, "grad_norm": 9.141915321350098, "learning_rate": 4.24924924924925e-05, "loss": 0.4787, "step": 175 }, { "epoch": 0.73, "grad_norm": 14.628870964050293, "learning_rate": 4.2117117117117115e-05, "loss": 0.0911, "step": 180 }, { "epoch": 0.75, "grad_norm": 8.062237739562988, "learning_rate": 4.1741741741741744e-05, "loss": 0.0447, "step": 185 }, { "epoch": 0.77, "grad_norm": 16.428728103637695, "learning_rate": 4.1366366366366366e-05, "loss": 0.3648, "step": 190 }, { "epoch": 0.79, "grad_norm": 0.037159789353609085, "learning_rate": 4.099099099099099e-05, "loss": 0.249, "step": 195 }, { "epoch": 0.81, "grad_norm": 8.085445404052734, "learning_rate": 4.061561561561562e-05, "loss": 0.0837, "step": 200 }, { "epoch": 0.83, "grad_norm": 2.010667085647583, "learning_rate": 4.024024024024024e-05, "loss": 0.0347, "step": 205 }, { "epoch": 0.85, "grad_norm": 0.7477858662605286, "learning_rate": 3.986486486486487e-05, "loss": 0.1399, "step": 210 }, { "epoch": 0.87, "grad_norm": 7.963656425476074, "learning_rate": 3.948948948948949e-05, "loss": 0.09, "step": 215 }, { "epoch": 0.89, "grad_norm": 0.43208637833595276, "learning_rate": 3.911411411411411e-05, "loss": 0.1049, "step": 220 }, { "epoch": 0.91, "grad_norm": 5.871271133422852, "learning_rate": 3.873873873873874e-05, "loss": 0.322, "step": 225 }, { "epoch": 0.93, "grad_norm": 0.027932079508900642, "learning_rate": 3.836336336336336e-05, "loss": 0.0024, "step": 230 }, { "epoch": 0.95, "grad_norm": 0.017657727003097534, "learning_rate": 3.7987987987987985e-05, "loss": 0.1332, "step": 235 }, { "epoch": 0.97, "grad_norm": 0.7482669949531555, "learning_rate": 3.7612612612612614e-05, "loss": 0.0361, "step": 240 }, { "epoch": 0.99, "grad_norm": 0.05272696912288666, "learning_rate": 3.7237237237237236e-05, "loss": 0.0071, "step": 245 }, { "epoch": 1.0, "eval_accuracy": 0.9954128440366973, "eval_f1_macro": 0.9953370832314339, "eval_f1_micro": 0.9954128440366973, "eval_f1_weighted": 0.9954123953053728, "eval_loss": 0.008562826551496983, "eval_precision_macro": 0.9954337899543378, "eval_precision_micro": 0.9954128440366973, "eval_precision_weighted": 0.9954756817896192, "eval_recall_macro": 0.9953051643192489, "eval_recall_micro": 0.9954128440366973, "eval_recall_weighted": 0.9954128440366973, "eval_runtime": 61.5673, "eval_samples_per_second": 3.541, "eval_steps_per_second": 0.227, "step": 247 }, { "epoch": 1.01, "grad_norm": 2.773138999938965, "learning_rate": 3.6861861861861865e-05, "loss": 0.257, "step": 250 }, { "epoch": 1.03, "grad_norm": 4.275286674499512, "learning_rate": 3.648648648648649e-05, "loss": 0.382, "step": 255 }, { "epoch": 1.05, "grad_norm": 24.60214614868164, "learning_rate": 3.611111111111111e-05, "loss": 0.3858, "step": 260 }, { "epoch": 1.07, "grad_norm": 8.784448623657227, "learning_rate": 3.573573573573574e-05, "loss": 0.5298, "step": 265 }, { "epoch": 1.09, "grad_norm": 1.1353392601013184, "learning_rate": 3.536036036036036e-05, "loss": 0.153, "step": 270 }, { "epoch": 1.11, "grad_norm": 0.023551814258098602, "learning_rate": 3.498498498498499e-05, "loss": 0.0052, "step": 275 }, { "epoch": 1.13, "grad_norm": 0.15712416172027588, "learning_rate": 3.460960960960961e-05, "loss": 0.3094, "step": 280 }, { "epoch": 1.15, "grad_norm": 0.023001644760370255, "learning_rate": 3.4234234234234234e-05, "loss": 0.0407, "step": 285 }, { "epoch": 1.17, "grad_norm": 0.011525897309184074, "learning_rate": 3.385885885885886e-05, "loss": 0.0422, "step": 290 }, { "epoch": 1.19, "grad_norm": 0.9765132665634155, "learning_rate": 3.3483483483483485e-05, "loss": 0.1107, "step": 295 }, { "epoch": 1.21, "grad_norm": 0.193386971950531, "learning_rate": 3.310810810810811e-05, "loss": 0.1098, "step": 300 }, { "epoch": 1.23, "grad_norm": 0.0348842591047287, "learning_rate": 3.2732732732732736e-05, "loss": 0.0901, "step": 305 }, { "epoch": 1.26, "grad_norm": 0.02413288690149784, "learning_rate": 3.235735735735736e-05, "loss": 0.0014, "step": 310 }, { "epoch": 1.28, "grad_norm": 1.470839023590088, "learning_rate": 3.198198198198199e-05, "loss": 0.0368, "step": 315 }, { "epoch": 1.3, "grad_norm": 0.6575441360473633, "learning_rate": 3.160660660660661e-05, "loss": 0.3384, "step": 320 }, { "epoch": 1.32, "grad_norm": 0.11603262275457382, "learning_rate": 3.123123123123123e-05, "loss": 0.2287, "step": 325 }, { "epoch": 1.34, "grad_norm": 0.09468206018209457, "learning_rate": 3.085585585585586e-05, "loss": 0.0053, "step": 330 }, { "epoch": 1.36, "grad_norm": 0.18447920680046082, "learning_rate": 3.0480480480480482e-05, "loss": 0.0894, "step": 335 }, { "epoch": 1.38, "grad_norm": 22.559310913085938, "learning_rate": 3.0105105105105108e-05, "loss": 0.2127, "step": 340 }, { "epoch": 1.4, "grad_norm": 0.36875221133232117, "learning_rate": 2.9729729729729733e-05, "loss": 0.0123, "step": 345 }, { "epoch": 1.42, "grad_norm": 0.3973434269428253, "learning_rate": 2.935435435435436e-05, "loss": 0.0268, "step": 350 }, { "epoch": 1.44, "grad_norm": 0.054519250988960266, "learning_rate": 2.897897897897898e-05, "loss": 0.1053, "step": 355 }, { "epoch": 1.46, "grad_norm": 5.856131076812744, "learning_rate": 2.8603603603603606e-05, "loss": 0.105, "step": 360 }, { "epoch": 1.48, "grad_norm": 2.6053731441497803, "learning_rate": 2.8228228228228232e-05, "loss": 0.0072, "step": 365 }, { "epoch": 1.5, "grad_norm": 14.849920272827148, "learning_rate": 2.7852852852852857e-05, "loss": 0.256, "step": 370 }, { "epoch": 1.52, "grad_norm": 0.036322303116321564, "learning_rate": 2.7477477477477483e-05, "loss": 0.1195, "step": 375 }, { "epoch": 1.54, "grad_norm": 7.0286431312561035, "learning_rate": 2.7102102102102105e-05, "loss": 0.2874, "step": 380 }, { "epoch": 1.56, "grad_norm": 0.007908895611763, "learning_rate": 2.672672672672673e-05, "loss": 0.1749, "step": 385 }, { "epoch": 1.58, "grad_norm": 0.2694431245326996, "learning_rate": 2.635135135135135e-05, "loss": 0.0078, "step": 390 }, { "epoch": 1.6, "grad_norm": 14.261417388916016, "learning_rate": 2.5975975975975975e-05, "loss": 0.0251, "step": 395 }, { "epoch": 1.62, "grad_norm": 0.8614434003829956, "learning_rate": 2.56006006006006e-05, "loss": 0.01, "step": 400 }, { "epoch": 1.64, "grad_norm": 10.162217140197754, "learning_rate": 2.5225225225225222e-05, "loss": 0.0645, "step": 405 }, { "epoch": 1.66, "grad_norm": 0.010031257756054401, "learning_rate": 2.484984984984985e-05, "loss": 0.1714, "step": 410 }, { "epoch": 1.68, "grad_norm": 1.1147041320800781, "learning_rate": 2.4474474474474477e-05, "loss": 0.062, "step": 415 }, { "epoch": 1.7, "grad_norm": 14.216385841369629, "learning_rate": 2.4099099099099102e-05, "loss": 0.2521, "step": 420 }, { "epoch": 1.72, "grad_norm": 5.073903560638428, "learning_rate": 2.3723723723723725e-05, "loss": 0.1764, "step": 425 }, { "epoch": 1.74, "grad_norm": 0.5860055088996887, "learning_rate": 2.334834834834835e-05, "loss": 0.0095, "step": 430 }, { "epoch": 1.76, "grad_norm": 6.916937828063965, "learning_rate": 2.2972972972972976e-05, "loss": 0.2236, "step": 435 }, { "epoch": 1.78, "grad_norm": 0.4593968689441681, "learning_rate": 2.2597597597597598e-05, "loss": 0.2062, "step": 440 }, { "epoch": 1.8, "grad_norm": 10.855243682861328, "learning_rate": 2.2222222222222223e-05, "loss": 0.0295, "step": 445 }, { "epoch": 1.82, "grad_norm": 8.221331596374512, "learning_rate": 2.1846846846846845e-05, "loss": 0.2937, "step": 450 }, { "epoch": 1.84, "grad_norm": 5.3991265296936035, "learning_rate": 2.147147147147147e-05, "loss": 0.1518, "step": 455 }, { "epoch": 1.86, "grad_norm": 2.2880308628082275, "learning_rate": 2.1096096096096096e-05, "loss": 0.0107, "step": 460 }, { "epoch": 1.88, "grad_norm": 0.38851749897003174, "learning_rate": 2.0720720720720722e-05, "loss": 0.0047, "step": 465 }, { "epoch": 1.9, "grad_norm": 0.007677082903683186, "learning_rate": 2.0345345345345344e-05, "loss": 0.1443, "step": 470 }, { "epoch": 1.92, "grad_norm": 0.004121085163205862, "learning_rate": 1.996996996996997e-05, "loss": 0.0866, "step": 475 }, { "epoch": 1.94, "grad_norm": 4.384052276611328, "learning_rate": 1.9594594594594595e-05, "loss": 0.1988, "step": 480 }, { "epoch": 1.96, "grad_norm": 13.786602973937988, "learning_rate": 1.921921921921922e-05, "loss": 0.2474, "step": 485 }, { "epoch": 1.98, "grad_norm": 0.005240537691861391, "learning_rate": 1.8843843843843846e-05, "loss": 0.0088, "step": 490 }, { "epoch": 2.0, "eval_accuracy": 1.0, "eval_f1_macro": 1.0, "eval_f1_micro": 1.0, "eval_f1_weighted": 1.0, "eval_loss": 0.0002666182699613273, "eval_precision_macro": 1.0, "eval_precision_micro": 1.0, "eval_precision_weighted": 1.0, "eval_recall_macro": 1.0, "eval_recall_micro": 1.0, "eval_recall_weighted": 1.0, "eval_runtime": 57.4601, "eval_samples_per_second": 3.794, "eval_steps_per_second": 0.244, "step": 494 } ], "logging_steps": 5, "max_steps": 741, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 3.0532117555408896e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }