{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 1, "global_step": 40, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.25, "grad_norm": 32.508975982666016, "learning_rate": 2.5e-05, "loss": 1.9242, "step": 1 }, { "epoch": 0.25, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.25513196480938416, "eval_loss": 1.845262050628662, "eval_runtime": 1.664, "eval_samples_per_second": 37.261, "eval_steps_per_second": 1.202, "step": 1 }, { "epoch": 0.5, "grad_norm": 31.633163452148438, "learning_rate": 5e-05, "loss": 1.8427, "step": 2 }, { "epoch": 0.5, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.25513196480938416, "eval_loss": 1.8311964273452759, "eval_runtime": 1.6638, "eval_samples_per_second": 37.264, "eval_steps_per_second": 1.202, "step": 2 }, { "epoch": 0.75, "grad_norm": 32.75844192504883, "learning_rate": 4.868421052631579e-05, "loss": 1.811, "step": 3 }, { "epoch": 0.75, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.25513196480938416, "eval_loss": 1.7797694206237793, "eval_runtime": 1.6596, "eval_samples_per_second": 37.359, "eval_steps_per_second": 1.205, "step": 3 }, { "epoch": 1.0, "grad_norm": 33.511817932128906, "learning_rate": 4.736842105263158e-05, "loss": 1.9235, "step": 4 }, { "epoch": 1.0, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.25513196480938416, "eval_loss": 1.7310200929641724, "eval_runtime": 1.6611, "eval_samples_per_second": 37.325, "eval_steps_per_second": 1.204, "step": 4 }, { "epoch": 1.25, "grad_norm": 30.517168045043945, "learning_rate": 4.605263157894737e-05, "loss": 1.6903, "step": 5 }, { "epoch": 1.25, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.25513196480938416, "eval_loss": 1.6759703159332275, "eval_runtime": 1.6597, "eval_samples_per_second": 37.357, "eval_steps_per_second": 1.205, "step": 5 }, { "epoch": 1.5, "grad_norm": 29.271242141723633, "learning_rate": 4.473684210526316e-05, "loss": 1.6032, "step": 6 }, { "epoch": 1.5, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.25513196480938416, "eval_loss": 1.620038390159607, "eval_runtime": 1.6099, "eval_samples_per_second": 38.511, "eval_steps_per_second": 1.242, "step": 6 }, { "epoch": 1.75, "grad_norm": 31.15843963623047, "learning_rate": 4.342105263157895e-05, "loss": 1.7522, "step": 7 }, { "epoch": 1.75, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.25513196480938416, "eval_loss": 1.5599325895309448, "eval_runtime": 1.6096, "eval_samples_per_second": 38.518, "eval_steps_per_second": 1.243, "step": 7 }, { "epoch": 2.0, "grad_norm": 32.795101165771484, "learning_rate": 4.210526315789474e-05, "loss": 1.665, "step": 8 }, { "epoch": 2.0, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.25513196480938416, "eval_loss": 1.5007245540618896, "eval_runtime": 1.6593, "eval_samples_per_second": 37.365, "eval_steps_per_second": 1.205, "step": 8 }, { "epoch": 2.25, "grad_norm": 24.919414520263672, "learning_rate": 4.078947368421053e-05, "loss": 1.4334, "step": 9 }, { "epoch": 2.25, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.25513196480938416, "eval_loss": 1.4376260042190552, "eval_runtime": 1.6592, "eval_samples_per_second": 37.367, "eval_steps_per_second": 1.205, "step": 9 }, { "epoch": 2.5, "grad_norm": 26.314369201660156, "learning_rate": 3.9473684210526316e-05, "loss": 1.4499, "step": 10 }, { "epoch": 2.5, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.25513196480938416, "eval_loss": 1.3746062517166138, "eval_runtime": 1.659, "eval_samples_per_second": 37.372, "eval_steps_per_second": 1.206, "step": 10 }, { "epoch": 2.75, "grad_norm": 29.581968307495117, "learning_rate": 3.815789473684211e-05, "loss": 1.4751, "step": 11 }, { "epoch": 2.75, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.25513196480938416, "eval_loss": 1.3099955320358276, "eval_runtime": 1.6591, "eval_samples_per_second": 37.37, "eval_steps_per_second": 1.205, "step": 11 }, { "epoch": 3.0, "grad_norm": 23.594392776489258, "learning_rate": 3.6842105263157895e-05, "loss": 1.2184, "step": 12 }, { "epoch": 3.0, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.25513196480938416, "eval_loss": 1.2490864992141724, "eval_runtime": 1.6592, "eval_samples_per_second": 37.368, "eval_steps_per_second": 1.205, "step": 12 }, { "epoch": 3.25, "grad_norm": 21.464380264282227, "learning_rate": 3.5526315789473684e-05, "loss": 1.1918, "step": 13 }, { "epoch": 3.25, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.25513196480938416, "eval_loss": 1.1868385076522827, "eval_runtime": 1.6597, "eval_samples_per_second": 37.355, "eval_steps_per_second": 1.205, "step": 13 }, { "epoch": 3.5, "grad_norm": 23.22657585144043, "learning_rate": 3.421052631578947e-05, "loss": 1.2961, "step": 14 }, { "epoch": 3.5, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.25513196480938416, "eval_loss": 1.1285125017166138, "eval_runtime": 1.6596, "eval_samples_per_second": 37.359, "eval_steps_per_second": 1.205, "step": 14 }, { "epoch": 3.75, "grad_norm": 20.130626678466797, "learning_rate": 3.289473684210527e-05, "loss": 1.058, "step": 15 }, { "epoch": 3.75, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.25513196480938416, "eval_loss": 1.0760616064071655, "eval_runtime": 1.6629, "eval_samples_per_second": 37.284, "eval_steps_per_second": 1.203, "step": 15 }, { "epoch": 4.0, "grad_norm": 19.751522064208984, "learning_rate": 3.157894736842105e-05, "loss": 1.0431, "step": 16 }, { "epoch": 4.0, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.25513196480938416, "eval_loss": 1.027753233909607, "eval_runtime": 1.6593, "eval_samples_per_second": 37.365, "eval_steps_per_second": 1.205, "step": 16 }, { "epoch": 4.25, "grad_norm": 21.63219451904297, "learning_rate": 3.0263157894736844e-05, "loss": 1.1025, "step": 17 }, { "epoch": 4.25, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.25513196480938416, "eval_loss": 0.9908801913261414, "eval_runtime": 1.6593, "eval_samples_per_second": 37.364, "eval_steps_per_second": 1.205, "step": 17 }, { "epoch": 4.5, "grad_norm": 10.896172523498535, "learning_rate": 2.8947368421052634e-05, "loss": 1.027, "step": 18 }, { "epoch": 4.5, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.25513196480938416, "eval_loss": 0.9616305232048035, "eval_runtime": 1.6609, "eval_samples_per_second": 37.328, "eval_steps_per_second": 1.204, "step": 18 }, { "epoch": 4.75, "grad_norm": 10.560508728027344, "learning_rate": 2.7631578947368426e-05, "loss": 0.9494, "step": 19 }, { "epoch": 4.75, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.26990838618745594, "eval_loss": 0.9417527914047241, "eval_runtime": 1.6591, "eval_samples_per_second": 37.369, "eval_steps_per_second": 1.205, "step": 19 }, { "epoch": 5.0, "grad_norm": 7.201696395874023, "learning_rate": 2.6315789473684212e-05, "loss": 0.8845, "step": 20 }, { "epoch": 5.0, "eval_accuracy": 0.532258064516129, "eval_f1": 0.32762030323005936, "eval_loss": 0.9292937517166138, "eval_runtime": 1.6591, "eval_samples_per_second": 37.371, "eval_steps_per_second": 1.206, "step": 20 }, { "epoch": 5.25, "grad_norm": 3.4469192028045654, "learning_rate": 2.5e-05, "loss": 1.0096, "step": 21 }, { "epoch": 5.25, "eval_accuracy": 0.5967741935483871, "eval_f1": 0.4066193853427896, "eval_loss": 0.9209141731262207, "eval_runtime": 1.6594, "eval_samples_per_second": 37.363, "eval_steps_per_second": 1.205, "step": 21 }, { "epoch": 5.5, "grad_norm": 5.117708683013916, "learning_rate": 2.368421052631579e-05, "loss": 0.8488, "step": 22 }, { "epoch": 5.5, "eval_accuracy": 0.6290322580645161, "eval_f1": 0.434640522875817, "eval_loss": 0.9170079231262207, "eval_runtime": 1.6597, "eval_samples_per_second": 37.356, "eval_steps_per_second": 1.205, "step": 22 }, { "epoch": 5.75, "grad_norm": 4.267374515533447, "learning_rate": 2.236842105263158e-05, "loss": 0.847, "step": 23 }, { "epoch": 5.75, "eval_accuracy": 0.6129032258064516, "eval_f1": 0.41979655712050085, "eval_loss": 0.9126449227333069, "eval_runtime": 1.6593, "eval_samples_per_second": 37.364, "eval_steps_per_second": 1.205, "step": 23 }, { "epoch": 6.0, "grad_norm": 4.23488712310791, "learning_rate": 2.105263157894737e-05, "loss": 0.8127, "step": 24 }, { "epoch": 6.0, "eval_accuracy": 0.5483870967741935, "eval_f1": 0.3648484848484848, "eval_loss": 0.9107705354690552, "eval_runtime": 1.6606, "eval_samples_per_second": 37.337, "eval_steps_per_second": 1.204, "step": 24 }, { "epoch": 6.25, "grad_norm": 6.90617561340332, "learning_rate": 1.9736842105263158e-05, "loss": 0.8793, "step": 25 }, { "epoch": 6.25, "eval_accuracy": 0.5161290322580645, "eval_f1": 0.34080808080808084, "eval_loss": 0.9092269539833069, "eval_runtime": 1.6621, "eval_samples_per_second": 37.302, "eval_steps_per_second": 1.203, "step": 25 }, { "epoch": 6.5, "grad_norm": 3.975435733795166, "learning_rate": 1.8421052631578947e-05, "loss": 0.8219, "step": 26 }, { "epoch": 6.5, "eval_accuracy": 0.532258064516129, "eval_f1": 0.35006119951040393, "eval_loss": 0.9109280705451965, "eval_runtime": 1.6614, "eval_samples_per_second": 37.318, "eval_steps_per_second": 1.204, "step": 26 }, { "epoch": 6.75, "grad_norm": 4.9610748291015625, "learning_rate": 1.7105263157894737e-05, "loss": 0.8406, "step": 27 }, { "epoch": 6.75, "eval_accuracy": 0.5161290322580645, "eval_f1": 0.34080808080808084, "eval_loss": 0.9092269539833069, "eval_runtime": 1.6609, "eval_samples_per_second": 37.33, "eval_steps_per_second": 1.204, "step": 27 }, { "epoch": 7.0, "grad_norm": 2.729252815246582, "learning_rate": 1.5789473684210526e-05, "loss": 0.9318, "step": 28 }, { "epoch": 7.0, "eval_accuracy": 0.532258064516129, "eval_f1": 0.3553553553553554, "eval_loss": 0.9079668521881104, "eval_runtime": 1.6613, "eval_samples_per_second": 37.321, "eval_steps_per_second": 1.204, "step": 28 }, { "epoch": 7.25, "grad_norm": 5.042109489440918, "learning_rate": 1.4473684210526317e-05, "loss": 0.9478, "step": 29 }, { "epoch": 7.25, "eval_accuracy": 0.532258064516129, "eval_f1": 0.3553553553553554, "eval_loss": 0.9087701439857483, "eval_runtime": 1.661, "eval_samples_per_second": 37.327, "eval_steps_per_second": 1.204, "step": 29 }, { "epoch": 7.5, "grad_norm": 4.453703880310059, "learning_rate": 1.3157894736842106e-05, "loss": 0.8433, "step": 30 }, { "epoch": 7.5, "eval_accuracy": 0.5483870967741935, "eval_f1": 0.36946595195552906, "eval_loss": 0.9084787964820862, "eval_runtime": 1.6611, "eval_samples_per_second": 37.325, "eval_steps_per_second": 1.204, "step": 30 }, { "epoch": 7.75, "grad_norm": 3.1456220149993896, "learning_rate": 1.1842105263157895e-05, "loss": 0.794, "step": 31 }, { "epoch": 7.75, "eval_accuracy": 0.5483870967741935, "eval_f1": 0.36946595195552906, "eval_loss": 0.9089512825012207, "eval_runtime": 1.6607, "eval_samples_per_second": 37.334, "eval_steps_per_second": 1.204, "step": 31 }, { "epoch": 8.0, "grad_norm": 12.549348831176758, "learning_rate": 1.0526315789473684e-05, "loss": 0.9746, "step": 32 }, { "epoch": 8.0, "eval_accuracy": 0.5483870967741935, "eval_f1": 0.36946595195552906, "eval_loss": 0.9065335392951965, "eval_runtime": 1.6609, "eval_samples_per_second": 37.329, "eval_steps_per_second": 1.204, "step": 32 }, { "epoch": 8.25, "grad_norm": 4.388298511505127, "learning_rate": 9.210526315789474e-06, "loss": 0.9246, "step": 33 }, { "epoch": 8.25, "eval_accuracy": 0.5483870967741935, "eval_f1": 0.36946595195552906, "eval_loss": 0.9052813053131104, "eval_runtime": 1.661, "eval_samples_per_second": 37.326, "eval_steps_per_second": 1.204, "step": 33 }, { "epoch": 8.5, "grad_norm": 2.7072877883911133, "learning_rate": 7.894736842105263e-06, "loss": 0.7426, "step": 34 }, { "epoch": 8.5, "eval_accuracy": 0.5645161290322581, "eval_f1": 0.3831757289204098, "eval_loss": 0.9058404564857483, "eval_runtime": 1.661, "eval_samples_per_second": 37.328, "eval_steps_per_second": 1.204, "step": 34 }, { "epoch": 8.75, "grad_norm": 12.431721687316895, "learning_rate": 6.578947368421053e-06, "loss": 0.8438, "step": 35 }, { "epoch": 8.75, "eval_accuracy": 0.5483870967741935, "eval_f1": 0.3732394366197183, "eval_loss": 0.9043520092964172, "eval_runtime": 1.6608, "eval_samples_per_second": 37.332, "eval_steps_per_second": 1.204, "step": 35 }, { "epoch": 9.0, "grad_norm": 4.142194747924805, "learning_rate": 5.263157894736842e-06, "loss": 0.8591, "step": 36 }, { "epoch": 9.0, "eval_accuracy": 0.5483870967741935, "eval_f1": 0.3732394366197183, "eval_loss": 0.9042260050773621, "eval_runtime": 1.6606, "eval_samples_per_second": 37.336, "eval_steps_per_second": 1.204, "step": 36 }, { "epoch": 9.25, "grad_norm": 2.2469961643218994, "learning_rate": 3.9473684210526315e-06, "loss": 0.8005, "step": 37 }, { "epoch": 9.25, "eval_accuracy": 0.5483870967741935, "eval_f1": 0.3732394366197183, "eval_loss": 0.9043677449226379, "eval_runtime": 1.6601, "eval_samples_per_second": 37.347, "eval_steps_per_second": 1.205, "step": 37 }, { "epoch": 9.5, "grad_norm": 2.831308126449585, "learning_rate": 2.631578947368421e-06, "loss": 0.8354, "step": 38 }, { "epoch": 9.5, "eval_accuracy": 0.5483870967741935, "eval_f1": 0.3732394366197183, "eval_loss": 0.9049898982048035, "eval_runtime": 1.6604, "eval_samples_per_second": 37.341, "eval_steps_per_second": 1.205, "step": 38 }, { "epoch": 9.75, "grad_norm": 4.90074348449707, "learning_rate": 1.3157894736842106e-06, "loss": 0.8728, "step": 39 }, { "epoch": 9.75, "eval_accuracy": 0.5483870967741935, "eval_f1": 0.3732394366197183, "eval_loss": 0.9037298560142517, "eval_runtime": 1.6599, "eval_samples_per_second": 37.352, "eval_steps_per_second": 1.205, "step": 39 }, { "epoch": 10.0, "grad_norm": 2.8960094451904297, "learning_rate": 0.0, "loss": 0.8464, "step": 40 }, { "epoch": 10.0, "eval_accuracy": 0.5483870967741935, "eval_f1": 0.3732394366197183, "eval_loss": 0.9036431908607483, "eval_runtime": 1.6603, "eval_samples_per_second": 37.343, "eval_steps_per_second": 1.205, "step": 40 }, { "epoch": 10.0, "step": 40, "total_flos": 50446463926272.0, "train_loss": 1.1404520988464355, "train_runtime": 238.9505, "train_samples_per_second": 10.211, "train_steps_per_second": 0.167 } ], "logging_steps": 1, "max_steps": 40, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 50446463926272.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }