{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.859154929577464, "eval_steps": 500, "global_step": 350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.28169014084507044, "grad_norm": 6.4921875, "learning_rate": 0.00019959742939952392, "loss": 1.0479, "step": 10 }, { "epoch": 0.5633802816901409, "grad_norm": 7.4765625, "learning_rate": 0.00019839295885986296, "loss": 1.0329, "step": 20 }, { "epoch": 0.8450704225352113, "grad_norm": 8.5546875, "learning_rate": 0.00019639628606958533, "loss": 0.9256, "step": 30 }, { "epoch": 1.1267605633802817, "grad_norm": 7.9453125, "learning_rate": 0.00019362348706397373, "loss": 0.8794, "step": 40 }, { "epoch": 1.408450704225352, "grad_norm": 9.7578125, "learning_rate": 0.0001900968867902419, "loss": 0.8596, "step": 50 }, { "epoch": 1.6901408450704225, "grad_norm": 9.765625, "learning_rate": 0.00018584487936018661, "loss": 0.8522, "step": 60 }, { "epoch": 1.971830985915493, "grad_norm": 10.0859375, "learning_rate": 0.00018090169943749476, "loss": 0.8563, "step": 70 }, { "epoch": 2.2535211267605635, "grad_norm": 8.421875, "learning_rate": 0.00017530714660036112, "loss": 0.826, "step": 80 }, { "epoch": 2.535211267605634, "grad_norm": 9.7265625, "learning_rate": 0.00016910626489868649, "loss": 0.8266, "step": 90 }, { "epoch": 2.816901408450704, "grad_norm": 11.2265625, "learning_rate": 0.00016234898018587337, "loss": 0.8513, "step": 100 }, { "epoch": 3.0985915492957745, "grad_norm": 10.9765625, "learning_rate": 0.00015508969814521025, "loss": 0.8051, "step": 110 }, { "epoch": 3.380281690140845, "grad_norm": 12.53125, "learning_rate": 0.00014738686624729986, "loss": 0.8422, "step": 120 }, { "epoch": 3.6619718309859155, "grad_norm": 10.234375, "learning_rate": 0.00013930250316539238, "loss": 0.8595, "step": 130 }, { "epoch": 3.943661971830986, "grad_norm": 12.8671875, "learning_rate": 0.00013090169943749476, "loss": 0.8358, "step": 140 }, { "epoch": 4.225352112676056, "grad_norm": 11.7109375, "learning_rate": 0.00012225209339563145, "loss": 0.8135, "step": 150 }, { "epoch": 4.507042253521127, "grad_norm": 9.9296875, "learning_rate": 0.00011342332658176555, "loss": 0.7967, "step": 160 }, { "epoch": 4.788732394366197, "grad_norm": 10.546875, "learning_rate": 0.00010448648303505151, "loss": 0.7877, "step": 170 }, { "epoch": 5.070422535211268, "grad_norm": 14.2265625, "learning_rate": 9.551351696494854e-05, "loss": 0.7516, "step": 180 }, { "epoch": 5.352112676056338, "grad_norm": 11.4375, "learning_rate": 8.657667341823448e-05, "loss": 0.7581, "step": 190 }, { "epoch": 5.633802816901408, "grad_norm": 10.5859375, "learning_rate": 7.774790660436858e-05, "loss": 0.7491, "step": 200 }, { "epoch": 5.915492957746479, "grad_norm": 9.6796875, "learning_rate": 6.909830056250527e-05, "loss": 0.7307, "step": 210 }, { "epoch": 6.197183098591549, "grad_norm": 8.890625, "learning_rate": 6.069749683460765e-05, "loss": 0.7323, "step": 220 }, { "epoch": 6.47887323943662, "grad_norm": 11.2734375, "learning_rate": 5.261313375270014e-05, "loss": 0.7145, "step": 230 }, { "epoch": 6.76056338028169, "grad_norm": 10.4375, "learning_rate": 4.491030185478976e-05, "loss": 0.7525, "step": 240 }, { "epoch": 7.042253521126761, "grad_norm": 10.9765625, "learning_rate": 3.7651019814126654e-05, "loss": 0.7138, "step": 250 }, { "epoch": 7.323943661971831, "grad_norm": 9.625, "learning_rate": 3.089373510131354e-05, "loss": 0.7266, "step": 260 }, { "epoch": 7.605633802816901, "grad_norm": 9.6484375, "learning_rate": 2.4692853399638917e-05, "loss": 0.7136, "step": 270 }, { "epoch": 7.887323943661972, "grad_norm": 10.25, "learning_rate": 1.9098300562505266e-05, "loss": 0.7194, "step": 280 }, { "epoch": 8.169014084507042, "grad_norm": 11.3203125, "learning_rate": 1.415512063981339e-05, "loss": 0.7137, "step": 290 }, { "epoch": 8.450704225352112, "grad_norm": 13.578125, "learning_rate": 9.903113209758096e-06, "loss": 0.7186, "step": 300 }, { "epoch": 8.732394366197184, "grad_norm": 8.828125, "learning_rate": 6.37651293602628e-06, "loss": 0.7232, "step": 310 }, { "epoch": 9.014084507042254, "grad_norm": 11.9140625, "learning_rate": 3.6037139304146762e-06, "loss": 0.7116, "step": 320 }, { "epoch": 9.295774647887324, "grad_norm": 9.4453125, "learning_rate": 1.6070411401370334e-06, "loss": 0.7106, "step": 330 }, { "epoch": 9.577464788732394, "grad_norm": 12.828125, "learning_rate": 4.025706004760932e-07, "loss": 0.727, "step": 340 }, { "epoch": 9.859154929577464, "grad_norm": 10.4453125, "learning_rate": 0.0, "loss": 0.704, "step": 350 }, { "epoch": 9.859154929577464, "step": 350, "total_flos": 2.2849945337856e+16, "train_loss": 0.7934087766919817, "train_runtime": 317.5625, "train_samples_per_second": 4.472, "train_steps_per_second": 1.102 } ], "logging_steps": 10, "max_steps": 350, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 2.2849945337856e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }