{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.3192612137203166, "eval_steps": 25, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07, "grad_norm": 1.4790619611740112, "learning_rate": 2.3797595190380762e-05, "loss": 0.9713, "step": 25 }, { "epoch": 0.07, "eval_loss": 0.7396946549415588, "eval_runtime": 75.3443, "eval_samples_per_second": 2.535, "eval_steps_per_second": 0.319, "step": 25 }, { "epoch": 0.13, "grad_norm": 1.359966516494751, "learning_rate": 2.2545090180360722e-05, "loss": 0.6963, "step": 50 }, { "epoch": 0.13, "eval_loss": 0.6846508979797363, "eval_runtime": 75.2251, "eval_samples_per_second": 2.539, "eval_steps_per_second": 0.319, "step": 50 }, { "epoch": 0.2, "grad_norm": 1.3525006771087646, "learning_rate": 2.1292585170340683e-05, "loss": 0.6809, "step": 75 }, { "epoch": 0.2, "eval_loss": 0.6628683805465698, "eval_runtime": 75.0453, "eval_samples_per_second": 2.545, "eval_steps_per_second": 0.32, "step": 75 }, { "epoch": 0.26, "grad_norm": 1.3805192708969116, "learning_rate": 2.0040080160320643e-05, "loss": 0.6596, "step": 100 }, { "epoch": 0.26, "eval_loss": 0.6511640548706055, "eval_runtime": 75.0686, "eval_samples_per_second": 2.544, "eval_steps_per_second": 0.32, "step": 100 }, { "epoch": 0.33, "grad_norm": 1.2107349634170532, "learning_rate": 1.87875751503006e-05, "loss": 0.6552, "step": 125 }, { "epoch": 0.33, "eval_loss": 0.6428065896034241, "eval_runtime": 75.0266, "eval_samples_per_second": 2.546, "eval_steps_per_second": 0.32, "step": 125 }, { "epoch": 0.4, "grad_norm": 1.327737808227539, "learning_rate": 1.7535070140280564e-05, "loss": 0.6363, "step": 150 }, { "epoch": 0.4, "eval_loss": 0.6354109048843384, "eval_runtime": 75.1344, "eval_samples_per_second": 2.542, "eval_steps_per_second": 0.319, "step": 150 }, { "epoch": 0.46, "grad_norm": 1.2277601957321167, "learning_rate": 1.628256513026052e-05, "loss": 0.6441, "step": 175 }, { "epoch": 0.46, "eval_loss": 0.6307269930839539, "eval_runtime": 75.0535, "eval_samples_per_second": 2.545, "eval_steps_per_second": 0.32, "step": 175 }, { "epoch": 0.53, "grad_norm": 1.3676594495773315, "learning_rate": 1.5030060120240483e-05, "loss": 0.6317, "step": 200 }, { "epoch": 0.53, "eval_loss": 0.6252032518386841, "eval_runtime": 75.0153, "eval_samples_per_second": 2.546, "eval_steps_per_second": 0.32, "step": 200 }, { "epoch": 0.59, "grad_norm": 1.2495057582855225, "learning_rate": 1.3777555110220442e-05, "loss": 0.6222, "step": 225 }, { "epoch": 0.59, "eval_loss": 0.6214942336082458, "eval_runtime": 75.0103, "eval_samples_per_second": 2.546, "eval_steps_per_second": 0.32, "step": 225 }, { "epoch": 0.66, "grad_norm": 1.2535713911056519, "learning_rate": 1.25250501002004e-05, "loss": 0.6127, "step": 250 }, { "epoch": 0.66, "eval_loss": 0.6172995567321777, "eval_runtime": 75.023, "eval_samples_per_second": 2.546, "eval_steps_per_second": 0.32, "step": 250 }, { "epoch": 0.73, "grad_norm": 1.3162541389465332, "learning_rate": 1.1272545090180361e-05, "loss": 0.6017, "step": 275 }, { "epoch": 0.73, "eval_loss": 0.614778459072113, "eval_runtime": 75.3906, "eval_samples_per_second": 2.533, "eval_steps_per_second": 0.318, "step": 275 }, { "epoch": 0.79, "grad_norm": 1.3146350383758545, "learning_rate": 1.0020040080160322e-05, "loss": 0.6201, "step": 300 }, { "epoch": 0.79, "eval_loss": 0.6113688945770264, "eval_runtime": 75.4374, "eval_samples_per_second": 2.532, "eval_steps_per_second": 0.318, "step": 300 }, { "epoch": 0.86, "grad_norm": 1.311963438987732, "learning_rate": 8.767535070140282e-06, "loss": 0.5961, "step": 325 }, { "epoch": 0.86, "eval_loss": 0.6093372702598572, "eval_runtime": 75.3949, "eval_samples_per_second": 2.533, "eval_steps_per_second": 0.318, "step": 325 }, { "epoch": 0.92, "grad_norm": 1.3103692531585693, "learning_rate": 7.515030060120242e-06, "loss": 0.6044, "step": 350 }, { "epoch": 0.92, "eval_loss": 0.606224775314331, "eval_runtime": 75.3575, "eval_samples_per_second": 2.535, "eval_steps_per_second": 0.318, "step": 350 }, { "epoch": 0.99, "grad_norm": 1.4562299251556396, "learning_rate": 6.2625250501002e-06, "loss": 0.6064, "step": 375 }, { "epoch": 0.99, "eval_loss": 0.6035271286964417, "eval_runtime": 75.3932, "eval_samples_per_second": 2.533, "eval_steps_per_second": 0.318, "step": 375 }, { "epoch": 1.06, "grad_norm": 1.3848822116851807, "learning_rate": 5.010020040080161e-06, "loss": 0.5763, "step": 400 }, { "epoch": 1.06, "eval_loss": 0.6053218841552734, "eval_runtime": 75.3983, "eval_samples_per_second": 2.533, "eval_steps_per_second": 0.318, "step": 400 }, { "epoch": 1.12, "grad_norm": 1.466248631477356, "learning_rate": 3.757515030060121e-06, "loss": 0.5589, "step": 425 }, { "epoch": 1.12, "eval_loss": 0.6024670600891113, "eval_runtime": 75.3786, "eval_samples_per_second": 2.534, "eval_steps_per_second": 0.318, "step": 425 }, { "epoch": 1.19, "grad_norm": 1.4298962354660034, "learning_rate": 2.5050100200400804e-06, "loss": 0.5595, "step": 450 }, { "epoch": 1.19, "eval_loss": 0.6021297574043274, "eval_runtime": 75.3606, "eval_samples_per_second": 2.534, "eval_steps_per_second": 0.318, "step": 450 }, { "epoch": 1.25, "grad_norm": 1.395114779472351, "learning_rate": 1.2525050100200402e-06, "loss": 0.5525, "step": 475 }, { "epoch": 1.25, "eval_loss": 0.6009101867675781, "eval_runtime": 75.3723, "eval_samples_per_second": 2.534, "eval_steps_per_second": 0.318, "step": 475 }, { "epoch": 1.32, "grad_norm": 1.4652175903320312, "learning_rate": 0.0, "loss": 0.5541, "step": 500 }, { "epoch": 1.32, "eval_loss": 0.6004253625869751, "eval_runtime": 75.3761, "eval_samples_per_second": 2.534, "eval_steps_per_second": 0.318, "step": 500 } ], "logging_steps": 25, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 250, "total_flos": 7.87141357369344e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }