{ "best_metric": 2.269272565841675, "best_model_checkpoint": "/kaggle/working/models/checkpoint-3750", "epoch": 4.8475055544334475, "eval_steps": 750, "global_step": 3750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06, "grad_norm": 0.10441142320632935, "learning_rate": 5e-05, "loss": 2.7397, "step": 50 }, { "epoch": 0.13, "grad_norm": 0.15981321036815643, "learning_rate": 0.0001, "loss": 2.6589, "step": 100 }, { "epoch": 0.19, "grad_norm": 0.20967262983322144, "learning_rate": 9.889819303658e-05, "loss": 2.5718, "step": 150 }, { "epoch": 0.26, "grad_norm": 0.20717187225818634, "learning_rate": 9.779638607315998e-05, "loss": 2.5054, "step": 200 }, { "epoch": 0.32, "grad_norm": 0.22402256727218628, "learning_rate": 9.669457910973999e-05, "loss": 2.4717, "step": 250 }, { "epoch": 0.39, "grad_norm": 0.27481311559677124, "learning_rate": 9.559277214631996e-05, "loss": 2.4487, "step": 300 }, { "epoch": 0.45, "grad_norm": 0.21997925639152527, "learning_rate": 9.449096518289996e-05, "loss": 2.4595, "step": 350 }, { "epoch": 0.52, "grad_norm": 0.23128151893615723, "learning_rate": 9.338915821947995e-05, "loss": 2.4498, "step": 400 }, { "epoch": 0.58, "grad_norm": 0.2886131703853607, "learning_rate": 9.228735125605994e-05, "loss": 2.4342, "step": 450 }, { "epoch": 0.65, "grad_norm": 0.2664533257484436, "learning_rate": 9.118554429263993e-05, "loss": 2.41, "step": 500 }, { "epoch": 0.71, "grad_norm": 0.2600548267364502, "learning_rate": 9.008373732921994e-05, "loss": 2.4136, "step": 550 }, { "epoch": 0.78, "grad_norm": 0.2559927701950073, "learning_rate": 8.898193036579991e-05, "loss": 2.4235, "step": 600 }, { "epoch": 0.84, "grad_norm": 0.2909291982650757, "learning_rate": 8.788012340237991e-05, "loss": 2.4128, "step": 650 }, { "epoch": 0.9, "grad_norm": 0.3658187687397003, "learning_rate": 8.677831643895989e-05, "loss": 2.4026, "step": 700 }, { "epoch": 0.97, "grad_norm": 0.2849361002445221, "learning_rate": 8.567650947553989e-05, "loss": 2.4121, "step": 750 }, { "epoch": 0.97, "eval_loss": 2.3384921550750732, "eval_runtime": 952.2726, "eval_samples_per_second": 8.658, "eval_steps_per_second": 8.658, "step": 750 }, { "epoch": 1.03, "grad_norm": 0.28176233172416687, "learning_rate": 8.457470251211988e-05, "loss": 2.4043, "step": 800 }, { "epoch": 1.1, "grad_norm": 0.3647988736629486, "learning_rate": 8.347289554869987e-05, "loss": 2.3877, "step": 850 }, { "epoch": 1.16, "grad_norm": 0.2923799455165863, "learning_rate": 8.237108858527986e-05, "loss": 2.4004, "step": 900 }, { "epoch": 1.23, "grad_norm": 0.2779831290245056, "learning_rate": 8.126928162185985e-05, "loss": 2.3928, "step": 950 }, { "epoch": 1.29, "grad_norm": 0.29313695430755615, "learning_rate": 8.016747465843984e-05, "loss": 2.389, "step": 1000 }, { "epoch": 1.36, "grad_norm": 0.31519418954849243, "learning_rate": 7.906566769501984e-05, "loss": 2.3865, "step": 1050 }, { "epoch": 1.42, "grad_norm": 0.2636606693267822, "learning_rate": 7.796386073159983e-05, "loss": 2.3931, "step": 1100 }, { "epoch": 1.49, "grad_norm": 0.33571264147758484, "learning_rate": 7.686205376817982e-05, "loss": 2.3772, "step": 1150 }, { "epoch": 1.55, "grad_norm": 0.31720587611198425, "learning_rate": 7.576024680475981e-05, "loss": 2.3778, "step": 1200 }, { "epoch": 1.62, "grad_norm": 0.2712741196155548, "learning_rate": 7.46584398413398e-05, "loss": 2.3711, "step": 1250 }, { "epoch": 1.68, "grad_norm": 0.3407454192638397, "learning_rate": 7.35566328779198e-05, "loss": 2.3688, "step": 1300 }, { "epoch": 1.75, "grad_norm": 0.27811819314956665, "learning_rate": 7.245482591449978e-05, "loss": 2.3761, "step": 1350 }, { "epoch": 1.81, "grad_norm": 0.2932997941970825, "learning_rate": 7.135301895107978e-05, "loss": 2.385, "step": 1400 }, { "epoch": 1.87, "grad_norm": 0.2541429102420807, "learning_rate": 7.025121198765977e-05, "loss": 2.374, "step": 1450 }, { "epoch": 1.94, "grad_norm": 0.2891751527786255, "learning_rate": 6.914940502423976e-05, "loss": 2.3634, "step": 1500 }, { "epoch": 1.94, "eval_loss": 2.3060712814331055, "eval_runtime": 954.8022, "eval_samples_per_second": 8.635, "eval_steps_per_second": 8.635, "step": 1500 }, { "epoch": 2.0, "grad_norm": 0.3406051695346832, "learning_rate": 6.804759806081975e-05, "loss": 2.3682, "step": 1550 }, { "epoch": 2.07, "grad_norm": 0.2978401184082031, "learning_rate": 6.694579109739974e-05, "loss": 2.3646, "step": 1600 }, { "epoch": 2.13, "grad_norm": 0.3196316361427307, "learning_rate": 6.584398413397973e-05, "loss": 2.3536, "step": 1650 }, { "epoch": 2.2, "grad_norm": 0.3379887044429779, "learning_rate": 6.474217717055973e-05, "loss": 2.3637, "step": 1700 }, { "epoch": 2.26, "grad_norm": 0.31465980410575867, "learning_rate": 6.36403702071397e-05, "loss": 2.3652, "step": 1750 }, { "epoch": 2.33, "grad_norm": 0.28737229108810425, "learning_rate": 6.253856324371971e-05, "loss": 2.3477, "step": 1800 }, { "epoch": 2.39, "grad_norm": 0.2794209420681, "learning_rate": 6.14367562802997e-05, "loss": 2.3557, "step": 1850 }, { "epoch": 2.46, "grad_norm": 0.2747984230518341, "learning_rate": 6.0334949316879686e-05, "loss": 2.3772, "step": 1900 }, { "epoch": 2.52, "grad_norm": 0.3119751811027527, "learning_rate": 5.923314235345968e-05, "loss": 2.3551, "step": 1950 }, { "epoch": 2.59, "grad_norm": 0.2791976034641266, "learning_rate": 5.8131335390039664e-05, "loss": 2.3438, "step": 2000 }, { "epoch": 2.65, "grad_norm": 0.2925887703895569, "learning_rate": 5.702952842661966e-05, "loss": 2.3584, "step": 2050 }, { "epoch": 2.71, "grad_norm": 0.28244686126708984, "learning_rate": 5.592772146319964e-05, "loss": 2.352, "step": 2100 }, { "epoch": 2.78, "grad_norm": 0.32431092858314514, "learning_rate": 5.482591449977964e-05, "loss": 2.3538, "step": 2150 }, { "epoch": 2.84, "grad_norm": 0.3006184995174408, "learning_rate": 5.3724107536359635e-05, "loss": 2.3444, "step": 2200 }, { "epoch": 2.91, "grad_norm": 0.33126187324523926, "learning_rate": 5.262230057293962e-05, "loss": 2.3428, "step": 2250 }, { "epoch": 2.91, "eval_loss": 2.286065101623535, "eval_runtime": 952.0984, "eval_samples_per_second": 8.66, "eval_steps_per_second": 8.66, "step": 2250 }, { "epoch": 2.97, "grad_norm": 0.36009636521339417, "learning_rate": 5.152049360951961e-05, "loss": 2.357, "step": 2300 }, { "epoch": 3.04, "grad_norm": 0.29686203598976135, "learning_rate": 5.041868664609961e-05, "loss": 2.3503, "step": 2350 }, { "epoch": 3.1, "grad_norm": 0.343845397233963, "learning_rate": 4.93168796826796e-05, "loss": 2.341, "step": 2400 }, { "epoch": 3.17, "grad_norm": 0.35624146461486816, "learning_rate": 4.821507271925959e-05, "loss": 2.3489, "step": 2450 }, { "epoch": 3.23, "grad_norm": 0.39963939785957336, "learning_rate": 4.7113265755839584e-05, "loss": 2.3406, "step": 2500 }, { "epoch": 3.3, "grad_norm": 0.31882843375205994, "learning_rate": 4.601145879241957e-05, "loss": 2.3426, "step": 2550 }, { "epoch": 3.36, "grad_norm": 0.30565398931503296, "learning_rate": 4.490965182899956e-05, "loss": 2.3518, "step": 2600 }, { "epoch": 3.43, "grad_norm": 0.3145510256290436, "learning_rate": 4.380784486557955e-05, "loss": 2.3465, "step": 2650 }, { "epoch": 3.49, "grad_norm": 0.31365010142326355, "learning_rate": 4.270603790215955e-05, "loss": 2.3441, "step": 2700 }, { "epoch": 3.55, "grad_norm": 0.2947095036506653, "learning_rate": 4.160423093873954e-05, "loss": 2.3259, "step": 2750 }, { "epoch": 3.62, "grad_norm": 0.32206296920776367, "learning_rate": 4.0502423975319526e-05, "loss": 2.339, "step": 2800 }, { "epoch": 3.68, "grad_norm": 0.31289926171302795, "learning_rate": 3.9400617011899515e-05, "loss": 2.3377, "step": 2850 }, { "epoch": 3.75, "grad_norm": 0.33143824338912964, "learning_rate": 3.829881004847951e-05, "loss": 2.3388, "step": 2900 }, { "epoch": 3.81, "grad_norm": 0.3358709216117859, "learning_rate": 3.71970030850595e-05, "loss": 2.3288, "step": 2950 }, { "epoch": 3.88, "grad_norm": 0.3498934209346771, "learning_rate": 3.609519612163949e-05, "loss": 2.3438, "step": 3000 }, { "epoch": 3.88, "eval_loss": 2.2748169898986816, "eval_runtime": 954.0213, "eval_samples_per_second": 8.642, "eval_steps_per_second": 8.642, "step": 3000 }, { "epoch": 3.94, "grad_norm": 0.3428861200809479, "learning_rate": 3.499338915821948e-05, "loss": 2.3304, "step": 3050 }, { "epoch": 4.01, "grad_norm": 0.3780589699745178, "learning_rate": 3.389158219479947e-05, "loss": 2.3373, "step": 3100 }, { "epoch": 4.07, "grad_norm": 0.35940733551979065, "learning_rate": 3.2789775231379464e-05, "loss": 2.327, "step": 3150 }, { "epoch": 4.14, "grad_norm": 0.32163000106811523, "learning_rate": 3.1687968267959454e-05, "loss": 2.3354, "step": 3200 }, { "epoch": 4.2, "grad_norm": 0.4193963408470154, "learning_rate": 3.058616130453944e-05, "loss": 2.3155, "step": 3250 }, { "epoch": 4.27, "grad_norm": 0.32936742901802063, "learning_rate": 2.9484354341119435e-05, "loss": 2.334, "step": 3300 }, { "epoch": 4.33, "grad_norm": 0.34648725390434265, "learning_rate": 2.838254737769943e-05, "loss": 2.3321, "step": 3350 }, { "epoch": 4.4, "grad_norm": 0.3279941976070404, "learning_rate": 2.728074041427942e-05, "loss": 2.3381, "step": 3400 }, { "epoch": 4.46, "grad_norm": 0.29082515835762024, "learning_rate": 2.617893345085941e-05, "loss": 2.3339, "step": 3450 }, { "epoch": 4.52, "grad_norm": 0.33501705527305603, "learning_rate": 2.50771264874394e-05, "loss": 2.3425, "step": 3500 }, { "epoch": 4.59, "grad_norm": 0.3124564290046692, "learning_rate": 2.3975319524019392e-05, "loss": 2.3257, "step": 3550 }, { "epoch": 4.65, "grad_norm": 0.3001866638660431, "learning_rate": 2.2873512560599385e-05, "loss": 2.3381, "step": 3600 }, { "epoch": 4.72, "grad_norm": 0.299400269985199, "learning_rate": 2.1771705597179377e-05, "loss": 2.3404, "step": 3650 }, { "epoch": 4.78, "grad_norm": 0.3362495005130768, "learning_rate": 2.0669898633759366e-05, "loss": 2.3131, "step": 3700 }, { "epoch": 4.85, "grad_norm": 0.31044018268585205, "learning_rate": 1.956809167033936e-05, "loss": 2.332, "step": 3750 }, { "epoch": 4.85, "eval_loss": 2.269272565841675, "eval_runtime": 952.7148, "eval_samples_per_second": 8.654, "eval_steps_per_second": 8.654, "step": 3750 } ], "logging_steps": 50, "max_steps": 4638, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 750, "total_flos": 1.4703137390592e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }