|
{ |
|
"best_metric": 2.269272565841675, |
|
"best_model_checkpoint": "/kaggle/working/models/checkpoint-3750", |
|
"epoch": 4.8475055544334475, |
|
"eval_steps": 750, |
|
"global_step": 3750, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.10441142320632935, |
|
"learning_rate": 5e-05, |
|
"loss": 2.7397, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.15981321036815643, |
|
"learning_rate": 0.0001, |
|
"loss": 2.6589, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.20967262983322144, |
|
"learning_rate": 9.889819303658e-05, |
|
"loss": 2.5718, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.20717187225818634, |
|
"learning_rate": 9.779638607315998e-05, |
|
"loss": 2.5054, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.22402256727218628, |
|
"learning_rate": 9.669457910973999e-05, |
|
"loss": 2.4717, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.27481311559677124, |
|
"learning_rate": 9.559277214631996e-05, |
|
"loss": 2.4487, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.21997925639152527, |
|
"learning_rate": 9.449096518289996e-05, |
|
"loss": 2.4595, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.23128151893615723, |
|
"learning_rate": 9.338915821947995e-05, |
|
"loss": 2.4498, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.2886131703853607, |
|
"learning_rate": 9.228735125605994e-05, |
|
"loss": 2.4342, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.2664533257484436, |
|
"learning_rate": 9.118554429263993e-05, |
|
"loss": 2.41, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.2600548267364502, |
|
"learning_rate": 9.008373732921994e-05, |
|
"loss": 2.4136, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.2559927701950073, |
|
"learning_rate": 8.898193036579991e-05, |
|
"loss": 2.4235, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.2909291982650757, |
|
"learning_rate": 8.788012340237991e-05, |
|
"loss": 2.4128, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.3658187687397003, |
|
"learning_rate": 8.677831643895989e-05, |
|
"loss": 2.4026, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.2849361002445221, |
|
"learning_rate": 8.567650947553989e-05, |
|
"loss": 2.4121, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"eval_loss": 2.3384921550750732, |
|
"eval_runtime": 952.2726, |
|
"eval_samples_per_second": 8.658, |
|
"eval_steps_per_second": 8.658, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.28176233172416687, |
|
"learning_rate": 8.457470251211988e-05, |
|
"loss": 2.4043, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.3647988736629486, |
|
"learning_rate": 8.347289554869987e-05, |
|
"loss": 2.3877, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.2923799455165863, |
|
"learning_rate": 8.237108858527986e-05, |
|
"loss": 2.4004, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.2779831290245056, |
|
"learning_rate": 8.126928162185985e-05, |
|
"loss": 2.3928, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.29313695430755615, |
|
"learning_rate": 8.016747465843984e-05, |
|
"loss": 2.389, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.31519418954849243, |
|
"learning_rate": 7.906566769501984e-05, |
|
"loss": 2.3865, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.2636606693267822, |
|
"learning_rate": 7.796386073159983e-05, |
|
"loss": 2.3931, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.33571264147758484, |
|
"learning_rate": 7.686205376817982e-05, |
|
"loss": 2.3772, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.31720587611198425, |
|
"learning_rate": 7.576024680475981e-05, |
|
"loss": 2.3778, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.2712741196155548, |
|
"learning_rate": 7.46584398413398e-05, |
|
"loss": 2.3711, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.3407454192638397, |
|
"learning_rate": 7.35566328779198e-05, |
|
"loss": 2.3688, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.27811819314956665, |
|
"learning_rate": 7.245482591449978e-05, |
|
"loss": 2.3761, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.2932997941970825, |
|
"learning_rate": 7.135301895107978e-05, |
|
"loss": 2.385, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.2541429102420807, |
|
"learning_rate": 7.025121198765977e-05, |
|
"loss": 2.374, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.2891751527786255, |
|
"learning_rate": 6.914940502423976e-05, |
|
"loss": 2.3634, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"eval_loss": 2.3060712814331055, |
|
"eval_runtime": 954.8022, |
|
"eval_samples_per_second": 8.635, |
|
"eval_steps_per_second": 8.635, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.3406051695346832, |
|
"learning_rate": 6.804759806081975e-05, |
|
"loss": 2.3682, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.2978401184082031, |
|
"learning_rate": 6.694579109739974e-05, |
|
"loss": 2.3646, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 0.3196316361427307, |
|
"learning_rate": 6.584398413397973e-05, |
|
"loss": 2.3536, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.3379887044429779, |
|
"learning_rate": 6.474217717055973e-05, |
|
"loss": 2.3637, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 0.31465980410575867, |
|
"learning_rate": 6.36403702071397e-05, |
|
"loss": 2.3652, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 0.28737229108810425, |
|
"learning_rate": 6.253856324371971e-05, |
|
"loss": 2.3477, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.2794209420681, |
|
"learning_rate": 6.14367562802997e-05, |
|
"loss": 2.3557, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.2747984230518341, |
|
"learning_rate": 6.0334949316879686e-05, |
|
"loss": 2.3772, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.3119751811027527, |
|
"learning_rate": 5.923314235345968e-05, |
|
"loss": 2.3551, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 0.2791976034641266, |
|
"learning_rate": 5.8131335390039664e-05, |
|
"loss": 2.3438, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.2925887703895569, |
|
"learning_rate": 5.702952842661966e-05, |
|
"loss": 2.3584, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.28244686126708984, |
|
"learning_rate": 5.592772146319964e-05, |
|
"loss": 2.352, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 0.32431092858314514, |
|
"learning_rate": 5.482591449977964e-05, |
|
"loss": 2.3538, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.3006184995174408, |
|
"learning_rate": 5.3724107536359635e-05, |
|
"loss": 2.3444, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.33126187324523926, |
|
"learning_rate": 5.262230057293962e-05, |
|
"loss": 2.3428, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"eval_loss": 2.286065101623535, |
|
"eval_runtime": 952.0984, |
|
"eval_samples_per_second": 8.66, |
|
"eval_steps_per_second": 8.66, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.36009636521339417, |
|
"learning_rate": 5.152049360951961e-05, |
|
"loss": 2.357, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.29686203598976135, |
|
"learning_rate": 5.041868664609961e-05, |
|
"loss": 2.3503, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 0.343845397233963, |
|
"learning_rate": 4.93168796826796e-05, |
|
"loss": 2.341, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 0.35624146461486816, |
|
"learning_rate": 4.821507271925959e-05, |
|
"loss": 2.3489, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 0.39963939785957336, |
|
"learning_rate": 4.7113265755839584e-05, |
|
"loss": 2.3406, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 0.31882843375205994, |
|
"learning_rate": 4.601145879241957e-05, |
|
"loss": 2.3426, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 0.30565398931503296, |
|
"learning_rate": 4.490965182899956e-05, |
|
"loss": 2.3518, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 0.3145510256290436, |
|
"learning_rate": 4.380784486557955e-05, |
|
"loss": 2.3465, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 0.31365010142326355, |
|
"learning_rate": 4.270603790215955e-05, |
|
"loss": 2.3441, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 0.2947095036506653, |
|
"learning_rate": 4.160423093873954e-05, |
|
"loss": 2.3259, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 0.32206296920776367, |
|
"learning_rate": 4.0502423975319526e-05, |
|
"loss": 2.339, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.31289926171302795, |
|
"learning_rate": 3.9400617011899515e-05, |
|
"loss": 2.3377, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.33143824338912964, |
|
"learning_rate": 3.829881004847951e-05, |
|
"loss": 2.3388, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 0.3358709216117859, |
|
"learning_rate": 3.71970030850595e-05, |
|
"loss": 2.3288, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 0.3498934209346771, |
|
"learning_rate": 3.609519612163949e-05, |
|
"loss": 2.3438, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"eval_loss": 2.2748169898986816, |
|
"eval_runtime": 954.0213, |
|
"eval_samples_per_second": 8.642, |
|
"eval_steps_per_second": 8.642, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 0.3428861200809479, |
|
"learning_rate": 3.499338915821948e-05, |
|
"loss": 2.3304, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 0.3780589699745178, |
|
"learning_rate": 3.389158219479947e-05, |
|
"loss": 2.3373, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 0.35940733551979065, |
|
"learning_rate": 3.2789775231379464e-05, |
|
"loss": 2.327, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 0.32163000106811523, |
|
"learning_rate": 3.1687968267959454e-05, |
|
"loss": 2.3354, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 0.4193963408470154, |
|
"learning_rate": 3.058616130453944e-05, |
|
"loss": 2.3155, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 4.27, |
|
"grad_norm": 0.32936742901802063, |
|
"learning_rate": 2.9484354341119435e-05, |
|
"loss": 2.334, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 0.34648725390434265, |
|
"learning_rate": 2.838254737769943e-05, |
|
"loss": 2.3321, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.3279941976070404, |
|
"learning_rate": 2.728074041427942e-05, |
|
"loss": 2.3381, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 0.29082515835762024, |
|
"learning_rate": 2.617893345085941e-05, |
|
"loss": 2.3339, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 0.33501705527305603, |
|
"learning_rate": 2.50771264874394e-05, |
|
"loss": 2.3425, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"grad_norm": 0.3124564290046692, |
|
"learning_rate": 2.3975319524019392e-05, |
|
"loss": 2.3257, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 0.3001866638660431, |
|
"learning_rate": 2.2873512560599385e-05, |
|
"loss": 2.3381, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 0.299400269985199, |
|
"learning_rate": 2.1771705597179377e-05, |
|
"loss": 2.3404, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 0.3362495005130768, |
|
"learning_rate": 2.0669898633759366e-05, |
|
"loss": 2.3131, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 0.31044018268585205, |
|
"learning_rate": 1.956809167033936e-05, |
|
"loss": 2.332, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"eval_loss": 2.269272565841675, |
|
"eval_runtime": 952.7148, |
|
"eval_samples_per_second": 8.654, |
|
"eval_steps_per_second": 8.654, |
|
"step": 3750 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 4638, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 750, |
|
"total_flos": 1.4703137390592e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|