|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9978431433840766, |
|
"eval_steps": 500, |
|
"global_step": 2997, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001000281329123816, |
|
"grad_norm": 62.25, |
|
"learning_rate": 6.666666666666668e-08, |
|
"loss": 2.3405, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0500140664561908, |
|
"grad_norm": 6.875, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 1.3787, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1000281329123816, |
|
"grad_norm": 3.9375, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.6171, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1500421993685724, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5743, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2000562658247632, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.5609, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.25007033228095404, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.6189, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3000843987371448, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5975, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.35009846519333565, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.9983043934122208e-05, |
|
"loss": 0.5851, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4001125316495264, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.9932233238122834e-05, |
|
"loss": 0.567, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.45012659810571726, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.984774022190361e-05, |
|
"loss": 0.5503, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5001406645619081, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.972985141929439e-05, |
|
"loss": 0.5386, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5501547310180989, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.9578966616355823e-05, |
|
"loss": 0.5262, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6001687974742896, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.9395597495619634e-05, |
|
"loss": 0.5219, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6501828639304804, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.918036590086405e-05, |
|
"loss": 0.5065, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7001969303866713, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.8934001728309003e-05, |
|
"loss": 0.5041, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7502109968428621, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.865734045138245e-05, |
|
"loss": 0.4946, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8002250632990529, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.8351320287451865e-05, |
|
"loss": 0.4906, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8502391297552436, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.8016979016129164e-05, |
|
"loss": 0.4824, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.9002531962114345, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 1.7655450459938786e-05, |
|
"loss": 0.4736, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9502672626676253, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.726796063928382e-05, |
|
"loss": 0.4677, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.0002813291238162, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 1.6855823614749474e-05, |
|
"loss": 0.4654, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.050295395580007, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.6420437030843482e-05, |
|
"loss": 0.3223, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.1003094620361977, |
|
"grad_norm": 1.375, |
|
"learning_rate": 1.5963277376285646e-05, |
|
"loss": 0.319, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.1503235284923885, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 1.5485894976919836e-05, |
|
"loss": 0.3218, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.2003375949485793, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 1.4989908738228567e-05, |
|
"loss": 0.3174, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.25035166140477, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 1.4477000655279376e-05, |
|
"loss": 0.3185, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.3003657278609608, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.394891010872102e-05, |
|
"loss": 0.3153, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.3503797943171518, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 1.3407427966172866e-05, |
|
"loss": 0.3161, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.4003938607733426, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 1.2854390509011061e-05, |
|
"loss": 0.3117, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.4504079272295334, |
|
"grad_norm": 1.375, |
|
"learning_rate": 1.2291673205146908e-05, |
|
"loss": 0.307, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.5004219936857242, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 1.1721184348915384e-05, |
|
"loss": 0.3057, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.550436060141915, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.1144858589642251e-05, |
|
"loss": 0.3023, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.6004501265981057, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.0564650370835772e-05, |
|
"loss": 0.2997, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.6504641930542965, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 9.982527302252135e-06, |
|
"loss": 0.2989, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.7004782595104873, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 9.40046348731131e-06, |
|
"loss": 0.2947, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.750492325966678, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 8.820432828491542e-06, |
|
"loss": 0.2935, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.8005063924228688, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 8.244402333405252e-06, |
|
"loss": 0.289, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.8505204588790596, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 7.674325444256899e-06, |
|
"loss": 0.2874, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.9005345253352506, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 7.112135413304042e-06, |
|
"loss": 0.2839, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.9505485917914414, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 6.55973874678682e-06, |
|
"loss": 0.2832, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.0005626582476324, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 6.0190087395588596e-06, |
|
"loss": 0.2765, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.050576724703823, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 5.491779122345093e-06, |
|
"loss": 0.1509, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.100590791160014, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 4.979837843169959e-06, |
|
"loss": 0.1491, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.1506048576162047, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 4.484921004044509e-06, |
|
"loss": 0.149, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.2006189240723955, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 4.008706973474391e-06, |
|
"loss": 0.1492, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.2506329905285862, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 3.5528106947544626e-06, |
|
"loss": 0.1477, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.300647056984777, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 3.118778209351808e-06, |
|
"loss": 0.1478, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.350661123440968, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 2.7080814139495402e-06, |
|
"loss": 0.1471, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.4006751898971586, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 2.322113068931391e-06, |
|
"loss": 0.1472, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.4506892563533493, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 1.9621820752343324e-06, |
|
"loss": 0.1467, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.50070332280954, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 1.629509035586484e-06, |
|
"loss": 0.1449, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.550717389265731, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 1.3252221151830513e-06, |
|
"loss": 0.1457, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.6007314557219217, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.0503532158376584e-06, |
|
"loss": 0.1447, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.6507455221781124, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 8.058344765833171e-07, |
|
"loss": 0.146, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.7007595886343037, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 5.924951125902545e-07, |
|
"loss": 0.1461, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.7507736550904944, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 4.11058603120511e-07, |
|
"loss": 0.144, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.800787721546685, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 2.6214023805552826e-07, |
|
"loss": 0.1447, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.850801788002876, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 1.462450313169983e-07, |
|
"loss": 0.1443, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.9008158544590668, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 6.376600825699463e-08, |
|
"loss": 0.1443, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.9508299209152575, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 1.49828728252277e-08, |
|
"loss": 0.1456, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.9978431433840766, |
|
"step": 2997, |
|
"total_flos": 1.9278929080237425e+18, |
|
"train_loss": 0.3424536967062735, |
|
"train_runtime": 31340.7737, |
|
"train_samples_per_second": 6.124, |
|
"train_steps_per_second": 0.096 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 2997, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"total_flos": 1.9278929080237425e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|