|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.3480985118788617, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0006961970237577234, |
|
"grad_norm": 74.93397158603199, |
|
"learning_rate": 6.944444444444445e-07, |
|
"loss": 2.6763, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006961970237577234, |
|
"grad_norm": 42.51151469703164, |
|
"learning_rate": 6.944444444444445e-06, |
|
"loss": 2.5014, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.013923940475154469, |
|
"grad_norm": 3.9060174491324595, |
|
"learning_rate": 1.388888888888889e-05, |
|
"loss": 2.1436, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.020885910712731704, |
|
"grad_norm": 2.4653356578163312, |
|
"learning_rate": 2.0833333333333336e-05, |
|
"loss": 2.1299, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.027847880950308938, |
|
"grad_norm": 1.9758176342444926, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 2.0629, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.034809851187886175, |
|
"grad_norm": 2.24304363639927, |
|
"learning_rate": 3.472222222222222e-05, |
|
"loss": 2.0022, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04177182142546341, |
|
"grad_norm": 2.137058600643756, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 1.9324, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04873379166304064, |
|
"grad_norm": 1.8905774575553251, |
|
"learning_rate": 4.8611111111111115e-05, |
|
"loss": 1.8735, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.055695761900617875, |
|
"grad_norm": 2.277356627020573, |
|
"learning_rate": 5.555555555555556e-05, |
|
"loss": 1.8459, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06265773213819512, |
|
"grad_norm": 2.1850479551940736, |
|
"learning_rate": 6.25e-05, |
|
"loss": 1.8657, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.06961970237577235, |
|
"grad_norm": 2.2821298235140066, |
|
"learning_rate": 6.944444444444444e-05, |
|
"loss": 1.8105, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07658167261334958, |
|
"grad_norm": 1.7409612334315907, |
|
"learning_rate": 7.638888888888889e-05, |
|
"loss": 1.7874, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.08354364285092682, |
|
"grad_norm": 1.9712592588798674, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 1.7892, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.09050561308850405, |
|
"grad_norm": 2.509099517180512, |
|
"learning_rate": 9.027777777777779e-05, |
|
"loss": 1.7808, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.09746758332608128, |
|
"grad_norm": 2.0842685380596038, |
|
"learning_rate": 9.722222222222223e-05, |
|
"loss": 1.803, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.10442955356365852, |
|
"grad_norm": 1.8429004845744987, |
|
"learning_rate": 0.00010416666666666667, |
|
"loss": 1.7588, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.11139152380123575, |
|
"grad_norm": 2.8327524674330524, |
|
"learning_rate": 0.00011111111111111112, |
|
"loss": 1.7772, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.11835349403881298, |
|
"grad_norm": 2.0729891765954154, |
|
"learning_rate": 0.00011805555555555556, |
|
"loss": 1.7641, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.12531546427639023, |
|
"grad_norm": 7.58810403295457, |
|
"learning_rate": 0.000125, |
|
"loss": 1.833, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.13227743451396745, |
|
"grad_norm": 19.362231963576104, |
|
"learning_rate": 0.00013194444444444446, |
|
"loss": 1.7874, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1392394047515447, |
|
"grad_norm": 3.96368325072897, |
|
"learning_rate": 0.0001388888888888889, |
|
"loss": 1.846, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.14620137498912192, |
|
"grad_norm": 2.7386120910604945, |
|
"learning_rate": 0.00014583333333333335, |
|
"loss": 1.8197, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.15316334522669917, |
|
"grad_norm": 2.110144128803464, |
|
"learning_rate": 0.00015277777777777777, |
|
"loss": 1.7561, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.16012531546427639, |
|
"grad_norm": 2.5725492351480352, |
|
"learning_rate": 0.00015972222222222223, |
|
"loss": 1.7749, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.16708728570185363, |
|
"grad_norm": 1.121359657360383, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 1.7614, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.17404925593943085, |
|
"grad_norm": 2.155597643370339, |
|
"learning_rate": 0.00017361111111111112, |
|
"loss": 1.7587, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1810112261770081, |
|
"grad_norm": 2.5773482568660393, |
|
"learning_rate": 0.00018055555555555557, |
|
"loss": 1.7934, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.18797319641458532, |
|
"grad_norm": 1.6740595013333657, |
|
"learning_rate": 0.0001875, |
|
"loss": 1.7504, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.19493516665216257, |
|
"grad_norm": 2.2858392200215696, |
|
"learning_rate": 0.00019444444444444446, |
|
"loss": 1.7499, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2018971368897398, |
|
"grad_norm": 1.7434751943536266, |
|
"learning_rate": 0.0001999999336897035, |
|
"loss": 1.7543, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.20885910712731703, |
|
"grad_norm": 1.3619486244331678, |
|
"learning_rate": 0.00019999761283856016, |
|
"loss": 1.7123, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.21582107736489425, |
|
"grad_norm": 1.6373278228861707, |
|
"learning_rate": 0.00019999197656053288, |
|
"loss": 1.7195, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.2227830476024715, |
|
"grad_norm": 1.281241393641898, |
|
"learning_rate": 0.00019998302504249278, |
|
"loss": 1.7336, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.22974501784004872, |
|
"grad_norm": 1.4989445469875955, |
|
"learning_rate": 0.000199970758581228, |
|
"loss": 1.7219, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.23670698807762597, |
|
"grad_norm": 1.321518251513136, |
|
"learning_rate": 0.00019995517758343386, |
|
"loss": 1.7302, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.24366895831520322, |
|
"grad_norm": 1.2671299687318527, |
|
"learning_rate": 0.0001999362825656992, |
|
"loss": 1.7159, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.25063092855278046, |
|
"grad_norm": 1.0577637068118013, |
|
"learning_rate": 0.00019991407415448947, |
|
"loss": 1.7203, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2575928987903577, |
|
"grad_norm": 1.134207084144878, |
|
"learning_rate": 0.00019988855308612595, |
|
"loss": 1.7114, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.2645548690279349, |
|
"grad_norm": 1.2266368964891576, |
|
"learning_rate": 0.00019985972020676116, |
|
"loss": 1.7238, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.2715168392655121, |
|
"grad_norm": 1.1990301126734546, |
|
"learning_rate": 0.00019982757647235094, |
|
"loss": 1.704, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.2784788095030894, |
|
"grad_norm": 0.7719332777254844, |
|
"learning_rate": 0.0001997921229486228, |
|
"loss": 1.7127, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2854407797406666, |
|
"grad_norm": 0.9054141958817113, |
|
"learning_rate": 0.00019975336081104038, |
|
"loss": 1.6892, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.29240274997824384, |
|
"grad_norm": 0.7201089451408423, |
|
"learning_rate": 0.00019971129134476473, |
|
"loss": 1.689, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.29936472021582106, |
|
"grad_norm": 0.6596449276890678, |
|
"learning_rate": 0.00019966591594461157, |
|
"loss": 1.7046, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.30632669045339833, |
|
"grad_norm": 0.9894358407915632, |
|
"learning_rate": 0.000199617236115005, |
|
"loss": 1.7063, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.31328866069097555, |
|
"grad_norm": 0.6354863462129365, |
|
"learning_rate": 0.00019956525346992768, |
|
"loss": 1.6896, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.32025063092855277, |
|
"grad_norm": 1.004907546559211, |
|
"learning_rate": 0.0001995099697328674, |
|
"loss": 1.6662, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.32721260116613, |
|
"grad_norm": 0.5536272096656565, |
|
"learning_rate": 0.00019945138673675973, |
|
"loss": 1.7043, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.33417457140370727, |
|
"grad_norm": 1.0951977004377467, |
|
"learning_rate": 0.00019938950642392746, |
|
"loss": 1.6751, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.3411365416412845, |
|
"grad_norm": 2.3040787232248343, |
|
"learning_rate": 0.00019932433084601613, |
|
"loss": 1.7194, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.3480985118788617, |
|
"grad_norm": 1.484568533076407, |
|
"learning_rate": 0.00019925586216392596, |
|
"loss": 1.6743, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5744, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 418654621532160.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|