|
{ |
|
"best_metric": 0.34750062227249146, |
|
"best_model_checkpoint": "../../saves/LLaMA3-70B-qlora-bnb/lora/sft/A61K/checkpoint-100", |
|
"epoch": 2.9925925925925925, |
|
"eval_steps": 100, |
|
"global_step": 606, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04938271604938271, |
|
"grad_norm": 15.011398315429688, |
|
"learning_rate": 1.1999999999999999e-05, |
|
"loss": 12.2879, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.09876543209876543, |
|
"grad_norm": 17.572187423706055, |
|
"learning_rate": 3.5999999999999994e-05, |
|
"loss": 11.973, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 45.80781936645508, |
|
"learning_rate": 6.599999999999999e-05, |
|
"loss": 9.264, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.19753086419753085, |
|
"grad_norm": 14.461894989013672, |
|
"learning_rate": 9.599999999999999e-05, |
|
"loss": 3.6258, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.24691358024691357, |
|
"grad_norm": 10.816905975341797, |
|
"learning_rate": 0.00012599999999999997, |
|
"loss": 0.5542, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 4.389413356781006, |
|
"learning_rate": 0.000156, |
|
"loss": 0.4193, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.345679012345679, |
|
"grad_norm": 6.623525619506836, |
|
"learning_rate": 0.000186, |
|
"loss": 0.5134, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3950617283950617, |
|
"grad_norm": 2.265923023223877, |
|
"learning_rate": 0.00021599999999999996, |
|
"loss": 0.4182, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 2.2424216270446777, |
|
"learning_rate": 0.00024599999999999996, |
|
"loss": 0.3526, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.49382716049382713, |
|
"grad_norm": 4.1003804206848145, |
|
"learning_rate": 0.000276, |
|
"loss": 0.3573, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.49382716049382713, |
|
"eval_loss": 0.34750062227249146, |
|
"eval_runtime": 641.1941, |
|
"eval_samples_per_second": 0.281, |
|
"eval_steps_per_second": 0.281, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5432098765432098, |
|
"grad_norm": 4.202010154724121, |
|
"learning_rate": 0.00029998843583216637, |
|
"loss": 0.3412, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 7.416792869567871, |
|
"learning_rate": 0.000299583877149169, |
|
"loss": 0.4443, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6419753086419753, |
|
"grad_norm": 4.8874101638793945, |
|
"learning_rate": 0.0002986028919054496, |
|
"loss": 0.3789, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.691358024691358, |
|
"grad_norm": 17.96492576599121, |
|
"learning_rate": 0.0002970492603610264, |
|
"loss": 0.4358, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 15.250304222106934, |
|
"learning_rate": 0.0002949289694879236, |
|
"loss": 0.4073, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.7901234567901234, |
|
"grad_norm": 2.9382853507995605, |
|
"learning_rate": 0.00029225018989917134, |
|
"loss": 0.3782, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8395061728395061, |
|
"grad_norm": 9.713154792785645, |
|
"learning_rate": 0.00028902324436306994, |
|
"loss": 0.3941, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 2.8005290031433105, |
|
"learning_rate": 0.00028526056802405104, |
|
"loss": 0.345, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.9382716049382716, |
|
"grad_norm": 179.18141174316406, |
|
"learning_rate": 0.0002809766604834258, |
|
"loss": 0.5958, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.9876543209876543, |
|
"grad_norm": 3.8531386852264404, |
|
"learning_rate": 0.0002761880299246772, |
|
"loss": 0.3841, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.9876543209876543, |
|
"eval_loss": 0.3884469270706177, |
|
"eval_runtime": 641.004, |
|
"eval_samples_per_second": 0.281, |
|
"eval_steps_per_second": 0.281, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.037037037037037, |
|
"grad_norm": 5.21042537689209, |
|
"learning_rate": 0.0002709131294986136, |
|
"loss": 0.38, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.0864197530864197, |
|
"grad_norm": 2.059401035308838, |
|
"learning_rate": 0.0002651722862135245, |
|
"loss": 0.3495, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.1358024691358024, |
|
"grad_norm": 9.867389678955078, |
|
"learning_rate": 0.00025898762260436153, |
|
"loss": 0.4585, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.1851851851851851, |
|
"grad_norm": 82.30119323730469, |
|
"learning_rate": 0.0002523829714827981, |
|
"loss": 0.4752, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.2345679012345678, |
|
"grad_norm": 3.674161672592163, |
|
"learning_rate": 0.000245383784096678, |
|
"loss": 0.4348, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.2839506172839505, |
|
"grad_norm": 18.10048484802246, |
|
"learning_rate": 0.00023801703205276613, |
|
"loss": 0.4606, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 4.7857232093811035, |
|
"learning_rate": 0.00023031110338074388, |
|
"loss": 0.4063, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.382716049382716, |
|
"grad_norm": 4.155650615692139, |
|
"learning_rate": 0.00022229569313897066, |
|
"loss": 0.4185, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.4320987654320987, |
|
"grad_norm": 19.67816734313965, |
|
"learning_rate": 0.00021400168898356626, |
|
"loss": 0.4242, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"grad_norm": 6.764120101928711, |
|
"learning_rate": 0.00020546105214177678, |
|
"loss": 0.3648, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"eval_loss": 0.36776283383369446, |
|
"eval_runtime": 640.9052, |
|
"eval_samples_per_second": 0.281, |
|
"eval_steps_per_second": 0.281, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.5308641975308643, |
|
"grad_norm": 1.9953991174697876, |
|
"learning_rate": 0.0001967066942482978, |
|
"loss": 0.3639, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.5802469135802468, |
|
"grad_norm": 4.738420486450195, |
|
"learning_rate": 0.00018777235051917025, |
|
"loss": 0.3831, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.6296296296296298, |
|
"grad_norm": 4.011212348937988, |
|
"learning_rate": 0.00017869244975197748, |
|
"loss": 0.3603, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.6790123456790123, |
|
"grad_norm": 1.691756248474121, |
|
"learning_rate": 0.00016950198165330198, |
|
"loss": 0.3657, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.7283950617283952, |
|
"grad_norm": 4.096136093139648, |
|
"learning_rate": 0.00016023636200470065, |
|
"loss": 0.3608, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 3.374934673309326, |
|
"learning_rate": 0.00015093129618678526, |
|
"loss": 0.3644, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.8271604938271606, |
|
"grad_norm": 2.8539466857910156, |
|
"learning_rate": 0.0001416226415873234, |
|
"loss": 0.393, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.876543209876543, |
|
"grad_norm": 6.184209823608398, |
|
"learning_rate": 0.00013234626942357447, |
|
"loss": 0.3979, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.925925925925926, |
|
"grad_norm": 8.18782901763916, |
|
"learning_rate": 0.00012313792651133325, |
|
"loss": 0.5438, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.9753086419753085, |
|
"grad_norm": 0.582004964351654, |
|
"learning_rate": 0.00011403309751335898, |
|
"loss": 0.3604, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.9753086419753085, |
|
"eval_loss": 0.3583581745624542, |
|
"eval_runtime": 640.2927, |
|
"eval_samples_per_second": 0.281, |
|
"eval_steps_per_second": 0.281, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.0246913580246915, |
|
"grad_norm": 3.920786142349243, |
|
"learning_rate": 0.00010506686819801978, |
|
"loss": 0.3926, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.074074074074074, |
|
"grad_norm": 2.3339221477508545, |
|
"learning_rate": 9.627379023509041e-05, |
|
"loss": 0.3697, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.123456790123457, |
|
"grad_norm": 0.7499105334281921, |
|
"learning_rate": 8.768774804971705e-05, |
|
"loss": 0.3496, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.1728395061728394, |
|
"grad_norm": 0.7011772990226746, |
|
"learning_rate": 7.934182824763187e-05, |
|
"loss": 0.3602, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.9641762375831604, |
|
"learning_rate": 7.126819211479209e-05, |
|
"loss": 0.3549, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.271604938271605, |
|
"grad_norm": 0.6399113535881042, |
|
"learning_rate": 6.349795168276994e-05, |
|
"loss": 0.3675, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.3209876543209877, |
|
"grad_norm": 0.7869608998298645, |
|
"learning_rate": 5.6061049837480616e-05, |
|
"loss": 0.352, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.3703703703703702, |
|
"grad_norm": 0.6829902529716492, |
|
"learning_rate": 4.898614493325209e-05, |
|
"loss": 0.3455, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.419753086419753, |
|
"grad_norm": 1.2732656002044678, |
|
"learning_rate": 4.2300500356881895e-05, |
|
"loss": 0.3496, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.4691358024691357, |
|
"grad_norm": 0.9332154989242554, |
|
"learning_rate": 3.602987946724803e-05, |
|
"loss": 0.3435, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.4691358024691357, |
|
"eval_loss": 0.3569886386394501, |
|
"eval_runtime": 640.5499, |
|
"eval_samples_per_second": 0.281, |
|
"eval_steps_per_second": 0.281, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.5185185185185186, |
|
"grad_norm": 1.809404969215393, |
|
"learning_rate": 3.0198446315329134e-05, |
|
"loss": 0.3585, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.567901234567901, |
|
"grad_norm": 0.32072120904922485, |
|
"learning_rate": 2.482867252721145e-05, |
|
"loss": 0.346, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.617283950617284, |
|
"grad_norm": 1.431264042854309, |
|
"learning_rate": 1.9941250708913388e-05, |
|
"loss": 0.3658, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 2.0725672245025635, |
|
"learning_rate": 1.5555014706723723e-05, |
|
"loss": 0.3526, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.7160493827160495, |
|
"grad_norm": 1.5036245584487915, |
|
"learning_rate": 1.1686867030334379e-05, |
|
"loss": 0.3455, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.765432098765432, |
|
"grad_norm": 0.9321132898330688, |
|
"learning_rate": 8.351713718443865e-06, |
|
"loss": 0.3625, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.814814814814815, |
|
"grad_norm": 0.8139396905899048, |
|
"learning_rate": 5.56240689783013e-06, |
|
"loss": 0.3428, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.8641975308641974, |
|
"grad_norm": 0.6848800778388977, |
|
"learning_rate": 3.3296952572425205e-06, |
|
"loss": 0.3431, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.9135802469135803, |
|
"grad_norm": 1.2402377128601074, |
|
"learning_rate": 1.6621826269641315e-06, |
|
"loss": 0.3438, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"grad_norm": 0.6294095516204834, |
|
"learning_rate": 5.662948236587972e-07, |
|
"loss": 0.3379, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"eval_loss": 0.34963592886924744, |
|
"eval_runtime": 640.5576, |
|
"eval_samples_per_second": 0.281, |
|
"eval_steps_per_second": 0.281, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.9925925925925925, |
|
"step": 606, |
|
"total_flos": 1.4871334589840622e+19, |
|
"train_loss": 0.9772530333830578, |
|
"train_runtime": 45545.0998, |
|
"train_samples_per_second": 0.107, |
|
"train_steps_per_second": 0.013 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 606, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"total_flos": 1.4871334589840622e+19, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|