{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.963369963369964, "eval_steps": 500, "global_step": 680, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14652014652014653, "grad_norm": 0.301513671875, "learning_rate": 0.00019989329748023725, "loss": 0.8924, "step": 10 }, { "epoch": 0.29304029304029305, "grad_norm": 0.32275390625, "learning_rate": 0.00019957341762950344, "loss": 0.8199, "step": 20 }, { "epoch": 0.43956043956043955, "grad_norm": 0.353515625, "learning_rate": 0.0001990410430875205, "loss": 0.8061, "step": 30 }, { "epoch": 0.5860805860805861, "grad_norm": 0.445068359375, "learning_rate": 0.0001982973099683902, "loss": 0.8483, "step": 40 }, { "epoch": 0.7326007326007326, "grad_norm": 0.396728515625, "learning_rate": 0.0001973438054360693, "loss": 0.8471, "step": 50 }, { "epoch": 0.8791208791208791, "grad_norm": 0.35693359375, "learning_rate": 0.00019618256431728194, "loss": 0.8433, "step": 60 }, { "epoch": 1.0256410256410255, "grad_norm": 0.360595703125, "learning_rate": 0.0001948160647590966, "loss": 0.8381, "step": 70 }, { "epoch": 1.1721611721611722, "grad_norm": 0.37109375, "learning_rate": 0.00019324722294043558, "loss": 0.7955, "step": 80 }, { "epoch": 1.3186813186813187, "grad_norm": 0.388671875, "learning_rate": 0.0001914793868488021, "loss": 0.8182, "step": 90 }, { "epoch": 1.4652014652014653, "grad_norm": 0.45458984375, "learning_rate": 0.00018951632913550626, "loss": 0.7914, "step": 100 }, { "epoch": 1.6117216117216118, "grad_norm": 0.51025390625, "learning_rate": 0.00018736223906463696, "loss": 0.8114, "step": 110 }, { "epoch": 1.7582417582417582, "grad_norm": 0.52001953125, "learning_rate": 0.00018502171357296144, "loss": 0.8434, "step": 120 }, { "epoch": 1.9047619047619047, "grad_norm": 0.433349609375, "learning_rate": 0.00018249974745983023, "loss": 0.7869, "step": 130 }, { "epoch": 2.051282051282051, "grad_norm": 0.62255859375, "learning_rate": 0.000179801722728024, "loss": 0.7767, "step": 140 }, { "epoch": 2.197802197802198, "grad_norm": 0.469970703125, "learning_rate": 0.00017693339709828792, "loss": 0.7711, "step": 150 }, { "epoch": 2.3443223443223444, "grad_norm": 0.447998046875, "learning_rate": 0.00017390089172206592, "loss": 0.7823, "step": 160 }, { "epoch": 2.490842490842491, "grad_norm": 0.41015625, "learning_rate": 0.00017071067811865476, "loss": 0.7747, "step": 170 }, { "epoch": 2.6373626373626373, "grad_norm": 0.466796875, "learning_rate": 0.00016736956436465573, "loss": 0.7877, "step": 180 }, { "epoch": 2.7838827838827838, "grad_norm": 0.48046875, "learning_rate": 0.00016388468056519612, "loss": 0.7778, "step": 190 }, { "epoch": 2.9304029304029307, "grad_norm": 0.47998046875, "learning_rate": 0.00016026346363792567, "loss": 0.754, "step": 200 }, { "epoch": 3.076923076923077, "grad_norm": 0.50244140625, "learning_rate": 0.0001565136414422592, "loss": 0.8089, "step": 210 }, { "epoch": 3.2234432234432235, "grad_norm": 0.475830078125, "learning_rate": 0.0001526432162877356, "loss": 0.75, "step": 220 }, { "epoch": 3.36996336996337, "grad_norm": 0.464111328125, "learning_rate": 0.00014866044785668563, "loss": 0.762, "step": 230 }, { "epoch": 3.5164835164835164, "grad_norm": 0.61474609375, "learning_rate": 0.00014457383557765386, "loss": 0.7423, "step": 240 }, { "epoch": 3.663003663003663, "grad_norm": 0.4873046875, "learning_rate": 0.00014039210048718949, "loss": 0.764, "step": 250 }, { "epoch": 3.8095238095238093, "grad_norm": 0.5546875, "learning_rate": 0.00013612416661871533, "loss": 0.7422, "step": 260 }, { "epoch": 3.956043956043956, "grad_norm": 0.473388671875, "learning_rate": 0.00013177914195819016, "loss": 0.7702, "step": 270 }, { "epoch": 4.102564102564102, "grad_norm": 0.476318359375, "learning_rate": 0.0001273662990072083, "loss": 0.7237, "step": 280 }, { "epoch": 4.249084249084249, "grad_norm": 0.471923828125, "learning_rate": 0.0001228950549950134, "loss": 0.7334, "step": 290 }, { "epoch": 4.395604395604396, "grad_norm": 0.50537109375, "learning_rate": 0.00011837495178165706, "loss": 0.7274, "step": 300 }, { "epoch": 4.542124542124542, "grad_norm": 0.5, "learning_rate": 0.00011381563549518823, "loss": 0.7142, "step": 310 }, { "epoch": 4.688644688644689, "grad_norm": 0.48779296875, "learning_rate": 0.00010922683594633021, "loss": 0.7166, "step": 320 }, { "epoch": 4.835164835164835, "grad_norm": 0.513671875, "learning_rate": 0.00010461834586457398, "loss": 0.76, "step": 330 }, { "epoch": 4.981684981684982, "grad_norm": 0.5048828125, "learning_rate": 0.0001, "loss": 0.7571, "step": 340 }, { "epoch": 5.128205128205128, "grad_norm": 0.5166015625, "learning_rate": 9.538165413542607e-05, "loss": 0.7046, "step": 350 }, { "epoch": 5.274725274725275, "grad_norm": 0.51416015625, "learning_rate": 9.077316405366981e-05, "loss": 0.6916, "step": 360 }, { "epoch": 5.4212454212454215, "grad_norm": 0.61279296875, "learning_rate": 8.61843645048118e-05, "loss": 0.7281, "step": 370 }, { "epoch": 5.5677655677655675, "grad_norm": 0.54736328125, "learning_rate": 8.162504821834295e-05, "loss": 0.6963, "step": 380 }, { "epoch": 5.714285714285714, "grad_norm": 0.6103515625, "learning_rate": 7.710494500498662e-05, "loss": 0.7323, "step": 390 }, { "epoch": 5.860805860805861, "grad_norm": 0.60595703125, "learning_rate": 7.263370099279172e-05, "loss": 0.7342, "step": 400 }, { "epoch": 6.007326007326007, "grad_norm": 0.55078125, "learning_rate": 6.822085804180984e-05, "loss": 0.7018, "step": 410 }, { "epoch": 6.153846153846154, "grad_norm": 0.55078125, "learning_rate": 6.387583338128471e-05, "loss": 0.6998, "step": 420 }, { "epoch": 6.3003663003663, "grad_norm": 0.591796875, "learning_rate": 5.960789951281052e-05, "loss": 0.6639, "step": 430 }, { "epoch": 6.446886446886447, "grad_norm": 0.60107421875, "learning_rate": 5.542616442234618e-05, "loss": 0.7004, "step": 440 }, { "epoch": 6.593406593406593, "grad_norm": 0.55810546875, "learning_rate": 5.1339552143314384e-05, "loss": 0.7155, "step": 450 }, { "epoch": 6.73992673992674, "grad_norm": 0.609375, "learning_rate": 4.735678371226441e-05, "loss": 0.7091, "step": 460 }, { "epoch": 6.886446886446887, "grad_norm": 0.53662109375, "learning_rate": 4.3486358557740814e-05, "loss": 0.6888, "step": 470 }, { "epoch": 7.032967032967033, "grad_norm": 0.5029296875, "learning_rate": 3.973653636207437e-05, "loss": 0.6888, "step": 480 }, { "epoch": 7.17948717948718, "grad_norm": 0.53662109375, "learning_rate": 3.6115319434803894e-05, "loss": 0.6845, "step": 490 }, { "epoch": 7.326007326007326, "grad_norm": 0.59130859375, "learning_rate": 3.263043563534428e-05, "loss": 0.6731, "step": 500 }, { "epoch": 7.472527472527473, "grad_norm": 0.564453125, "learning_rate": 2.9289321881345254e-05, "loss": 0.6708, "step": 510 }, { "epoch": 7.619047619047619, "grad_norm": 0.62158203125, "learning_rate": 2.6099108277934103e-05, "loss": 0.6906, "step": 520 }, { "epoch": 7.7655677655677655, "grad_norm": 0.572265625, "learning_rate": 2.3066602901712108e-05, "loss": 0.6841, "step": 530 }, { "epoch": 7.912087912087912, "grad_norm": 0.5771484375, "learning_rate": 2.0198277271976052e-05, "loss": 0.6664, "step": 540 }, { "epoch": 8.058608058608058, "grad_norm": 0.587890625, "learning_rate": 1.750025254016978e-05, "loss": 0.6899, "step": 550 }, { "epoch": 8.205128205128204, "grad_norm": 0.55810546875, "learning_rate": 1.4978286427038601e-05, "loss": 0.6723, "step": 560 }, { "epoch": 8.351648351648352, "grad_norm": 0.560546875, "learning_rate": 1.2637760935363053e-05, "loss": 0.6494, "step": 570 }, { "epoch": 8.498168498168498, "grad_norm": 0.568359375, "learning_rate": 1.0483670864493778e-05, "loss": 0.6958, "step": 580 }, { "epoch": 8.644688644688644, "grad_norm": 0.5947265625, "learning_rate": 8.520613151197898e-06, "loss": 0.6637, "step": 590 }, { "epoch": 8.791208791208792, "grad_norm": 0.5556640625, "learning_rate": 6.75277705956443e-06, "loss": 0.6876, "step": 600 }, { "epoch": 8.937728937728938, "grad_norm": 0.60791015625, "learning_rate": 5.183935240903414e-06, "loss": 0.6656, "step": 610 }, { "epoch": 9.084249084249084, "grad_norm": 0.51416015625, "learning_rate": 3.817435682718096e-06, "loss": 0.6786, "step": 620 }, { "epoch": 9.23076923076923, "grad_norm": 0.56689453125, "learning_rate": 2.656194563930714e-06, "loss": 0.668, "step": 630 }, { "epoch": 9.377289377289378, "grad_norm": 0.5615234375, "learning_rate": 1.7026900316098215e-06, "loss": 0.6681, "step": 640 }, { "epoch": 9.523809523809524, "grad_norm": 0.587890625, "learning_rate": 9.589569124794916e-07, "loss": 0.6496, "step": 650 }, { "epoch": 9.67032967032967, "grad_norm": 0.57373046875, "learning_rate": 4.2658237049655323e-07, "loss": 0.6891, "step": 660 }, { "epoch": 9.816849816849818, "grad_norm": 0.572265625, "learning_rate": 1.0670251976275803e-07, "loss": 0.6689, "step": 670 }, { "epoch": 9.963369963369964, "grad_norm": 0.57470703125, "learning_rate": 0.0, "loss": 0.6729, "step": 680 }, { "epoch": 9.963369963369964, "step": 680, "total_flos": 3.318222409433088e+16, "train_loss": 0.735039118458243, "train_runtime": 593.4308, "train_samples_per_second": 4.6, "train_steps_per_second": 1.146 } ], "logging_steps": 10, "max_steps": 680, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 3.318222409433088e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }