|
{ |
|
"best_metric": 0.26116234064102173, |
|
"best_model_checkpoint": "outputs/checkpoint-108", |
|
"epoch": 5.0, |
|
"eval_steps": 6, |
|
"global_step": 110, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.045454545454545456, |
|
"grad_norm": 12.375, |
|
"learning_rate": 4e-05, |
|
"loss": 8.7425, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.09090909090909091, |
|
"grad_norm": 17.125, |
|
"learning_rate": 8e-05, |
|
"loss": 8.6536, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.13636363636363635, |
|
"grad_norm": 12.0625, |
|
"learning_rate": 0.00012, |
|
"loss": 8.602, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.18181818181818182, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 0.00016, |
|
"loss": 8.3064, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"grad_norm": 10.125, |
|
"learning_rate": 0.0002, |
|
"loss": 7.7739, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.2727272727272727, |
|
"grad_norm": 7.25, |
|
"learning_rate": 0.00024, |
|
"loss": 7.1622, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.2727272727272727, |
|
"eval_loss": 6.408970832824707, |
|
"eval_runtime": 1.4799, |
|
"eval_samples_per_second": 95.274, |
|
"eval_steps_per_second": 12.163, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.3181818181818182, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 0.00028000000000000003, |
|
"loss": 6.4402, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 0.00032, |
|
"loss": 5.616, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.4090909090909091, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 0.00035999999999999997, |
|
"loss": 4.8702, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 0.0004, |
|
"loss": 4.2606, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 0.00044, |
|
"loss": 3.7622, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.5454545454545454, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 0.00048, |
|
"loss": 3.5397, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.5454545454545454, |
|
"eval_loss": 2.9970808029174805, |
|
"eval_runtime": 1.4818, |
|
"eval_samples_per_second": 95.155, |
|
"eval_steps_per_second": 12.147, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.5909090909090909, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 0.0005200000000000001, |
|
"loss": 2.9788, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.6363636363636364, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 0.0005600000000000001, |
|
"loss": 2.5801, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.6818181818181818, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 0.0006, |
|
"loss": 2.3461, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 0.00064, |
|
"loss": 2.0174, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.7727272727272727, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 0.00068, |
|
"loss": 1.889, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.8181818181818182, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 0.0007199999999999999, |
|
"loss": 1.7954, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.8181818181818182, |
|
"eval_loss": 1.589725136756897, |
|
"eval_runtime": 1.4857, |
|
"eval_samples_per_second": 94.907, |
|
"eval_steps_per_second": 12.116, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.8636363636363636, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 0.00076, |
|
"loss": 1.6871, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 0.0008, |
|
"loss": 1.5787, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.9545454545454546, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 0.00084, |
|
"loss": 1.4167, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 0.00088, |
|
"loss": 1.3882, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 1.0454545454545454, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.00092, |
|
"loss": 1.2626, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 1.0909090909090908, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.00096, |
|
"loss": 1.1743, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 1.0909090909090908, |
|
"eval_loss": 1.088915467262268, |
|
"eval_runtime": 1.4822, |
|
"eval_samples_per_second": 95.126, |
|
"eval_steps_per_second": 12.144, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 1.1363636363636362, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.001, |
|
"loss": 1.1278, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 1.1818181818181819, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.0009996585300715115, |
|
"loss": 1.1023, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.2272727272727273, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.0009986345866928941, |
|
"loss": 1.0403, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 1.2727272727272727, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.000996929568447637, |
|
"loss": 1.0496, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.3181818181818181, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.000994545804185573, |
|
"loss": 0.8593, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.000991486549841951, |
|
"loss": 0.9413, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"eval_loss": 0.7773878574371338, |
|
"eval_runtime": 1.491, |
|
"eval_samples_per_second": 94.566, |
|
"eval_steps_per_second": 12.072, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.4090909090909092, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.0009877559839902184, |
|
"loss": 0.7758, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.4545454545454546, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.0009833592021345938, |
|
"loss": 0.8344, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.0009783022097502204, |
|
"loss": 0.6183, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.5454545454545454, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.0009725919140804099, |
|
"loss": 0.7497, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.5909090909090908, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.0009662361147021779, |
|
"loss": 0.7042, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.6363636363636362, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.0009592434928729616, |
|
"loss": 0.7236, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.6363636363636362, |
|
"eval_loss": 0.6186583042144775, |
|
"eval_runtime": 1.4853, |
|
"eval_samples_per_second": 94.931, |
|
"eval_steps_per_second": 12.119, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.6818181818181817, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.0009516235996730644, |
|
"loss": 0.6119, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.7272727272727273, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.0009433868429600309, |
|
"loss": 0.606, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.7727272727272727, |
|
"grad_norm": 17.625, |
|
"learning_rate": 0.0009345444731527642, |
|
"loss": 0.6787, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.0009251085678648072, |
|
"loss": 0.6607, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.8636363636363638, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.0009150920154077753, |
|
"loss": 0.6514, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.9090909090909092, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.0009045084971874737, |
|
"loss": 0.6084, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.9090909090909092, |
|
"eval_loss": 0.555232048034668, |
|
"eval_runtime": 1.4852, |
|
"eval_samples_per_second": 94.934, |
|
"eval_steps_per_second": 12.119, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.9545454545454546, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.0008933724690167416, |
|
"loss": 0.5991, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.0008816991413705516, |
|
"loss": 0.6085, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 2.0454545454545454, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.0008695044586103295, |
|
"loss": 0.4946, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 2.090909090909091, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.0008568050772058762, |
|
"loss": 0.4987, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 2.1363636363636362, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0008436183429846313, |
|
"loss": 0.4656, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 2.1818181818181817, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.0008299622674393614, |
|
"loss": 0.5454, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 2.1818181818181817, |
|
"eval_loss": 0.4745166301727295, |
|
"eval_runtime": 1.4854, |
|
"eval_samples_per_second": 94.921, |
|
"eval_steps_per_second": 12.118, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 2.227272727272727, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0008158555031266255, |
|
"loss": 0.4058, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 2.2727272727272725, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.0008013173181896282, |
|
"loss": 0.5267, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.3181818181818183, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.0007863675700402526, |
|
"loss": 0.524, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 2.3636363636363638, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0007710266782362247, |
|
"loss": 0.5331, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 2.409090909090909, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.0007553155965904535, |
|
"loss": 0.4235, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 2.4545454545454546, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.0007392557845506433, |
|
"loss": 0.5147, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 2.4545454545454546, |
|
"eval_loss": 0.437049001455307, |
|
"eval_runtime": 1.4804, |
|
"eval_samples_per_second": 95.243, |
|
"eval_steps_per_second": 12.159, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0007228691778882692, |
|
"loss": 0.4376, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 2.5454545454545454, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0007061781587369518, |
|
"loss": 0.4396, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 2.590909090909091, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.0006892055250211552, |
|
"loss": 0.4257, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 2.6363636363636362, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0006719744593169641, |
|
"loss": 0.4447, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 2.6818181818181817, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0006545084971874737, |
|
"loss": 0.4591, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.0006368314950360416, |
|
"loss": 0.4645, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"eval_loss": 0.3943060338497162, |
|
"eval_runtime": 1.4805, |
|
"eval_samples_per_second": 95.235, |
|
"eval_steps_per_second": 12.158, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.7727272727272725, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.0006189675975213093, |
|
"loss": 0.4733, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 2.8181818181818183, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.0006009412045785051, |
|
"loss": 0.4227, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 2.8636363636363638, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.000582776938092065, |
|
"loss": 0.485, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.909090909090909, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0005644996082651017, |
|
"loss": 0.4154, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.9545454545454546, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.000546134179731651, |
|
"loss": 0.4602, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.000527705737457985, |
|
"loss": 0.4371, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.35816648602485657, |
|
"eval_runtime": 1.4795, |
|
"eval_samples_per_second": 95.3, |
|
"eval_steps_per_second": 12.166, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 3.0454545454545454, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.000509239452479565, |
|
"loss": 0.3674, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 3.090909090909091, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.0004907605475204352, |
|
"loss": 0.3405, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 3.1363636363636362, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.00047229426254201504, |
|
"loss": 0.3669, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 3.1818181818181817, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00045386582026834903, |
|
"loss": 0.3333, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 3.227272727272727, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0004355003917348985, |
|
"loss": 0.3032, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 3.2727272727272725, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.000417223061907935, |
|
"loss": 0.3557, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 3.2727272727272725, |
|
"eval_loss": 0.3237670361995697, |
|
"eval_runtime": 1.4942, |
|
"eval_samples_per_second": 94.367, |
|
"eval_steps_per_second": 12.047, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 3.3181818181818183, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.000399058795421495, |
|
"loss": 0.3774, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 3.3636363636363638, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00038103240247869074, |
|
"loss": 0.3433, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 3.409090909090909, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0003631685049639586, |
|
"loss": 0.3872, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 3.4545454545454546, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.00034549150281252633, |
|
"loss": 0.3675, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0003280255406830359, |
|
"loss": 0.3581, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 3.5454545454545454, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.00031079447497884486, |
|
"loss": 0.3062, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 3.5454545454545454, |
|
"eval_loss": 0.3086094558238983, |
|
"eval_runtime": 1.4971, |
|
"eval_samples_per_second": 94.182, |
|
"eval_steps_per_second": 12.023, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 3.590909090909091, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00029382184126304836, |
|
"loss": 0.3324, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.0002771308221117309, |
|
"loss": 0.338, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 3.6818181818181817, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0002607442154493568, |
|
"loss": 0.3319, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 3.7272727272727275, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0002446844034095466, |
|
"loss": 0.3577, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 3.7727272727272725, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00022897332176377528, |
|
"loss": 0.3463, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 3.8181818181818183, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00021363242995974742, |
|
"loss": 0.3065, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 3.8181818181818183, |
|
"eval_loss": 0.2896404266357422, |
|
"eval_runtime": 1.4869, |
|
"eval_samples_per_second": 94.829, |
|
"eval_steps_per_second": 12.106, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 3.8636363636363638, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00019868268181037185, |
|
"loss": 0.339, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 3.909090909090909, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00018414449687337466, |
|
"loss": 0.3104, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 3.9545454545454546, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.0001700377325606388, |
|
"loss": 0.3248, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00015638165701536866, |
|
"loss": 0.3155, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 4.045454545454546, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.00014319492279412388, |
|
"loss": 0.2769, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 4.090909090909091, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.0001304955413896705, |
|
"loss": 0.2873, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 4.090909090909091, |
|
"eval_loss": 0.274143785238266, |
|
"eval_runtime": 1.5006, |
|
"eval_samples_per_second": 93.962, |
|
"eval_steps_per_second": 11.995, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 4.136363636363637, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.00011830085862944851, |
|
"loss": 0.2952, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 4.181818181818182, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.00010662753098325839, |
|
"loss": 0.2559, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 4.2272727272727275, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 9.549150281252633e-05, |
|
"loss": 0.2737, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 4.2727272727272725, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 8.490798459222476e-05, |
|
"loss": 0.2822, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 4.318181818181818, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 7.489143213519301e-05, |
|
"loss": 0.3014, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 4.363636363636363, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 6.545552684723583e-05, |
|
"loss": 0.2827, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 4.363636363636363, |
|
"eval_loss": 0.26471802592277527, |
|
"eval_runtime": 1.4885, |
|
"eval_samples_per_second": 94.724, |
|
"eval_steps_per_second": 12.092, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 4.409090909090909, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 5.6613157039969057e-05, |
|
"loss": 0.2638, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 4.454545454545454, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 4.8376400326935575e-05, |
|
"loss": 0.2592, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 4.075650712703849e-05, |
|
"loss": 0.298, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 4.545454545454545, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 3.376388529782215e-05, |
|
"loss": 0.2632, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 4.590909090909091, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 2.7408085919590266e-05, |
|
"loss": 0.2404, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 4.636363636363637, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 2.1697790249779635e-05, |
|
"loss": 0.265, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 4.636363636363637, |
|
"eval_loss": 0.26171576976776123, |
|
"eval_runtime": 1.4803, |
|
"eval_samples_per_second": 95.248, |
|
"eval_steps_per_second": 12.159, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 4.681818181818182, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 1.6640797865406288e-05, |
|
"loss": 0.3012, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 4.7272727272727275, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 1.22440160097817e-05, |
|
"loss": 0.3019, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 4.7727272727272725, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 8.513450158049108e-06, |
|
"loss": 0.2667, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 4.818181818181818, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 5.454195814427021e-06, |
|
"loss": 0.2781, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 4.863636363636363, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 3.0704315523631954e-06, |
|
"loss": 0.281, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 4.909090909090909, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 1.3654133071059894e-06, |
|
"loss": 0.2935, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 4.909090909090909, |
|
"eval_loss": 0.26116234064102173, |
|
"eval_runtime": 1.4806, |
|
"eval_samples_per_second": 95.232, |
|
"eval_steps_per_second": 12.157, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 4.954545454545455, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 3.4146992848854695e-07, |
|
"loss": 0.2806, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.0, |
|
"loss": 0.2668, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 110, |
|
"total_flos": 5704372783549440.0, |
|
"train_loss": 1.2665127342397517, |
|
"train_runtime": 277.3258, |
|
"train_samples_per_second": 25.349, |
|
"train_steps_per_second": 0.397 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 110, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 6, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5704372783549440.0, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|