|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9528795811518327, |
|
"eval_steps": 500, |
|
"global_step": 141, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.041884816753926704, |
|
"grad_norm": 0.21733999252319336, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 1.0015, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.08376963350785341, |
|
"grad_norm": 0.21422453224658966, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 0.9283, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.1256544502617801, |
|
"grad_norm": 0.18293505907058716, |
|
"learning_rate": 4e-05, |
|
"loss": 0.8643, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.16753926701570682, |
|
"grad_norm": 0.2287076711654663, |
|
"learning_rate": 5.333333333333333e-05, |
|
"loss": 1.0013, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.2094240837696335, |
|
"grad_norm": 0.2509159445762634, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 0.9341, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.2513089005235602, |
|
"grad_norm": 0.21400025486946106, |
|
"learning_rate": 8e-05, |
|
"loss": 0.9068, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.2931937172774869, |
|
"grad_norm": 0.20343424379825592, |
|
"learning_rate": 9.333333333333334e-05, |
|
"loss": 0.8083, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.33507853403141363, |
|
"grad_norm": 0.25129085779190063, |
|
"learning_rate": 9.998445910004082e-05, |
|
"loss": 0.8985, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.3769633507853403, |
|
"grad_norm": 0.28799620270729065, |
|
"learning_rate": 9.986018985905901e-05, |
|
"loss": 0.9362, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.418848167539267, |
|
"grad_norm": 0.29551103711128235, |
|
"learning_rate": 9.961196033000861e-05, |
|
"loss": 0.9174, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.4607329842931937, |
|
"grad_norm": 0.31457847356796265, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 0.8732, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.5026178010471204, |
|
"grad_norm": 0.2715758979320526, |
|
"learning_rate": 9.874639560909117e-05, |
|
"loss": 0.8653, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.5445026178010471, |
|
"grad_norm": 0.25824904441833496, |
|
"learning_rate": 9.81312123475006e-05, |
|
"loss": 0.8592, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.5863874345549738, |
|
"grad_norm": 0.22251802682876587, |
|
"learning_rate": 9.73963673083566e-05, |
|
"loss": 0.805, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.6282722513089005, |
|
"grad_norm": 0.22290602326393127, |
|
"learning_rate": 9.654368743221022e-05, |
|
"loss": 0.7464, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.6701570680628273, |
|
"grad_norm": 0.2132934331893921, |
|
"learning_rate": 9.557529261558367e-05, |
|
"loss": 0.7796, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.7120418848167539, |
|
"grad_norm": 0.19897978007793427, |
|
"learning_rate": 9.449359044057345e-05, |
|
"loss": 0.7768, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.7539267015706806, |
|
"grad_norm": 0.21434754133224487, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 0.8751, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.7958115183246073, |
|
"grad_norm": 0.2214685082435608, |
|
"learning_rate": 9.200129615753859e-05, |
|
"loss": 0.7966, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.837696335078534, |
|
"grad_norm": 0.20432740449905396, |
|
"learning_rate": 9.059690028579284e-05, |
|
"loss": 0.7963, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.8795811518324608, |
|
"grad_norm": 0.23610389232635498, |
|
"learning_rate": 8.90915741234015e-05, |
|
"loss": 0.8332, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.9214659685863874, |
|
"grad_norm": 0.23847244679927826, |
|
"learning_rate": 8.748906014838672e-05, |
|
"loss": 0.7869, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.9633507853403142, |
|
"grad_norm": 0.23536360263824463, |
|
"learning_rate": 8.579334246298593e-05, |
|
"loss": 0.8517, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.0052356020942408, |
|
"grad_norm": 0.24705368280410767, |
|
"learning_rate": 8.400863688854597e-05, |
|
"loss": 0.8694, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.0471204188481675, |
|
"grad_norm": 0.22067400813102722, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 0.8002, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.0890052356020943, |
|
"grad_norm": 0.21238695085048676, |
|
"learning_rate": 8.019022051627388e-05, |
|
"loss": 0.7798, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.130890052356021, |
|
"grad_norm": 0.2280968427658081, |
|
"learning_rate": 7.81660029031811e-05, |
|
"loss": 0.8114, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.1727748691099475, |
|
"grad_norm": 0.2327934205532074, |
|
"learning_rate": 7.60717601689749e-05, |
|
"loss": 0.8206, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.2146596858638743, |
|
"grad_norm": 0.2437446266412735, |
|
"learning_rate": 7.391269893106592e-05, |
|
"loss": 0.8477, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.256544502617801, |
|
"grad_norm": 0.2429206371307373, |
|
"learning_rate": 7.169418695587791e-05, |
|
"loss": 0.8213, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.2984293193717278, |
|
"grad_norm": 0.23616257309913635, |
|
"learning_rate": 6.942173981373474e-05, |
|
"loss": 0.8287, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.3403141361256545, |
|
"grad_norm": 0.2597900629043579, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 0.7643, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.3821989528795813, |
|
"grad_norm": 0.2668094038963318, |
|
"learning_rate": 6.473775872054521e-05, |
|
"loss": 0.7315, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.4240837696335078, |
|
"grad_norm": 0.2583785355091095, |
|
"learning_rate": 6.233786988451468e-05, |
|
"loss": 0.7811, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.4659685863874345, |
|
"grad_norm": 0.2756960391998291, |
|
"learning_rate": 5.9907307159969884e-05, |
|
"loss": 0.754, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.5078534031413613, |
|
"grad_norm": 0.28554120659828186, |
|
"learning_rate": 5.745211330880872e-05, |
|
"loss": 0.7809, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.5497382198952878, |
|
"grad_norm": 0.3146248459815979, |
|
"learning_rate": 5.497839232979084e-05, |
|
"loss": 0.8319, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.5916230366492146, |
|
"grad_norm": 0.2715282142162323, |
|
"learning_rate": 5.249229428303486e-05, |
|
"loss": 0.7745, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.6335078534031413, |
|
"grad_norm": 0.28730642795562744, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8002, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.675392670157068, |
|
"grad_norm": 0.3208574652671814, |
|
"learning_rate": 4.750770571696514e-05, |
|
"loss": 0.7718, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.7172774869109948, |
|
"grad_norm": 0.307959645986557, |
|
"learning_rate": 4.502160767020918e-05, |
|
"loss": 0.742, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.7591623036649215, |
|
"grad_norm": 0.34215685725212097, |
|
"learning_rate": 4.254788669119127e-05, |
|
"loss": 0.7687, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.8010471204188483, |
|
"grad_norm": 0.28537240624427795, |
|
"learning_rate": 4.0092692840030134e-05, |
|
"loss": 0.7066, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.8429319371727748, |
|
"grad_norm": 0.31295880675315857, |
|
"learning_rate": 3.7662130115485314e-05, |
|
"loss": 0.7098, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.8848167539267016, |
|
"grad_norm": 0.2994559705257416, |
|
"learning_rate": 3.5262241279454785e-05, |
|
"loss": 0.6961, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.9267015706806283, |
|
"grad_norm": 0.3853859603404999, |
|
"learning_rate": 3.289899283371657e-05, |
|
"loss": 0.7948, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.9685863874345548, |
|
"grad_norm": 0.3592422306537628, |
|
"learning_rate": 3.0578260186265265e-05, |
|
"loss": 0.7264, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.0104712041884816, |
|
"grad_norm": 0.42988094687461853, |
|
"learning_rate": 2.8305813044122097e-05, |
|
"loss": 0.6696, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 2.0523560209424083, |
|
"grad_norm": 0.3221912086009979, |
|
"learning_rate": 2.6087301068934106e-05, |
|
"loss": 0.7135, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 2.094240837696335, |
|
"grad_norm": 0.3392656743526459, |
|
"learning_rate": 2.39282398310251e-05, |
|
"loss": 0.7865, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.136125654450262, |
|
"grad_norm": 0.32586315274238586, |
|
"learning_rate": 2.1833997096818898e-05, |
|
"loss": 0.7411, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 2.1780104712041886, |
|
"grad_norm": 0.36464083194732666, |
|
"learning_rate": 1.980977948372612e-05, |
|
"loss": 0.772, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 2.2198952879581153, |
|
"grad_norm": 0.3270440697669983, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 0.7096, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 2.261780104712042, |
|
"grad_norm": 0.3391498923301697, |
|
"learning_rate": 1.599136311145402e-05, |
|
"loss": 0.7378, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 2.303664921465969, |
|
"grad_norm": 0.3948444128036499, |
|
"learning_rate": 1.4206657537014079e-05, |
|
"loss": 0.6471, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.345549738219895, |
|
"grad_norm": 0.38464757800102234, |
|
"learning_rate": 1.2510939851613285e-05, |
|
"loss": 0.7359, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 2.387434554973822, |
|
"grad_norm": 0.38830411434173584, |
|
"learning_rate": 1.090842587659851e-05, |
|
"loss": 0.7572, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.4293193717277486, |
|
"grad_norm": 0.37749606370925903, |
|
"learning_rate": 9.403099714207175e-06, |
|
"loss": 0.7411, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 2.4712041884816753, |
|
"grad_norm": 0.3769814968109131, |
|
"learning_rate": 7.998703842461431e-06, |
|
"loss": 0.6692, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.513089005235602, |
|
"grad_norm": 0.4309244453907013, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 0.7619, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.554973821989529, |
|
"grad_norm": 0.37709513306617737, |
|
"learning_rate": 5.506409559426573e-06, |
|
"loss": 0.708, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.5968586387434556, |
|
"grad_norm": 0.40647101402282715, |
|
"learning_rate": 4.424707384416344e-06, |
|
"loss": 0.7279, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.6387434554973823, |
|
"grad_norm": 0.3731394112110138, |
|
"learning_rate": 3.4563125677897932e-06, |
|
"loss": 0.7629, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.680628272251309, |
|
"grad_norm": 0.40869101881980896, |
|
"learning_rate": 2.603632691643415e-06, |
|
"loss": 0.7334, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.7225130890052354, |
|
"grad_norm": 0.33611902594566345, |
|
"learning_rate": 1.8687876524993987e-06, |
|
"loss": 0.6643, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.7643979057591626, |
|
"grad_norm": 0.38377344608306885, |
|
"learning_rate": 1.2536043909088191e-06, |
|
"loss": 0.7363, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.806282722513089, |
|
"grad_norm": 0.38260236382484436, |
|
"learning_rate": 7.596123493895991e-07, |
|
"loss": 0.7018, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.8481675392670156, |
|
"grad_norm": 0.3747893273830414, |
|
"learning_rate": 3.8803966999139684e-07, |
|
"loss": 0.8282, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.8900523560209423, |
|
"grad_norm": 0.40073326230049133, |
|
"learning_rate": 1.3981014094099353e-07, |
|
"loss": 0.6387, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.931937172774869, |
|
"grad_norm": 0.4021676480770111, |
|
"learning_rate": 1.5540899959187727e-08, |
|
"loss": 0.7076, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.9528795811518327, |
|
"step": 141, |
|
"total_flos": 5.744674604829082e+16, |
|
"train_loss": 0.7884446775659602, |
|
"train_runtime": 510.788, |
|
"train_samples_per_second": 8.957, |
|
"train_steps_per_second": 0.276 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 141, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.744674604829082e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|