|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9988193624557261, |
|
"eval_steps": 500, |
|
"global_step": 423, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0023612750885478157, |
|
"grad_norm": 1.5131582373299473, |
|
"learning_rate": 2.3255813953488374e-07, |
|
"loss": 1.3255, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.011806375442739079, |
|
"grad_norm": 1.4970470206944162, |
|
"learning_rate": 1.1627906976744188e-06, |
|
"loss": 1.3229, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.023612750885478158, |
|
"grad_norm": 1.4534560068903173, |
|
"learning_rate": 2.3255813953488376e-06, |
|
"loss": 1.3249, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03541912632821724, |
|
"grad_norm": 1.3529390744329275, |
|
"learning_rate": 3.4883720930232564e-06, |
|
"loss": 1.3105, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.047225501770956316, |
|
"grad_norm": 1.1377914835607748, |
|
"learning_rate": 4.651162790697675e-06, |
|
"loss": 1.2795, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0590318772136954, |
|
"grad_norm": 0.9300447218970413, |
|
"learning_rate": 5.8139534883720935e-06, |
|
"loss": 1.2542, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07083825265643448, |
|
"grad_norm": 0.7033484136291787, |
|
"learning_rate": 6.976744186046513e-06, |
|
"loss": 1.2254, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08264462809917356, |
|
"grad_norm": 0.5519511550883764, |
|
"learning_rate": 8.139534883720931e-06, |
|
"loss": 1.1956, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09445100354191263, |
|
"grad_norm": 0.530461549075621, |
|
"learning_rate": 9.30232558139535e-06, |
|
"loss": 1.1609, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10625737898465171, |
|
"grad_norm": 0.4882723618310303, |
|
"learning_rate": 9.999316524962347e-06, |
|
"loss": 1.1353, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1180637544273908, |
|
"grad_norm": 0.43070884824119315, |
|
"learning_rate": 9.991629576543164e-06, |
|
"loss": 1.1108, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.12987012987012986, |
|
"grad_norm": 0.4326223460490506, |
|
"learning_rate": 9.975414512725058e-06, |
|
"loss": 1.0912, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.14167650531286896, |
|
"grad_norm": 0.3990410735697587, |
|
"learning_rate": 9.95069903667256e-06, |
|
"loss": 1.0684, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.15348288075560804, |
|
"grad_norm": 0.4102324061429566, |
|
"learning_rate": 9.917525374361913e-06, |
|
"loss": 1.0513, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1652892561983471, |
|
"grad_norm": 0.4553698218636962, |
|
"learning_rate": 9.8759502024387e-06, |
|
"loss": 1.0205, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1770956316410862, |
|
"grad_norm": 0.4487232032461124, |
|
"learning_rate": 9.826044551386743e-06, |
|
"loss": 0.9933, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.18890200708382526, |
|
"grad_norm": 0.37710863024094676, |
|
"learning_rate": 9.767893684173722e-06, |
|
"loss": 0.9569, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.20070838252656434, |
|
"grad_norm": 0.29780121644785873, |
|
"learning_rate": 9.701596950580807e-06, |
|
"loss": 0.944, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.21251475796930341, |
|
"grad_norm": 0.2779699178964329, |
|
"learning_rate": 9.627267617465243e-06, |
|
"loss": 0.9348, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2243211334120425, |
|
"grad_norm": 0.25856375372275114, |
|
"learning_rate": 9.545032675245814e-06, |
|
"loss": 0.931, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2361275088547816, |
|
"grad_norm": 0.2613892946414014, |
|
"learning_rate": 9.45503262094184e-06, |
|
"loss": 0.9177, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.24793388429752067, |
|
"grad_norm": 0.2354999119122035, |
|
"learning_rate": 9.357421218136387e-06, |
|
"loss": 0.9148, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2597402597402597, |
|
"grad_norm": 0.2162870999711504, |
|
"learning_rate": 9.252365234273754e-06, |
|
"loss": 0.9164, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2715466351829988, |
|
"grad_norm": 0.22180996969883549, |
|
"learning_rate": 9.140044155740102e-06, |
|
"loss": 0.9011, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2833530106257379, |
|
"grad_norm": 0.22437103522632068, |
|
"learning_rate": 9.02064988121396e-06, |
|
"loss": 0.9006, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.29515938606847697, |
|
"grad_norm": 0.21463636431913852, |
|
"learning_rate": 8.894386393810563e-06, |
|
"loss": 0.8936, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3069657615112161, |
|
"grad_norm": 0.21659671484135407, |
|
"learning_rate": 8.761469412580126e-06, |
|
"loss": 0.892, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3187721369539551, |
|
"grad_norm": 0.22355054167698388, |
|
"learning_rate": 8.622126023955446e-06, |
|
"loss": 0.892, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3305785123966942, |
|
"grad_norm": 0.21653458201632308, |
|
"learning_rate": 8.476594293778561e-06, |
|
"loss": 0.8925, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.34238488783943327, |
|
"grad_norm": 0.222501084583819, |
|
"learning_rate": 8.325122860569241e-06, |
|
"loss": 0.8864, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.3541912632821724, |
|
"grad_norm": 0.21898064700237985, |
|
"learning_rate": 8.167970510730254e-06, |
|
"loss": 0.8849, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3659976387249115, |
|
"grad_norm": 0.22115057529251225, |
|
"learning_rate": 8.005405736415127e-06, |
|
"loss": 0.8815, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3778040141676505, |
|
"grad_norm": 0.21778385690198585, |
|
"learning_rate": 7.837706276813819e-06, |
|
"loss": 0.8806, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.38961038961038963, |
|
"grad_norm": 0.2293476838871522, |
|
"learning_rate": 7.66515864363997e-06, |
|
"loss": 0.8781, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4014167650531287, |
|
"grad_norm": 0.23331956197796366, |
|
"learning_rate": 7.488057631630438e-06, |
|
"loss": 0.8742, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4132231404958678, |
|
"grad_norm": 0.21721254042866397, |
|
"learning_rate": 7.30670581489344e-06, |
|
"loss": 0.8805, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.42502951593860683, |
|
"grad_norm": 0.23010261349577163, |
|
"learning_rate": 7.121413029965769e-06, |
|
"loss": 0.8742, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.43683589138134593, |
|
"grad_norm": 0.2206787567833532, |
|
"learning_rate": 6.932495846462262e-06, |
|
"loss": 0.876, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.448642266824085, |
|
"grad_norm": 0.2398992468500764, |
|
"learning_rate": 6.7402770262219234e-06, |
|
"loss": 0.8703, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4604486422668241, |
|
"grad_norm": 0.2448460311968614, |
|
"learning_rate": 6.545084971874738e-06, |
|
"loss": 0.873, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.4722550177095632, |
|
"grad_norm": 0.2612008536174653, |
|
"learning_rate": 6.34725316577129e-06, |
|
"loss": 0.8746, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.48406139315230223, |
|
"grad_norm": 0.24693316413096483, |
|
"learning_rate": 6.147119600233758e-06, |
|
"loss": 0.8736, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.49586776859504134, |
|
"grad_norm": 0.24534026805051296, |
|
"learning_rate": 5.945026200101702e-06, |
|
"loss": 0.8652, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5076741440377804, |
|
"grad_norm": 0.23005356270231522, |
|
"learning_rate": 5.74131823855921e-06, |
|
"loss": 0.8624, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5194805194805194, |
|
"grad_norm": 0.24466967197626655, |
|
"learning_rate": 5.53634374724146e-06, |
|
"loss": 0.8639, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5312868949232585, |
|
"grad_norm": 0.24160067474761546, |
|
"learning_rate": 5.3304529216284974e-06, |
|
"loss": 0.8632, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5430932703659976, |
|
"grad_norm": 0.26220069503373417, |
|
"learning_rate": 5.123997522742151e-06, |
|
"loss": 0.8639, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5548996458087367, |
|
"grad_norm": 0.264066001522196, |
|
"learning_rate": 4.917330276168208e-06, |
|
"loss": 0.8623, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.5667060212514758, |
|
"grad_norm": 0.24428345928123874, |
|
"learning_rate": 4.710804269430681e-06, |
|
"loss": 0.8692, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5785123966942148, |
|
"grad_norm": 0.24362157236851636, |
|
"learning_rate": 4.504772348747687e-06, |
|
"loss": 0.8617, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.5903187721369539, |
|
"grad_norm": 0.2605786039459921, |
|
"learning_rate": 4.299586516199611e-06, |
|
"loss": 0.8631, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.602125147579693, |
|
"grad_norm": 0.24469411057849033, |
|
"learning_rate": 4.0955973283394525e-06, |
|
"loss": 0.8519, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6139315230224321, |
|
"grad_norm": 0.24265909990188317, |
|
"learning_rate": 3.893153297272829e-06, |
|
"loss": 0.858, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6257378984651711, |
|
"grad_norm": 0.2431981393362141, |
|
"learning_rate": 3.6926002952309015e-06, |
|
"loss": 0.8574, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6375442739079102, |
|
"grad_norm": 0.25137244252341084, |
|
"learning_rate": 3.4942809636534637e-06, |
|
"loss": 0.8584, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6493506493506493, |
|
"grad_norm": 0.2520911165703089, |
|
"learning_rate": 3.298534127791785e-06, |
|
"loss": 0.8578, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6611570247933884, |
|
"grad_norm": 0.2651721936087408, |
|
"learning_rate": 3.105694217831361e-06, |
|
"loss": 0.8589, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6729634002361276, |
|
"grad_norm": 0.2546800394311146, |
|
"learning_rate": 2.9160906975235493e-06, |
|
"loss": 0.8523, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6847697756788665, |
|
"grad_norm": 0.25316819030742643, |
|
"learning_rate": 2.7300475013022666e-06, |
|
"loss": 0.8552, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6965761511216056, |
|
"grad_norm": 0.2509477836308168, |
|
"learning_rate": 2.5478824808474613e-06, |
|
"loss": 0.8574, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7083825265643447, |
|
"grad_norm": 0.22985580266469613, |
|
"learning_rate": 2.3699068620408305e-06, |
|
"loss": 0.8525, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7201889020070839, |
|
"grad_norm": 0.24633435787531885, |
|
"learning_rate": 2.1964247132416373e-06, |
|
"loss": 0.8541, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.731995277449823, |
|
"grad_norm": 0.2550223994617769, |
|
"learning_rate": 2.027732425791011e-06, |
|
"loss": 0.8575, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.743801652892562, |
|
"grad_norm": 0.250598594634277, |
|
"learning_rate": 1.864118207632315e-06, |
|
"loss": 0.8584, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.755608028335301, |
|
"grad_norm": 0.2547611444118788, |
|
"learning_rate": 1.7058615909127102e-06, |
|
"loss": 0.8578, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7674144037780402, |
|
"grad_norm": 0.2551344265566289, |
|
"learning_rate": 1.5532329544071712e-06, |
|
"loss": 0.8568, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.7792207792207793, |
|
"grad_norm": 0.24284035631227585, |
|
"learning_rate": 1.406493061580881e-06, |
|
"loss": 0.854, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7910271546635183, |
|
"grad_norm": 0.2454427828284572, |
|
"learning_rate": 1.2658926150792321e-06, |
|
"loss": 0.8543, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8028335301062574, |
|
"grad_norm": 0.24171894401645216, |
|
"learning_rate": 1.1316718284065536e-06, |
|
"loss": 0.8601, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8146399055489965, |
|
"grad_norm": 0.24890059165826273, |
|
"learning_rate": 1.0040600155253766e-06, |
|
"loss": 0.8588, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.8264462809917356, |
|
"grad_norm": 0.24128541650834764, |
|
"learning_rate": 8.832751990773714e-07, |
|
"loss": 0.8547, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8382526564344747, |
|
"grad_norm": 0.25646382446616656, |
|
"learning_rate": 7.695237378953224e-07, |
|
"loss": 0.859, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.8500590318772137, |
|
"grad_norm": 0.2460481416324142, |
|
"learning_rate": 6.629999744425236e-07, |
|
"loss": 0.8567, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8618654073199528, |
|
"grad_norm": 0.24513983983677975, |
|
"learning_rate": 5.63885902781941e-07, |
|
"loss": 0.8596, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.8736717827626919, |
|
"grad_norm": 0.2343335791217957, |
|
"learning_rate": 4.723508576424063e-07, |
|
"loss": 0.8552, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.885478158205431, |
|
"grad_norm": 0.2519214484705661, |
|
"learning_rate": 3.885512251130763e-07, |
|
"loss": 0.8527, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.89728453364817, |
|
"grad_norm": 0.2440125511079392, |
|
"learning_rate": 3.126301754604233e-07, |
|
"loss": 0.8548, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.2532464399771227, |
|
"learning_rate": 2.447174185242324e-07, |
|
"loss": 0.854, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.9208972845336482, |
|
"grad_norm": 0.23622376914066623, |
|
"learning_rate": 1.849289821105199e-07, |
|
"loss": 0.8521, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9327036599763873, |
|
"grad_norm": 0.2444798272238538, |
|
"learning_rate": 1.333670137599713e-07, |
|
"loss": 0.8532, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.9445100354191264, |
|
"grad_norm": 0.23429527809735437, |
|
"learning_rate": 9.011960623058202e-08, |
|
"loss": 0.8532, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9563164108618654, |
|
"grad_norm": 0.23623057232372727, |
|
"learning_rate": 5.526064699265754e-08, |
|
"loss": 0.8506, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.9681227863046045, |
|
"grad_norm": 0.2482774375940229, |
|
"learning_rate": 2.884969199331178e-08, |
|
"loss": 0.8587, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9799291617473436, |
|
"grad_norm": 0.23526019800815218, |
|
"learning_rate": 1.0931863906127327e-08, |
|
"loss": 0.8558, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.9917355371900827, |
|
"grad_norm": 0.24074172289610016, |
|
"learning_rate": 1.53777503982655e-09, |
|
"loss": 0.853, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.9988193624557261, |
|
"eval_loss": 0.8673050403594971, |
|
"eval_runtime": 1.109, |
|
"eval_samples_per_second": 1.804, |
|
"eval_steps_per_second": 0.902, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.9988193624557261, |
|
"step": 423, |
|
"total_flos": 1701364321222656.0, |
|
"train_loss": 0.0, |
|
"train_runtime": 0.0113, |
|
"train_samples_per_second": 2398604.87, |
|
"train_steps_per_second": 37460.213 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 423, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1701364321222656.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|