|
{ |
|
"best_metric": 7.317912578582764, |
|
"best_model_checkpoint": "bert_tiny_lda_5_v1/checkpoint-30000", |
|
"epoch": 25.0, |
|
"eval_steps": 10000, |
|
"global_step": 35725, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.34989503149055284, |
|
"grad_norm": 2.1514995098114014, |
|
"learning_rate": 5e-06, |
|
"loss": 11.3174, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6997900629811057, |
|
"grad_norm": 0.9805640578269958, |
|
"learning_rate": 1e-05, |
|
"loss": 9.6408, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0496850944716585, |
|
"grad_norm": 0.583168625831604, |
|
"learning_rate": 1.5e-05, |
|
"loss": 8.6938, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.3995801259622114, |
|
"grad_norm": 0.581084132194519, |
|
"learning_rate": 2e-05, |
|
"loss": 8.503, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.749475157452764, |
|
"grad_norm": 0.6091183423995972, |
|
"learning_rate": 2.5e-05, |
|
"loss": 8.3913, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.099370188943317, |
|
"grad_norm": 0.9693360924720764, |
|
"learning_rate": 3e-05, |
|
"loss": 8.2919, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.44926522043387, |
|
"grad_norm": 0.822096049785614, |
|
"learning_rate": 3.5e-05, |
|
"loss": 8.2003, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.7991602519244227, |
|
"grad_norm": 0.9631165266036987, |
|
"learning_rate": 4e-05, |
|
"loss": 8.1238, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.1490552834149756, |
|
"grad_norm": 0.9502329230308533, |
|
"learning_rate": 4.5e-05, |
|
"loss": 8.0682, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 3.498950314905528, |
|
"grad_norm": 0.8137854933738708, |
|
"learning_rate": 5e-05, |
|
"loss": 8.0245, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.8488453463960814, |
|
"grad_norm": 0.6269208788871765, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 7.9804, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 4.198740377886634, |
|
"grad_norm": 0.7888797521591187, |
|
"learning_rate": 6e-05, |
|
"loss": 7.9445, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 4.548635409377187, |
|
"grad_norm": 0.6673307418823242, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 7.909, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 4.89853044086774, |
|
"grad_norm": 0.6339680552482605, |
|
"learning_rate": 7e-05, |
|
"loss": 7.8781, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 5.248425472358292, |
|
"grad_norm": 0.7484241724014282, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 7.8479, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 5.5983205038488455, |
|
"grad_norm": 0.705053448677063, |
|
"learning_rate": 8e-05, |
|
"loss": 7.8187, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 5.948215535339398, |
|
"grad_norm": 0.7108335494995117, |
|
"learning_rate": 8.5e-05, |
|
"loss": 7.798, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 6.298110566829951, |
|
"grad_norm": 0.9193771481513977, |
|
"learning_rate": 9e-05, |
|
"loss": 7.7733, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 6.648005598320504, |
|
"grad_norm": 0.7022958993911743, |
|
"learning_rate": 9.5e-05, |
|
"loss": 7.7537, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 6.997900629811056, |
|
"grad_norm": 0.5968444347381592, |
|
"learning_rate": 0.0001, |
|
"loss": 7.735, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 6.997900629811056, |
|
"eval_accuracy": 0.1533309145711814, |
|
"eval_loss": 7.668768882751465, |
|
"eval_runtime": 0.9954, |
|
"eval_samples_per_second": 481.211, |
|
"eval_steps_per_second": 3.014, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 7.3477956613016095, |
|
"grad_norm": 0.5823611617088318, |
|
"learning_rate": 9.805636540330418e-05, |
|
"loss": 7.7113, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 7.697690692792162, |
|
"grad_norm": 0.838610827922821, |
|
"learning_rate": 9.611273080660836e-05, |
|
"loss": 7.6992, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 8.047585724282715, |
|
"grad_norm": 0.6564937829971313, |
|
"learning_rate": 9.416909620991254e-05, |
|
"loss": 7.6777, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 8.397480755773268, |
|
"grad_norm": 0.8518803119659424, |
|
"learning_rate": 9.222546161321672e-05, |
|
"loss": 7.6481, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 8.74737578726382, |
|
"grad_norm": 0.8578371405601501, |
|
"learning_rate": 9.02818270165209e-05, |
|
"loss": 7.6016, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 9.097270818754374, |
|
"grad_norm": 0.8438256978988647, |
|
"learning_rate": 8.833819241982508e-05, |
|
"loss": 7.5425, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 9.447165850244927, |
|
"grad_norm": 1.0309187173843384, |
|
"learning_rate": 8.639455782312925e-05, |
|
"loss": 7.5037, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 9.79706088173548, |
|
"grad_norm": 0.640387773513794, |
|
"learning_rate": 8.445092322643343e-05, |
|
"loss": 7.4909, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 10.146955913226032, |
|
"grad_norm": 0.6895781755447388, |
|
"learning_rate": 8.250728862973761e-05, |
|
"loss": 7.4791, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 10.496850944716584, |
|
"grad_norm": 0.6688145995140076, |
|
"learning_rate": 8.056365403304179e-05, |
|
"loss": 7.4638, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 10.846745976207139, |
|
"grad_norm": 0.6284395456314087, |
|
"learning_rate": 7.862001943634597e-05, |
|
"loss": 7.4597, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 11.196641007697691, |
|
"grad_norm": 0.685321569442749, |
|
"learning_rate": 7.667638483965015e-05, |
|
"loss": 7.4469, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 11.546536039188243, |
|
"grad_norm": 0.6994627714157104, |
|
"learning_rate": 7.473275024295433e-05, |
|
"loss": 7.4372, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 11.896431070678796, |
|
"grad_norm": 0.7122758030891418, |
|
"learning_rate": 7.27891156462585e-05, |
|
"loss": 7.4293, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 12.246326102169348, |
|
"grad_norm": 0.6678953170776367, |
|
"learning_rate": 7.08454810495627e-05, |
|
"loss": 7.4228, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 12.596221133659903, |
|
"grad_norm": 0.6669552326202393, |
|
"learning_rate": 6.890184645286687e-05, |
|
"loss": 7.4132, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 12.946116165150455, |
|
"grad_norm": 0.8246549367904663, |
|
"learning_rate": 6.695821185617104e-05, |
|
"loss": 7.4135, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 13.296011196641008, |
|
"grad_norm": 0.6137415170669556, |
|
"learning_rate": 6.501457725947522e-05, |
|
"loss": 7.3995, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 13.64590622813156, |
|
"grad_norm": 0.6450156569480896, |
|
"learning_rate": 6.30709426627794e-05, |
|
"loss": 7.3942, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 13.995801259622114, |
|
"grad_norm": 0.712520956993103, |
|
"learning_rate": 6.112730806608357e-05, |
|
"loss": 7.3886, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 13.995801259622114, |
|
"eval_accuracy": 0.15518117168980697, |
|
"eval_loss": 7.352290630340576, |
|
"eval_runtime": 0.9817, |
|
"eval_samples_per_second": 487.916, |
|
"eval_steps_per_second": 3.056, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 14.345696291112667, |
|
"grad_norm": 0.7041005492210388, |
|
"learning_rate": 5.918367346938776e-05, |
|
"loss": 7.3793, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 14.695591322603219, |
|
"grad_norm": 0.7062017917633057, |
|
"learning_rate": 5.724003887269194e-05, |
|
"loss": 7.3775, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 15.045486354093772, |
|
"grad_norm": 0.6913180351257324, |
|
"learning_rate": 5.529640427599612e-05, |
|
"loss": 7.3728, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 15.395381385584324, |
|
"grad_norm": 0.6736568212509155, |
|
"learning_rate": 5.3352769679300295e-05, |
|
"loss": 7.3656, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 15.745276417074878, |
|
"grad_norm": 0.788135826587677, |
|
"learning_rate": 5.1409135082604474e-05, |
|
"loss": 7.3602, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 16.09517144856543, |
|
"grad_norm": 0.6803423762321472, |
|
"learning_rate": 4.946550048590865e-05, |
|
"loss": 7.3586, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 16.445066480055985, |
|
"grad_norm": 0.7166891694068909, |
|
"learning_rate": 4.752186588921283e-05, |
|
"loss": 7.351, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 16.794961511546536, |
|
"grad_norm": 0.629996120929718, |
|
"learning_rate": 4.557823129251701e-05, |
|
"loss": 7.3483, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 17.14485654303709, |
|
"grad_norm": 0.7230992913246155, |
|
"learning_rate": 4.363459669582119e-05, |
|
"loss": 7.3422, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 17.49475157452764, |
|
"grad_norm": 0.669651985168457, |
|
"learning_rate": 4.1690962099125366e-05, |
|
"loss": 7.3404, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 17.844646606018195, |
|
"grad_norm": 0.6792581677436829, |
|
"learning_rate": 3.9747327502429545e-05, |
|
"loss": 7.3377, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 18.19454163750875, |
|
"grad_norm": 0.6232681274414062, |
|
"learning_rate": 3.780369290573372e-05, |
|
"loss": 7.3318, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 18.5444366689993, |
|
"grad_norm": 0.7704149484634399, |
|
"learning_rate": 3.58600583090379e-05, |
|
"loss": 7.3276, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 18.894331700489854, |
|
"grad_norm": 0.6254093050956726, |
|
"learning_rate": 3.391642371234208e-05, |
|
"loss": 7.3284, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 19.244226731980405, |
|
"grad_norm": 0.683228075504303, |
|
"learning_rate": 3.1972789115646265e-05, |
|
"loss": 7.3249, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 19.59412176347096, |
|
"grad_norm": 0.7169561982154846, |
|
"learning_rate": 3.0029154518950437e-05, |
|
"loss": 7.3192, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 19.944016794961513, |
|
"grad_norm": 0.7218673825263977, |
|
"learning_rate": 2.8085519922254615e-05, |
|
"loss": 7.3195, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 20.293911826452064, |
|
"grad_norm": 0.629692554473877, |
|
"learning_rate": 2.6141885325558797e-05, |
|
"loss": 7.3133, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 20.643806857942618, |
|
"grad_norm": 0.6370089054107666, |
|
"learning_rate": 2.4198250728862976e-05, |
|
"loss": 7.312, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 20.99370188943317, |
|
"grad_norm": 0.686647891998291, |
|
"learning_rate": 2.225461613216715e-05, |
|
"loss": 7.31, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 20.99370188943317, |
|
"eval_accuracy": 0.15583133549569878, |
|
"eval_loss": 7.317912578582764, |
|
"eval_runtime": 0.9857, |
|
"eval_samples_per_second": 485.95, |
|
"eval_steps_per_second": 3.044, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 21.343596920923723, |
|
"grad_norm": 0.713712215423584, |
|
"learning_rate": 2.0310981535471333e-05, |
|
"loss": 7.3014, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 21.693491952414277, |
|
"grad_norm": 0.6833124756813049, |
|
"learning_rate": 1.836734693877551e-05, |
|
"loss": 7.2863, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 22.043386983904828, |
|
"grad_norm": 0.8253185153007507, |
|
"learning_rate": 1.642371234207969e-05, |
|
"loss": 7.264, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 22.393282015395382, |
|
"grad_norm": 0.8029682636260986, |
|
"learning_rate": 1.4480077745383868e-05, |
|
"loss": 7.2417, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 22.743177046885933, |
|
"grad_norm": 0.7757593393325806, |
|
"learning_rate": 1.2536443148688048e-05, |
|
"loss": 7.2277, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 23.093072078376487, |
|
"grad_norm": 0.8712273240089417, |
|
"learning_rate": 1.0592808551992225e-05, |
|
"loss": 7.2148, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 23.44296710986704, |
|
"grad_norm": 0.7989441752433777, |
|
"learning_rate": 8.649173955296405e-06, |
|
"loss": 7.2004, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 23.792862141357592, |
|
"grad_norm": 0.870869517326355, |
|
"learning_rate": 6.705539358600584e-06, |
|
"loss": 7.1886, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 24.142757172848146, |
|
"grad_norm": 0.7762761116027832, |
|
"learning_rate": 4.7619047619047615e-06, |
|
"loss": 7.1794, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 24.492652204338697, |
|
"grad_norm": 0.7940042614936829, |
|
"learning_rate": 2.818270165208941e-06, |
|
"loss": 7.1725, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 24.84254723582925, |
|
"grad_norm": 0.7887886166572571, |
|
"learning_rate": 8.746355685131196e-07, |
|
"loss": 7.1711, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"step": 35725, |
|
"total_flos": 3.049796905526016e+17, |
|
"train_loss": 7.630663348358774, |
|
"train_runtime": 20886.7559, |
|
"train_samples_per_second": 273.665, |
|
"train_steps_per_second": 1.71 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 35725, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 25, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.049796905526016e+17, |
|
"train_batch_size": 160, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|