oh_v1.3_camel_chemistry_x4 / trainer_state.json
sedrickkeh's picture
End of training
7eeae66 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9987628865979383,
"eval_steps": 500,
"global_step": 909,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.032989690721649485,
"grad_norm": 3.190283465974621,
"learning_rate": 5e-06,
"loss": 1.0361,
"step": 10
},
{
"epoch": 0.06597938144329897,
"grad_norm": 1.2412136578455273,
"learning_rate": 5e-06,
"loss": 0.9152,
"step": 20
},
{
"epoch": 0.09896907216494845,
"grad_norm": 0.9937983329085553,
"learning_rate": 5e-06,
"loss": 0.8771,
"step": 30
},
{
"epoch": 0.13195876288659794,
"grad_norm": 0.9565722646796444,
"learning_rate": 5e-06,
"loss": 0.855,
"step": 40
},
{
"epoch": 0.16494845360824742,
"grad_norm": 0.8422015410274669,
"learning_rate": 5e-06,
"loss": 0.834,
"step": 50
},
{
"epoch": 0.1979381443298969,
"grad_norm": 0.9927761059685116,
"learning_rate": 5e-06,
"loss": 0.8249,
"step": 60
},
{
"epoch": 0.2309278350515464,
"grad_norm": 1.0124625468461579,
"learning_rate": 5e-06,
"loss": 0.8092,
"step": 70
},
{
"epoch": 0.2639175257731959,
"grad_norm": 0.9162707322543721,
"learning_rate": 5e-06,
"loss": 0.8002,
"step": 80
},
{
"epoch": 0.29690721649484536,
"grad_norm": 0.7438936667406614,
"learning_rate": 5e-06,
"loss": 0.7953,
"step": 90
},
{
"epoch": 0.32989690721649484,
"grad_norm": 0.74943987165883,
"learning_rate": 5e-06,
"loss": 0.7912,
"step": 100
},
{
"epoch": 0.3628865979381443,
"grad_norm": 0.6393536784375358,
"learning_rate": 5e-06,
"loss": 0.7835,
"step": 110
},
{
"epoch": 0.3958762886597938,
"grad_norm": 0.6732184370236527,
"learning_rate": 5e-06,
"loss": 0.7846,
"step": 120
},
{
"epoch": 0.4288659793814433,
"grad_norm": 0.675715599863121,
"learning_rate": 5e-06,
"loss": 0.7835,
"step": 130
},
{
"epoch": 0.4618556701030928,
"grad_norm": 0.638472159519646,
"learning_rate": 5e-06,
"loss": 0.7797,
"step": 140
},
{
"epoch": 0.4948453608247423,
"grad_norm": 1.1212007213917934,
"learning_rate": 5e-06,
"loss": 0.7759,
"step": 150
},
{
"epoch": 0.5278350515463918,
"grad_norm": 0.5929058650136099,
"learning_rate": 5e-06,
"loss": 0.7758,
"step": 160
},
{
"epoch": 0.5608247422680412,
"grad_norm": 0.7325664578284696,
"learning_rate": 5e-06,
"loss": 0.7746,
"step": 170
},
{
"epoch": 0.5938144329896907,
"grad_norm": 0.7146407893100764,
"learning_rate": 5e-06,
"loss": 0.772,
"step": 180
},
{
"epoch": 0.6268041237113402,
"grad_norm": 0.6297828498939105,
"learning_rate": 5e-06,
"loss": 0.7686,
"step": 190
},
{
"epoch": 0.6597938144329897,
"grad_norm": 0.650337063259678,
"learning_rate": 5e-06,
"loss": 0.7668,
"step": 200
},
{
"epoch": 0.6927835051546392,
"grad_norm": 0.577352278155154,
"learning_rate": 5e-06,
"loss": 0.7633,
"step": 210
},
{
"epoch": 0.7257731958762886,
"grad_norm": 0.6351053699389445,
"learning_rate": 5e-06,
"loss": 0.7606,
"step": 220
},
{
"epoch": 0.7587628865979381,
"grad_norm": 0.6179445706530043,
"learning_rate": 5e-06,
"loss": 0.7644,
"step": 230
},
{
"epoch": 0.7917525773195876,
"grad_norm": 0.7772047208925177,
"learning_rate": 5e-06,
"loss": 0.7585,
"step": 240
},
{
"epoch": 0.8247422680412371,
"grad_norm": 0.6393400921262609,
"learning_rate": 5e-06,
"loss": 0.7597,
"step": 250
},
{
"epoch": 0.8577319587628865,
"grad_norm": 0.5366628092052804,
"learning_rate": 5e-06,
"loss": 0.7559,
"step": 260
},
{
"epoch": 0.8907216494845361,
"grad_norm": 0.6897451596502111,
"learning_rate": 5e-06,
"loss": 0.757,
"step": 270
},
{
"epoch": 0.9237113402061856,
"grad_norm": 0.683076031456689,
"learning_rate": 5e-06,
"loss": 0.7595,
"step": 280
},
{
"epoch": 0.9567010309278351,
"grad_norm": 0.6342586759859082,
"learning_rate": 5e-06,
"loss": 0.7576,
"step": 290
},
{
"epoch": 0.9896907216494846,
"grad_norm": 0.6339977397184522,
"learning_rate": 5e-06,
"loss": 0.7548,
"step": 300
},
{
"epoch": 0.9995876288659794,
"eval_loss": 0.7519278526306152,
"eval_runtime": 322.6674,
"eval_samples_per_second": 25.314,
"eval_steps_per_second": 0.397,
"step": 303
},
{
"epoch": 1.022680412371134,
"grad_norm": 0.810990287818241,
"learning_rate": 5e-06,
"loss": 0.7934,
"step": 310
},
{
"epoch": 1.0556701030927835,
"grad_norm": 0.5989713675220099,
"learning_rate": 5e-06,
"loss": 0.7044,
"step": 320
},
{
"epoch": 1.088659793814433,
"grad_norm": 0.584782182855064,
"learning_rate": 5e-06,
"loss": 0.7115,
"step": 330
},
{
"epoch": 1.1216494845360825,
"grad_norm": 0.7858920415247334,
"learning_rate": 5e-06,
"loss": 0.7094,
"step": 340
},
{
"epoch": 1.1546391752577319,
"grad_norm": 0.9401995798606461,
"learning_rate": 5e-06,
"loss": 0.7079,
"step": 350
},
{
"epoch": 1.1876288659793814,
"grad_norm": 0.6150009311102699,
"learning_rate": 5e-06,
"loss": 0.7073,
"step": 360
},
{
"epoch": 1.220618556701031,
"grad_norm": 0.6009149100944755,
"learning_rate": 5e-06,
"loss": 0.7096,
"step": 370
},
{
"epoch": 1.2536082474226804,
"grad_norm": 0.6115518108906659,
"learning_rate": 5e-06,
"loss": 0.7066,
"step": 380
},
{
"epoch": 1.2865979381443298,
"grad_norm": 0.7496882281145417,
"learning_rate": 5e-06,
"loss": 0.7076,
"step": 390
},
{
"epoch": 1.3195876288659794,
"grad_norm": 0.6685224897984725,
"learning_rate": 5e-06,
"loss": 0.7062,
"step": 400
},
{
"epoch": 1.352577319587629,
"grad_norm": 0.641185927057492,
"learning_rate": 5e-06,
"loss": 0.7117,
"step": 410
},
{
"epoch": 1.3855670103092783,
"grad_norm": 0.5361388827305237,
"learning_rate": 5e-06,
"loss": 0.7094,
"step": 420
},
{
"epoch": 1.418556701030928,
"grad_norm": 1.002359631516242,
"learning_rate": 5e-06,
"loss": 0.7054,
"step": 430
},
{
"epoch": 1.4515463917525773,
"grad_norm": 0.8431450479727091,
"learning_rate": 5e-06,
"loss": 0.7075,
"step": 440
},
{
"epoch": 1.4845360824742269,
"grad_norm": 0.6447323729739957,
"learning_rate": 5e-06,
"loss": 0.7099,
"step": 450
},
{
"epoch": 1.5175257731958762,
"grad_norm": 0.8431314429320579,
"learning_rate": 5e-06,
"loss": 0.7018,
"step": 460
},
{
"epoch": 1.5505154639175258,
"grad_norm": 0.6273662519128372,
"learning_rate": 5e-06,
"loss": 0.7051,
"step": 470
},
{
"epoch": 1.5835051546391754,
"grad_norm": 0.8396735090007554,
"learning_rate": 5e-06,
"loss": 0.7106,
"step": 480
},
{
"epoch": 1.6164948453608248,
"grad_norm": 0.5802654475284174,
"learning_rate": 5e-06,
"loss": 0.7043,
"step": 490
},
{
"epoch": 1.6494845360824741,
"grad_norm": 0.6224806599884348,
"learning_rate": 5e-06,
"loss": 0.7086,
"step": 500
},
{
"epoch": 1.6824742268041237,
"grad_norm": 0.6154446076130442,
"learning_rate": 5e-06,
"loss": 0.7026,
"step": 510
},
{
"epoch": 1.7154639175257733,
"grad_norm": 0.5857753449684375,
"learning_rate": 5e-06,
"loss": 0.7037,
"step": 520
},
{
"epoch": 1.7484536082474227,
"grad_norm": 0.5716099691987403,
"learning_rate": 5e-06,
"loss": 0.7066,
"step": 530
},
{
"epoch": 1.781443298969072,
"grad_norm": 0.6774790897099987,
"learning_rate": 5e-06,
"loss": 0.707,
"step": 540
},
{
"epoch": 1.8144329896907216,
"grad_norm": 0.6117062221128381,
"learning_rate": 5e-06,
"loss": 0.6996,
"step": 550
},
{
"epoch": 1.8474226804123712,
"grad_norm": 0.5362825787566358,
"learning_rate": 5e-06,
"loss": 0.7036,
"step": 560
},
{
"epoch": 1.8804123711340206,
"grad_norm": 0.7851595485494056,
"learning_rate": 5e-06,
"loss": 0.7066,
"step": 570
},
{
"epoch": 1.91340206185567,
"grad_norm": 0.642752104749906,
"learning_rate": 5e-06,
"loss": 0.705,
"step": 580
},
{
"epoch": 1.9463917525773196,
"grad_norm": 0.6663997639727156,
"learning_rate": 5e-06,
"loss": 0.7051,
"step": 590
},
{
"epoch": 1.9793814432989691,
"grad_norm": 0.6435653630361237,
"learning_rate": 5e-06,
"loss": 0.7026,
"step": 600
},
{
"epoch": 1.9991752577319588,
"eval_loss": 0.7376570701599121,
"eval_runtime": 321.676,
"eval_samples_per_second": 25.392,
"eval_steps_per_second": 0.398,
"step": 606
},
{
"epoch": 2.0123711340206185,
"grad_norm": 0.7966992322635892,
"learning_rate": 5e-06,
"loss": 0.7418,
"step": 610
},
{
"epoch": 2.045360824742268,
"grad_norm": 0.5919842716689093,
"learning_rate": 5e-06,
"loss": 0.6581,
"step": 620
},
{
"epoch": 2.0783505154639177,
"grad_norm": 0.6225831303900108,
"learning_rate": 5e-06,
"loss": 0.6494,
"step": 630
},
{
"epoch": 2.111340206185567,
"grad_norm": 0.5720666970317613,
"learning_rate": 5e-06,
"loss": 0.6557,
"step": 640
},
{
"epoch": 2.1443298969072164,
"grad_norm": 0.625621284764116,
"learning_rate": 5e-06,
"loss": 0.6562,
"step": 650
},
{
"epoch": 2.177319587628866,
"grad_norm": 0.722621026378947,
"learning_rate": 5e-06,
"loss": 0.6592,
"step": 660
},
{
"epoch": 2.2103092783505156,
"grad_norm": 0.6611874958125228,
"learning_rate": 5e-06,
"loss": 0.6576,
"step": 670
},
{
"epoch": 2.243298969072165,
"grad_norm": 0.624720046082098,
"learning_rate": 5e-06,
"loss": 0.6534,
"step": 680
},
{
"epoch": 2.2762886597938143,
"grad_norm": 0.6227890769590231,
"learning_rate": 5e-06,
"loss": 0.6561,
"step": 690
},
{
"epoch": 2.3092783505154637,
"grad_norm": 0.6353543358518403,
"learning_rate": 5e-06,
"loss": 0.6564,
"step": 700
},
{
"epoch": 2.3422680412371135,
"grad_norm": 0.616682251013517,
"learning_rate": 5e-06,
"loss": 0.6558,
"step": 710
},
{
"epoch": 2.375257731958763,
"grad_norm": 0.5785627398529801,
"learning_rate": 5e-06,
"loss": 0.6579,
"step": 720
},
{
"epoch": 2.4082474226804123,
"grad_norm": 0.7087632640527876,
"learning_rate": 5e-06,
"loss": 0.6578,
"step": 730
},
{
"epoch": 2.441237113402062,
"grad_norm": 0.7221097669514308,
"learning_rate": 5e-06,
"loss": 0.6555,
"step": 740
},
{
"epoch": 2.4742268041237114,
"grad_norm": 0.6845092133296887,
"learning_rate": 5e-06,
"loss": 0.6589,
"step": 750
},
{
"epoch": 2.507216494845361,
"grad_norm": 0.6131735355128494,
"learning_rate": 5e-06,
"loss": 0.6597,
"step": 760
},
{
"epoch": 2.54020618556701,
"grad_norm": 0.74499117668607,
"learning_rate": 5e-06,
"loss": 0.6604,
"step": 770
},
{
"epoch": 2.5731958762886595,
"grad_norm": 0.6953072761863929,
"learning_rate": 5e-06,
"loss": 0.6599,
"step": 780
},
{
"epoch": 2.6061855670103093,
"grad_norm": 0.7683634702318719,
"learning_rate": 5e-06,
"loss": 0.6584,
"step": 790
},
{
"epoch": 2.6391752577319587,
"grad_norm": 0.9202931242949187,
"learning_rate": 5e-06,
"loss": 0.6599,
"step": 800
},
{
"epoch": 2.6721649484536085,
"grad_norm": 0.6785534766587453,
"learning_rate": 5e-06,
"loss": 0.6605,
"step": 810
},
{
"epoch": 2.705154639175258,
"grad_norm": 0.9373759072613878,
"learning_rate": 5e-06,
"loss": 0.6633,
"step": 820
},
{
"epoch": 2.7381443298969073,
"grad_norm": 0.5316447851690145,
"learning_rate": 5e-06,
"loss": 0.6582,
"step": 830
},
{
"epoch": 2.7711340206185566,
"grad_norm": 0.7810499110998566,
"learning_rate": 5e-06,
"loss": 0.6638,
"step": 840
},
{
"epoch": 2.804123711340206,
"grad_norm": 0.5581690358208933,
"learning_rate": 5e-06,
"loss": 0.6572,
"step": 850
},
{
"epoch": 2.837113402061856,
"grad_norm": 0.5757480690524878,
"learning_rate": 5e-06,
"loss": 0.6629,
"step": 860
},
{
"epoch": 2.870103092783505,
"grad_norm": 0.6570394054126519,
"learning_rate": 5e-06,
"loss": 0.6603,
"step": 870
},
{
"epoch": 2.9030927835051545,
"grad_norm": 0.5532161107989387,
"learning_rate": 5e-06,
"loss": 0.6596,
"step": 880
},
{
"epoch": 2.9360824742268044,
"grad_norm": 0.6779485831959426,
"learning_rate": 5e-06,
"loss": 0.6615,
"step": 890
},
{
"epoch": 2.9690721649484537,
"grad_norm": 0.6105580266011457,
"learning_rate": 5e-06,
"loss": 0.6573,
"step": 900
},
{
"epoch": 2.9987628865979383,
"eval_loss": 0.737443208694458,
"eval_runtime": 321.3627,
"eval_samples_per_second": 25.417,
"eval_steps_per_second": 0.398,
"step": 909
},
{
"epoch": 2.9987628865979383,
"step": 909,
"total_flos": 1522399476449280.0,
"train_loss": 0.7216839113644641,
"train_runtime": 53747.0627,
"train_samples_per_second": 8.662,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 909,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1522399476449280.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}