|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 27.888001784832113, |
|
"eval_steps": 50000, |
|
"global_step": 1750000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 4.9920355054102726e-05, |
|
"loss": 1.1412, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 1.011365294456482, |
|
"eval_runtime": 2006.5534, |
|
"eval_samples_per_second": 111.193, |
|
"eval_steps_per_second": 1.738, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"learning_rate": 4.992035346050262e-05, |
|
"loss": 1.0123, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"eval_loss": 0.9693423509597778, |
|
"eval_runtime": 1995.4615, |
|
"eval_samples_per_second": 111.811, |
|
"eval_steps_per_second": 1.747, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"learning_rate": 4.992034708610221e-05, |
|
"loss": 0.9754, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"eval_loss": 0.9472731351852417, |
|
"eval_runtime": 1997.2157, |
|
"eval_samples_per_second": 111.713, |
|
"eval_steps_per_second": 1.746, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"learning_rate": 4.992034549250211e-05, |
|
"loss": 0.9539, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"eval_loss": 0.9325647354125977, |
|
"eval_runtime": 2015.1208, |
|
"eval_samples_per_second": 110.72, |
|
"eval_steps_per_second": 1.73, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"learning_rate": 4.992035027330242e-05, |
|
"loss": 0.9387, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"eval_loss": 0.9212433099746704, |
|
"eval_runtime": 2011.4269, |
|
"eval_samples_per_second": 110.924, |
|
"eval_steps_per_second": 1.734, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"learning_rate": 4.992035027330242e-05, |
|
"loss": 0.9243, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"eval_loss": 0.9138051271438599, |
|
"eval_runtime": 2011.3973, |
|
"eval_samples_per_second": 110.925, |
|
"eval_steps_per_second": 1.734, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 5.58, |
|
"learning_rate": 4.992035186690252e-05, |
|
"loss": 0.9144, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 5.58, |
|
"eval_loss": 0.9093130826950073, |
|
"eval_runtime": 1998.4573, |
|
"eval_samples_per_second": 111.644, |
|
"eval_steps_per_second": 1.745, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 6.37, |
|
"learning_rate": 4.9920355054102726e-05, |
|
"loss": 0.906, |
|
"step": 400000 |
|
}, |
|
{ |
|
"epoch": 6.37, |
|
"eval_loss": 0.9041373133659363, |
|
"eval_runtime": 1998.7351, |
|
"eval_samples_per_second": 111.628, |
|
"eval_steps_per_second": 1.745, |
|
"step": 400000 |
|
}, |
|
{ |
|
"epoch": 7.17, |
|
"learning_rate": 4.9920355054102726e-05, |
|
"loss": 0.8994, |
|
"step": 450000 |
|
}, |
|
{ |
|
"epoch": 7.17, |
|
"eval_loss": 0.9003444910049438, |
|
"eval_runtime": 1982.6092, |
|
"eval_samples_per_second": 112.536, |
|
"eval_steps_per_second": 1.759, |
|
"step": 450000 |
|
}, |
|
{ |
|
"epoch": 7.97, |
|
"learning_rate": 4.992035186690252e-05, |
|
"loss": 0.8933, |
|
"step": 500000 |
|
}, |
|
{ |
|
"epoch": 7.97, |
|
"eval_loss": 0.8956149220466614, |
|
"eval_runtime": 2002.7479, |
|
"eval_samples_per_second": 111.404, |
|
"eval_steps_per_second": 1.741, |
|
"step": 500000 |
|
}, |
|
{ |
|
"epoch": 8.76, |
|
"learning_rate": 4.9920355054102726e-05, |
|
"loss": 0.8856, |
|
"step": 550000 |
|
}, |
|
{ |
|
"epoch": 8.76, |
|
"eval_loss": 0.8930546045303345, |
|
"eval_runtime": 1996.5839, |
|
"eval_samples_per_second": 111.748, |
|
"eval_steps_per_second": 1.746, |
|
"step": 550000 |
|
}, |
|
{ |
|
"epoch": 9.56, |
|
"learning_rate": 4.992035346050262e-05, |
|
"loss": 0.8802, |
|
"step": 600000 |
|
}, |
|
{ |
|
"epoch": 9.56, |
|
"eval_loss": 0.89084392786026, |
|
"eval_runtime": 1991.4984, |
|
"eval_samples_per_second": 112.034, |
|
"eval_steps_per_second": 1.751, |
|
"step": 600000 |
|
}, |
|
{ |
|
"epoch": 10.36, |
|
"learning_rate": 4.992035346050262e-05, |
|
"loss": 0.8763, |
|
"step": 650000 |
|
}, |
|
{ |
|
"epoch": 10.36, |
|
"eval_loss": 0.8895950317382812, |
|
"eval_runtime": 2006.0064, |
|
"eval_samples_per_second": 111.223, |
|
"eval_steps_per_second": 1.738, |
|
"step": 650000 |
|
}, |
|
{ |
|
"epoch": 11.16, |
|
"learning_rate": 4.992035186690252e-05, |
|
"loss": 0.8725, |
|
"step": 700000 |
|
}, |
|
{ |
|
"epoch": 11.16, |
|
"eval_loss": 0.8886296153068542, |
|
"eval_runtime": 2027.9335, |
|
"eval_samples_per_second": 110.021, |
|
"eval_steps_per_second": 1.719, |
|
"step": 700000 |
|
}, |
|
{ |
|
"epoch": 11.95, |
|
"learning_rate": 4.992035186690252e-05, |
|
"loss": 0.8688, |
|
"step": 750000 |
|
}, |
|
{ |
|
"epoch": 11.95, |
|
"eval_loss": 0.885003924369812, |
|
"eval_runtime": 1989.8507, |
|
"eval_samples_per_second": 112.127, |
|
"eval_steps_per_second": 1.752, |
|
"step": 750000 |
|
}, |
|
{ |
|
"epoch": 12.75, |
|
"learning_rate": 4.992035186690252e-05, |
|
"loss": 0.8628, |
|
"step": 800000 |
|
}, |
|
{ |
|
"epoch": 12.75, |
|
"eval_loss": 0.8833887577056885, |
|
"eval_runtime": 2010.1701, |
|
"eval_samples_per_second": 110.993, |
|
"eval_steps_per_second": 1.735, |
|
"step": 800000 |
|
}, |
|
{ |
|
"epoch": 13.55, |
|
"learning_rate": 4.9920348679702315e-05, |
|
"loss": 0.8599, |
|
"step": 850000 |
|
}, |
|
{ |
|
"epoch": 13.55, |
|
"eval_loss": 0.883805513381958, |
|
"eval_runtime": 2005.9725, |
|
"eval_samples_per_second": 111.225, |
|
"eval_steps_per_second": 1.738, |
|
"step": 850000 |
|
}, |
|
{ |
|
"epoch": 14.34, |
|
"learning_rate": 4.9920348679702315e-05, |
|
"loss": 0.8572, |
|
"step": 900000 |
|
}, |
|
{ |
|
"epoch": 14.34, |
|
"eval_loss": 0.8837567567825317, |
|
"eval_runtime": 2005.535, |
|
"eval_samples_per_second": 111.25, |
|
"eval_steps_per_second": 1.739, |
|
"step": 900000 |
|
}, |
|
{ |
|
"epoch": 15.14, |
|
"learning_rate": 4.9920348679702315e-05, |
|
"loss": 0.8548, |
|
"step": 950000 |
|
}, |
|
{ |
|
"epoch": 15.14, |
|
"eval_loss": 0.8825677037239075, |
|
"eval_runtime": 1984.894, |
|
"eval_samples_per_second": 112.407, |
|
"eval_steps_per_second": 1.757, |
|
"step": 950000 |
|
}, |
|
{ |
|
"epoch": 15.94, |
|
"learning_rate": 4.992034549250211e-05, |
|
"loss": 0.8502, |
|
"step": 1000000 |
|
}, |
|
{ |
|
"epoch": 15.94, |
|
"eval_loss": 0.8808427453041077, |
|
"eval_runtime": 2006.4913, |
|
"eval_samples_per_second": 111.197, |
|
"eval_steps_per_second": 1.738, |
|
"step": 1000000 |
|
}, |
|
{ |
|
"epoch": 16.73, |
|
"learning_rate": 4.992034708610221e-05, |
|
"loss": 0.8471, |
|
"step": 1050000 |
|
}, |
|
{ |
|
"epoch": 16.73, |
|
"eval_loss": 0.8812766075134277, |
|
"eval_runtime": 1998.6292, |
|
"eval_samples_per_second": 111.634, |
|
"eval_steps_per_second": 1.745, |
|
"step": 1050000 |
|
}, |
|
{ |
|
"epoch": 17.53, |
|
"learning_rate": 4.992034708610221e-05, |
|
"loss": 0.8427, |
|
"step": 1100000 |
|
}, |
|
{ |
|
"epoch": 17.53, |
|
"eval_loss": 0.8817498683929443, |
|
"eval_runtime": 1994.6872, |
|
"eval_samples_per_second": 111.855, |
|
"eval_steps_per_second": 1.748, |
|
"step": 1100000 |
|
}, |
|
{ |
|
"epoch": 18.33, |
|
"learning_rate": 4.992034549250211e-05, |
|
"loss": 0.841, |
|
"step": 1150000 |
|
}, |
|
{ |
|
"epoch": 18.33, |
|
"eval_loss": 0.8802331686019897, |
|
"eval_runtime": 1993.2844, |
|
"eval_samples_per_second": 111.933, |
|
"eval_steps_per_second": 1.749, |
|
"step": 1150000 |
|
}, |
|
{ |
|
"epoch": 19.12, |
|
"learning_rate": 4.992034549250211e-05, |
|
"loss": 0.8399, |
|
"step": 1200000 |
|
}, |
|
{ |
|
"epoch": 19.12, |
|
"eval_loss": 0.8813353180885315, |
|
"eval_runtime": 2003.661, |
|
"eval_samples_per_second": 111.354, |
|
"eval_steps_per_second": 1.74, |
|
"step": 1200000 |
|
}, |
|
{ |
|
"epoch": 19.92, |
|
"learning_rate": 4.9920348679702315e-05, |
|
"loss": 0.8382, |
|
"step": 1250000 |
|
}, |
|
{ |
|
"epoch": 19.92, |
|
"eval_loss": 0.8779821991920471, |
|
"eval_runtime": 1999.9414, |
|
"eval_samples_per_second": 111.561, |
|
"eval_steps_per_second": 1.744, |
|
"step": 1250000 |
|
}, |
|
{ |
|
"epoch": 20.72, |
|
"learning_rate": 4.992035186690252e-05, |
|
"loss": 0.8356, |
|
"step": 1300000 |
|
}, |
|
{ |
|
"epoch": 20.72, |
|
"eval_loss": 0.878333330154419, |
|
"eval_runtime": 1991.7989, |
|
"eval_samples_per_second": 112.017, |
|
"eval_steps_per_second": 1.751, |
|
"step": 1300000 |
|
}, |
|
{ |
|
"epoch": 21.51, |
|
"learning_rate": 4.992034708610221e-05, |
|
"loss": 0.8311, |
|
"step": 1350000 |
|
}, |
|
{ |
|
"epoch": 21.51, |
|
"eval_loss": 0.8799993991851807, |
|
"eval_runtime": 2009.116, |
|
"eval_samples_per_second": 111.051, |
|
"eval_steps_per_second": 1.736, |
|
"step": 1350000 |
|
}, |
|
{ |
|
"epoch": 22.31, |
|
"learning_rate": 4.992035027330242e-05, |
|
"loss": 0.8297, |
|
"step": 1400000 |
|
}, |
|
{ |
|
"epoch": 22.31, |
|
"eval_loss": 0.8792157769203186, |
|
"eval_runtime": 1985.6931, |
|
"eval_samples_per_second": 112.361, |
|
"eval_steps_per_second": 1.756, |
|
"step": 1400000 |
|
}, |
|
{ |
|
"epoch": 23.11, |
|
"learning_rate": 4.9920348679702315e-05, |
|
"loss": 0.8294, |
|
"step": 1450000 |
|
}, |
|
{ |
|
"epoch": 23.11, |
|
"eval_loss": 0.8799900412559509, |
|
"eval_runtime": 2021.1167, |
|
"eval_samples_per_second": 110.392, |
|
"eval_steps_per_second": 1.725, |
|
"step": 1450000 |
|
}, |
|
{ |
|
"epoch": 23.9, |
|
"learning_rate": 4.9920348679702315e-05, |
|
"loss": 0.8277, |
|
"step": 1500000 |
|
}, |
|
{ |
|
"epoch": 23.9, |
|
"eval_loss": 0.8771235346794128, |
|
"eval_runtime": 1995.2079, |
|
"eval_samples_per_second": 111.825, |
|
"eval_steps_per_second": 1.748, |
|
"step": 1500000 |
|
}, |
|
{ |
|
"epoch": 24.7, |
|
"learning_rate": 4.9920348679702315e-05, |
|
"loss": 0.8254, |
|
"step": 1550000 |
|
}, |
|
{ |
|
"epoch": 24.7, |
|
"eval_loss": 0.8784825205802917, |
|
"eval_runtime": 1994.3392, |
|
"eval_samples_per_second": 111.874, |
|
"eval_steps_per_second": 1.748, |
|
"step": 1550000 |
|
}, |
|
{ |
|
"epoch": 25.5, |
|
"learning_rate": 4.992034708610221e-05, |
|
"loss": 0.821, |
|
"step": 1600000 |
|
}, |
|
{ |
|
"epoch": 25.5, |
|
"eval_loss": 0.8786540627479553, |
|
"eval_runtime": 1998.0285, |
|
"eval_samples_per_second": 111.668, |
|
"eval_steps_per_second": 1.745, |
|
"step": 1600000 |
|
}, |
|
{ |
|
"epoch": 26.29, |
|
"learning_rate": 4.9920348679702315e-05, |
|
"loss": 0.8201, |
|
"step": 1650000 |
|
}, |
|
{ |
|
"epoch": 26.29, |
|
"eval_loss": 0.8791316151618958, |
|
"eval_runtime": 2008.615, |
|
"eval_samples_per_second": 111.079, |
|
"eval_steps_per_second": 1.736, |
|
"step": 1650000 |
|
}, |
|
{ |
|
"epoch": 27.09, |
|
"learning_rate": 4.9920348679702315e-05, |
|
"loss": 0.8204, |
|
"step": 1700000 |
|
}, |
|
{ |
|
"epoch": 27.09, |
|
"eval_loss": 0.8804346323013306, |
|
"eval_runtime": 2001.1642, |
|
"eval_samples_per_second": 111.493, |
|
"eval_steps_per_second": 1.742, |
|
"step": 1700000 |
|
}, |
|
{ |
|
"epoch": 27.89, |
|
"learning_rate": 4.992035186690252e-05, |
|
"loss": 0.8188, |
|
"step": 1750000 |
|
}, |
|
{ |
|
"epoch": 27.89, |
|
"eval_loss": 0.875482976436615, |
|
"eval_runtime": 2011.2204, |
|
"eval_samples_per_second": 110.935, |
|
"eval_steps_per_second": 1.734, |
|
"step": 1750000 |
|
} |
|
], |
|
"logging_steps": 50000, |
|
"max_steps": 31375500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 500, |
|
"save_steps": 50000, |
|
"total_flos": 3.65807962939392e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|