{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 24, "global_step": 94, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010638297872340425, "grad_norm": 0.10508890450000763, "learning_rate": 3.0000000000000004e-08, "loss": 1.2333, "step": 1 }, { "epoch": 0.010638297872340425, "eval_loss": 1.5895557403564453, "eval_runtime": 2.9603, "eval_samples_per_second": 35.469, "eval_steps_per_second": 17.903, "step": 1 }, { "epoch": 0.02127659574468085, "grad_norm": 0.12071573734283447, "learning_rate": 6.000000000000001e-08, "loss": 1.3986, "step": 2 }, { "epoch": 0.031914893617021274, "grad_norm": 0.10401547700166702, "learning_rate": 9e-08, "loss": 1.2058, "step": 3 }, { "epoch": 0.0425531914893617, "grad_norm": 0.12298297882080078, "learning_rate": 1.2000000000000002e-07, "loss": 1.3298, "step": 4 }, { "epoch": 0.05319148936170213, "grad_norm": 0.1327030211687088, "learning_rate": 1.5000000000000002e-07, "loss": 1.4031, "step": 5 }, { "epoch": 0.06382978723404255, "grad_norm": 0.12036772817373276, "learning_rate": 1.8e-07, "loss": 1.5335, "step": 6 }, { "epoch": 0.07446808510638298, "grad_norm": 0.14979703724384308, "learning_rate": 2.1000000000000003e-07, "loss": 1.3224, "step": 7 }, { "epoch": 0.0851063829787234, "grad_norm": 0.13082227110862732, "learning_rate": 2.4000000000000003e-07, "loss": 1.4035, "step": 8 }, { "epoch": 0.09574468085106383, "grad_norm": 0.09265327453613281, "learning_rate": 2.7e-07, "loss": 1.1546, "step": 9 }, { "epoch": 0.10638297872340426, "grad_norm": 0.10080434381961823, "learning_rate": 3.0000000000000004e-07, "loss": 1.2652, "step": 10 }, { "epoch": 0.11702127659574468, "grad_norm": 0.10915568470954895, "learning_rate": 3.3e-07, "loss": 1.1438, "step": 11 }, { "epoch": 0.1276595744680851, "grad_norm": 0.1307348906993866, "learning_rate": 3.6e-07, "loss": 1.2821, "step": 12 }, { "epoch": 0.13829787234042554, "grad_norm": 0.09814529865980148, "learning_rate": 3.9e-07, "loss": 1.3905, "step": 13 }, { "epoch": 0.14893617021276595, "grad_norm": 0.11842218786478043, "learning_rate": 4.2000000000000006e-07, "loss": 1.3085, "step": 14 }, { "epoch": 0.1595744680851064, "grad_norm": 0.10207124054431915, "learning_rate": 4.5e-07, "loss": 1.19, "step": 15 }, { "epoch": 0.1702127659574468, "grad_norm": 0.1150127425789833, "learning_rate": 4.800000000000001e-07, "loss": 1.386, "step": 16 }, { "epoch": 0.18085106382978725, "grad_norm": 0.11641352623701096, "learning_rate": 5.100000000000001e-07, "loss": 1.4357, "step": 17 }, { "epoch": 0.19148936170212766, "grad_norm": 0.15035435557365417, "learning_rate": 5.4e-07, "loss": 1.2549, "step": 18 }, { "epoch": 0.20212765957446807, "grad_norm": 0.0984087809920311, "learning_rate": 5.7e-07, "loss": 0.9723, "step": 19 }, { "epoch": 0.2127659574468085, "grad_norm": 0.11582670360803604, "learning_rate": 6.000000000000001e-07, "loss": 1.1402, "step": 20 }, { "epoch": 0.22340425531914893, "grad_norm": 0.10151291638612747, "learning_rate": 6.3e-07, "loss": 1.1533, "step": 21 }, { "epoch": 0.23404255319148937, "grad_norm": 0.09040653705596924, "learning_rate": 6.6e-07, "loss": 1.0438, "step": 22 }, { "epoch": 0.24468085106382978, "grad_norm": 0.10666865110397339, "learning_rate": 6.900000000000001e-07, "loss": 1.3195, "step": 23 }, { "epoch": 0.2553191489361702, "grad_norm": 0.12293969094753265, "learning_rate": 7.2e-07, "loss": 1.7286, "step": 24 }, { "epoch": 0.2553191489361702, "eval_loss": 1.5890721082687378, "eval_runtime": 2.9908, "eval_samples_per_second": 35.108, "eval_steps_per_second": 17.721, "step": 24 }, { "epoch": 0.26595744680851063, "grad_norm": 0.11052387952804565, "learning_rate": 7.5e-07, "loss": 1.1413, "step": 25 }, { "epoch": 0.2765957446808511, "grad_norm": 0.09323304146528244, "learning_rate": 7.8e-07, "loss": 1.2906, "step": 26 }, { "epoch": 0.2872340425531915, "grad_norm": 0.12542971968650818, "learning_rate": 8.100000000000001e-07, "loss": 1.2809, "step": 27 }, { "epoch": 0.2978723404255319, "grad_norm": 0.1080215722322464, "learning_rate": 8.400000000000001e-07, "loss": 1.2735, "step": 28 }, { "epoch": 0.30851063829787234, "grad_norm": 0.11304887384176254, "learning_rate": 8.699999999999999e-07, "loss": 1.3767, "step": 29 }, { "epoch": 0.3191489361702128, "grad_norm": 0.15650290250778198, "learning_rate": 9e-07, "loss": 1.4713, "step": 30 }, { "epoch": 0.32978723404255317, "grad_norm": 0.12139321118593216, "learning_rate": 9.3e-07, "loss": 1.3479, "step": 31 }, { "epoch": 0.3404255319148936, "grad_norm": 0.10141867399215698, "learning_rate": 9.600000000000001e-07, "loss": 1.1512, "step": 32 }, { "epoch": 0.35106382978723405, "grad_norm": 0.15045498311519623, "learning_rate": 9.9e-07, "loss": 1.5059, "step": 33 }, { "epoch": 0.3617021276595745, "grad_norm": 0.12956956028938293, "learning_rate": 1.0200000000000002e-06, "loss": 1.2849, "step": 34 }, { "epoch": 0.3723404255319149, "grad_norm": 0.12963886559009552, "learning_rate": 1.05e-06, "loss": 1.5025, "step": 35 }, { "epoch": 0.3829787234042553, "grad_norm": 0.11268144845962524, "learning_rate": 1.08e-06, "loss": 1.2987, "step": 36 }, { "epoch": 0.39361702127659576, "grad_norm": 0.12941108644008636, "learning_rate": 1.11e-06, "loss": 1.3432, "step": 37 }, { "epoch": 0.40425531914893614, "grad_norm": 0.11319927126169205, "learning_rate": 1.14e-06, "loss": 1.1665, "step": 38 }, { "epoch": 0.4148936170212766, "grad_norm": 0.11748912930488586, "learning_rate": 1.17e-06, "loss": 1.423, "step": 39 }, { "epoch": 0.425531914893617, "grad_norm": 0.11666171997785568, "learning_rate": 1.2000000000000002e-06, "loss": 1.4391, "step": 40 }, { "epoch": 0.43617021276595747, "grad_norm": 0.12274409830570221, "learning_rate": 1.2299999999999999e-06, "loss": 1.4865, "step": 41 }, { "epoch": 0.44680851063829785, "grad_norm": 0.09922561049461365, "learning_rate": 1.26e-06, "loss": 1.2193, "step": 42 }, { "epoch": 0.4574468085106383, "grad_norm": 0.12003930658102036, "learning_rate": 1.29e-06, "loss": 1.3024, "step": 43 }, { "epoch": 0.46808510638297873, "grad_norm": 0.12094161659479141, "learning_rate": 1.32e-06, "loss": 1.4654, "step": 44 }, { "epoch": 0.4787234042553192, "grad_norm": 0.12934774160385132, "learning_rate": 1.35e-06, "loss": 1.3713, "step": 45 }, { "epoch": 0.48936170212765956, "grad_norm": 0.09754550457000732, "learning_rate": 1.3800000000000001e-06, "loss": 1.1991, "step": 46 }, { "epoch": 0.5, "grad_norm": 0.11549004167318344, "learning_rate": 1.41e-06, "loss": 1.4859, "step": 47 }, { "epoch": 0.5106382978723404, "grad_norm": 0.12035688012838364, "learning_rate": 1.44e-06, "loss": 1.2823, "step": 48 }, { "epoch": 0.5106382978723404, "eval_loss": 1.5874611139297485, "eval_runtime": 2.9787, "eval_samples_per_second": 35.251, "eval_steps_per_second": 17.793, "step": 48 }, { "epoch": 0.5212765957446809, "grad_norm": 0.12258938699960709, "learning_rate": 1.4700000000000001e-06, "loss": 1.2966, "step": 49 }, { "epoch": 0.5319148936170213, "grad_norm": 0.12217017263174057, "learning_rate": 1.5e-06, "loss": 1.2988, "step": 50 }, { "epoch": 0.5425531914893617, "grad_norm": 0.12793436646461487, "learning_rate": 1.53e-06, "loss": 1.4233, "step": 51 }, { "epoch": 0.5531914893617021, "grad_norm": 0.11145548522472382, "learning_rate": 1.56e-06, "loss": 1.2792, "step": 52 }, { "epoch": 0.5638297872340425, "grad_norm": 0.13195408880710602, "learning_rate": 1.59e-06, "loss": 1.4481, "step": 53 }, { "epoch": 0.574468085106383, "grad_norm": 0.10663347691297531, "learning_rate": 1.6200000000000002e-06, "loss": 1.2331, "step": 54 }, { "epoch": 0.5851063829787234, "grad_norm": 0.10975392907857895, "learning_rate": 1.65e-06, "loss": 1.1839, "step": 55 }, { "epoch": 0.5957446808510638, "grad_norm": 0.13139477372169495, "learning_rate": 1.6800000000000002e-06, "loss": 1.436, "step": 56 }, { "epoch": 0.6063829787234043, "grad_norm": 0.0924743041396141, "learning_rate": 1.71e-06, "loss": 1.1291, "step": 57 }, { "epoch": 0.6170212765957447, "grad_norm": 0.12475109100341797, "learning_rate": 1.7399999999999999e-06, "loss": 1.3444, "step": 58 }, { "epoch": 0.6276595744680851, "grad_norm": 0.08960220962762833, "learning_rate": 1.77e-06, "loss": 1.2511, "step": 59 }, { "epoch": 0.6382978723404256, "grad_norm": 0.09909304976463318, "learning_rate": 1.8e-06, "loss": 1.1281, "step": 60 }, { "epoch": 0.648936170212766, "grad_norm": 0.11598234623670578, "learning_rate": 1.83e-06, "loss": 1.486, "step": 61 }, { "epoch": 0.6595744680851063, "grad_norm": 0.1404409557580948, "learning_rate": 1.86e-06, "loss": 1.4557, "step": 62 }, { "epoch": 0.6702127659574468, "grad_norm": 0.11349129676818848, "learning_rate": 1.8900000000000001e-06, "loss": 1.3969, "step": 63 }, { "epoch": 0.6808510638297872, "grad_norm": 0.10858353972434998, "learning_rate": 1.9200000000000003e-06, "loss": 1.3515, "step": 64 }, { "epoch": 0.6914893617021277, "grad_norm": 0.11054569482803345, "learning_rate": 1.95e-06, "loss": 1.248, "step": 65 }, { "epoch": 0.7021276595744681, "grad_norm": 0.11826737225055695, "learning_rate": 1.98e-06, "loss": 1.3439, "step": 66 }, { "epoch": 0.7127659574468085, "grad_norm": 0.12291310727596283, "learning_rate": 2.0100000000000002e-06, "loss": 1.4226, "step": 67 }, { "epoch": 0.723404255319149, "grad_norm": 0.1383126825094223, "learning_rate": 2.0400000000000004e-06, "loss": 1.194, "step": 68 }, { "epoch": 0.7340425531914894, "grad_norm": 0.10981890559196472, "learning_rate": 2.07e-06, "loss": 1.2502, "step": 69 }, { "epoch": 0.7446808510638298, "grad_norm": 0.10639657825231552, "learning_rate": 2.1e-06, "loss": 1.2262, "step": 70 }, { "epoch": 0.7553191489361702, "grad_norm": 0.46177396178245544, "learning_rate": 2.13e-06, "loss": 5.063, "step": 71 }, { "epoch": 0.7659574468085106, "grad_norm": 0.11061427742242813, "learning_rate": 2.16e-06, "loss": 1.3856, "step": 72 }, { "epoch": 0.7659574468085106, "eval_loss": 1.5843762159347534, "eval_runtime": 2.9943, "eval_samples_per_second": 35.066, "eval_steps_per_second": 17.7, "step": 72 }, { "epoch": 0.776595744680851, "grad_norm": 0.13233324885368347, "learning_rate": 2.19e-06, "loss": 1.526, "step": 73 }, { "epoch": 0.7872340425531915, "grad_norm": 0.14500053226947784, "learning_rate": 2.22e-06, "loss": 1.2554, "step": 74 }, { "epoch": 0.7978723404255319, "grad_norm": 0.10629269480705261, "learning_rate": 2.25e-06, "loss": 1.2917, "step": 75 }, { "epoch": 0.8085106382978723, "grad_norm": 0.09674028307199478, "learning_rate": 2.28e-06, "loss": 1.2169, "step": 76 }, { "epoch": 0.8191489361702128, "grad_norm": 0.11965947598218918, "learning_rate": 2.31e-06, "loss": 1.2445, "step": 77 }, { "epoch": 0.8297872340425532, "grad_norm": 0.10441229492425919, "learning_rate": 2.34e-06, "loss": 1.2392, "step": 78 }, { "epoch": 0.8404255319148937, "grad_norm": 0.13795869052410126, "learning_rate": 2.37e-06, "loss": 1.5335, "step": 79 }, { "epoch": 0.851063829787234, "grad_norm": 0.10789927840232849, "learning_rate": 2.4000000000000003e-06, "loss": 1.2309, "step": 80 }, { "epoch": 0.8617021276595744, "grad_norm": 0.10697130113840103, "learning_rate": 2.43e-06, "loss": 1.276, "step": 81 }, { "epoch": 0.8723404255319149, "grad_norm": 0.11484125256538391, "learning_rate": 2.4599999999999997e-06, "loss": 1.2163, "step": 82 }, { "epoch": 0.8829787234042553, "grad_norm": 0.09692610800266266, "learning_rate": 2.49e-06, "loss": 1.1554, "step": 83 }, { "epoch": 0.8936170212765957, "grad_norm": 0.10697747021913528, "learning_rate": 2.52e-06, "loss": 1.211, "step": 84 }, { "epoch": 0.9042553191489362, "grad_norm": 0.10578318685293198, "learning_rate": 2.55e-06, "loss": 1.2634, "step": 85 }, { "epoch": 0.9148936170212766, "grad_norm": 0.10587752610445023, "learning_rate": 2.58e-06, "loss": 1.369, "step": 86 }, { "epoch": 0.925531914893617, "grad_norm": 0.11430489271879196, "learning_rate": 2.61e-06, "loss": 1.3048, "step": 87 }, { "epoch": 0.9361702127659575, "grad_norm": 0.11116154491901398, "learning_rate": 2.64e-06, "loss": 1.1883, "step": 88 }, { "epoch": 0.9468085106382979, "grad_norm": 0.12686476111412048, "learning_rate": 2.6700000000000003e-06, "loss": 1.2605, "step": 89 }, { "epoch": 0.9574468085106383, "grad_norm": 0.10976041853427887, "learning_rate": 2.7e-06, "loss": 1.2345, "step": 90 }, { "epoch": 0.9680851063829787, "grad_norm": 0.12391550838947296, "learning_rate": 2.73e-06, "loss": 1.2233, "step": 91 }, { "epoch": 0.9787234042553191, "grad_norm": 0.1277073174715042, "learning_rate": 2.7600000000000003e-06, "loss": 1.2437, "step": 92 }, { "epoch": 0.9893617021276596, "grad_norm": 0.10421616584062576, "learning_rate": 2.7900000000000004e-06, "loss": 1.1432, "step": 93 }, { "epoch": 1.0, "grad_norm": 0.12696842849254608, "learning_rate": 2.82e-06, "loss": 1.3558, "step": 94 } ], "logging_steps": 1, "max_steps": 376, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 94, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3164980962656256.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }