{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.0625, "eval_steps": 500, "global_step": 1160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.078125, "grad_norm": 6.125, "learning_rate": 1e-05, "loss": 12.2019, "step": 10 }, { "epoch": 0.15625, "grad_norm": 6.5, "learning_rate": 2e-05, "loss": 11.8976, "step": 20 }, { "epoch": 0.234375, "grad_norm": 7.46875, "learning_rate": 3e-05, "loss": 11.1732, "step": 30 }, { "epoch": 0.3125, "grad_norm": 7.0625, "learning_rate": 4e-05, "loss": 9.8831, "step": 40 }, { "epoch": 0.390625, "grad_norm": 7.375, "learning_rate": 5e-05, "loss": 8.4578, "step": 50 }, { "epoch": 0.46875, "grad_norm": 8.9375, "learning_rate": 6e-05, "loss": 7.2717, "step": 60 }, { "epoch": 0.546875, "grad_norm": 12.25, "learning_rate": 7e-05, "loss": 6.1268, "step": 70 }, { "epoch": 0.625, "grad_norm": 13.4375, "learning_rate": 8e-05, "loss": 4.9462, "step": 80 }, { "epoch": 0.703125, "grad_norm": 15.375, "learning_rate": 9e-05, "loss": 3.5646, "step": 90 }, { "epoch": 0.78125, "grad_norm": 15.375, "learning_rate": 0.0001, "loss": 2.0469, "step": 100 }, { "epoch": 0.859375, "grad_norm": 3.71875, "learning_rate": 9.915254237288136e-05, "loss": 0.6762, "step": 110 }, { "epoch": 0.9375, "grad_norm": 3.734375, "learning_rate": 9.830508474576272e-05, "loss": 0.3449, "step": 120 }, { "epoch": 1.015625, "grad_norm": 2.609375, "learning_rate": 9.745762711864407e-05, "loss": 0.2995, "step": 130 }, { "epoch": 1.09375, "grad_norm": 2.375, "learning_rate": 9.661016949152543e-05, "loss": 0.2842, "step": 140 }, { "epoch": 1.171875, "grad_norm": 1.703125, "learning_rate": 9.576271186440679e-05, "loss": 0.2791, "step": 150 }, { "epoch": 1.25, "grad_norm": 1.0390625, "learning_rate": 9.491525423728815e-05, "loss": 0.2459, "step": 160 }, { "epoch": 1.328125, "grad_norm": 1.6484375, "learning_rate": 9.40677966101695e-05, "loss": 0.2479, "step": 170 }, { "epoch": 1.40625, "grad_norm": 1.0, "learning_rate": 9.322033898305085e-05, "loss": 0.2573, "step": 180 }, { "epoch": 1.484375, "grad_norm": 1.1171875, "learning_rate": 9.237288135593221e-05, "loss": 0.2424, "step": 190 }, { "epoch": 1.5625, "grad_norm": 0.8125, "learning_rate": 9.152542372881357e-05, "loss": 0.2339, "step": 200 }, { "epoch": 1.640625, "grad_norm": 0.95703125, "learning_rate": 9.067796610169493e-05, "loss": 0.2326, "step": 210 }, { "epoch": 1.71875, "grad_norm": 1.90625, "learning_rate": 8.983050847457629e-05, "loss": 0.2267, "step": 220 }, { "epoch": 1.796875, "grad_norm": 0.88671875, "learning_rate": 8.898305084745763e-05, "loss": 0.2233, "step": 230 }, { "epoch": 1.875, "grad_norm": 1.109375, "learning_rate": 8.813559322033899e-05, "loss": 0.2211, "step": 240 }, { "epoch": 1.953125, "grad_norm": 0.80859375, "learning_rate": 8.728813559322035e-05, "loss": 0.2287, "step": 250 }, { "epoch": 2.03125, "grad_norm": 0.87109375, "learning_rate": 8.644067796610171e-05, "loss": 0.2169, "step": 260 }, { "epoch": 2.109375, "grad_norm": 1.2890625, "learning_rate": 8.559322033898305e-05, "loss": 0.2161, "step": 270 }, { "epoch": 2.1875, "grad_norm": 0.75390625, "learning_rate": 8.474576271186441e-05, "loss": 0.2192, "step": 280 }, { "epoch": 2.265625, "grad_norm": 0.56640625, "learning_rate": 8.389830508474577e-05, "loss": 0.2136, "step": 290 }, { "epoch": 2.34375, "grad_norm": 0.7265625, "learning_rate": 8.305084745762712e-05, "loss": 0.1974, "step": 300 }, { "epoch": 2.421875, "grad_norm": 0.78515625, "learning_rate": 8.220338983050848e-05, "loss": 0.2046, "step": 310 }, { "epoch": 2.5, "grad_norm": 0.8984375, "learning_rate": 8.135593220338983e-05, "loss": 0.1961, "step": 320 }, { "epoch": 2.578125, "grad_norm": 0.51171875, "learning_rate": 8.050847457627118e-05, "loss": 0.196, "step": 330 }, { "epoch": 2.65625, "grad_norm": 0.5234375, "learning_rate": 7.966101694915254e-05, "loss": 0.1955, "step": 340 }, { "epoch": 2.734375, "grad_norm": 0.8125, "learning_rate": 7.88135593220339e-05, "loss": 0.1867, "step": 350 }, { "epoch": 2.8125, "grad_norm": 0.57421875, "learning_rate": 7.796610169491526e-05, "loss": 0.184, "step": 360 }, { "epoch": 2.890625, "grad_norm": 0.5, "learning_rate": 7.711864406779662e-05, "loss": 0.1911, "step": 370 }, { "epoch": 2.96875, "grad_norm": 0.72265625, "learning_rate": 7.627118644067796e-05, "loss": 0.185, "step": 380 }, { "epoch": 3.046875, "grad_norm": 0.87109375, "learning_rate": 7.542372881355932e-05, "loss": 0.1682, "step": 390 }, { "epoch": 3.125, "grad_norm": 0.94140625, "learning_rate": 7.457627118644068e-05, "loss": 0.166, "step": 400 }, { "epoch": 3.203125, "grad_norm": 0.73828125, "learning_rate": 7.372881355932204e-05, "loss": 0.1683, "step": 410 }, { "epoch": 3.28125, "grad_norm": 0.640625, "learning_rate": 7.288135593220338e-05, "loss": 0.1672, "step": 420 }, { "epoch": 3.359375, "grad_norm": 0.478515625, "learning_rate": 7.203389830508474e-05, "loss": 0.1518, "step": 430 }, { "epoch": 3.4375, "grad_norm": 0.6796875, "learning_rate": 7.11864406779661e-05, "loss": 0.1555, "step": 440 }, { "epoch": 3.515625, "grad_norm": 0.5, "learning_rate": 7.033898305084746e-05, "loss": 0.1632, "step": 450 }, { "epoch": 3.59375, "grad_norm": 15.8125, "learning_rate": 6.949152542372882e-05, "loss": 0.1647, "step": 460 }, { "epoch": 3.671875, "grad_norm": 0.71484375, "learning_rate": 6.864406779661017e-05, "loss": 0.156, "step": 470 }, { "epoch": 3.75, "grad_norm": 1.28125, "learning_rate": 6.779661016949152e-05, "loss": 0.1445, "step": 480 }, { "epoch": 3.828125, "grad_norm": 0.6171875, "learning_rate": 6.694915254237288e-05, "loss": 0.125, "step": 490 }, { "epoch": 3.90625, "grad_norm": 2.015625, "learning_rate": 6.610169491525424e-05, "loss": 0.1408, "step": 500 }, { "epoch": 3.984375, "grad_norm": 8.9375, "learning_rate": 6.52542372881356e-05, "loss": 0.1295, "step": 510 }, { "epoch": 4.0625, "grad_norm": 0.96484375, "learning_rate": 6.440677966101695e-05, "loss": 0.127, "step": 520 }, { "epoch": 4.140625, "grad_norm": 0.71875, "learning_rate": 6.35593220338983e-05, "loss": 0.1191, "step": 530 }, { "epoch": 4.21875, "grad_norm": 0.447265625, "learning_rate": 6.271186440677966e-05, "loss": 0.1288, "step": 540 }, { "epoch": 4.296875, "grad_norm": 0.8046875, "learning_rate": 6.186440677966102e-05, "loss": 0.105, "step": 550 }, { "epoch": 4.375, "grad_norm": 0.83203125, "learning_rate": 6.101694915254238e-05, "loss": 0.1171, "step": 560 }, { "epoch": 4.453125, "grad_norm": 1.3515625, "learning_rate": 6.016949152542373e-05, "loss": 0.1279, "step": 570 }, { "epoch": 4.53125, "grad_norm": 0.8203125, "learning_rate": 5.932203389830509e-05, "loss": 0.1049, "step": 580 }, { "epoch": 4.609375, "grad_norm": 31.0, "learning_rate": 5.8474576271186446e-05, "loss": 0.1511, "step": 590 }, { "epoch": 4.6875, "grad_norm": 0.54296875, "learning_rate": 5.76271186440678e-05, "loss": 0.1048, "step": 600 }, { "epoch": 4.765625, "grad_norm": 1.328125, "learning_rate": 5.677966101694916e-05, "loss": 0.1087, "step": 610 }, { "epoch": 4.84375, "grad_norm": 0.81640625, "learning_rate": 5.593220338983051e-05, "loss": 0.1018, "step": 620 }, { "epoch": 4.921875, "grad_norm": 1.25, "learning_rate": 5.508474576271186e-05, "loss": 0.1116, "step": 630 }, { "epoch": 5.0, "grad_norm": 0.7734375, "learning_rate": 5.423728813559322e-05, "loss": 0.1077, "step": 640 }, { "epoch": 5.078125, "grad_norm": 0.52734375, "learning_rate": 5.338983050847458e-05, "loss": 0.1047, "step": 650 }, { "epoch": 5.15625, "grad_norm": 0.71875, "learning_rate": 5.254237288135594e-05, "loss": 0.101, "step": 660 }, { "epoch": 5.234375, "grad_norm": 0.66796875, "learning_rate": 5.1694915254237284e-05, "loss": 0.0999, "step": 670 }, { "epoch": 5.3125, "grad_norm": 0.498046875, "learning_rate": 5.0847457627118643e-05, "loss": 0.0964, "step": 680 }, { "epoch": 5.390625, "grad_norm": 0.52734375, "learning_rate": 5e-05, "loss": 0.1011, "step": 690 }, { "epoch": 5.46875, "grad_norm": 0.78515625, "learning_rate": 4.915254237288136e-05, "loss": 0.1019, "step": 700 }, { "epoch": 5.546875, "grad_norm": 0.7734375, "learning_rate": 4.8305084745762714e-05, "loss": 0.1089, "step": 710 }, { "epoch": 5.625, "grad_norm": 0.6953125, "learning_rate": 4.745762711864407e-05, "loss": 0.0998, "step": 720 }, { "epoch": 5.703125, "grad_norm": 10.375, "learning_rate": 4.6610169491525425e-05, "loss": 0.1138, "step": 730 }, { "epoch": 5.78125, "grad_norm": 0.67578125, "learning_rate": 4.5762711864406784e-05, "loss": 0.1189, "step": 740 }, { "epoch": 5.859375, "grad_norm": 0.6328125, "learning_rate": 4.491525423728814e-05, "loss": 0.1006, "step": 750 }, { "epoch": 5.9375, "grad_norm": 0.90625, "learning_rate": 4.4067796610169495e-05, "loss": 0.1049, "step": 760 }, { "epoch": 6.015625, "grad_norm": 0.46875, "learning_rate": 4.3220338983050854e-05, "loss": 0.0988, "step": 770 }, { "epoch": 6.09375, "grad_norm": 0.953125, "learning_rate": 4.2372881355932206e-05, "loss": 0.1061, "step": 780 }, { "epoch": 6.171875, "grad_norm": 0.51171875, "learning_rate": 4.152542372881356e-05, "loss": 0.0999, "step": 790 }, { "epoch": 6.25, "grad_norm": 0.52734375, "learning_rate": 4.067796610169492e-05, "loss": 0.102, "step": 800 }, { "epoch": 6.328125, "grad_norm": 0.98828125, "learning_rate": 3.983050847457627e-05, "loss": 0.0933, "step": 810 }, { "epoch": 6.40625, "grad_norm": 0.45703125, "learning_rate": 3.898305084745763e-05, "loss": 0.0963, "step": 820 }, { "epoch": 6.484375, "grad_norm": 0.93359375, "learning_rate": 3.813559322033898e-05, "loss": 0.1018, "step": 830 }, { "epoch": 6.5625, "grad_norm": 0.6484375, "learning_rate": 3.728813559322034e-05, "loss": 0.1038, "step": 840 }, { "epoch": 6.640625, "grad_norm": 0.51953125, "learning_rate": 3.644067796610169e-05, "loss": 0.0978, "step": 850 }, { "epoch": 6.71875, "grad_norm": 0.515625, "learning_rate": 3.559322033898305e-05, "loss": 0.1033, "step": 860 }, { "epoch": 6.796875, "grad_norm": 0.58203125, "learning_rate": 3.474576271186441e-05, "loss": 0.0998, "step": 870 }, { "epoch": 6.875, "grad_norm": 0.546875, "learning_rate": 3.389830508474576e-05, "loss": 0.0943, "step": 880 }, { "epoch": 6.953125, "grad_norm": 0.796875, "learning_rate": 3.305084745762712e-05, "loss": 0.1026, "step": 890 }, { "epoch": 7.03125, "grad_norm": 0.60546875, "learning_rate": 3.2203389830508473e-05, "loss": 0.1009, "step": 900 }, { "epoch": 7.109375, "grad_norm": 0.82421875, "learning_rate": 3.135593220338983e-05, "loss": 0.0949, "step": 910 }, { "epoch": 7.1875, "grad_norm": 0.51953125, "learning_rate": 3.050847457627119e-05, "loss": 0.0969, "step": 920 }, { "epoch": 7.265625, "grad_norm": 0.60546875, "learning_rate": 2.9661016949152544e-05, "loss": 0.0965, "step": 930 }, { "epoch": 7.34375, "grad_norm": 0.546875, "learning_rate": 2.88135593220339e-05, "loss": 0.0929, "step": 940 }, { "epoch": 7.421875, "grad_norm": 0.796875, "learning_rate": 2.7966101694915255e-05, "loss": 0.0982, "step": 950 }, { "epoch": 7.5, "grad_norm": 0.63671875, "learning_rate": 2.711864406779661e-05, "loss": 0.1018, "step": 960 }, { "epoch": 7.578125, "grad_norm": 0.7265625, "learning_rate": 2.627118644067797e-05, "loss": 0.0926, "step": 970 }, { "epoch": 7.65625, "grad_norm": 0.8203125, "learning_rate": 2.5423728813559322e-05, "loss": 0.0944, "step": 980 }, { "epoch": 7.734375, "grad_norm": 0.59375, "learning_rate": 2.457627118644068e-05, "loss": 0.0957, "step": 990 }, { "epoch": 7.8125, "grad_norm": 2.703125, "learning_rate": 2.3728813559322036e-05, "loss": 0.0974, "step": 1000 }, { "epoch": 7.890625, "grad_norm": 0.77734375, "learning_rate": 2.2881355932203392e-05, "loss": 0.1005, "step": 1010 }, { "epoch": 7.96875, "grad_norm": 0.70703125, "learning_rate": 2.2033898305084748e-05, "loss": 0.0988, "step": 1020 }, { "epoch": 8.046875, "grad_norm": 0.73046875, "learning_rate": 2.1186440677966103e-05, "loss": 0.0968, "step": 1030 }, { "epoch": 8.125, "grad_norm": 0.7265625, "learning_rate": 2.033898305084746e-05, "loss": 0.0951, "step": 1040 }, { "epoch": 8.203125, "grad_norm": 0.6953125, "learning_rate": 1.9491525423728814e-05, "loss": 0.0925, "step": 1050 }, { "epoch": 8.28125, "grad_norm": 0.498046875, "learning_rate": 1.864406779661017e-05, "loss": 0.0953, "step": 1060 }, { "epoch": 8.359375, "grad_norm": 0.546875, "learning_rate": 1.7796610169491526e-05, "loss": 0.0966, "step": 1070 }, { "epoch": 8.4375, "grad_norm": 0.6796875, "learning_rate": 1.694915254237288e-05, "loss": 0.1083, "step": 1080 }, { "epoch": 8.515625, "grad_norm": 0.70703125, "learning_rate": 1.6101694915254237e-05, "loss": 0.0992, "step": 1090 }, { "epoch": 8.59375, "grad_norm": 0.52734375, "learning_rate": 1.5254237288135596e-05, "loss": 0.11, "step": 1100 }, { "epoch": 8.671875, "grad_norm": 0.58984375, "learning_rate": 1.440677966101695e-05, "loss": 0.1163, "step": 1110 }, { "epoch": 8.75, "grad_norm": 0.51953125, "learning_rate": 1.3559322033898305e-05, "loss": 0.1086, "step": 1120 }, { "epoch": 8.828125, "grad_norm": 0.93359375, "learning_rate": 1.2711864406779661e-05, "loss": 0.0908, "step": 1130 }, { "epoch": 8.90625, "grad_norm": 0.765625, "learning_rate": 1.1864406779661018e-05, "loss": 0.0967, "step": 1140 }, { "epoch": 8.984375, "grad_norm": 0.64453125, "learning_rate": 1.1016949152542374e-05, "loss": 0.0959, "step": 1150 }, { "epoch": 9.0625, "grad_norm": 0.5546875, "learning_rate": 1.016949152542373e-05, "loss": 0.0855, "step": 1160 } ], "logging_steps": 10, "max_steps": 1280, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4466660675659776.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }