diff --git a/adapters/saved-alpaca-13b/adapter_config.json b/adapters/saved-alpaca-13b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4ba49948a8f8232ee95452e47fcf9bd523635048 --- /dev/null +++ b/adapters/saved-alpaca-13b/adapter_config.json @@ -0,0 +1,18 @@ +{ + "base_model_name_or_path": "/mnt/bn/qingyi-bn-lq/llama/llama-13b-hf", + "bias": "none", + "enable_lora": null, + "fan_in_fan_out": false, + "inference_mode": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved-alpaca-13b/adapter_model.bin b/adapters/saved-alpaca-13b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..fe59620d99423116237f88507d383130a771ec15 --- /dev/null +++ b/adapters/saved-alpaca-13b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17f520b2585949c23c374ad167d169b4fc21b3cc8411305b18ae1b7bd1d49002 +size 26271757 diff --git a/adapters/saved-alpaca-13b/checkpoint-1000/optimizer.pt b/adapters/saved-alpaca-13b/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9bea23d71ab7d8941495b192c263d658441cd8ac --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e7c503114ad958265cb0b9937aa6f04e6b3e5ebed55f402be0dfe2608728e1b +size 52523141 diff --git a/adapters/saved-alpaca-13b/checkpoint-1000/pytorch_model.bin b/adapters/saved-alpaca-13b/checkpoint-1000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..22aec9d48a4e933a403baffbbe03531ea69915f2 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-1000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a485291ae756d9ce2a9f1e587dca5a7f2e956eb0d2ac36a0538fd9ee8fe8568 +size 26271757 diff --git a/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_0.pth b/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..43af9775a693dc0b8a7f29178e97bc3f4b9bed65 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b2d4f40efc056e291f0e7bc62df73b1723ac76b464e4dfa2950f67d2923899f +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_1.pth b/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3e07ba837bba8650c4029a055f0be9e762081640 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdb44bc4712cb02da41900ef02cb5e2115d133414246f5122c59a215c7fe47e9 +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_2.pth b/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..f33b7c29dae11335092246b98d2da3d6cf20f76e --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:502133cf23fc1f7cde440bff04cdc89a4fd570c0158d3b2ae9f6012edcf11ba3 +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_3.pth b/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..1c9c6e925d992ed110fff77c0f161d0cd5870632 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfea5e1888cf75341567d2f38c227d33ec186ecd19e89de39809a43d9f05a0e1 +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_4.pth b/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..79042a801bcb66171d70af2788cdd7f553b92490 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d53cb797ccdf76e13be23cc92a67cb626bc8920b6455ae36c4554d606c59f38 +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_5.pth b/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b1a58e65e1a2f16582630640b46e3bd7ec6d38e --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6c980ec98f6a650a1f1406bf0637c97c375a37ccb6617e7c9214eb5329e587b +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_6.pth b/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..2caad348803c57ab34baf7a429b9d8a7b819864b --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19beeabb3d073c196d42ab61f0ea7ddee10e55c26f725f3343c43d2eb06e7226 +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_7.pth b/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..11c3905910782ab4352098b0b8e0e723c8b53945 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-1000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e8deea3980223c29fe97f5415e158a42ed7c6e93ed515673cc65d3adaf369bb +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-1000/scaler.pt b/adapters/saved-alpaca-13b/checkpoint-1000/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..efdbd3c795f6b0d4144e68355e99c220ccdedd09 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-1000/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68cff80b680ddf6e7abbef98b5f336b97f9b5963e2209307f639383870e8cc71 +size 557 diff --git a/adapters/saved-alpaca-13b/checkpoint-1000/scheduler.pt b/adapters/saved-alpaca-13b/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..483d11399557624504dab1563e0154444b1c9dd8 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:988fa96f15b0bcf68e96b1a4d321f89c5a5aca28eeae640fe236375758ba5304 +size 627 diff --git a/adapters/saved-alpaca-13b/checkpoint-1000/trainer_state.json b/adapters/saved-alpaca-13b/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..24fec7443a5c86598e1547569e8365c841a6d4f9 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-1000/trainer_state.json @@ -0,0 +1,356 @@ +{ + "best_metric": 0.8150926828384399, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-13b/checkpoint-1000", + "epoch": 2.5624599615631007, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.05, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.9024, + "step": 20 + }, + { + "epoch": 0.1, + "learning_rate": 0.00011999999999999999, + "loss": 1.4401, + "step": 40 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017999999999999998, + "loss": 0.9439, + "step": 60 + }, + { + "epoch": 0.2, + "learning_rate": 0.00023999999999999998, + "loss": 0.8693, + "step": 80 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003, + "loss": 0.8598, + "step": 100 + }, + { + "epoch": 0.31, + "learning_rate": 0.00029439252336448596, + "loss": 0.8485, + "step": 120 + }, + { + "epoch": 0.36, + "learning_rate": 0.00028878504672897194, + "loss": 0.8323, + "step": 140 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002831775700934579, + "loss": 0.8364, + "step": 160 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002775700934579439, + "loss": 0.8364, + "step": 180 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002719626168224299, + "loss": 0.842, + "step": 200 + }, + { + "epoch": 0.51, + "eval_loss": 0.8336600661277771, + "eval_runtime": 14.4551, + "eval_samples_per_second": 138.359, + "eval_steps_per_second": 2.214, + "step": 200 + }, + { + "epoch": 0.56, + "learning_rate": 0.00026635514018691586, + "loss": 0.8289, + "step": 220 + }, + { + "epoch": 0.61, + "learning_rate": 0.00026074766355140184, + "loss": 0.8383, + "step": 240 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002551401869158878, + "loss": 0.822, + "step": 260 + }, + { + "epoch": 0.72, + "learning_rate": 0.0002495327102803738, + "loss": 0.8378, + "step": 280 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002439252336448598, + "loss": 0.8275, + "step": 300 + }, + { + "epoch": 0.82, + "learning_rate": 0.00023831775700934577, + "loss": 0.8225, + "step": 320 + }, + { + "epoch": 0.87, + "learning_rate": 0.00023271028037383175, + "loss": 0.8188, + "step": 340 + }, + { + "epoch": 0.92, + "learning_rate": 0.00022710280373831773, + "loss": 0.8251, + "step": 360 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002214953271028037, + "loss": 0.8107, + "step": 380 + }, + { + "epoch": 1.02, + "learning_rate": 0.0002158878504672897, + "loss": 0.806, + "step": 400 + }, + { + "epoch": 1.02, + "eval_loss": 0.8227179050445557, + "eval_runtime": 14.4306, + "eval_samples_per_second": 138.594, + "eval_steps_per_second": 2.218, + "step": 400 + }, + { + "epoch": 1.08, + "learning_rate": 0.00021028037383177567, + "loss": 0.8157, + "step": 420 + }, + { + "epoch": 1.13, + "learning_rate": 0.00020467289719626166, + "loss": 0.8139, + "step": 440 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019906542056074764, + "loss": 0.8203, + "step": 460 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019345794392523362, + "loss": 0.8183, + "step": 480 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001878504672897196, + "loss": 0.8046, + "step": 500 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018224299065420558, + "loss": 0.8053, + "step": 520 + }, + { + "epoch": 1.38, + "learning_rate": 0.00017663551401869156, + "loss": 0.8037, + "step": 540 + }, + { + "epoch": 1.43, + "learning_rate": 0.00017102803738317754, + "loss": 0.8036, + "step": 560 + }, + { + "epoch": 1.49, + "learning_rate": 0.00016542056074766352, + "loss": 0.7971, + "step": 580 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001598130841121495, + "loss": 0.8024, + "step": 600 + }, + { + "epoch": 1.54, + "eval_loss": 0.8188450932502747, + "eval_runtime": 14.5635, + "eval_samples_per_second": 137.329, + "eval_steps_per_second": 2.197, + "step": 600 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001542056074766355, + "loss": 0.8042, + "step": 620 + }, + { + "epoch": 1.64, + "learning_rate": 0.00014859813084112147, + "loss": 0.8148, + "step": 640 + }, + { + "epoch": 1.69, + "learning_rate": 0.00014299065420560745, + "loss": 0.7976, + "step": 660 + }, + { + "epoch": 1.74, + "learning_rate": 0.00013738317757009343, + "loss": 0.8112, + "step": 680 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001317757009345794, + "loss": 0.805, + "step": 700 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001261682242990654, + "loss": 0.797, + "step": 720 + }, + { + "epoch": 1.9, + "learning_rate": 0.00012056074766355139, + "loss": 0.7883, + "step": 740 + }, + { + "epoch": 1.95, + "learning_rate": 0.00011495327102803737, + "loss": 0.8026, + "step": 760 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010934579439252335, + "loss": 0.8098, + "step": 780 + }, + { + "epoch": 2.05, + "learning_rate": 0.00010373831775700933, + "loss": 0.8021, + "step": 800 + }, + { + "epoch": 2.05, + "eval_loss": 0.8164276480674744, + "eval_runtime": 14.5011, + "eval_samples_per_second": 137.921, + "eval_steps_per_second": 2.207, + "step": 800 + }, + { + "epoch": 2.1, + "learning_rate": 9.813084112149531e-05, + "loss": 0.7967, + "step": 820 + }, + { + "epoch": 2.15, + "learning_rate": 9.25233644859813e-05, + "loss": 0.793, + "step": 840 + }, + { + "epoch": 2.2, + "learning_rate": 8.691588785046728e-05, + "loss": 0.8026, + "step": 860 + }, + { + "epoch": 2.25, + "learning_rate": 8.130841121495326e-05, + "loss": 0.7911, + "step": 880 + }, + { + "epoch": 2.31, + "learning_rate": 7.570093457943924e-05, + "loss": 0.8042, + "step": 900 + }, + { + "epoch": 2.36, + "learning_rate": 7.009345794392522e-05, + "loss": 0.7994, + "step": 920 + }, + { + "epoch": 2.41, + "learning_rate": 6.44859813084112e-05, + "loss": 0.8056, + "step": 940 + }, + { + "epoch": 2.46, + "learning_rate": 5.887850467289719e-05, + "loss": 0.7943, + "step": 960 + }, + { + "epoch": 2.51, + "learning_rate": 5.327102803738317e-05, + "loss": 0.7987, + "step": 980 + }, + { + "epoch": 2.56, + "learning_rate": 4.766355140186915e-05, + "loss": 0.8006, + "step": 1000 + }, + { + "epoch": 2.56, + "eval_loss": 0.8150926828384399, + "eval_runtime": 14.519, + "eval_samples_per_second": 137.751, + "eval_steps_per_second": 2.204, + "step": 1000 + } + ], + "max_steps": 1170, + "num_train_epochs": 3, + "total_flos": 2.527783339700519e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-13b/checkpoint-1000/training_args.bin b/adapters/saved-alpaca-13b/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0dd9d576e81f123903a3d80b6b40a5e98dcac191 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:569cffde1178ce7cb3b47607b0b6b1b562a8816ca51d5d79948ec2815c618c99 +size 3579 diff --git a/adapters/saved-alpaca-13b/checkpoint-600/optimizer.pt b/adapters/saved-alpaca-13b/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ce53dd7abcc62543ff18c842930f5091c726226 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0e445eed20c1696cd10624c265dfe0008908b629a4c3b68d091dbdd6d6d15bb +size 52523141 diff --git a/adapters/saved-alpaca-13b/checkpoint-600/pytorch_model.bin b/adapters/saved-alpaca-13b/checkpoint-600/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..efaaf1142cf1cdd973c19ff54b197e66586eaec4 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-600/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3388b0e66a46f5f58661b8842f81cc59c7403c62035ba064e3a82985570fc045 +size 26271757 diff --git a/adapters/saved-alpaca-13b/checkpoint-600/rng_state_0.pth b/adapters/saved-alpaca-13b/checkpoint-600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..614eeb249be24512a1a8daedcf40ee5e2ed19e90 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a401151419ac788509518fecd7290393f6661cbb814b9d23e0507968245b6dd +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-600/rng_state_1.pth b/adapters/saved-alpaca-13b/checkpoint-600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..b0e21283b80917863c1c1f57ba67b114bf0ab006 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39d31064b0c52dea5c8b99aaca805fad14c4188d2f37b7f3f99c5b4329ef0fe1 +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-600/rng_state_2.pth b/adapters/saved-alpaca-13b/checkpoint-600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..ed9d745c42633e4f5044fb0ea65f395e6c39bc75 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82500c71ca1a097dc82fa54a6a17ac633526f2ce40d060dde7da01abca5a1530 +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-600/rng_state_3.pth b/adapters/saved-alpaca-13b/checkpoint-600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..6360dab6a2c06d160081d77012797854f922566d --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c317a21316a208880af8f21030493d2ffa5f332154ffae8970e90d6d3b9baa7f +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-600/rng_state_4.pth b/adapters/saved-alpaca-13b/checkpoint-600/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..17a897d5bf9bcd09f37ba323e8748160be4d0e69 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-600/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df6fd724264e10e653e99e7343342410999eca369fecbf183329b923a782797e +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-600/rng_state_5.pth b/adapters/saved-alpaca-13b/checkpoint-600/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..7a755a6c6a3f5748b931618403a5eaa76f4519dd --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-600/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a6b3bfc5317511071ea71fe365af9b93c488be84bd421286a3e319c75ae1ed7 +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-600/rng_state_6.pth b/adapters/saved-alpaca-13b/checkpoint-600/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..49ab0ba10de9f90e7281ec5da87230a56f0c1c2a --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-600/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09b5773348f2c99824a5b650d0bd7cbae32b7808ed38e8264121e8b4ebc7c33c +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-600/rng_state_7.pth b/adapters/saved-alpaca-13b/checkpoint-600/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..9eb378b4f828f2342bf850809776f9fd5ca65c4f --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-600/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e4a1e6d9da1d2200d29265401dbae1332be7010031641244ec5316bc187a6cd +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-600/scaler.pt b/adapters/saved-alpaca-13b/checkpoint-600/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..842791b612283ceb8e68b64ed8e40e81c5a97bce --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-600/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dc9eacfeb00bd0bfeb98934a2309be01be65b288e0d747bbfc423b32679169f +size 557 diff --git a/adapters/saved-alpaca-13b/checkpoint-600/scheduler.pt b/adapters/saved-alpaca-13b/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c7bc11bb45bdbf773b1e9ce6b5f7314df71b760 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab0b5eaefe5317a6f29d9c670ecd5644d66afb60156c841d18e022a62f983d66 +size 627 diff --git a/adapters/saved-alpaca-13b/checkpoint-600/trainer_state.json b/adapters/saved-alpaca-13b/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..165b13aa4166b2e1578a58a7795d35f9e4178f5c --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-600/trainer_state.json @@ -0,0 +1,220 @@ +{ + "best_metric": 0.8188450932502747, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-13b/checkpoint-600", + "epoch": 1.5374759769378603, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.05, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.9024, + "step": 20 + }, + { + "epoch": 0.1, + "learning_rate": 0.00011999999999999999, + "loss": 1.4401, + "step": 40 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017999999999999998, + "loss": 0.9439, + "step": 60 + }, + { + "epoch": 0.2, + "learning_rate": 0.00023999999999999998, + "loss": 0.8693, + "step": 80 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003, + "loss": 0.8598, + "step": 100 + }, + { + "epoch": 0.31, + "learning_rate": 0.00029439252336448596, + "loss": 0.8485, + "step": 120 + }, + { + "epoch": 0.36, + "learning_rate": 0.00028878504672897194, + "loss": 0.8323, + "step": 140 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002831775700934579, + "loss": 0.8364, + "step": 160 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002775700934579439, + "loss": 0.8364, + "step": 180 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002719626168224299, + "loss": 0.842, + "step": 200 + }, + { + "epoch": 0.51, + "eval_loss": 0.8336600661277771, + "eval_runtime": 14.4551, + "eval_samples_per_second": 138.359, + "eval_steps_per_second": 2.214, + "step": 200 + }, + { + "epoch": 0.56, + "learning_rate": 0.00026635514018691586, + "loss": 0.8289, + "step": 220 + }, + { + "epoch": 0.61, + "learning_rate": 0.00026074766355140184, + "loss": 0.8383, + "step": 240 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002551401869158878, + "loss": 0.822, + "step": 260 + }, + { + "epoch": 0.72, + "learning_rate": 0.0002495327102803738, + "loss": 0.8378, + "step": 280 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002439252336448598, + "loss": 0.8275, + "step": 300 + }, + { + "epoch": 0.82, + "learning_rate": 0.00023831775700934577, + "loss": 0.8225, + "step": 320 + }, + { + "epoch": 0.87, + "learning_rate": 0.00023271028037383175, + "loss": 0.8188, + "step": 340 + }, + { + "epoch": 0.92, + "learning_rate": 0.00022710280373831773, + "loss": 0.8251, + "step": 360 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002214953271028037, + "loss": 0.8107, + "step": 380 + }, + { + "epoch": 1.02, + "learning_rate": 0.0002158878504672897, + "loss": 0.806, + "step": 400 + }, + { + "epoch": 1.02, + "eval_loss": 0.8227179050445557, + "eval_runtime": 14.4306, + "eval_samples_per_second": 138.594, + "eval_steps_per_second": 2.218, + "step": 400 + }, + { + "epoch": 1.08, + "learning_rate": 0.00021028037383177567, + "loss": 0.8157, + "step": 420 + }, + { + "epoch": 1.13, + "learning_rate": 0.00020467289719626166, + "loss": 0.8139, + "step": 440 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019906542056074764, + "loss": 0.8203, + "step": 460 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019345794392523362, + "loss": 0.8183, + "step": 480 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001878504672897196, + "loss": 0.8046, + "step": 500 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018224299065420558, + "loss": 0.8053, + "step": 520 + }, + { + "epoch": 1.38, + "learning_rate": 0.00017663551401869156, + "loss": 0.8037, + "step": 540 + }, + { + "epoch": 1.43, + "learning_rate": 0.00017102803738317754, + "loss": 0.8036, + "step": 560 + }, + { + "epoch": 1.49, + "learning_rate": 0.00016542056074766352, + "loss": 0.7971, + "step": 580 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001598130841121495, + "loss": 0.8024, + "step": 600 + }, + { + "epoch": 1.54, + "eval_loss": 0.8188450932502747, + "eval_runtime": 14.5635, + "eval_samples_per_second": 137.329, + "eval_steps_per_second": 2.197, + "step": 600 + } + ], + "max_steps": 1170, + "num_train_epochs": 3, + "total_flos": 1.5167016053306819e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-13b/checkpoint-600/training_args.bin b/adapters/saved-alpaca-13b/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0dd9d576e81f123903a3d80b6b40a5e98dcac191 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:569cffde1178ce7cb3b47607b0b6b1b562a8816ca51d5d79948ec2815c618c99 +size 3579 diff --git a/adapters/saved-alpaca-13b/checkpoint-800/optimizer.pt b/adapters/saved-alpaca-13b/checkpoint-800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c945ca1fdecb733225f306b431473caa073b9e19 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a0106565c0a3f46bd8586f3992cc17ea683b95bd5b565fe1ce96fd1667082e3 +size 52523141 diff --git a/adapters/saved-alpaca-13b/checkpoint-800/pytorch_model.bin b/adapters/saved-alpaca-13b/checkpoint-800/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..7b0c45e024a699d7cde0217658c614c916757d1f --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-800/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6d5cf675ef9b1bb62432ca7fe7defdd413bc528b7b88c703059361c0592a5c1 +size 26271757 diff --git a/adapters/saved-alpaca-13b/checkpoint-800/rng_state_0.pth b/adapters/saved-alpaca-13b/checkpoint-800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..97d896f7aa9de377437d716dd61e3bfb29abf746 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8abd48d76a5f0f966df413c2c82c2c9e03343968546fb321f0e4dad0af2c5688 +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-800/rng_state_1.pth b/adapters/saved-alpaca-13b/checkpoint-800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..be3d6bef65a0d319b051acd5df22822a6dfe218a --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f05d613698fda587a8e1836e4405f3a8a8b51e0d2f1657924ccf69f123b43daa +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-800/rng_state_2.pth b/adapters/saved-alpaca-13b/checkpoint-800/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..eb764d5af412de3eb8b573376a7ce17e04452b43 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-800/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:001a14f29826dd590373d0e9c1051359997d64ec822fb50b1cabbd25868ef872 +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-800/rng_state_3.pth b/adapters/saved-alpaca-13b/checkpoint-800/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f29175f236824e58f389e4789b1d75385cd1ebba --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-800/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25ca8f826bd679c698d40e7c7b6421876a004d7ee0c7c1f4627142b8470543a3 +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-800/rng_state_4.pth b/adapters/saved-alpaca-13b/checkpoint-800/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..b721359a621d56f12fe05167c59380ce81d1e90f --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-800/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cec3482573f68be8a59c79516d457f452e08e29a2dcb57de4b901670d361d37f +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-800/rng_state_5.pth b/adapters/saved-alpaca-13b/checkpoint-800/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..43254ab4b63f863ac15f8342b5a8ad9784329bd7 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-800/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:059585222586276099098bf816da3a60abf4e41c1131c0bb6b9dc17ce74ff48d +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-800/rng_state_6.pth b/adapters/saved-alpaca-13b/checkpoint-800/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..25516fd4c154af3d61556ae807bc8b963a3a8fc0 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-800/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:726e2dd1d9070a86366cb043486560fe6cd90fad3fb89344beffec018587ddce +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-800/rng_state_7.pth b/adapters/saved-alpaca-13b/checkpoint-800/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..c35d47387477a6f956e56e3696d82d12e3c63524 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-800/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af8d9d6dc0510fc91cbb94adfe6c2322f7fc807b649b3bb74ea2a584ca9b9cdb +size 14583 diff --git a/adapters/saved-alpaca-13b/checkpoint-800/scaler.pt b/adapters/saved-alpaca-13b/checkpoint-800/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e01dd7b5d3a8968bb4c73a805f08f0f65c9b57f --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-800/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27ec07a12731ae6f9765d05fe7c8495505f1d0f90b4cc6255a0853fec3970808 +size 557 diff --git a/adapters/saved-alpaca-13b/checkpoint-800/scheduler.pt b/adapters/saved-alpaca-13b/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5d40f5c3578c7ebe095681d6f7bd8fecbbf7407 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98c4b9b4bff768da63a57f06e31324664e9c442c12b99f05f7bf2cd746d192e9 +size 627 diff --git a/adapters/saved-alpaca-13b/checkpoint-800/trainer_state.json b/adapters/saved-alpaca-13b/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8bb38cdf739bd547953672c60904d65e0b74aba3 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-800/trainer_state.json @@ -0,0 +1,288 @@ +{ + "best_metric": 0.8164276480674744, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-13b/checkpoint-800", + "epoch": 2.0499679692504804, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.05, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.9024, + "step": 20 + }, + { + "epoch": 0.1, + "learning_rate": 0.00011999999999999999, + "loss": 1.4401, + "step": 40 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017999999999999998, + "loss": 0.9439, + "step": 60 + }, + { + "epoch": 0.2, + "learning_rate": 0.00023999999999999998, + "loss": 0.8693, + "step": 80 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003, + "loss": 0.8598, + "step": 100 + }, + { + "epoch": 0.31, + "learning_rate": 0.00029439252336448596, + "loss": 0.8485, + "step": 120 + }, + { + "epoch": 0.36, + "learning_rate": 0.00028878504672897194, + "loss": 0.8323, + "step": 140 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002831775700934579, + "loss": 0.8364, + "step": 160 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002775700934579439, + "loss": 0.8364, + "step": 180 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002719626168224299, + "loss": 0.842, + "step": 200 + }, + { + "epoch": 0.51, + "eval_loss": 0.8336600661277771, + "eval_runtime": 14.4551, + "eval_samples_per_second": 138.359, + "eval_steps_per_second": 2.214, + "step": 200 + }, + { + "epoch": 0.56, + "learning_rate": 0.00026635514018691586, + "loss": 0.8289, + "step": 220 + }, + { + "epoch": 0.61, + "learning_rate": 0.00026074766355140184, + "loss": 0.8383, + "step": 240 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002551401869158878, + "loss": 0.822, + "step": 260 + }, + { + "epoch": 0.72, + "learning_rate": 0.0002495327102803738, + "loss": 0.8378, + "step": 280 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002439252336448598, + "loss": 0.8275, + "step": 300 + }, + { + "epoch": 0.82, + "learning_rate": 0.00023831775700934577, + "loss": 0.8225, + "step": 320 + }, + { + "epoch": 0.87, + "learning_rate": 0.00023271028037383175, + "loss": 0.8188, + "step": 340 + }, + { + "epoch": 0.92, + "learning_rate": 0.00022710280373831773, + "loss": 0.8251, + "step": 360 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002214953271028037, + "loss": 0.8107, + "step": 380 + }, + { + "epoch": 1.02, + "learning_rate": 0.0002158878504672897, + "loss": 0.806, + "step": 400 + }, + { + "epoch": 1.02, + "eval_loss": 0.8227179050445557, + "eval_runtime": 14.4306, + "eval_samples_per_second": 138.594, + "eval_steps_per_second": 2.218, + "step": 400 + }, + { + "epoch": 1.08, + "learning_rate": 0.00021028037383177567, + "loss": 0.8157, + "step": 420 + }, + { + "epoch": 1.13, + "learning_rate": 0.00020467289719626166, + "loss": 0.8139, + "step": 440 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019906542056074764, + "loss": 0.8203, + "step": 460 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019345794392523362, + "loss": 0.8183, + "step": 480 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001878504672897196, + "loss": 0.8046, + "step": 500 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018224299065420558, + "loss": 0.8053, + "step": 520 + }, + { + "epoch": 1.38, + "learning_rate": 0.00017663551401869156, + "loss": 0.8037, + "step": 540 + }, + { + "epoch": 1.43, + "learning_rate": 0.00017102803738317754, + "loss": 0.8036, + "step": 560 + }, + { + "epoch": 1.49, + "learning_rate": 0.00016542056074766352, + "loss": 0.7971, + "step": 580 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001598130841121495, + "loss": 0.8024, + "step": 600 + }, + { + "epoch": 1.54, + "eval_loss": 0.8188450932502747, + "eval_runtime": 14.5635, + "eval_samples_per_second": 137.329, + "eval_steps_per_second": 2.197, + "step": 600 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001542056074766355, + "loss": 0.8042, + "step": 620 + }, + { + "epoch": 1.64, + "learning_rate": 0.00014859813084112147, + "loss": 0.8148, + "step": 640 + }, + { + "epoch": 1.69, + "learning_rate": 0.00014299065420560745, + "loss": 0.7976, + "step": 660 + }, + { + "epoch": 1.74, + "learning_rate": 0.00013738317757009343, + "loss": 0.8112, + "step": 680 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001317757009345794, + "loss": 0.805, + "step": 700 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001261682242990654, + "loss": 0.797, + "step": 720 + }, + { + "epoch": 1.9, + "learning_rate": 0.00012056074766355139, + "loss": 0.7883, + "step": 740 + }, + { + "epoch": 1.95, + "learning_rate": 0.00011495327102803737, + "loss": 0.8026, + "step": 760 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010934579439252335, + "loss": 0.8098, + "step": 780 + }, + { + "epoch": 2.05, + "learning_rate": 0.00010373831775700933, + "loss": 0.8021, + "step": 800 + }, + { + "epoch": 2.05, + "eval_loss": 0.8164276480674744, + "eval_runtime": 14.5011, + "eval_samples_per_second": 137.921, + "eval_steps_per_second": 2.207, + "step": 800 + } + ], + "max_steps": 1170, + "num_train_epochs": 3, + "total_flos": 2.022163468739674e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-13b/checkpoint-800/training_args.bin b/adapters/saved-alpaca-13b/checkpoint-800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0dd9d576e81f123903a3d80b6b40a5e98dcac191 --- /dev/null +++ b/adapters/saved-alpaca-13b/checkpoint-800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:569cffde1178ce7cb3b47607b0b6b1b562a8816ca51d5d79948ec2815c618c99 +size 3579 diff --git a/adapters/saved-alpaca-30b/adapter_config.json b/adapters/saved-alpaca-30b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1c63d3d52c8ae53700f4a81186f7cb93f50a2d78 --- /dev/null +++ b/adapters/saved-alpaca-30b/adapter_config.json @@ -0,0 +1,18 @@ +{ + "base_model_name_or_path": "/mnt/bn/qingyi-bn-lq/llama/llama-30b-hf", + "bias": "none", + "enable_lora": null, + "fan_in_fan_out": false, + "inference_mode": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved-alpaca-30b/adapter_model.bin b/adapters/saved-alpaca-30b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..023d5cf937a53c78d2bf9e2d4f3162c3d60a495a --- /dev/null +++ b/adapters/saved-alpaca-30b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ce3914959790fb662ef96782a3dbf51d0389d5e6dc788ba1b4c3168000ba8e1 +size 51204365 diff --git a/adapters/saved-alpaca-30b/checkpoint-1000/optimizer.pt b/adapters/saved-alpaca-30b/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e704e0026a158b22f3ba8561ae3159228c669209 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1003515bf84f62eb769e2ef0bb929c6db3457b3c0810df2976efbeaa82dc1c7a +size 102377669 diff --git a/adapters/saved-alpaca-30b/checkpoint-1000/pytorch_model.bin b/adapters/saved-alpaca-30b/checkpoint-1000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..5b507906355c1d73a138ed75a787e644bd5e70fb --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-1000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06013f5d13fcd882f339d95b43b55ca7ec455466da5a5ec2721ca81617174771 +size 51204365 diff --git a/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_0.pth b/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..0c6fa5d8c3d186c776d21a33e46e22441485ad0a --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b324516799aef68bd0b329d375bc92c3d156fb72d07e9939b8d80339153d370 +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_1.pth b/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..bf85bde100be1aff073274f831b8a188c6561429 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36b8d42f588c9287e71758c87a240aef55cc53f75d82f8ec0bea88228494b72d +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_2.pth b/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..60e8d280f04db029e1c91a405c72ab026f49decc --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d63c651b99dfa5b6f7f1a8b5ebea9eeb23f217f583525f1da522d601a422ac6a +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_3.pth b/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..aa65e662f00de46e91e88bca86c7d69193061add --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e32be0fa29843e2103f1c18fe378e57074c910b9890a445211d0c75fe3b9fb70 +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_4.pth b/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..1512c04ecaa36c571726dcb29594b405e3e8fa26 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae1fb5b360df6e73feb69e99b337f464381da1308996d4463b0c9f28717ec03f +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_5.pth b/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..2018261fb2df52994e3864a6bb62974f25fecb7b --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd6c0876b11ca5d9ba93d5ea11467ab8680ac9302195e463dd78f471cf7e33ec +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_6.pth b/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d4a36656aa5c7aecccf065e28e9fde7cf51d9506 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c849a99dd8a5119da9e6060b2c92eddd3de90af838c28bca7d11c19b2c9c1386 +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_7.pth b/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..553f1184a52f2550c221d031ba32aa8b89f3a5db --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-1000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4e81ed4ed5b3e0d6dfedeb61c4f22fe6f877fd71a17bb4280f5825bf53107c3 +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-1000/scaler.pt b/adapters/saved-alpaca-30b/checkpoint-1000/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..efdbd3c795f6b0d4144e68355e99c220ccdedd09 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-1000/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68cff80b680ddf6e7abbef98b5f336b97f9b5963e2209307f639383870e8cc71 +size 557 diff --git a/adapters/saved-alpaca-30b/checkpoint-1000/scheduler.pt b/adapters/saved-alpaca-30b/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..483d11399557624504dab1563e0154444b1c9dd8 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:988fa96f15b0bcf68e96b1a4d321f89c5a5aca28eeae640fe236375758ba5304 +size 627 diff --git a/adapters/saved-alpaca-30b/checkpoint-1000/trainer_state.json b/adapters/saved-alpaca-30b/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c6d33abced2866eac1a1b31a3ef88555aba8570a --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-1000/trainer_state.json @@ -0,0 +1,356 @@ +{ + "best_metric": 0.8037025332450867, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-30b/checkpoint-1000", + "epoch": 2.5624599615631007, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.05, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.9598, + "step": 20 + }, + { + "epoch": 0.1, + "learning_rate": 0.00011999999999999999, + "loss": 1.372, + "step": 40 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017999999999999998, + "loss": 0.9191, + "step": 60 + }, + { + "epoch": 0.2, + "learning_rate": 0.00023999999999999998, + "loss": 0.8468, + "step": 80 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003, + "loss": 0.8366, + "step": 100 + }, + { + "epoch": 0.31, + "learning_rate": 0.00029439252336448596, + "loss": 0.819, + "step": 120 + }, + { + "epoch": 0.36, + "learning_rate": 0.00028878504672897194, + "loss": 0.8205, + "step": 140 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002831775700934579, + "loss": 0.8168, + "step": 160 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002775700934579439, + "loss": 0.8019, + "step": 180 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002719626168224299, + "loss": 0.8035, + "step": 200 + }, + { + "epoch": 0.51, + "eval_loss": 0.8181663751602173, + "eval_runtime": 27.5256, + "eval_samples_per_second": 72.66, + "eval_steps_per_second": 1.163, + "step": 200 + }, + { + "epoch": 0.56, + "learning_rate": 0.00026635514018691586, + "loss": 0.8029, + "step": 220 + }, + { + "epoch": 0.61, + "learning_rate": 0.00026074766355140184, + "loss": 0.8026, + "step": 240 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002551401869158878, + "loss": 0.8023, + "step": 260 + }, + { + "epoch": 0.72, + "learning_rate": 0.0002495327102803738, + "loss": 0.8077, + "step": 280 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002439252336448598, + "loss": 0.7959, + "step": 300 + }, + { + "epoch": 0.82, + "learning_rate": 0.00023831775700934577, + "loss": 0.797, + "step": 320 + }, + { + "epoch": 0.87, + "learning_rate": 0.00023271028037383175, + "loss": 0.7937, + "step": 340 + }, + { + "epoch": 0.92, + "learning_rate": 0.00022710280373831773, + "loss": 0.7939, + "step": 360 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002214953271028037, + "loss": 0.7904, + "step": 380 + }, + { + "epoch": 1.02, + "learning_rate": 0.0002158878504672897, + "loss": 0.7992, + "step": 400 + }, + { + "epoch": 1.02, + "eval_loss": 0.8100255131721497, + "eval_runtime": 27.5094, + "eval_samples_per_second": 72.702, + "eval_steps_per_second": 1.163, + "step": 400 + }, + { + "epoch": 1.08, + "learning_rate": 0.00021028037383177567, + "loss": 0.7886, + "step": 420 + }, + { + "epoch": 1.13, + "learning_rate": 0.00020467289719626166, + "loss": 0.7959, + "step": 440 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019906542056074764, + "loss": 0.7736, + "step": 460 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019345794392523362, + "loss": 0.7814, + "step": 480 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001878504672897196, + "loss": 0.7876, + "step": 500 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018224299065420558, + "loss": 0.7823, + "step": 520 + }, + { + "epoch": 1.38, + "learning_rate": 0.00017663551401869156, + "loss": 0.7913, + "step": 540 + }, + { + "epoch": 1.43, + "learning_rate": 0.00017102803738317754, + "loss": 0.7804, + "step": 560 + }, + { + "epoch": 1.49, + "learning_rate": 0.00016542056074766352, + "loss": 0.7902, + "step": 580 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001598130841121495, + "loss": 0.7916, + "step": 600 + }, + { + "epoch": 1.54, + "eval_loss": 0.8062512278556824, + "eval_runtime": 27.5287, + "eval_samples_per_second": 72.651, + "eval_steps_per_second": 1.162, + "step": 600 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001542056074766355, + "loss": 0.7736, + "step": 620 + }, + { + "epoch": 1.64, + "learning_rate": 0.00014859813084112147, + "loss": 0.7774, + "step": 640 + }, + { + "epoch": 1.69, + "learning_rate": 0.00014299065420560745, + "loss": 0.7783, + "step": 660 + }, + { + "epoch": 1.74, + "learning_rate": 0.00013738317757009343, + "loss": 0.7644, + "step": 680 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001317757009345794, + "loss": 0.776, + "step": 700 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001261682242990654, + "loss": 0.7763, + "step": 720 + }, + { + "epoch": 1.9, + "learning_rate": 0.00012056074766355139, + "loss": 0.7737, + "step": 740 + }, + { + "epoch": 1.95, + "learning_rate": 0.00011495327102803737, + "loss": 0.7834, + "step": 760 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010934579439252335, + "loss": 0.7858, + "step": 780 + }, + { + "epoch": 2.05, + "learning_rate": 0.00010373831775700933, + "loss": 0.7706, + "step": 800 + }, + { + "epoch": 2.05, + "eval_loss": 0.8045574426651001, + "eval_runtime": 27.455, + "eval_samples_per_second": 72.846, + "eval_steps_per_second": 1.166, + "step": 800 + }, + { + "epoch": 2.1, + "learning_rate": 9.813084112149531e-05, + "loss": 0.7733, + "step": 820 + }, + { + "epoch": 2.15, + "learning_rate": 9.25233644859813e-05, + "loss": 0.7761, + "step": 840 + }, + { + "epoch": 2.2, + "learning_rate": 8.691588785046728e-05, + "loss": 0.7811, + "step": 860 + }, + { + "epoch": 2.25, + "learning_rate": 8.130841121495326e-05, + "loss": 0.7734, + "step": 880 + }, + { + "epoch": 2.31, + "learning_rate": 7.570093457943924e-05, + "loss": 0.7768, + "step": 900 + }, + { + "epoch": 2.36, + "learning_rate": 7.009345794392522e-05, + "loss": 0.7714, + "step": 920 + }, + { + "epoch": 2.41, + "learning_rate": 6.44859813084112e-05, + "loss": 0.7679, + "step": 940 + }, + { + "epoch": 2.46, + "learning_rate": 5.887850467289719e-05, + "loss": 0.776, + "step": 960 + }, + { + "epoch": 2.51, + "learning_rate": 5.327102803738317e-05, + "loss": 0.7705, + "step": 980 + }, + { + "epoch": 2.56, + "learning_rate": 4.766355140186915e-05, + "loss": 0.7798, + "step": 1000 + }, + { + "epoch": 2.56, + "eval_loss": 0.8037025332450867, + "eval_runtime": 27.5024, + "eval_samples_per_second": 72.721, + "eval_steps_per_second": 1.164, + "step": 1000 + } + ], + "max_steps": 1170, + "num_train_epochs": 3, + "total_flos": 6.35529279470515e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-30b/checkpoint-1000/training_args.bin b/adapters/saved-alpaca-30b/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..32b818b9981353cd2065885222cd8ae8420c1a85 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:835a42b791aaa6c71e746b3ba649333fac139644f2e09e0d8032d840744cfffa +size 3579 diff --git a/adapters/saved-alpaca-30b/checkpoint-600/optimizer.pt b/adapters/saved-alpaca-30b/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6aef25df196ccf3f74e898c359ffdba1bc24701 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b97ca11a7084692a3de43476a1a81580a5d408da419d681b812b3e2f88a0f0b7 +size 102377669 diff --git a/adapters/saved-alpaca-30b/checkpoint-600/pytorch_model.bin b/adapters/saved-alpaca-30b/checkpoint-600/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..6ec56af75b1536dc759d96418ebcc1fa7d40ef88 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-600/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c00fdc411ec89e0d9fca1e5eb7c76f92b2efff061583bee4e8673269cb937392 +size 51204365 diff --git a/adapters/saved-alpaca-30b/checkpoint-600/rng_state_0.pth b/adapters/saved-alpaca-30b/checkpoint-600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..a912055b1eda7af055b02ef0f25551c8d2ae8e6c --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6547b1d4cd5699574fbe410f6df62d371bfa548e1b5ff84df44eeff52be7ab2c +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-600/rng_state_1.pth b/adapters/saved-alpaca-30b/checkpoint-600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..fab47a3e0ee4c312342f26c3f81c23a0cf28a7a4 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46bf6aaf20831fb1084eff71ea84069a8f1debd07a981db7a2a8fa403efc85f4 +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-600/rng_state_2.pth b/adapters/saved-alpaca-30b/checkpoint-600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..a766a3372f9fe238a2ad9424cb0313da6b2c9662 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a5fb60dc0a030cbef7d6692b598fff0db134bae5d7be8f68991b1169dd6980e +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-600/rng_state_3.pth b/adapters/saved-alpaca-30b/checkpoint-600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..272ded940f8253ca12cb1103e04ca87274349449 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bb4a7bd7e9486507cdd01b272a01cb74aee1c88207b2c6fcf6001390b3c05f2 +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-600/rng_state_4.pth b/adapters/saved-alpaca-30b/checkpoint-600/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..caebfd4dde4d9ce9146ca0a2bc5b4c05c710bbe7 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-600/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14ea8490382b5ed64dfa594c0b8f36e10e440d2185a47594c7435a56600a02df +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-600/rng_state_5.pth b/adapters/saved-alpaca-30b/checkpoint-600/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..a01a5e2264241c0147907a92c1fbc9e0e90d6d41 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-600/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fe8f097f0d7dfa698dec19425476939d6c744e1156f67ce30069314e6932f58 +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-600/rng_state_6.pth b/adapters/saved-alpaca-30b/checkpoint-600/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..005cfb27de6a9427ae2bff80a40d94d9c4c8cfa0 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-600/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e32225e06d30059b2787ee9315b5bc852117527b8d8c7082d805dc5fe79b8d0 +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-600/rng_state_7.pth b/adapters/saved-alpaca-30b/checkpoint-600/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..a46d8d1232499409581047a052d887645c285190 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-600/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f63c700b0b7a0a2942fc2f53363e591b666f168e09a554f768ec2b413092ce6f +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-600/scaler.pt b/adapters/saved-alpaca-30b/checkpoint-600/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..842791b612283ceb8e68b64ed8e40e81c5a97bce --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-600/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dc9eacfeb00bd0bfeb98934a2309be01be65b288e0d747bbfc423b32679169f +size 557 diff --git a/adapters/saved-alpaca-30b/checkpoint-600/scheduler.pt b/adapters/saved-alpaca-30b/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c7bc11bb45bdbf773b1e9ce6b5f7314df71b760 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab0b5eaefe5317a6f29d9c670ecd5644d66afb60156c841d18e022a62f983d66 +size 627 diff --git a/adapters/saved-alpaca-30b/checkpoint-600/trainer_state.json b/adapters/saved-alpaca-30b/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3cd79b5c1882c1f52a39572adcecfc931f37cb3f --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-600/trainer_state.json @@ -0,0 +1,220 @@ +{ + "best_metric": 0.8062512278556824, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-30b/checkpoint-600", + "epoch": 1.5374759769378603, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.05, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.9598, + "step": 20 + }, + { + "epoch": 0.1, + "learning_rate": 0.00011999999999999999, + "loss": 1.372, + "step": 40 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017999999999999998, + "loss": 0.9191, + "step": 60 + }, + { + "epoch": 0.2, + "learning_rate": 0.00023999999999999998, + "loss": 0.8468, + "step": 80 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003, + "loss": 0.8366, + "step": 100 + }, + { + "epoch": 0.31, + "learning_rate": 0.00029439252336448596, + "loss": 0.819, + "step": 120 + }, + { + "epoch": 0.36, + "learning_rate": 0.00028878504672897194, + "loss": 0.8205, + "step": 140 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002831775700934579, + "loss": 0.8168, + "step": 160 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002775700934579439, + "loss": 0.8019, + "step": 180 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002719626168224299, + "loss": 0.8035, + "step": 200 + }, + { + "epoch": 0.51, + "eval_loss": 0.8181663751602173, + "eval_runtime": 27.5256, + "eval_samples_per_second": 72.66, + "eval_steps_per_second": 1.163, + "step": 200 + }, + { + "epoch": 0.56, + "learning_rate": 0.00026635514018691586, + "loss": 0.8029, + "step": 220 + }, + { + "epoch": 0.61, + "learning_rate": 0.00026074766355140184, + "loss": 0.8026, + "step": 240 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002551401869158878, + "loss": 0.8023, + "step": 260 + }, + { + "epoch": 0.72, + "learning_rate": 0.0002495327102803738, + "loss": 0.8077, + "step": 280 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002439252336448598, + "loss": 0.7959, + "step": 300 + }, + { + "epoch": 0.82, + "learning_rate": 0.00023831775700934577, + "loss": 0.797, + "step": 320 + }, + { + "epoch": 0.87, + "learning_rate": 0.00023271028037383175, + "loss": 0.7937, + "step": 340 + }, + { + "epoch": 0.92, + "learning_rate": 0.00022710280373831773, + "loss": 0.7939, + "step": 360 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002214953271028037, + "loss": 0.7904, + "step": 380 + }, + { + "epoch": 1.02, + "learning_rate": 0.0002158878504672897, + "loss": 0.7992, + "step": 400 + }, + { + "epoch": 1.02, + "eval_loss": 0.8100255131721497, + "eval_runtime": 27.5094, + "eval_samples_per_second": 72.702, + "eval_steps_per_second": 1.163, + "step": 400 + }, + { + "epoch": 1.08, + "learning_rate": 0.00021028037383177567, + "loss": 0.7886, + "step": 420 + }, + { + "epoch": 1.13, + "learning_rate": 0.00020467289719626166, + "loss": 0.7959, + "step": 440 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019906542056074764, + "loss": 0.7736, + "step": 460 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019345794392523362, + "loss": 0.7814, + "step": 480 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001878504672897196, + "loss": 0.7876, + "step": 500 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018224299065420558, + "loss": 0.7823, + "step": 520 + }, + { + "epoch": 1.38, + "learning_rate": 0.00017663551401869156, + "loss": 0.7913, + "step": 540 + }, + { + "epoch": 1.43, + "learning_rate": 0.00017102803738317754, + "loss": 0.7804, + "step": 560 + }, + { + "epoch": 1.49, + "learning_rate": 0.00016542056074766352, + "loss": 0.7902, + "step": 580 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001598130841121495, + "loss": 0.7916, + "step": 600 + }, + { + "epoch": 1.54, + "eval_loss": 0.8062512278556824, + "eval_runtime": 27.5287, + "eval_samples_per_second": 72.651, + "eval_steps_per_second": 1.162, + "step": 600 + } + ], + "max_steps": 1170, + "num_train_epochs": 3, + "total_flos": 3.813255128564105e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-30b/checkpoint-600/training_args.bin b/adapters/saved-alpaca-30b/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..32b818b9981353cd2065885222cd8ae8420c1a85 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:835a42b791aaa6c71e746b3ba649333fac139644f2e09e0d8032d840744cfffa +size 3579 diff --git a/adapters/saved-alpaca-30b/checkpoint-800/optimizer.pt b/adapters/saved-alpaca-30b/checkpoint-800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..aae26f3198a058f7f88e13a524c55f5b5c45bb56 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:931941303a9b752dcc6b6cf11f1b4ec3f5806b0c66c01d4153ea679426d7f31d +size 102377669 diff --git a/adapters/saved-alpaca-30b/checkpoint-800/pytorch_model.bin b/adapters/saved-alpaca-30b/checkpoint-800/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..21dade2d2b40e4e073dfb10f44d3d81ca28f7931 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-800/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d68022c0a7e1317dbad41d4086cc50e75a8c28765b8bfb24d49752a052196905 +size 51204365 diff --git a/adapters/saved-alpaca-30b/checkpoint-800/rng_state_0.pth b/adapters/saved-alpaca-30b/checkpoint-800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..aa7c3c6deaf4bf4181ca720949e9c06a14ecbd5d --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f29aa0778a8aa1cb1ed24f3265fcbb5fbeb2cf62eef9789c3f6f59bf918a9e54 +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-800/rng_state_1.pth b/adapters/saved-alpaca-30b/checkpoint-800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..24f372fbf9e4ad17cbbcf7e8f544211d2475b965 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01bdff18a0ee51c923eaa862385965862740bf4bed63af9bb340a61cbb2d79e2 +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-800/rng_state_2.pth b/adapters/saved-alpaca-30b/checkpoint-800/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..3398ab8a27ff25915cd312cd6bcc1d4a1c79850f --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-800/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1b87863f946b9653b49e806a69911d8f458c87f79df0e3d93e04e574e818e8a +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-800/rng_state_3.pth b/adapters/saved-alpaca-30b/checkpoint-800/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..81ecdeea90913bcc466b26c018f0d65ee736c4ce --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-800/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bba32f673b994753a4d8f37272a7e7e5a0eeb61116121145092f6a521ce21957 +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-800/rng_state_4.pth b/adapters/saved-alpaca-30b/checkpoint-800/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..658459df6ce6a916a21cc2fc7ac6073d90f07f60 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-800/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fce83170d7aed7e47b94dca979c88307deaffbeadee315db2e9af1571238d04b +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-800/rng_state_5.pth b/adapters/saved-alpaca-30b/checkpoint-800/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..11421a36cccfa77dcf424a9da58de655c7b3e1e1 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-800/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9feb3d91cbd7227760fbe320c548ba8b100bfce0ee28938cef88c8aa869cbc82 +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-800/rng_state_6.pth b/adapters/saved-alpaca-30b/checkpoint-800/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..65ae8eb8b2e5ad921d332d15a5efdfc0038b75c9 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-800/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e61b2c8ec325e0bd9a77929c51211742f87124d93e927c42376aa08683b26df +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-800/rng_state_7.pth b/adapters/saved-alpaca-30b/checkpoint-800/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..e4c71ce7f7eaed0a10c62b3a54e553d08636b8c7 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-800/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad269acf647315cefce6102a644885b257654aabfba1bb57fcc6f97f9b92116d +size 14583 diff --git a/adapters/saved-alpaca-30b/checkpoint-800/scaler.pt b/adapters/saved-alpaca-30b/checkpoint-800/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e01dd7b5d3a8968bb4c73a805f08f0f65c9b57f --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-800/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27ec07a12731ae6f9765d05fe7c8495505f1d0f90b4cc6255a0853fec3970808 +size 557 diff --git a/adapters/saved-alpaca-30b/checkpoint-800/scheduler.pt b/adapters/saved-alpaca-30b/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5d40f5c3578c7ebe095681d6f7bd8fecbbf7407 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98c4b9b4bff768da63a57f06e31324664e9c442c12b99f05f7bf2cd746d192e9 +size 627 diff --git a/adapters/saved-alpaca-30b/checkpoint-800/trainer_state.json b/adapters/saved-alpaca-30b/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a5d1236b6f5aa11f01301c21aa48e87896d41ee8 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-800/trainer_state.json @@ -0,0 +1,288 @@ +{ + "best_metric": 0.8045574426651001, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-30b/checkpoint-800", + "epoch": 2.0499679692504804, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.05, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.9598, + "step": 20 + }, + { + "epoch": 0.1, + "learning_rate": 0.00011999999999999999, + "loss": 1.372, + "step": 40 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017999999999999998, + "loss": 0.9191, + "step": 60 + }, + { + "epoch": 0.2, + "learning_rate": 0.00023999999999999998, + "loss": 0.8468, + "step": 80 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003, + "loss": 0.8366, + "step": 100 + }, + { + "epoch": 0.31, + "learning_rate": 0.00029439252336448596, + "loss": 0.819, + "step": 120 + }, + { + "epoch": 0.36, + "learning_rate": 0.00028878504672897194, + "loss": 0.8205, + "step": 140 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002831775700934579, + "loss": 0.8168, + "step": 160 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002775700934579439, + "loss": 0.8019, + "step": 180 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002719626168224299, + "loss": 0.8035, + "step": 200 + }, + { + "epoch": 0.51, + "eval_loss": 0.8181663751602173, + "eval_runtime": 27.5256, + "eval_samples_per_second": 72.66, + "eval_steps_per_second": 1.163, + "step": 200 + }, + { + "epoch": 0.56, + "learning_rate": 0.00026635514018691586, + "loss": 0.8029, + "step": 220 + }, + { + "epoch": 0.61, + "learning_rate": 0.00026074766355140184, + "loss": 0.8026, + "step": 240 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002551401869158878, + "loss": 0.8023, + "step": 260 + }, + { + "epoch": 0.72, + "learning_rate": 0.0002495327102803738, + "loss": 0.8077, + "step": 280 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002439252336448598, + "loss": 0.7959, + "step": 300 + }, + { + "epoch": 0.82, + "learning_rate": 0.00023831775700934577, + "loss": 0.797, + "step": 320 + }, + { + "epoch": 0.87, + "learning_rate": 0.00023271028037383175, + "loss": 0.7937, + "step": 340 + }, + { + "epoch": 0.92, + "learning_rate": 0.00022710280373831773, + "loss": 0.7939, + "step": 360 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002214953271028037, + "loss": 0.7904, + "step": 380 + }, + { + "epoch": 1.02, + "learning_rate": 0.0002158878504672897, + "loss": 0.7992, + "step": 400 + }, + { + "epoch": 1.02, + "eval_loss": 0.8100255131721497, + "eval_runtime": 27.5094, + "eval_samples_per_second": 72.702, + "eval_steps_per_second": 1.163, + "step": 400 + }, + { + "epoch": 1.08, + "learning_rate": 0.00021028037383177567, + "loss": 0.7886, + "step": 420 + }, + { + "epoch": 1.13, + "learning_rate": 0.00020467289719626166, + "loss": 0.7959, + "step": 440 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019906542056074764, + "loss": 0.7736, + "step": 460 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019345794392523362, + "loss": 0.7814, + "step": 480 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001878504672897196, + "loss": 0.7876, + "step": 500 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018224299065420558, + "loss": 0.7823, + "step": 520 + }, + { + "epoch": 1.38, + "learning_rate": 0.00017663551401869156, + "loss": 0.7913, + "step": 540 + }, + { + "epoch": 1.43, + "learning_rate": 0.00017102803738317754, + "loss": 0.7804, + "step": 560 + }, + { + "epoch": 1.49, + "learning_rate": 0.00016542056074766352, + "loss": 0.7902, + "step": 580 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001598130841121495, + "loss": 0.7916, + "step": 600 + }, + { + "epoch": 1.54, + "eval_loss": 0.8062512278556824, + "eval_runtime": 27.5287, + "eval_samples_per_second": 72.651, + "eval_steps_per_second": 1.162, + "step": 600 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001542056074766355, + "loss": 0.7736, + "step": 620 + }, + { + "epoch": 1.64, + "learning_rate": 0.00014859813084112147, + "loss": 0.7774, + "step": 640 + }, + { + "epoch": 1.69, + "learning_rate": 0.00014299065420560745, + "loss": 0.7783, + "step": 660 + }, + { + "epoch": 1.74, + "learning_rate": 0.00013738317757009343, + "loss": 0.7644, + "step": 680 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001317757009345794, + "loss": 0.776, + "step": 700 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001261682242990654, + "loss": 0.7763, + "step": 720 + }, + { + "epoch": 1.9, + "learning_rate": 0.00012056074766355139, + "loss": 0.7737, + "step": 740 + }, + { + "epoch": 1.95, + "learning_rate": 0.00011495327102803737, + "loss": 0.7834, + "step": 760 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010934579439252335, + "loss": 0.7858, + "step": 780 + }, + { + "epoch": 2.05, + "learning_rate": 0.00010373831775700933, + "loss": 0.7706, + "step": 800 + }, + { + "epoch": 2.05, + "eval_loss": 0.8045574426651001, + "eval_runtime": 27.455, + "eval_samples_per_second": 72.846, + "eval_steps_per_second": 1.166, + "step": 800 + } + ], + "max_steps": 1170, + "num_train_epochs": 3, + "total_flos": 5.084075332282089e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-30b/checkpoint-800/training_args.bin b/adapters/saved-alpaca-30b/checkpoint-800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..32b818b9981353cd2065885222cd8ae8420c1a85 --- /dev/null +++ b/adapters/saved-alpaca-30b/checkpoint-800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:835a42b791aaa6c71e746b3ba649333fac139644f2e09e0d8032d840744cfffa +size 3579 diff --git a/adapters/saved-alpaca-7b/adapter_config.json b/adapters/saved-alpaca-7b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6e56f2ae8f10fadfeec6c730ac6b119025824443 --- /dev/null +++ b/adapters/saved-alpaca-7b/adapter_config.json @@ -0,0 +1,18 @@ +{ + "base_model_name_or_path": "/mnt/bn/qingyi-bn-lq/llama/llama-7b-hf", + "bias": "none", + "enable_lora": null, + "fan_in_fan_out": false, + "inference_mode": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved-alpaca-7b/adapter_model.bin b/adapters/saved-alpaca-7b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..55a13689b3de4e3c5820065f7b7809bbcb5aadb2 --- /dev/null +++ b/adapters/saved-alpaca-7b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d74503c719494275053bae6ccf2376e785a4d7cddbe419d31de735ed35aae67a +size 16822989 diff --git a/adapters/saved-alpaca-7b/checkpoint-1000/optimizer.pt b/adapters/saved-alpaca-7b/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..34f913ef25f78a97933384e4c7637801cb548e43 --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c89db1db221d404783cc0bca70178864041b7f95e28cfcca49e8df19411dd411 +size 33629893 diff --git a/adapters/saved-alpaca-7b/checkpoint-1000/pytorch_model.bin b/adapters/saved-alpaca-7b/checkpoint-1000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..62dbaf767b1a2e2755aafbec29c3126f20956931 --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-1000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d19881a321385802a44c5bc6eaf8b414e0be6cfde66159846ec80fe825a6ea8c +size 16822989 diff --git a/adapters/saved-alpaca-7b/checkpoint-1000/rng_state_0.pth b/adapters/saved-alpaca-7b/checkpoint-1000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..61bd7f807fbd3b93dadecc20a7511f5ef96bd3ac --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-1000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e57ccdb14d90ae5ac4d61a9778f88b7a95c1607a78440743a8c881f4f56763c9 +size 14583 diff --git a/adapters/saved-alpaca-7b/checkpoint-1000/rng_state_1.pth b/adapters/saved-alpaca-7b/checkpoint-1000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..f80a5665d1507665d076ff489f393daa3125e3b7 --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-1000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eae139b345732d5e871798395ad3f04174592fde06d3ed50c562d475d655fbe4 +size 14583 diff --git a/adapters/saved-alpaca-7b/checkpoint-1000/rng_state_2.pth b/adapters/saved-alpaca-7b/checkpoint-1000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9977fdd0d4ed9c5dd3f01d23ada7180c796e31a3 --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-1000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba8d7df607d4fda8aac37b533d2b78f5318d65eef29769d31d72764563f15db7 +size 14583 diff --git a/adapters/saved-alpaca-7b/checkpoint-1000/rng_state_3.pth b/adapters/saved-alpaca-7b/checkpoint-1000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f12da7a8da4b6cdf6dcaa30d8adb7c321a6318 --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-1000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da0692f522368705868f7d5a48995ed7348f32ebe424cc27c9d3768ad0528123 +size 14583 diff --git a/adapters/saved-alpaca-7b/checkpoint-1000/scaler.pt b/adapters/saved-alpaca-7b/checkpoint-1000/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..efdbd3c795f6b0d4144e68355e99c220ccdedd09 --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-1000/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68cff80b680ddf6e7abbef98b5f336b97f9b5963e2209307f639383870e8cc71 +size 557 diff --git a/adapters/saved-alpaca-7b/checkpoint-1000/scheduler.pt b/adapters/saved-alpaca-7b/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..483d11399557624504dab1563e0154444b1c9dd8 --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:988fa96f15b0bcf68e96b1a4d321f89c5a5aca28eeae640fe236375758ba5304 +size 627 diff --git a/adapters/saved-alpaca-7b/checkpoint-1000/trainer_state.json b/adapters/saved-alpaca-7b/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..be590fd94e7d477b1f2a548b4c28084480b8140e --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-1000/trainer_state.json @@ -0,0 +1,356 @@ +{ + "best_metric": 0.8493452668190002, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/lora-alpaca/checkpoint-1000", + "epoch": 2.5624599615631007, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.05, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.9913, + "step": 20 + }, + { + "epoch": 0.1, + "learning_rate": 0.00011999999999999999, + "loss": 1.5044, + "step": 40 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017999999999999998, + "loss": 0.98, + "step": 60 + }, + { + "epoch": 0.2, + "learning_rate": 0.00023999999999999998, + "loss": 0.9036, + "step": 80 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003, + "loss": 0.8966, + "step": 100 + }, + { + "epoch": 0.31, + "learning_rate": 0.00029439252336448596, + "loss": 0.8743, + "step": 120 + }, + { + "epoch": 0.36, + "learning_rate": 0.00028878504672897194, + "loss": 0.8711, + "step": 140 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002831775700934579, + "loss": 0.8505, + "step": 160 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002775700934579439, + "loss": 0.8546, + "step": 180 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002719626168224299, + "loss": 0.8521, + "step": 200 + }, + { + "epoch": 0.51, + "eval_loss": 0.8707928657531738, + "eval_runtime": 19.2615, + "eval_samples_per_second": 103.834, + "eval_steps_per_second": 3.271, + "step": 200 + }, + { + "epoch": 0.56, + "learning_rate": 0.00026635514018691586, + "loss": 0.8523, + "step": 220 + }, + { + "epoch": 0.61, + "learning_rate": 0.00026074766355140184, + "loss": 0.8463, + "step": 240 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002551401869158878, + "loss": 0.8443, + "step": 260 + }, + { + "epoch": 0.72, + "learning_rate": 0.0002495327102803738, + "loss": 0.8557, + "step": 280 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002439252336448598, + "loss": 0.8607, + "step": 300 + }, + { + "epoch": 0.82, + "learning_rate": 0.00023831775700934577, + "loss": 0.8479, + "step": 320 + }, + { + "epoch": 0.87, + "learning_rate": 0.00023271028037383175, + "loss": 0.8416, + "step": 340 + }, + { + "epoch": 0.92, + "learning_rate": 0.00022710280373831773, + "loss": 0.8494, + "step": 360 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002214953271028037, + "loss": 0.8414, + "step": 380 + }, + { + "epoch": 1.02, + "learning_rate": 0.0002158878504672897, + "loss": 0.8476, + "step": 400 + }, + { + "epoch": 1.02, + "eval_loss": 0.8598244190216064, + "eval_runtime": 74.9831, + "eval_samples_per_second": 26.673, + "eval_steps_per_second": 0.84, + "step": 400 + }, + { + "epoch": 1.08, + "learning_rate": 0.00021028037383177567, + "loss": 0.8443, + "step": 420 + }, + { + "epoch": 1.13, + "learning_rate": 0.00020467289719626166, + "loss": 0.8382, + "step": 440 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019906542056074764, + "loss": 0.8467, + "step": 460 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019345794392523362, + "loss": 0.8384, + "step": 480 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001878504672897196, + "loss": 0.8346, + "step": 500 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018224299065420558, + "loss": 0.8347, + "step": 520 + }, + { + "epoch": 1.38, + "learning_rate": 0.00017663551401869156, + "loss": 0.8363, + "step": 540 + }, + { + "epoch": 1.43, + "learning_rate": 0.00017102803738317754, + "loss": 0.8284, + "step": 560 + }, + { + "epoch": 1.49, + "learning_rate": 0.00016542056074766352, + "loss": 0.837, + "step": 580 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001598130841121495, + "loss": 0.8385, + "step": 600 + }, + { + "epoch": 1.54, + "eval_loss": 0.8547194600105286, + "eval_runtime": 19.3656, + "eval_samples_per_second": 103.276, + "eval_steps_per_second": 3.253, + "step": 600 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001542056074766355, + "loss": 0.8359, + "step": 620 + }, + { + "epoch": 1.64, + "learning_rate": 0.00014859813084112147, + "loss": 0.8315, + "step": 640 + }, + { + "epoch": 1.69, + "learning_rate": 0.00014299065420560745, + "loss": 0.8379, + "step": 660 + }, + { + "epoch": 1.74, + "learning_rate": 0.00013738317757009343, + "loss": 0.8372, + "step": 680 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001317757009345794, + "loss": 0.8277, + "step": 700 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001261682242990654, + "loss": 0.8321, + "step": 720 + }, + { + "epoch": 1.9, + "learning_rate": 0.00012056074766355139, + "loss": 0.8299, + "step": 740 + }, + { + "epoch": 1.95, + "learning_rate": 0.00011495327102803737, + "loss": 0.8289, + "step": 760 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010934579439252335, + "loss": 0.8255, + "step": 780 + }, + { + "epoch": 2.05, + "learning_rate": 0.00010373831775700933, + "loss": 0.8319, + "step": 800 + }, + { + "epoch": 2.05, + "eval_loss": 0.8514933586120605, + "eval_runtime": 19.1889, + "eval_samples_per_second": 104.227, + "eval_steps_per_second": 3.283, + "step": 800 + }, + { + "epoch": 2.1, + "learning_rate": 9.813084112149531e-05, + "loss": 0.8241, + "step": 820 + }, + { + "epoch": 2.15, + "learning_rate": 9.25233644859813e-05, + "loss": 0.8214, + "step": 840 + }, + { + "epoch": 2.2, + "learning_rate": 8.691588785046728e-05, + "loss": 0.828, + "step": 860 + }, + { + "epoch": 2.25, + "learning_rate": 8.130841121495326e-05, + "loss": 0.8263, + "step": 880 + }, + { + "epoch": 2.31, + "learning_rate": 7.570093457943924e-05, + "loss": 0.8275, + "step": 900 + }, + { + "epoch": 2.36, + "learning_rate": 7.009345794392522e-05, + "loss": 0.8277, + "step": 920 + }, + { + "epoch": 2.41, + "learning_rate": 6.44859813084112e-05, + "loss": 0.8315, + "step": 940 + }, + { + "epoch": 2.46, + "learning_rate": 5.887850467289719e-05, + "loss": 0.8239, + "step": 960 + }, + { + "epoch": 2.51, + "learning_rate": 5.327102803738317e-05, + "loss": 0.8243, + "step": 980 + }, + { + "epoch": 2.56, + "learning_rate": 4.766355140186915e-05, + "loss": 0.8189, + "step": 1000 + }, + { + "epoch": 2.56, + "eval_loss": 0.8493452668190002, + "eval_runtime": 19.4174, + "eval_samples_per_second": 103.0, + "eval_steps_per_second": 3.245, + "step": 1000 + } + ], + "max_steps": 1170, + "num_train_epochs": 3, + "total_flos": 1.299718813859709e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-7b/checkpoint-1000/training_args.bin b/adapters/saved-alpaca-7b/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..be326b13a80d8a5eb96f40f3f9dac914f4249047 --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af95e626b78bae76450b5ed2778ec700c6bb4219252ae040fac9ddc05600a526 +size 3579 diff --git a/adapters/saved-alpaca-7b/checkpoint-600/optimizer.pt b/adapters/saved-alpaca-7b/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1b5765ae9ac12bf394c417e2efcfa027771f5718 --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1734d011b86e605ba1c4cc28ba9c6f5280cb548c4c7bc84cb105d5cdfde8a9b4 +size 33629893 diff --git a/adapters/saved-alpaca-7b/checkpoint-600/pytorch_model.bin b/adapters/saved-alpaca-7b/checkpoint-600/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..6b97163acf54dfe0e450849d017089b6c9f45b51 --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-600/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef6a066b1d13d203065dceb4d00843d2561f0f9a93aa5da4e3460ed5ecc2ae88 +size 16822989 diff --git a/adapters/saved-alpaca-7b/checkpoint-600/rng_state_0.pth b/adapters/saved-alpaca-7b/checkpoint-600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..fd9e6c7724723a33feaf4b2ecd94daa08216f621 --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70263c18966be34bf5cb56f073489df50956d92c8667826d127a902f62bda577 +size 14583 diff --git a/adapters/saved-alpaca-7b/checkpoint-600/rng_state_1.pth b/adapters/saved-alpaca-7b/checkpoint-600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..9c716bb79ba54e0b227387f078079d8119cb8ef2 --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3891e15a3b8346d86c0a2c936d85a2504e40290669367d6e8d33b0ee6312c23 +size 14583 diff --git a/adapters/saved-alpaca-7b/checkpoint-600/rng_state_2.pth b/adapters/saved-alpaca-7b/checkpoint-600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..dd07f6c01ead94814b837c2609cfa2023ddba92c --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac5aaa1d745db4da342bac54e8324169d92282130a48536270e6eea3b69fce86 +size 14583 diff --git a/adapters/saved-alpaca-7b/checkpoint-600/rng_state_3.pth b/adapters/saved-alpaca-7b/checkpoint-600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..0b556bdec95ba5511b4fe8414131fe0040aef0b0 --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98326ea60f7ba2a0a9dbcfd9099f37fa45f6e814558e07cb29bbd26671f17fc1 +size 14583 diff --git a/adapters/saved-alpaca-7b/checkpoint-600/scaler.pt b/adapters/saved-alpaca-7b/checkpoint-600/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..842791b612283ceb8e68b64ed8e40e81c5a97bce --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-600/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dc9eacfeb00bd0bfeb98934a2309be01be65b288e0d747bbfc423b32679169f +size 557 diff --git a/adapters/saved-alpaca-7b/checkpoint-600/scheduler.pt b/adapters/saved-alpaca-7b/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c7bc11bb45bdbf773b1e9ce6b5f7314df71b760 --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab0b5eaefe5317a6f29d9c670ecd5644d66afb60156c841d18e022a62f983d66 +size 627 diff --git a/adapters/saved-alpaca-7b/checkpoint-600/trainer_state.json b/adapters/saved-alpaca-7b/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ef1edb829d89d0ef86f8e556047f16be9d0e04ad --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-600/trainer_state.json @@ -0,0 +1,220 @@ +{ + "best_metric": 0.8547194600105286, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/lora-alpaca/checkpoint-600", + "epoch": 1.5374759769378603, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.05, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.9913, + "step": 20 + }, + { + "epoch": 0.1, + "learning_rate": 0.00011999999999999999, + "loss": 1.5044, + "step": 40 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017999999999999998, + "loss": 0.98, + "step": 60 + }, + { + "epoch": 0.2, + "learning_rate": 0.00023999999999999998, + "loss": 0.9036, + "step": 80 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003, + "loss": 0.8966, + "step": 100 + }, + { + "epoch": 0.31, + "learning_rate": 0.00029439252336448596, + "loss": 0.8743, + "step": 120 + }, + { + "epoch": 0.36, + "learning_rate": 0.00028878504672897194, + "loss": 0.8711, + "step": 140 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002831775700934579, + "loss": 0.8505, + "step": 160 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002775700934579439, + "loss": 0.8546, + "step": 180 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002719626168224299, + "loss": 0.8521, + "step": 200 + }, + { + "epoch": 0.51, + "eval_loss": 0.8707928657531738, + "eval_runtime": 19.2615, + "eval_samples_per_second": 103.834, + "eval_steps_per_second": 3.271, + "step": 200 + }, + { + "epoch": 0.56, + "learning_rate": 0.00026635514018691586, + "loss": 0.8523, + "step": 220 + }, + { + "epoch": 0.61, + "learning_rate": 0.00026074766355140184, + "loss": 0.8463, + "step": 240 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002551401869158878, + "loss": 0.8443, + "step": 260 + }, + { + "epoch": 0.72, + "learning_rate": 0.0002495327102803738, + "loss": 0.8557, + "step": 280 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002439252336448598, + "loss": 0.8607, + "step": 300 + }, + { + "epoch": 0.82, + "learning_rate": 0.00023831775700934577, + "loss": 0.8479, + "step": 320 + }, + { + "epoch": 0.87, + "learning_rate": 0.00023271028037383175, + "loss": 0.8416, + "step": 340 + }, + { + "epoch": 0.92, + "learning_rate": 0.00022710280373831773, + "loss": 0.8494, + "step": 360 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002214953271028037, + "loss": 0.8414, + "step": 380 + }, + { + "epoch": 1.02, + "learning_rate": 0.0002158878504672897, + "loss": 0.8476, + "step": 400 + }, + { + "epoch": 1.02, + "eval_loss": 0.8598244190216064, + "eval_runtime": 74.9831, + "eval_samples_per_second": 26.673, + "eval_steps_per_second": 0.84, + "step": 400 + }, + { + "epoch": 1.08, + "learning_rate": 0.00021028037383177567, + "loss": 0.8443, + "step": 420 + }, + { + "epoch": 1.13, + "learning_rate": 0.00020467289719626166, + "loss": 0.8382, + "step": 440 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019906542056074764, + "loss": 0.8467, + "step": 460 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019345794392523362, + "loss": 0.8384, + "step": 480 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001878504672897196, + "loss": 0.8346, + "step": 500 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018224299065420558, + "loss": 0.8347, + "step": 520 + }, + { + "epoch": 1.38, + "learning_rate": 0.00017663551401869156, + "loss": 0.8363, + "step": 540 + }, + { + "epoch": 1.43, + "learning_rate": 0.00017102803738317754, + "loss": 0.8284, + "step": 560 + }, + { + "epoch": 1.49, + "learning_rate": 0.00016542056074766352, + "loss": 0.837, + "step": 580 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001598130841121495, + "loss": 0.8385, + "step": 600 + }, + { + "epoch": 1.54, + "eval_loss": 0.8547194600105286, + "eval_runtime": 19.3656, + "eval_samples_per_second": 103.276, + "eval_steps_per_second": 3.253, + "step": 600 + } + ], + "max_steps": 1170, + "num_train_epochs": 3, + "total_flos": 7.798475370360996e+17, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-7b/checkpoint-600/training_args.bin b/adapters/saved-alpaca-7b/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..be326b13a80d8a5eb96f40f3f9dac914f4249047 --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af95e626b78bae76450b5ed2778ec700c6bb4219252ae040fac9ddc05600a526 +size 3579 diff --git a/adapters/saved-alpaca-7b/checkpoint-800/optimizer.pt b/adapters/saved-alpaca-7b/checkpoint-800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c3a458d04db0e56c24ff1ba7e3b3d657941402e --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72363f47eda4fa826f8d14b3d136c91966e933e5cf50b8243ed7492221d2be62 +size 33629893 diff --git a/adapters/saved-alpaca-7b/checkpoint-800/pytorch_model.bin b/adapters/saved-alpaca-7b/checkpoint-800/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..3206cd160260a1c705522a9e613fec37256b388a --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-800/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7826dee4cfcbb5913ac34c2b110e83e26103c07e513f5a5bb6aa1005ee83e3af +size 16822989 diff --git a/adapters/saved-alpaca-7b/checkpoint-800/rng_state_0.pth b/adapters/saved-alpaca-7b/checkpoint-800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..1b6df2efcebe955ca0ef40b6bdeba8f6c0514e07 --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c6e3df2550da99f9a14bf92d6c5e57cc564ccbe17c2892d297d9674972b271e +size 14583 diff --git a/adapters/saved-alpaca-7b/checkpoint-800/rng_state_1.pth b/adapters/saved-alpaca-7b/checkpoint-800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..5a9a334bbb4fdc272d7372af6965427b7659b248 --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e160f59bc18f0ecc440557dad88776f67fa42755e59a78b31efffae6d21e4938 +size 14583 diff --git a/adapters/saved-alpaca-7b/checkpoint-800/rng_state_2.pth b/adapters/saved-alpaca-7b/checkpoint-800/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..5cd665ace20b1f68c95a502974d85196c7e8cfa2 --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-800/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7653151ccf6b2ba85b1beb395a8eef06fafe6f5e33a883d03302f43ff835e5d4 +size 14583 diff --git a/adapters/saved-alpaca-7b/checkpoint-800/rng_state_3.pth b/adapters/saved-alpaca-7b/checkpoint-800/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..8015fbeb8e12a849e3b97dffb0c0821cc4b27eca --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-800/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b3f890cb7e84dd3d570008319336823bcf970f6d03a1ee770c1b03abd6e0da8 +size 14583 diff --git a/adapters/saved-alpaca-7b/checkpoint-800/scaler.pt b/adapters/saved-alpaca-7b/checkpoint-800/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e01dd7b5d3a8968bb4c73a805f08f0f65c9b57f --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-800/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27ec07a12731ae6f9765d05fe7c8495505f1d0f90b4cc6255a0853fec3970808 +size 557 diff --git a/adapters/saved-alpaca-7b/checkpoint-800/scheduler.pt b/adapters/saved-alpaca-7b/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5d40f5c3578c7ebe095681d6f7bd8fecbbf7407 --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98c4b9b4bff768da63a57f06e31324664e9c442c12b99f05f7bf2cd746d192e9 +size 627 diff --git a/adapters/saved-alpaca-7b/checkpoint-800/trainer_state.json b/adapters/saved-alpaca-7b/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..636c02b8cd5a6838ee66235cf344020037037a5e --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-800/trainer_state.json @@ -0,0 +1,288 @@ +{ + "best_metric": 0.8514933586120605, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/lora-alpaca/checkpoint-800", + "epoch": 2.0499679692504804, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.05, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.9913, + "step": 20 + }, + { + "epoch": 0.1, + "learning_rate": 0.00011999999999999999, + "loss": 1.5044, + "step": 40 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017999999999999998, + "loss": 0.98, + "step": 60 + }, + { + "epoch": 0.2, + "learning_rate": 0.00023999999999999998, + "loss": 0.9036, + "step": 80 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003, + "loss": 0.8966, + "step": 100 + }, + { + "epoch": 0.31, + "learning_rate": 0.00029439252336448596, + "loss": 0.8743, + "step": 120 + }, + { + "epoch": 0.36, + "learning_rate": 0.00028878504672897194, + "loss": 0.8711, + "step": 140 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002831775700934579, + "loss": 0.8505, + "step": 160 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002775700934579439, + "loss": 0.8546, + "step": 180 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002719626168224299, + "loss": 0.8521, + "step": 200 + }, + { + "epoch": 0.51, + "eval_loss": 0.8707928657531738, + "eval_runtime": 19.2615, + "eval_samples_per_second": 103.834, + "eval_steps_per_second": 3.271, + "step": 200 + }, + { + "epoch": 0.56, + "learning_rate": 0.00026635514018691586, + "loss": 0.8523, + "step": 220 + }, + { + "epoch": 0.61, + "learning_rate": 0.00026074766355140184, + "loss": 0.8463, + "step": 240 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002551401869158878, + "loss": 0.8443, + "step": 260 + }, + { + "epoch": 0.72, + "learning_rate": 0.0002495327102803738, + "loss": 0.8557, + "step": 280 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002439252336448598, + "loss": 0.8607, + "step": 300 + }, + { + "epoch": 0.82, + "learning_rate": 0.00023831775700934577, + "loss": 0.8479, + "step": 320 + }, + { + "epoch": 0.87, + "learning_rate": 0.00023271028037383175, + "loss": 0.8416, + "step": 340 + }, + { + "epoch": 0.92, + "learning_rate": 0.00022710280373831773, + "loss": 0.8494, + "step": 360 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002214953271028037, + "loss": 0.8414, + "step": 380 + }, + { + "epoch": 1.02, + "learning_rate": 0.0002158878504672897, + "loss": 0.8476, + "step": 400 + }, + { + "epoch": 1.02, + "eval_loss": 0.8598244190216064, + "eval_runtime": 74.9831, + "eval_samples_per_second": 26.673, + "eval_steps_per_second": 0.84, + "step": 400 + }, + { + "epoch": 1.08, + "learning_rate": 0.00021028037383177567, + "loss": 0.8443, + "step": 420 + }, + { + "epoch": 1.13, + "learning_rate": 0.00020467289719626166, + "loss": 0.8382, + "step": 440 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019906542056074764, + "loss": 0.8467, + "step": 460 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019345794392523362, + "loss": 0.8384, + "step": 480 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001878504672897196, + "loss": 0.8346, + "step": 500 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018224299065420558, + "loss": 0.8347, + "step": 520 + }, + { + "epoch": 1.38, + "learning_rate": 0.00017663551401869156, + "loss": 0.8363, + "step": 540 + }, + { + "epoch": 1.43, + "learning_rate": 0.00017102803738317754, + "loss": 0.8284, + "step": 560 + }, + { + "epoch": 1.49, + "learning_rate": 0.00016542056074766352, + "loss": 0.837, + "step": 580 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001598130841121495, + "loss": 0.8385, + "step": 600 + }, + { + "epoch": 1.54, + "eval_loss": 0.8547194600105286, + "eval_runtime": 19.3656, + "eval_samples_per_second": 103.276, + "eval_steps_per_second": 3.253, + "step": 600 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001542056074766355, + "loss": 0.8359, + "step": 620 + }, + { + "epoch": 1.64, + "learning_rate": 0.00014859813084112147, + "loss": 0.8315, + "step": 640 + }, + { + "epoch": 1.69, + "learning_rate": 0.00014299065420560745, + "loss": 0.8379, + "step": 660 + }, + { + "epoch": 1.74, + "learning_rate": 0.00013738317757009343, + "loss": 0.8372, + "step": 680 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001317757009345794, + "loss": 0.8277, + "step": 700 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001261682242990654, + "loss": 0.8321, + "step": 720 + }, + { + "epoch": 1.9, + "learning_rate": 0.00012056074766355139, + "loss": 0.8299, + "step": 740 + }, + { + "epoch": 1.95, + "learning_rate": 0.00011495327102803737, + "loss": 0.8289, + "step": 760 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010934579439252335, + "loss": 0.8255, + "step": 780 + }, + { + "epoch": 2.05, + "learning_rate": 0.00010373831775700933, + "loss": 0.8319, + "step": 800 + }, + { + "epoch": 2.05, + "eval_loss": 0.8514933586120605, + "eval_runtime": 19.1889, + "eval_samples_per_second": 104.227, + "eval_steps_per_second": 3.283, + "step": 800 + } + ], + "max_steps": 1170, + "num_train_epochs": 3, + "total_flos": 1.0397425536472187e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-7b/checkpoint-800/training_args.bin b/adapters/saved-alpaca-7b/checkpoint-800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..be326b13a80d8a5eb96f40f3f9dac914f4249047 --- /dev/null +++ b/adapters/saved-alpaca-7b/checkpoint-800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af95e626b78bae76450b5ed2778ec700c6bb4219252ae040fac9ddc05600a526 +size 3579 diff --git a/adapters/saved-alpaca-belle-cot13b/adapter_config.json b/adapters/saved-alpaca-belle-cot13b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4ba49948a8f8232ee95452e47fcf9bd523635048 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/adapter_config.json @@ -0,0 +1,18 @@ +{ + "base_model_name_or_path": "/mnt/bn/qingyi-bn-lq/llama/llama-13b-hf", + "bias": "none", + "enable_lora": null, + "fan_in_fan_out": false, + "inference_mode": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved-alpaca-belle-cot13b/adapter_model.bin b/adapters/saved-alpaca-belle-cot13b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..32023aedd2d74bbb643d1ac0bbd9a3c1840ba2c0 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f728283e1f50ee8ebc71ab4766efee77c48dff6b9cfe13b1f0ce0921d909f83 +size 26271757 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/optimizer.pt b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f13bf777c6a40bb55350fd825d8bebfdd100ab2c --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efd5b8272f8c01b080217b19fdfa2ecb24f805fa6f82c368ecd5829d26b7947a +size 52523141 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/pytorch_model.bin b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..be9bb1d7b90f3513ff073143c39ed3d0e18f9628 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb7c0b1d3b6c265a7c915e672c3160a2b277bfce535363f6422fc047950d1cdc +size 26271757 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_0.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..46f88664600e34348d6e2bc9aaeb7699faaa4042 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d566b6abbf439d1d67a448c01c1a77b576aef33f879df8588d39ca768464b65 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_1.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..4d26c42d8a1a7a2cc7b451f2ce91734dba440891 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97bc9b9d6886fa367e3b3bbf47c896fbb423cb103fd09f501cd8df89e36fba91 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_2.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..188f7c3dcbf326ebee8dd21b5dfa731f67b65cc4 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3445af3344013da2d2507d83e4828ce3d11996483f99a5df3b32da76fe7519d2 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_3.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..fb6645d9ceda14a5d4bef10b5801c6304c8f71a2 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bca7ed7cf7a83e2798882a8881bbac4104bf35a48fc6e6d937fa2151e801e59d +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_4.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..e065ac4f6f14a95d7b6a9e42f7254b646e92dc55 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d00b6ddf0fdf1c7fbdabb18ec4be2679014eaeb30c51dab24acaa6bfeb5ed9ee +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_5.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..ae64b92de2a9aae3a043aa971deecc901ddeb0e6 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00f0c577736fc463d72cb9788fa737a2ceb189111270d2616160755116b1cfe6 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_6.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..14bbd13ccd08401abfffea368c0d24596bf92e23 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:481f9ddade0aedda7f67fa14d01b57fe89f880b665a6f2ae6023e02fdcb509e8 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_7.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..dedcbd8e8b80217627e09238170fd620fc54bdde --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:877374d3dfd2c93a8fdc1f10e9b3b822ad5e3936c862094872c94bdf2dd42232 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/scaler.pt b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa628d389d3b52090d83d73cda5115d3991db98b --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb766af1e74932ee122c6422e6e29961de079aa0bf11e8d955248844482af2aa +size 557 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/scheduler.pt b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc42a7b073141bdad7a73b36a944d4285a8269d2 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:547444d0b1563c08f71c58494e0eeb4c8d495498b015407edc5402412a53a467 +size 627 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/trainer_state.json b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1e12a1c17282fc05fe853a25ce3bc021c4dd5f6e --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/trainer_state.json @@ -0,0 +1,5184 @@ +{ + "best_metric": 0.733613908290863, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-belle-cot13b/checkpoint-15200", + "epoch": 2.9124353324391645, + "global_step": 15200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7259, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 1.4365, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 1.0988, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 0.00023999999999999998, + "loss": 1.0373, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.0003, + "loss": 0.9935, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029961432152728675, + "loss": 0.9734, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002992286430545735, + "loss": 0.9538, + "step": 140 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029884296458186025, + "loss": 0.9304, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029845728610914697, + "loss": 0.9159, + "step": 180 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029807160763643375, + "loss": 0.9056, + "step": 200 + }, + { + "epoch": 0.04, + "eval_loss": 0.9217711091041565, + "eval_runtime": 25.3617, + "eval_samples_per_second": 78.859, + "eval_steps_per_second": 1.262, + "step": 200 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029768592916372047, + "loss": 0.9028, + "step": 220 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029730025069100725, + "loss": 0.8939, + "step": 240 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029691457221829397, + "loss": 0.8839, + "step": 260 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029652889374558075, + "loss": 0.8929, + "step": 280 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029614321527286747, + "loss": 0.8708, + "step": 300 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029575753680015425, + "loss": 0.8824, + "step": 320 + }, + { + "epoch": 0.07, + "learning_rate": 0.000295371858327441, + "loss": 0.8705, + "step": 340 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029498617985472775, + "loss": 0.8678, + "step": 360 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002946005013820145, + "loss": 0.8687, + "step": 380 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029421482290930125, + "loss": 0.8609, + "step": 400 + }, + { + "epoch": 0.08, + "eval_loss": 0.8822715878486633, + "eval_runtime": 25.3188, + "eval_samples_per_second": 78.993, + "eval_steps_per_second": 1.264, + "step": 400 + }, + { + "epoch": 0.08, + "learning_rate": 0.000293829144436588, + "loss": 0.8603, + "step": 420 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029344346596387475, + "loss": 0.8662, + "step": 440 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002930577874911615, + "loss": 0.8591, + "step": 460 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029267210901844825, + "loss": 0.8442, + "step": 480 + }, + { + "epoch": 0.1, + "learning_rate": 0.000292286430545735, + "loss": 0.8482, + "step": 500 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029190075207302175, + "loss": 0.8458, + "step": 520 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002915150736003085, + "loss": 0.8377, + "step": 540 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029112939512759525, + "loss": 0.8372, + "step": 560 + }, + { + "epoch": 0.11, + "learning_rate": 0.000290743716654882, + "loss": 0.8444, + "step": 580 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002903580381821688, + "loss": 0.84, + "step": 600 + }, + { + "epoch": 0.11, + "eval_loss": 0.8602503538131714, + "eval_runtime": 25.3547, + "eval_samples_per_second": 78.881, + "eval_steps_per_second": 1.262, + "step": 600 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002899723597094555, + "loss": 0.8428, + "step": 620 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002895866812367423, + "loss": 0.8366, + "step": 640 + }, + { + "epoch": 0.13, + "learning_rate": 0.000289201002764029, + "loss": 0.8408, + "step": 660 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002888153242913158, + "loss": 0.8445, + "step": 680 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002884296458186025, + "loss": 0.8335, + "step": 700 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002880439673458893, + "loss": 0.8316, + "step": 720 + }, + { + "epoch": 0.14, + "learning_rate": 0.000287658288873176, + "loss": 0.8449, + "step": 740 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002872726104004628, + "loss": 0.836, + "step": 760 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002868869319277495, + "loss": 0.8257, + "step": 780 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002865012534550363, + "loss": 0.8252, + "step": 800 + }, + { + "epoch": 0.15, + "eval_loss": 0.8450831174850464, + "eval_runtime": 25.4039, + "eval_samples_per_second": 78.728, + "eval_steps_per_second": 1.26, + "step": 800 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002861155749823231, + "loss": 0.8227, + "step": 820 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002857298965096098, + "loss": 0.8274, + "step": 840 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028534421803689657, + "loss": 0.8197, + "step": 860 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002849585395641833, + "loss": 0.823, + "step": 880 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028457286109147007, + "loss": 0.8176, + "step": 900 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002841871826187568, + "loss": 0.8092, + "step": 920 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028380150414604357, + "loss": 0.8171, + "step": 940 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002834158256733303, + "loss": 0.816, + "step": 960 + }, + { + "epoch": 0.19, + "learning_rate": 0.00028303014720061707, + "loss": 0.816, + "step": 980 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002826444687279038, + "loss": 0.8066, + "step": 1000 + }, + { + "epoch": 0.19, + "eval_loss": 0.8338332176208496, + "eval_runtime": 25.3851, + "eval_samples_per_second": 78.786, + "eval_steps_per_second": 1.261, + "step": 1000 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028225879025519057, + "loss": 0.82, + "step": 1020 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002818731117824773, + "loss": 0.8116, + "step": 1040 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028148743330976407, + "loss": 0.8156, + "step": 1060 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028110175483705085, + "loss": 0.8135, + "step": 1080 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028071607636433757, + "loss": 0.8055, + "step": 1100 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028033039789162435, + "loss": 0.8062, + "step": 1120 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027994471941891107, + "loss": 0.8082, + "step": 1140 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027955904094619785, + "loss": 0.8144, + "step": 1160 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027917336247348457, + "loss": 0.8067, + "step": 1180 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002787876840007713, + "loss": 0.8042, + "step": 1200 + }, + { + "epoch": 0.23, + "eval_loss": 0.8253737688064575, + "eval_runtime": 25.4089, + "eval_samples_per_second": 78.713, + "eval_steps_per_second": 1.259, + "step": 1200 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027840200552805807, + "loss": 0.8093, + "step": 1220 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027801632705534485, + "loss": 0.801, + "step": 1240 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027763064858263157, + "loss": 0.8043, + "step": 1260 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027724497010991834, + "loss": 0.8027, + "step": 1280 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002768592916372051, + "loss": 0.7979, + "step": 1300 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027647361316449184, + "loss": 0.7988, + "step": 1320 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002760879346917786, + "loss": 0.8051, + "step": 1340 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027570225621906534, + "loss": 0.7962, + "step": 1360 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002753165777463521, + "loss": 0.8034, + "step": 1380 + }, + { + "epoch": 0.27, + "learning_rate": 0.00027493089927363884, + "loss": 0.7994, + "step": 1400 + }, + { + "epoch": 0.27, + "eval_loss": 0.8166970014572144, + "eval_runtime": 25.3787, + "eval_samples_per_second": 78.806, + "eval_steps_per_second": 1.261, + "step": 1400 + }, + { + "epoch": 0.27, + "learning_rate": 0.00027454522080092557, + "loss": 0.7949, + "step": 1420 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027415954232821234, + "loss": 0.7919, + "step": 1440 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002737738638554991, + "loss": 0.7983, + "step": 1460 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027338818538278584, + "loss": 0.7828, + "step": 1480 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002730025069100726, + "loss": 0.7926, + "step": 1500 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002726168284373594, + "loss": 0.7837, + "step": 1520 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002722311499646461, + "loss": 0.7922, + "step": 1540 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002718454714919329, + "loss": 0.7852, + "step": 1560 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002714597930192196, + "loss": 0.7846, + "step": 1580 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002710741145465064, + "loss": 0.782, + "step": 1600 + }, + { + "epoch": 0.31, + "eval_loss": 0.8094187378883362, + "eval_runtime": 25.4544, + "eval_samples_per_second": 78.572, + "eval_steps_per_second": 1.257, + "step": 1600 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002706884360737931, + "loss": 0.7822, + "step": 1620 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027030275760107984, + "loss": 0.7787, + "step": 1640 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002699170791283666, + "loss": 0.7913, + "step": 1660 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002695314006556534, + "loss": 0.79, + "step": 1680 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002691457221829401, + "loss": 0.7934, + "step": 1700 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002687600437102269, + "loss": 0.7816, + "step": 1720 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002683743652375136, + "loss": 0.7825, + "step": 1740 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002679886867648004, + "loss": 0.7903, + "step": 1760 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026760300829208717, + "loss": 0.7906, + "step": 1780 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002672173298193739, + "loss": 0.7778, + "step": 1800 + }, + { + "epoch": 0.34, + "eval_loss": 0.8045867681503296, + "eval_runtime": 25.4351, + "eval_samples_per_second": 78.632, + "eval_steps_per_second": 1.258, + "step": 1800 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026683165134666067, + "loss": 0.7815, + "step": 1820 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002664459728739474, + "loss": 0.7851, + "step": 1840 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026606029440123417, + "loss": 0.7807, + "step": 1860 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002656746159285209, + "loss": 0.7856, + "step": 1880 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002652889374558076, + "loss": 0.7798, + "step": 1900 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002649032589830944, + "loss": 0.7777, + "step": 1920 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026451758051038117, + "loss": 0.7798, + "step": 1940 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002641319020376679, + "loss": 0.7783, + "step": 1960 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026374622356495467, + "loss": 0.7739, + "step": 1980 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026336054509224144, + "loss": 0.7823, + "step": 2000 + }, + { + "epoch": 0.38, + "eval_loss": 0.7984708547592163, + "eval_runtime": 25.4598, + "eval_samples_per_second": 78.555, + "eval_steps_per_second": 1.257, + "step": 2000 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026297486661952817, + "loss": 0.7774, + "step": 2020 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026258918814681494, + "loss": 0.7701, + "step": 2040 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026220350967410167, + "loss": 0.7777, + "step": 2060 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026181783120138844, + "loss": 0.781, + "step": 2080 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026143215272867517, + "loss": 0.779, + "step": 2100 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002610464742559619, + "loss": 0.7703, + "step": 2120 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026066079578324867, + "loss": 0.7749, + "step": 2140 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026027511731053544, + "loss": 0.772, + "step": 2160 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025988943883782216, + "loss": 0.771, + "step": 2180 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025950376036510894, + "loss": 0.7757, + "step": 2200 + }, + { + "epoch": 0.42, + "eval_loss": 0.7949528694152832, + "eval_runtime": 25.4504, + "eval_samples_per_second": 78.584, + "eval_steps_per_second": 1.257, + "step": 2200 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025911808189239566, + "loss": 0.7776, + "step": 2220 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025873240341968244, + "loss": 0.7689, + "step": 2240 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002583467249469692, + "loss": 0.7646, + "step": 2260 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025796104647425594, + "loss": 0.7805, + "step": 2280 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002575753680015427, + "loss": 0.7717, + "step": 2300 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025718968952882944, + "loss": 0.7672, + "step": 2320 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025680401105611616, + "loss": 0.7716, + "step": 2340 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025641833258340294, + "loss": 0.7661, + "step": 2360 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025603265411068966, + "loss": 0.7659, + "step": 2380 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025564697563797644, + "loss": 0.7697, + "step": 2400 + }, + { + "epoch": 0.46, + "eval_loss": 0.7915205359458923, + "eval_runtime": 25.4326, + "eval_samples_per_second": 78.639, + "eval_steps_per_second": 1.258, + "step": 2400 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002552612971652632, + "loss": 0.7686, + "step": 2420 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025487561869254994, + "loss": 0.7691, + "step": 2440 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002544899402198367, + "loss": 0.768, + "step": 2460 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002541042617471235, + "loss": 0.7663, + "step": 2480 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002537185832744102, + "loss": 0.767, + "step": 2500 + }, + { + "epoch": 0.48, + "learning_rate": 0.000253332904801697, + "loss": 0.769, + "step": 2520 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002529472263289837, + "loss": 0.7686, + "step": 2540 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025256154785627044, + "loss": 0.7722, + "step": 2560 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002521758693835572, + "loss": 0.7691, + "step": 2580 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025179019091084394, + "loss": 0.7742, + "step": 2600 + }, + { + "epoch": 0.5, + "eval_loss": 0.7875179648399353, + "eval_runtime": 25.4595, + "eval_samples_per_second": 78.556, + "eval_steps_per_second": 1.257, + "step": 2600 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002514045124381307, + "loss": 0.7682, + "step": 2620 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002510188339654175, + "loss": 0.7574, + "step": 2640 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002506331554927042, + "loss": 0.77, + "step": 2660 + }, + { + "epoch": 0.51, + "learning_rate": 0.000250247477019991, + "loss": 0.7638, + "step": 2680 + }, + { + "epoch": 0.52, + "learning_rate": 0.00024986179854727777, + "loss": 0.7517, + "step": 2700 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002494761200745645, + "loss": 0.7596, + "step": 2720 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024909044160185127, + "loss": 0.7608, + "step": 2740 + }, + { + "epoch": 0.53, + "learning_rate": 0.000248704763129138, + "loss": 0.7571, + "step": 2760 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002483190846564247, + "loss": 0.7597, + "step": 2780 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002479334061837115, + "loss": 0.7659, + "step": 2800 + }, + { + "epoch": 0.54, + "eval_loss": 0.7841727137565613, + "eval_runtime": 25.4853, + "eval_samples_per_second": 78.477, + "eval_steps_per_second": 1.256, + "step": 2800 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002475477277109982, + "loss": 0.7694, + "step": 2820 + }, + { + "epoch": 0.54, + "learning_rate": 0.000247162049238285, + "loss": 0.7722, + "step": 2840 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024677637076557176, + "loss": 0.7513, + "step": 2860 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002463906922928585, + "loss": 0.7553, + "step": 2880 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024600501382014526, + "loss": 0.7611, + "step": 2900 + }, + { + "epoch": 0.56, + "learning_rate": 0.000245619335347432, + "loss": 0.7614, + "step": 2920 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024523365687471876, + "loss": 0.761, + "step": 2940 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024484797840200554, + "loss": 0.7568, + "step": 2960 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024446229992929226, + "loss": 0.7571, + "step": 2980 + }, + { + "epoch": 0.57, + "learning_rate": 0.000244076621456579, + "loss": 0.7514, + "step": 3000 + }, + { + "epoch": 0.57, + "eval_loss": 0.7828710675239563, + "eval_runtime": 25.4475, + "eval_samples_per_second": 78.593, + "eval_steps_per_second": 1.257, + "step": 3000 + }, + { + "epoch": 0.58, + "learning_rate": 0.0002436909429838658, + "loss": 0.7564, + "step": 3020 + }, + { + "epoch": 0.58, + "learning_rate": 0.0002433052645111525, + "loss": 0.7593, + "step": 3040 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024291958603843926, + "loss": 0.7533, + "step": 3060 + }, + { + "epoch": 0.59, + "learning_rate": 0.000242533907565726, + "loss": 0.7566, + "step": 3080 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024214822909301276, + "loss": 0.7667, + "step": 3100 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024176255062029954, + "loss": 0.7638, + "step": 3120 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024137687214758626, + "loss": 0.7613, + "step": 3140 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024099119367487304, + "loss": 0.755, + "step": 3160 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002406055152021598, + "loss": 0.7547, + "step": 3180 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002402198367294465, + "loss": 0.7611, + "step": 3200 + }, + { + "epoch": 0.61, + "eval_loss": 0.7789185643196106, + "eval_runtime": 25.4744, + "eval_samples_per_second": 78.51, + "eval_steps_per_second": 1.256, + "step": 3200 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002398341582567333, + "loss": 0.7498, + "step": 3220 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023944847978402, + "loss": 0.757, + "step": 3240 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002390628013113068, + "loss": 0.7472, + "step": 3260 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023867712283859354, + "loss": 0.7557, + "step": 3280 + }, + { + "epoch": 0.63, + "learning_rate": 0.0002382914443658803, + "loss": 0.7602, + "step": 3300 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023790576589316704, + "loss": 0.7573, + "step": 3320 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002375200874204538, + "loss": 0.7565, + "step": 3340 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023713440894774054, + "loss": 0.7517, + "step": 3360 + }, + { + "epoch": 0.65, + "learning_rate": 0.0002367487304750273, + "loss": 0.7521, + "step": 3380 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023636305200231404, + "loss": 0.7575, + "step": 3400 + }, + { + "epoch": 0.65, + "eval_loss": 0.7771645784378052, + "eval_runtime": 25.4832, + "eval_samples_per_second": 78.483, + "eval_steps_per_second": 1.256, + "step": 3400 + }, + { + "epoch": 0.66, + "learning_rate": 0.0002359773735296008, + "loss": 0.7605, + "step": 3420 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023559169505688756, + "loss": 0.7547, + "step": 3440 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023520601658417428, + "loss": 0.7522, + "step": 3460 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023482033811146106, + "loss": 0.757, + "step": 3480 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002344346596387478, + "loss": 0.7561, + "step": 3500 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023404898116603456, + "loss": 0.7486, + "step": 3520 + }, + { + "epoch": 0.68, + "learning_rate": 0.0002336633026933213, + "loss": 0.7519, + "step": 3540 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023327762422060806, + "loss": 0.7487, + "step": 3560 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002328919457478948, + "loss": 0.747, + "step": 3580 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002325062672751816, + "loss": 0.7523, + "step": 3600 + }, + { + "epoch": 0.69, + "eval_loss": 0.7746226787567139, + "eval_runtime": 25.4795, + "eval_samples_per_second": 78.494, + "eval_steps_per_second": 1.256, + "step": 3600 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002321205888024683, + "loss": 0.7427, + "step": 3620 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002317349103297551, + "loss": 0.7442, + "step": 3640 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023134923185704184, + "loss": 0.7587, + "step": 3660 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023096355338432856, + "loss": 0.7506, + "step": 3680 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023057787491161534, + "loss": 0.7514, + "step": 3700 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023019219643890206, + "loss": 0.7475, + "step": 3720 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022980651796618884, + "loss": 0.7601, + "step": 3740 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022942083949347559, + "loss": 0.7474, + "step": 3760 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022903516102076233, + "loss": 0.7529, + "step": 3780 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022864948254804908, + "loss": 0.7458, + "step": 3800 + }, + { + "epoch": 0.73, + "eval_loss": 0.7719505429267883, + "eval_runtime": 25.4724, + "eval_samples_per_second": 78.516, + "eval_steps_per_second": 1.256, + "step": 3800 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022826380407533586, + "loss": 0.7584, + "step": 3820 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022787812560262258, + "loss": 0.7416, + "step": 3840 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022749244712990936, + "loss": 0.7444, + "step": 3860 + }, + { + "epoch": 0.74, + "learning_rate": 0.0002271067686571961, + "loss": 0.7459, + "step": 3880 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022672109018448283, + "loss": 0.7476, + "step": 3900 + }, + { + "epoch": 0.75, + "learning_rate": 0.0002263354117117696, + "loss": 0.7473, + "step": 3920 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022594973323905633, + "loss": 0.7434, + "step": 3940 + }, + { + "epoch": 0.76, + "learning_rate": 0.0002255640547663431, + "loss": 0.7463, + "step": 3960 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022517837629362986, + "loss": 0.7435, + "step": 3980 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002247926978209166, + "loss": 0.7445, + "step": 4000 + }, + { + "epoch": 0.77, + "eval_loss": 0.7707083821296692, + "eval_runtime": 25.4747, + "eval_samples_per_second": 78.509, + "eval_steps_per_second": 1.256, + "step": 4000 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022440701934820336, + "loss": 0.7321, + "step": 4020 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022402134087549014, + "loss": 0.7525, + "step": 4040 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022363566240277686, + "loss": 0.7494, + "step": 4060 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022324998393006364, + "loss": 0.7533, + "step": 4080 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022286430545735036, + "loss": 0.7442, + "step": 4100 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002224786269846371, + "loss": 0.7423, + "step": 4120 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022209294851192388, + "loss": 0.7443, + "step": 4140 + }, + { + "epoch": 0.8, + "learning_rate": 0.0002217072700392106, + "loss": 0.7388, + "step": 4160 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022132159156649738, + "loss": 0.7425, + "step": 4180 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022093591309378413, + "loss": 0.7507, + "step": 4200 + }, + { + "epoch": 0.8, + "eval_loss": 0.7691813111305237, + "eval_runtime": 25.5011, + "eval_samples_per_second": 78.428, + "eval_steps_per_second": 1.255, + "step": 4200 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022055023462107088, + "loss": 0.7276, + "step": 4220 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022016455614835763, + "loss": 0.7399, + "step": 4240 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021977887767564438, + "loss": 0.7409, + "step": 4260 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021939319920293113, + "loss": 0.7391, + "step": 4280 + }, + { + "epoch": 0.82, + "learning_rate": 0.0002190075207302179, + "loss": 0.741, + "step": 4300 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021862184225750463, + "loss": 0.7404, + "step": 4320 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021823616378479138, + "loss": 0.7356, + "step": 4340 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021785048531207816, + "loss": 0.7458, + "step": 4360 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021746480683936488, + "loss": 0.7373, + "step": 4380 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021707912836665166, + "loss": 0.7455, + "step": 4400 + }, + { + "epoch": 0.84, + "eval_loss": 0.7680566310882568, + "eval_runtime": 25.4479, + "eval_samples_per_second": 78.592, + "eval_steps_per_second": 1.257, + "step": 4400 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021669344989393838, + "loss": 0.7376, + "step": 4420 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021630777142122516, + "loss": 0.7396, + "step": 4440 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002159220929485119, + "loss": 0.7367, + "step": 4460 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021553641447579866, + "loss": 0.7354, + "step": 4480 + }, + { + "epoch": 0.86, + "learning_rate": 0.0002151507360030854, + "loss": 0.7337, + "step": 4500 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021476505753037218, + "loss": 0.7384, + "step": 4520 + }, + { + "epoch": 0.87, + "learning_rate": 0.0002143793790576589, + "loss": 0.7334, + "step": 4540 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021399370058494568, + "loss": 0.742, + "step": 4560 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002136080221122324, + "loss": 0.7408, + "step": 4580 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021322234363951916, + "loss": 0.7466, + "step": 4600 + }, + { + "epoch": 0.88, + "eval_loss": 0.7663780450820923, + "eval_runtime": 25.53, + "eval_samples_per_second": 78.339, + "eval_steps_per_second": 1.253, + "step": 4600 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021283666516680593, + "loss": 0.7399, + "step": 4620 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021245098669409266, + "loss": 0.746, + "step": 4640 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021206530822137943, + "loss": 0.7397, + "step": 4660 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021167962974866618, + "loss": 0.7349, + "step": 4680 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021129395127595293, + "loss": 0.7334, + "step": 4700 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021090827280323968, + "loss": 0.738, + "step": 4720 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002105225943305264, + "loss": 0.7398, + "step": 4740 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021013691585781318, + "loss": 0.7465, + "step": 4760 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020975123738509996, + "loss": 0.7388, + "step": 4780 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020936555891238668, + "loss": 0.7462, + "step": 4800 + }, + { + "epoch": 0.92, + "eval_loss": 0.7650267481803894, + "eval_runtime": 25.5463, + "eval_samples_per_second": 78.289, + "eval_steps_per_second": 1.253, + "step": 4800 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020897988043967343, + "loss": 0.7463, + "step": 4820 + }, + { + "epoch": 0.93, + "learning_rate": 0.0002085942019669602, + "loss": 0.7389, + "step": 4840 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020820852349424693, + "loss": 0.7316, + "step": 4860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002078228450215337, + "loss": 0.73, + "step": 4880 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020743716654882043, + "loss": 0.7472, + "step": 4900 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002070514880761072, + "loss": 0.7494, + "step": 4920 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020666580960339396, + "loss": 0.7424, + "step": 4940 + }, + { + "epoch": 0.95, + "learning_rate": 0.0002062801311306807, + "loss": 0.7443, + "step": 4960 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020589445265796746, + "loss": 0.7355, + "step": 4980 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020550877418525423, + "loss": 0.7388, + "step": 5000 + }, + { + "epoch": 0.96, + "eval_loss": 0.7630622386932373, + "eval_runtime": 25.8654, + "eval_samples_per_second": 77.323, + "eval_steps_per_second": 1.237, + "step": 5000 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020512309571254096, + "loss": 0.7317, + "step": 5020 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002047374172398277, + "loss": 0.7385, + "step": 5040 + }, + { + "epoch": 0.97, + "learning_rate": 0.00020435173876711445, + "loss": 0.7369, + "step": 5060 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002039660602944012, + "loss": 0.7243, + "step": 5080 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020358038182168798, + "loss": 0.7334, + "step": 5100 + }, + { + "epoch": 0.98, + "learning_rate": 0.0002031947033489747, + "loss": 0.7433, + "step": 5120 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020280902487626148, + "loss": 0.7202, + "step": 5140 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020242334640354823, + "loss": 0.7336, + "step": 5160 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020203766793083498, + "loss": 0.7324, + "step": 5180 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020165198945812173, + "loss": 0.7363, + "step": 5200 + }, + { + "epoch": 1.0, + "eval_loss": 0.7617191076278687, + "eval_runtime": 25.4884, + "eval_samples_per_second": 78.467, + "eval_steps_per_second": 1.255, + "step": 5200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0002012663109854085, + "loss": 0.7359, + "step": 5220 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020088063251269523, + "loss": 0.7347, + "step": 5240 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020049495403998198, + "loss": 0.732, + "step": 5260 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020010927556726873, + "loss": 0.7385, + "step": 5280 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019972359709455548, + "loss": 0.7313, + "step": 5300 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019933791862184226, + "loss": 0.7337, + "step": 5320 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019895224014912898, + "loss": 0.733, + "step": 5340 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019856656167641576, + "loss": 0.7226, + "step": 5360 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001981808832037025, + "loss": 0.7363, + "step": 5380 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019779520473098925, + "loss": 0.7296, + "step": 5400 + }, + { + "epoch": 1.03, + "eval_loss": 0.7608480453491211, + "eval_runtime": 25.5307, + "eval_samples_per_second": 78.337, + "eval_steps_per_second": 1.253, + "step": 5400 + }, + { + "epoch": 1.04, + "learning_rate": 0.000197409526258276, + "loss": 0.7237, + "step": 5420 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019702384778556273, + "loss": 0.735, + "step": 5440 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001966381693128495, + "loss": 0.7379, + "step": 5460 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019625249084013628, + "loss": 0.7372, + "step": 5480 + }, + { + "epoch": 1.05, + "learning_rate": 0.000195866812367423, + "loss": 0.7332, + "step": 5500 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019548113389470975, + "loss": 0.7375, + "step": 5520 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019509545542199653, + "loss": 0.7352, + "step": 5540 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019470977694928325, + "loss": 0.7336, + "step": 5560 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019432409847657003, + "loss": 0.7266, + "step": 5580 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019393842000385675, + "loss": 0.7325, + "step": 5600 + }, + { + "epoch": 1.07, + "eval_loss": 0.7595871686935425, + "eval_runtime": 25.4845, + "eval_samples_per_second": 78.479, + "eval_steps_per_second": 1.256, + "step": 5600 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019355274153114353, + "loss": 0.7259, + "step": 5620 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019316706305843028, + "loss": 0.7274, + "step": 5640 + }, + { + "epoch": 1.08, + "learning_rate": 0.000192781384585717, + "loss": 0.7254, + "step": 5660 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019239570611300378, + "loss": 0.7332, + "step": 5680 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019201002764029056, + "loss": 0.73, + "step": 5700 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019162434916757728, + "loss": 0.7365, + "step": 5720 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019123867069486403, + "loss": 0.7261, + "step": 5740 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019085299222215078, + "loss": 0.7331, + "step": 5760 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019046731374943753, + "loss": 0.7272, + "step": 5780 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001900816352767243, + "loss": 0.7325, + "step": 5800 + }, + { + "epoch": 1.11, + "eval_loss": 0.7583591341972351, + "eval_runtime": 25.49, + "eval_samples_per_second": 78.462, + "eval_steps_per_second": 1.255, + "step": 5800 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018969595680401103, + "loss": 0.7277, + "step": 5820 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001893102783312978, + "loss": 0.7352, + "step": 5840 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018892459985858455, + "loss": 0.7312, + "step": 5860 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018853892138587128, + "loss": 0.7296, + "step": 5880 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018815324291315805, + "loss": 0.7275, + "step": 5900 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018776756444044478, + "loss": 0.7345, + "step": 5920 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018738188596773155, + "loss": 0.7322, + "step": 5940 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001869962074950183, + "loss": 0.737, + "step": 5960 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018661052902230505, + "loss": 0.7243, + "step": 5980 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001862248505495918, + "loss": 0.7303, + "step": 6000 + }, + { + "epoch": 1.15, + "eval_loss": 0.7572018504142761, + "eval_runtime": 25.7126, + "eval_samples_per_second": 77.783, + "eval_steps_per_second": 1.245, + "step": 6000 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018583917207687858, + "loss": 0.7237, + "step": 6020 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001854534936041653, + "loss": 0.735, + "step": 6040 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018506781513145208, + "loss": 0.727, + "step": 6060 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001846821366587388, + "loss": 0.7226, + "step": 6080 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018429645818602558, + "loss": 0.7213, + "step": 6100 + }, + { + "epoch": 1.17, + "learning_rate": 0.000183930063636948, + "loss": 0.7206, + "step": 6120 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001835443851642347, + "loss": 0.7292, + "step": 6140 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001831587066915215, + "loss": 0.7316, + "step": 6160 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018277302821880824, + "loss": 0.7318, + "step": 6180 + }, + { + "epoch": 1.19, + "learning_rate": 0.000182387349746095, + "loss": 0.7302, + "step": 6200 + }, + { + "epoch": 1.19, + "eval_loss": 0.755982518196106, + "eval_runtime": 25.5055, + "eval_samples_per_second": 78.415, + "eval_steps_per_second": 1.255, + "step": 6200 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018200167127338174, + "loss": 0.728, + "step": 6220 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018161599280066851, + "loss": 0.7334, + "step": 6240 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018123031432795524, + "loss": 0.7303, + "step": 6260 + }, + { + "epoch": 1.2, + "learning_rate": 0.000180844635855242, + "loss": 0.7274, + "step": 6280 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018045895738252874, + "loss": 0.7368, + "step": 6300 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018007327890981549, + "loss": 0.724, + "step": 6320 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017968760043710226, + "loss": 0.7229, + "step": 6340 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017930192196438899, + "loss": 0.7216, + "step": 6360 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017891624349167576, + "loss": 0.7292, + "step": 6380 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001785305650189625, + "loss": 0.7226, + "step": 6400 + }, + { + "epoch": 1.23, + "eval_loss": 0.7554095387458801, + "eval_runtime": 25.5062, + "eval_samples_per_second": 78.412, + "eval_steps_per_second": 1.255, + "step": 6400 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017814488654624926, + "loss": 0.7262, + "step": 6420 + }, + { + "epoch": 1.23, + "learning_rate": 0.000177759208073536, + "loss": 0.7274, + "step": 6440 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017737352960082276, + "loss": 0.7271, + "step": 6460 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001769878511281095, + "loss": 0.7299, + "step": 6480 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001766021726553963, + "loss": 0.7264, + "step": 6500 + }, + { + "epoch": 1.25, + "learning_rate": 0.000176216494182683, + "loss": 0.7285, + "step": 6520 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001758308157099698, + "loss": 0.7216, + "step": 6540 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017544513723725654, + "loss": 0.7215, + "step": 6560 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017505945876454326, + "loss": 0.7253, + "step": 6580 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017467378029183004, + "loss": 0.7246, + "step": 6600 + }, + { + "epoch": 1.26, + "eval_loss": 0.7540405988693237, + "eval_runtime": 25.4725, + "eval_samples_per_second": 78.516, + "eval_steps_per_second": 1.256, + "step": 6600 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017428810181911676, + "loss": 0.7166, + "step": 6620 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017390242334640354, + "loss": 0.7213, + "step": 6640 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017351674487369029, + "loss": 0.7305, + "step": 6660 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017313106640097704, + "loss": 0.7347, + "step": 6680 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017274538792826379, + "loss": 0.7272, + "step": 6700 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017235970945555056, + "loss": 0.7224, + "step": 6720 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017197403098283728, + "loss": 0.7327, + "step": 6740 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017158835251012406, + "loss": 0.7228, + "step": 6760 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017120267403741078, + "loss": 0.7344, + "step": 6780 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017081699556469753, + "loss": 0.7269, + "step": 6800 + }, + { + "epoch": 1.3, + "eval_loss": 0.7531024813652039, + "eval_runtime": 25.6796, + "eval_samples_per_second": 77.883, + "eval_steps_per_second": 1.246, + "step": 6800 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001704313170919843, + "loss": 0.7362, + "step": 6820 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017004563861927103, + "loss": 0.7293, + "step": 6840 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001696599601465578, + "loss": 0.7286, + "step": 6860 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016927428167384456, + "loss": 0.7148, + "step": 6880 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001688886032011313, + "loss": 0.72, + "step": 6900 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016850292472841806, + "loss": 0.7239, + "step": 6920 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001681172462557048, + "loss": 0.726, + "step": 6940 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016773156778299156, + "loss": 0.7286, + "step": 6960 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016734588931027834, + "loss": 0.7276, + "step": 6980 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016696021083756506, + "loss": 0.7258, + "step": 7000 + }, + { + "epoch": 1.34, + "eval_loss": 0.7521764636039734, + "eval_runtime": 25.5237, + "eval_samples_per_second": 78.359, + "eval_steps_per_second": 1.254, + "step": 7000 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001665745323648518, + "loss": 0.7326, + "step": 7020 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016618885389213859, + "loss": 0.7311, + "step": 7040 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001658031754194253, + "loss": 0.7295, + "step": 7060 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016541749694671208, + "loss": 0.7279, + "step": 7080 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016503181847399883, + "loss": 0.7293, + "step": 7100 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016464614000128558, + "loss": 0.7256, + "step": 7120 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016426046152857233, + "loss": 0.7204, + "step": 7140 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016387478305585908, + "loss": 0.718, + "step": 7160 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016348910458314583, + "loss": 0.7206, + "step": 7180 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001631034261104326, + "loss": 0.7282, + "step": 7200 + }, + { + "epoch": 1.38, + "eval_loss": 0.7514960765838623, + "eval_runtime": 25.5249, + "eval_samples_per_second": 78.355, + "eval_steps_per_second": 1.254, + "step": 7200 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016271774763771933, + "loss": 0.7162, + "step": 7220 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016233206916500608, + "loss": 0.7277, + "step": 7240 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016194639069229286, + "loss": 0.7147, + "step": 7260 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016156071221957958, + "loss": 0.7339, + "step": 7280 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016117503374686636, + "loss": 0.7257, + "step": 7300 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016078935527415308, + "loss": 0.728, + "step": 7320 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016040367680143986, + "loss": 0.7139, + "step": 7340 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001600179983287266, + "loss": 0.7202, + "step": 7360 + }, + { + "epoch": 1.41, + "learning_rate": 0.00015963231985601336, + "loss": 0.7323, + "step": 7380 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001592466413833001, + "loss": 0.7198, + "step": 7400 + }, + { + "epoch": 1.42, + "eval_loss": 0.750492513179779, + "eval_runtime": 25.8887, + "eval_samples_per_second": 77.254, + "eval_steps_per_second": 1.236, + "step": 7400 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015886096291058688, + "loss": 0.7138, + "step": 7420 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001584752844378736, + "loss": 0.7205, + "step": 7440 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015808960596516038, + "loss": 0.7178, + "step": 7460 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001577039274924471, + "loss": 0.7251, + "step": 7480 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015731824901973386, + "loss": 0.7187, + "step": 7500 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015693257054702063, + "loss": 0.7238, + "step": 7520 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015654689207430736, + "loss": 0.7283, + "step": 7540 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015616121360159413, + "loss": 0.7189, + "step": 7560 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015577553512888088, + "loss": 0.7216, + "step": 7580 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015538985665616763, + "loss": 0.7219, + "step": 7600 + }, + { + "epoch": 1.46, + "eval_loss": 0.7496184706687927, + "eval_runtime": 25.4957, + "eval_samples_per_second": 78.445, + "eval_steps_per_second": 1.255, + "step": 7600 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015500417818345438, + "loss": 0.7233, + "step": 7620 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001546184997107411, + "loss": 0.7241, + "step": 7640 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015423282123802788, + "loss": 0.7194, + "step": 7660 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015384714276531466, + "loss": 0.7229, + "step": 7680 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015346146429260138, + "loss": 0.7219, + "step": 7700 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015307578581988813, + "loss": 0.7027, + "step": 7720 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001526901073471749, + "loss": 0.7171, + "step": 7740 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015230442887446163, + "loss": 0.7193, + "step": 7760 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001519187504017484, + "loss": 0.7269, + "step": 7780 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015153307192903513, + "loss": 0.7171, + "step": 7800 + }, + { + "epoch": 1.49, + "eval_loss": 0.7494381070137024, + "eval_runtime": 25.5318, + "eval_samples_per_second": 78.334, + "eval_steps_per_second": 1.253, + "step": 7800 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001511473934563219, + "loss": 0.7186, + "step": 7820 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015076171498360866, + "loss": 0.7137, + "step": 7840 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015037603651089538, + "loss": 0.7212, + "step": 7860 + }, + { + "epoch": 1.51, + "learning_rate": 0.00014999035803818216, + "loss": 0.7167, + "step": 7880 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001496046795654689, + "loss": 0.7203, + "step": 7900 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014921900109275566, + "loss": 0.714, + "step": 7920 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001488333226200424, + "loss": 0.7153, + "step": 7940 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014844764414732916, + "loss": 0.7176, + "step": 7960 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001480619656746159, + "loss": 0.7049, + "step": 7980 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014767628720190265, + "loss": 0.7204, + "step": 8000 + }, + { + "epoch": 1.53, + "eval_loss": 0.7486086487770081, + "eval_runtime": 25.5275, + "eval_samples_per_second": 78.347, + "eval_steps_per_second": 1.254, + "step": 8000 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014729060872918943, + "loss": 0.7167, + "step": 8020 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014690493025647618, + "loss": 0.72, + "step": 8040 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014651925178376293, + "loss": 0.7203, + "step": 8060 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014613357331104968, + "loss": 0.7258, + "step": 8080 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014574789483833643, + "loss": 0.715, + "step": 8100 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014536221636562318, + "loss": 0.7245, + "step": 8120 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014497653789290993, + "loss": 0.7258, + "step": 8140 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014459085942019668, + "loss": 0.7234, + "step": 8160 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014420518094748343, + "loss": 0.7128, + "step": 8180 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014381950247477018, + "loss": 0.7181, + "step": 8200 + }, + { + "epoch": 1.57, + "eval_loss": 0.7475513219833374, + "eval_runtime": 25.5412, + "eval_samples_per_second": 78.305, + "eval_steps_per_second": 1.253, + "step": 8200 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014343382400205693, + "loss": 0.7236, + "step": 8220 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001430481455293437, + "loss": 0.7125, + "step": 8240 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014266246705663046, + "loss": 0.7186, + "step": 8260 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001422767885839172, + "loss": 0.7203, + "step": 8280 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014189111011120396, + "loss": 0.7156, + "step": 8300 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001415054316384907, + "loss": 0.714, + "step": 8320 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014111975316577745, + "loss": 0.7129, + "step": 8340 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001407340746930642, + "loss": 0.7179, + "step": 8360 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014036768014398662, + "loss": 0.7197, + "step": 8380 + }, + { + "epoch": 1.61, + "learning_rate": 0.00013998200167127337, + "loss": 0.7287, + "step": 8400 + }, + { + "epoch": 1.61, + "eval_loss": 0.7470650672912598, + "eval_runtime": 25.5238, + "eval_samples_per_second": 78.358, + "eval_steps_per_second": 1.254, + "step": 8400 + }, + { + "epoch": 1.61, + "learning_rate": 0.00013959632319856011, + "loss": 0.718, + "step": 8420 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013921064472584686, + "loss": 0.7166, + "step": 8440 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013882496625313361, + "loss": 0.7218, + "step": 8460 + }, + { + "epoch": 1.62, + "learning_rate": 0.0001384392877804204, + "loss": 0.723, + "step": 8480 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013805360930770714, + "loss": 0.7104, + "step": 8500 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001376679308349939, + "loss": 0.7136, + "step": 8520 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001372822523622806, + "loss": 0.7237, + "step": 8540 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001368965738895674, + "loss": 0.7196, + "step": 8560 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013651089541685414, + "loss": 0.7218, + "step": 8580 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001361252169441409, + "loss": 0.7132, + "step": 8600 + }, + { + "epoch": 1.65, + "eval_loss": 0.7465201020240784, + "eval_runtime": 25.542, + "eval_samples_per_second": 78.302, + "eval_steps_per_second": 1.253, + "step": 8600 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013573953847142764, + "loss": 0.7139, + "step": 8620 + }, + { + "epoch": 1.66, + "learning_rate": 0.0001353538599987144, + "loss": 0.7093, + "step": 8640 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013496818152600114, + "loss": 0.7243, + "step": 8660 + }, + { + "epoch": 1.66, + "learning_rate": 0.0001345825030532879, + "loss": 0.7127, + "step": 8680 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013419682458057464, + "loss": 0.7148, + "step": 8700 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013381114610786142, + "loss": 0.7236, + "step": 8720 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013342546763514817, + "loss": 0.7103, + "step": 8740 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001330397891624349, + "loss": 0.7133, + "step": 8760 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013265411068972164, + "loss": 0.7182, + "step": 8780 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013226843221700841, + "loss": 0.7198, + "step": 8800 + }, + { + "epoch": 1.69, + "eval_loss": 0.7450763583183289, + "eval_runtime": 25.4725, + "eval_samples_per_second": 78.516, + "eval_steps_per_second": 1.256, + "step": 8800 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013188275374429516, + "loss": 0.7073, + "step": 8820 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013149707527158191, + "loss": 0.7208, + "step": 8840 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013111139679886866, + "loss": 0.7067, + "step": 8860 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001307257183261554, + "loss": 0.7149, + "step": 8880 + }, + { + "epoch": 1.71, + "learning_rate": 0.00013034003985344216, + "loss": 0.7133, + "step": 8900 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001299543613807289, + "loss": 0.7137, + "step": 8920 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001295686829080157, + "loss": 0.719, + "step": 8940 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012918300443530244, + "loss": 0.7198, + "step": 8960 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001287973259625892, + "loss": 0.7074, + "step": 8980 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001284116474898759, + "loss": 0.7216, + "step": 9000 + }, + { + "epoch": 1.72, + "eval_loss": 0.7454459071159363, + "eval_runtime": 25.5717, + "eval_samples_per_second": 78.211, + "eval_steps_per_second": 1.251, + "step": 9000 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001280259690171627, + "loss": 0.7203, + "step": 9020 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012764029054444944, + "loss": 0.7133, + "step": 9040 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001272546120717362, + "loss": 0.7081, + "step": 9060 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012686893359902294, + "loss": 0.7153, + "step": 9080 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001264832551263097, + "loss": 0.7108, + "step": 9100 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012609757665359644, + "loss": 0.7106, + "step": 9120 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001257118981808832, + "loss": 0.7117, + "step": 9140 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012532621970816994, + "loss": 0.7171, + "step": 9160 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012494054123545671, + "loss": 0.7148, + "step": 9180 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012455486276274346, + "loss": 0.714, + "step": 9200 + }, + { + "epoch": 1.76, + "eval_loss": 0.7446411848068237, + "eval_runtime": 25.5622, + "eval_samples_per_second": 78.24, + "eval_steps_per_second": 1.252, + "step": 9200 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012416918429003019, + "loss": 0.7133, + "step": 9220 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012378350581731694, + "loss": 0.7108, + "step": 9240 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001233978273446037, + "loss": 0.7147, + "step": 9260 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012301214887189046, + "loss": 0.715, + "step": 9280 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001226264703991772, + "loss": 0.7255, + "step": 9300 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012224079192646396, + "loss": 0.7168, + "step": 9320 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012185511345375073, + "loss": 0.7155, + "step": 9340 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012146943498103746, + "loss": 0.7064, + "step": 9360 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012108375650832421, + "loss": 0.716, + "step": 9380 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012069807803561096, + "loss": 0.7145, + "step": 9400 + }, + { + "epoch": 1.8, + "eval_loss": 0.7441000938415527, + "eval_runtime": 25.7378, + "eval_samples_per_second": 77.707, + "eval_steps_per_second": 1.243, + "step": 9400 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012031239956289772, + "loss": 0.7135, + "step": 9420 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011992672109018447, + "loss": 0.7164, + "step": 9440 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011954104261747122, + "loss": 0.714, + "step": 9460 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011915536414475797, + "loss": 0.7173, + "step": 9480 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011876968567204474, + "loss": 0.7102, + "step": 9500 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011838400719933149, + "loss": 0.7122, + "step": 9520 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011799832872661822, + "loss": 0.7197, + "step": 9540 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011761265025390497, + "loss": 0.7132, + "step": 9560 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011722697178119174, + "loss": 0.7255, + "step": 9580 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011684129330847849, + "loss": 0.7175, + "step": 9600 + }, + { + "epoch": 1.84, + "eval_loss": 0.7432481646537781, + "eval_runtime": 25.5076, + "eval_samples_per_second": 78.408, + "eval_steps_per_second": 1.255, + "step": 9600 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011645561483576524, + "loss": 0.7125, + "step": 9620 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011606993636305199, + "loss": 0.7119, + "step": 9640 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011568425789033875, + "loss": 0.7147, + "step": 9660 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001152985794176255, + "loss": 0.7101, + "step": 9680 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011491290094491225, + "loss": 0.7105, + "step": 9700 + }, + { + "epoch": 1.86, + "learning_rate": 0.000114527222472199, + "loss": 0.7153, + "step": 9720 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011414154399948576, + "loss": 0.7047, + "step": 9740 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011375586552677251, + "loss": 0.6967, + "step": 9760 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011337018705405925, + "loss": 0.7094, + "step": 9780 + }, + { + "epoch": 1.88, + "learning_rate": 0.000112984508581346, + "loss": 0.7195, + "step": 9800 + }, + { + "epoch": 1.88, + "eval_loss": 0.7431700229644775, + "eval_runtime": 25.5752, + "eval_samples_per_second": 78.201, + "eval_steps_per_second": 1.251, + "step": 9800 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011259883010863276, + "loss": 0.7122, + "step": 9820 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011221315163591951, + "loss": 0.7193, + "step": 9840 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011182747316320626, + "loss": 0.7147, + "step": 9860 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011144179469049301, + "loss": 0.7058, + "step": 9880 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011105611621777977, + "loss": 0.7106, + "step": 9900 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011067043774506652, + "loss": 0.71, + "step": 9920 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011028475927235327, + "loss": 0.7182, + "step": 9940 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010989908079964002, + "loss": 0.7048, + "step": 9960 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010951340232692679, + "loss": 0.7165, + "step": 9980 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010912772385421352, + "loss": 0.7153, + "step": 10000 + }, + { + "epoch": 1.92, + "eval_loss": 0.7425808310508728, + "eval_runtime": 25.6297, + "eval_samples_per_second": 78.034, + "eval_steps_per_second": 1.249, + "step": 10000 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010874204538150027, + "loss": 0.7127, + "step": 10020 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010835636690878702, + "loss": 0.7062, + "step": 10040 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010797068843607378, + "loss": 0.7125, + "step": 10060 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010758500996336053, + "loss": 0.7114, + "step": 10080 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010719933149064728, + "loss": 0.7096, + "step": 10100 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010681365301793405, + "loss": 0.7119, + "step": 10120 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001064279745452208, + "loss": 0.7034, + "step": 10140 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010604229607250755, + "loss": 0.7049, + "step": 10160 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001056566175997943, + "loss": 0.7156, + "step": 10180 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010527093912708106, + "loss": 0.718, + "step": 10200 + }, + { + "epoch": 1.95, + "eval_loss": 0.7418650984764099, + "eval_runtime": 25.554, + "eval_samples_per_second": 78.266, + "eval_steps_per_second": 1.252, + "step": 10200 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010488526065436781, + "loss": 0.7141, + "step": 10220 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010449958218165455, + "loss": 0.7073, + "step": 10240 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001041139037089413, + "loss": 0.7129, + "step": 10260 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010372822523622806, + "loss": 0.7174, + "step": 10280 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010334254676351481, + "loss": 0.7112, + "step": 10300 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010295686829080156, + "loss": 0.7073, + "step": 10320 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010257118981808831, + "loss": 0.7164, + "step": 10340 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010218551134537507, + "loss": 0.7057, + "step": 10360 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010179983287266182, + "loss": 0.709, + "step": 10380 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010141415439994857, + "loss": 0.7147, + "step": 10400 + }, + { + "epoch": 1.99, + "eval_loss": 0.7417293787002563, + "eval_runtime": 25.4964, + "eval_samples_per_second": 78.443, + "eval_steps_per_second": 1.255, + "step": 10400 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010102847592723531, + "loss": 0.713, + "step": 10420 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010064279745452208, + "loss": 0.7128, + "step": 10440 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010025711898180882, + "loss": 0.7094, + "step": 10460 + }, + { + "epoch": 2.01, + "learning_rate": 9.987144050909557e-05, + "loss": 0.7008, + "step": 10480 + }, + { + "epoch": 2.01, + "learning_rate": 9.948576203638232e-05, + "loss": 0.7083, + "step": 10500 + }, + { + "epoch": 2.02, + "learning_rate": 9.910008356366908e-05, + "loss": 0.7049, + "step": 10520 + }, + { + "epoch": 2.02, + "learning_rate": 9.871440509095583e-05, + "loss": 0.7041, + "step": 10540 + }, + { + "epoch": 2.02, + "learning_rate": 9.834801054187824e-05, + "loss": 0.7105, + "step": 10560 + }, + { + "epoch": 2.03, + "learning_rate": 9.7962332069165e-05, + "loss": 0.7041, + "step": 10580 + }, + { + "epoch": 2.03, + "learning_rate": 9.757665359645176e-05, + "loss": 0.7103, + "step": 10600 + }, + { + "epoch": 2.03, + "eval_loss": 0.7410894632339478, + "eval_runtime": 25.5424, + "eval_samples_per_second": 78.301, + "eval_steps_per_second": 1.253, + "step": 10600 + }, + { + "epoch": 2.03, + "learning_rate": 9.71909751237385e-05, + "loss": 0.7037, + "step": 10620 + }, + { + "epoch": 2.04, + "learning_rate": 9.680529665102524e-05, + "loss": 0.7078, + "step": 10640 + }, + { + "epoch": 2.04, + "learning_rate": 9.641961817831202e-05, + "loss": 0.7116, + "step": 10660 + }, + { + "epoch": 2.05, + "learning_rate": 9.603393970559876e-05, + "loss": 0.7094, + "step": 10680 + }, + { + "epoch": 2.05, + "learning_rate": 9.56482612328855e-05, + "loss": 0.7217, + "step": 10700 + }, + { + "epoch": 2.05, + "learning_rate": 9.526258276017226e-05, + "loss": 0.7038, + "step": 10720 + }, + { + "epoch": 2.06, + "learning_rate": 9.487690428745902e-05, + "loss": 0.7131, + "step": 10740 + }, + { + "epoch": 2.06, + "learning_rate": 9.449122581474577e-05, + "loss": 0.7051, + "step": 10760 + }, + { + "epoch": 2.07, + "learning_rate": 9.410554734203252e-05, + "loss": 0.7058, + "step": 10780 + }, + { + "epoch": 2.07, + "learning_rate": 9.371986886931927e-05, + "loss": 0.7039, + "step": 10800 + }, + { + "epoch": 2.07, + "eval_loss": 0.7405736446380615, + "eval_runtime": 25.7467, + "eval_samples_per_second": 77.68, + "eval_steps_per_second": 1.243, + "step": 10800 + }, + { + "epoch": 2.07, + "learning_rate": 9.333419039660603e-05, + "loss": 0.7101, + "step": 10820 + }, + { + "epoch": 2.08, + "learning_rate": 9.294851192389278e-05, + "loss": 0.6991, + "step": 10840 + }, + { + "epoch": 2.08, + "learning_rate": 9.256283345117953e-05, + "loss": 0.7069, + "step": 10860 + }, + { + "epoch": 2.08, + "learning_rate": 9.217715497846627e-05, + "loss": 0.7094, + "step": 10880 + }, + { + "epoch": 2.09, + "learning_rate": 9.179147650575303e-05, + "loss": 0.7103, + "step": 10900 + }, + { + "epoch": 2.09, + "learning_rate": 9.140579803303978e-05, + "loss": 0.7015, + "step": 10920 + }, + { + "epoch": 2.1, + "learning_rate": 9.102011956032653e-05, + "loss": 0.712, + "step": 10940 + }, + { + "epoch": 2.1, + "learning_rate": 9.063444108761328e-05, + "loss": 0.707, + "step": 10960 + }, + { + "epoch": 2.1, + "learning_rate": 9.024876261490004e-05, + "loss": 0.7009, + "step": 10980 + }, + { + "epoch": 2.11, + "learning_rate": 8.986308414218679e-05, + "loss": 0.7062, + "step": 11000 + }, + { + "epoch": 2.11, + "eval_loss": 0.7398320436477661, + "eval_runtime": 25.5459, + "eval_samples_per_second": 78.29, + "eval_steps_per_second": 1.253, + "step": 11000 + }, + { + "epoch": 2.11, + "learning_rate": 8.947740566947354e-05, + "loss": 0.7054, + "step": 11020 + }, + { + "epoch": 2.12, + "learning_rate": 8.909172719676029e-05, + "loss": 0.7094, + "step": 11040 + }, + { + "epoch": 2.12, + "learning_rate": 8.870604872404706e-05, + "loss": 0.7059, + "step": 11060 + }, + { + "epoch": 2.12, + "learning_rate": 8.83203702513338e-05, + "loss": 0.7202, + "step": 11080 + }, + { + "epoch": 2.13, + "learning_rate": 8.793469177862054e-05, + "loss": 0.699, + "step": 11100 + }, + { + "epoch": 2.13, + "learning_rate": 8.754901330590729e-05, + "loss": 0.7137, + "step": 11120 + }, + { + "epoch": 2.13, + "learning_rate": 8.716333483319405e-05, + "loss": 0.7048, + "step": 11140 + }, + { + "epoch": 2.14, + "learning_rate": 8.67776563604808e-05, + "loss": 0.7089, + "step": 11160 + }, + { + "epoch": 2.14, + "learning_rate": 8.639197788776755e-05, + "loss": 0.7057, + "step": 11180 + }, + { + "epoch": 2.15, + "learning_rate": 8.60062994150543e-05, + "loss": 0.709, + "step": 11200 + }, + { + "epoch": 2.15, + "eval_loss": 0.7393301725387573, + "eval_runtime": 25.7257, + "eval_samples_per_second": 77.743, + "eval_steps_per_second": 1.244, + "step": 11200 + }, + { + "epoch": 2.15, + "learning_rate": 8.562062094234107e-05, + "loss": 0.7027, + "step": 11220 + }, + { + "epoch": 2.15, + "learning_rate": 8.523494246962782e-05, + "loss": 0.7082, + "step": 11240 + }, + { + "epoch": 2.16, + "learning_rate": 8.484926399691457e-05, + "loss": 0.7007, + "step": 11260 + }, + { + "epoch": 2.16, + "learning_rate": 8.446358552420132e-05, + "loss": 0.7011, + "step": 11280 + }, + { + "epoch": 2.17, + "learning_rate": 8.407790705148808e-05, + "loss": 0.7067, + "step": 11300 + }, + { + "epoch": 2.17, + "learning_rate": 8.369222857877483e-05, + "loss": 0.702, + "step": 11320 + }, + { + "epoch": 2.17, + "learning_rate": 8.330655010606157e-05, + "loss": 0.7126, + "step": 11340 + }, + { + "epoch": 2.18, + "learning_rate": 8.292087163334832e-05, + "loss": 0.6947, + "step": 11360 + }, + { + "epoch": 2.18, + "learning_rate": 8.253519316063508e-05, + "loss": 0.7033, + "step": 11380 + }, + { + "epoch": 2.18, + "learning_rate": 8.214951468792183e-05, + "loss": 0.7075, + "step": 11400 + }, + { + "epoch": 2.18, + "eval_loss": 0.7390503883361816, + "eval_runtime": 25.6097, + "eval_samples_per_second": 78.095, + "eval_steps_per_second": 1.25, + "step": 11400 + }, + { + "epoch": 2.19, + "learning_rate": 8.176383621520858e-05, + "loss": 0.7081, + "step": 11420 + }, + { + "epoch": 2.19, + "learning_rate": 8.137815774249533e-05, + "loss": 0.7114, + "step": 11440 + }, + { + "epoch": 2.2, + "learning_rate": 8.099247926978209e-05, + "loss": 0.7105, + "step": 11460 + }, + { + "epoch": 2.2, + "learning_rate": 8.060680079706884e-05, + "loss": 0.7113, + "step": 11480 + }, + { + "epoch": 2.2, + "learning_rate": 8.022112232435559e-05, + "loss": 0.7109, + "step": 11500 + }, + { + "epoch": 2.21, + "learning_rate": 7.983544385164233e-05, + "loss": 0.7039, + "step": 11520 + }, + { + "epoch": 2.21, + "learning_rate": 7.94497653789291e-05, + "loss": 0.7144, + "step": 11540 + }, + { + "epoch": 2.21, + "learning_rate": 7.906408690621584e-05, + "loss": 0.7003, + "step": 11560 + }, + { + "epoch": 2.22, + "learning_rate": 7.867840843350259e-05, + "loss": 0.7028, + "step": 11580 + }, + { + "epoch": 2.22, + "learning_rate": 7.829272996078934e-05, + "loss": 0.7018, + "step": 11600 + }, + { + "epoch": 2.22, + "eval_loss": 0.7388148307800293, + "eval_runtime": 25.5069, + "eval_samples_per_second": 78.41, + "eval_steps_per_second": 1.255, + "step": 11600 + }, + { + "epoch": 2.23, + "learning_rate": 7.79070514880761e-05, + "loss": 0.7113, + "step": 11620 + }, + { + "epoch": 2.23, + "learning_rate": 7.752137301536285e-05, + "loss": 0.7136, + "step": 11640 + }, + { + "epoch": 2.23, + "learning_rate": 7.71356945426496e-05, + "loss": 0.7097, + "step": 11660 + }, + { + "epoch": 2.24, + "learning_rate": 7.675001606993635e-05, + "loss": 0.7057, + "step": 11680 + }, + { + "epoch": 2.24, + "learning_rate": 7.636433759722312e-05, + "loss": 0.7028, + "step": 11700 + }, + { + "epoch": 2.25, + "learning_rate": 7.597865912450986e-05, + "loss": 0.708, + "step": 11720 + }, + { + "epoch": 2.25, + "learning_rate": 7.559298065179661e-05, + "loss": 0.7088, + "step": 11740 + }, + { + "epoch": 2.25, + "learning_rate": 7.520730217908335e-05, + "loss": 0.7024, + "step": 11760 + }, + { + "epoch": 2.26, + "learning_rate": 7.482162370637011e-05, + "loss": 0.7016, + "step": 11780 + }, + { + "epoch": 2.26, + "learning_rate": 7.443594523365686e-05, + "loss": 0.7132, + "step": 11800 + }, + { + "epoch": 2.26, + "eval_loss": 0.7381731271743774, + "eval_runtime": 25.4976, + "eval_samples_per_second": 78.439, + "eval_steps_per_second": 1.255, + "step": 11800 + }, + { + "epoch": 2.26, + "learning_rate": 7.405026676094361e-05, + "loss": 0.6969, + "step": 11820 + }, + { + "epoch": 2.27, + "learning_rate": 7.366458828823038e-05, + "loss": 0.7042, + "step": 11840 + }, + { + "epoch": 2.27, + "learning_rate": 7.327890981551713e-05, + "loss": 0.7088, + "step": 11860 + }, + { + "epoch": 2.28, + "learning_rate": 7.289323134280388e-05, + "loss": 0.7109, + "step": 11880 + }, + { + "epoch": 2.28, + "learning_rate": 7.250755287009063e-05, + "loss": 0.7046, + "step": 11900 + }, + { + "epoch": 2.28, + "learning_rate": 7.212187439737738e-05, + "loss": 0.706, + "step": 11920 + }, + { + "epoch": 2.29, + "learning_rate": 7.173619592466414e-05, + "loss": 0.7045, + "step": 11940 + }, + { + "epoch": 2.29, + "learning_rate": 7.135051745195089e-05, + "loss": 0.7121, + "step": 11960 + }, + { + "epoch": 2.3, + "learning_rate": 7.096483897923764e-05, + "loss": 0.6946, + "step": 11980 + }, + { + "epoch": 2.3, + "learning_rate": 7.057916050652439e-05, + "loss": 0.7003, + "step": 12000 + }, + { + "epoch": 2.3, + "eval_loss": 0.7378225922584534, + "eval_runtime": 25.5221, + "eval_samples_per_second": 78.363, + "eval_steps_per_second": 1.254, + "step": 12000 + }, + { + "epoch": 2.3, + "learning_rate": 7.019348203381114e-05, + "loss": 0.7147, + "step": 12020 + }, + { + "epoch": 2.31, + "learning_rate": 6.980780356109789e-05, + "loss": 0.7066, + "step": 12040 + }, + { + "epoch": 2.31, + "learning_rate": 6.942212508838465e-05, + "loss": 0.6997, + "step": 12060 + }, + { + "epoch": 2.31, + "learning_rate": 6.90364466156714e-05, + "loss": 0.7083, + "step": 12080 + }, + { + "epoch": 2.32, + "learning_rate": 6.865076814295815e-05, + "loss": 0.6991, + "step": 12100 + }, + { + "epoch": 2.32, + "learning_rate": 6.82650896702449e-05, + "loss": 0.6982, + "step": 12120 + }, + { + "epoch": 2.33, + "learning_rate": 6.787941119753165e-05, + "loss": 0.7028, + "step": 12140 + }, + { + "epoch": 2.33, + "learning_rate": 6.74937327248184e-05, + "loss": 0.704, + "step": 12160 + }, + { + "epoch": 2.33, + "learning_rate": 6.710805425210516e-05, + "loss": 0.7084, + "step": 12180 + }, + { + "epoch": 2.34, + "learning_rate": 6.672237577939191e-05, + "loss": 0.7061, + "step": 12200 + }, + { + "epoch": 2.34, + "eval_loss": 0.7376002669334412, + "eval_runtime": 25.5156, + "eval_samples_per_second": 78.383, + "eval_steps_per_second": 1.254, + "step": 12200 + }, + { + "epoch": 2.34, + "learning_rate": 6.633669730667866e-05, + "loss": 0.7017, + "step": 12220 + }, + { + "epoch": 2.35, + "learning_rate": 6.595101883396541e-05, + "loss": 0.6949, + "step": 12240 + }, + { + "epoch": 2.35, + "learning_rate": 6.556534036125216e-05, + "loss": 0.6985, + "step": 12260 + }, + { + "epoch": 2.35, + "learning_rate": 6.517966188853891e-05, + "loss": 0.7075, + "step": 12280 + }, + { + "epoch": 2.36, + "learning_rate": 6.479398341582568e-05, + "loss": 0.6997, + "step": 12300 + }, + { + "epoch": 2.36, + "learning_rate": 6.440830494311241e-05, + "loss": 0.7045, + "step": 12320 + }, + { + "epoch": 2.36, + "learning_rate": 6.402262647039918e-05, + "loss": 0.7148, + "step": 12340 + }, + { + "epoch": 2.37, + "learning_rate": 6.363694799768592e-05, + "loss": 0.7085, + "step": 12360 + }, + { + "epoch": 2.37, + "learning_rate": 6.325126952497267e-05, + "loss": 0.7062, + "step": 12380 + }, + { + "epoch": 2.38, + "learning_rate": 6.286559105225942e-05, + "loss": 0.7092, + "step": 12400 + }, + { + "epoch": 2.38, + "eval_loss": 0.7370800971984863, + "eval_runtime": 25.5432, + "eval_samples_per_second": 78.299, + "eval_steps_per_second": 1.253, + "step": 12400 + }, + { + "epoch": 2.38, + "learning_rate": 6.247991257954619e-05, + "loss": 0.7069, + "step": 12420 + }, + { + "epoch": 2.38, + "learning_rate": 6.209423410683292e-05, + "loss": 0.7083, + "step": 12440 + }, + { + "epoch": 2.39, + "learning_rate": 6.170855563411969e-05, + "loss": 0.7126, + "step": 12460 + }, + { + "epoch": 2.39, + "learning_rate": 6.132287716140644e-05, + "loss": 0.7062, + "step": 12480 + }, + { + "epoch": 2.4, + "learning_rate": 6.0937198688693187e-05, + "loss": 0.7149, + "step": 12500 + }, + { + "epoch": 2.4, + "learning_rate": 6.0551520215979936e-05, + "loss": 0.7111, + "step": 12520 + }, + { + "epoch": 2.4, + "learning_rate": 6.016584174326669e-05, + "loss": 0.7059, + "step": 12540 + }, + { + "epoch": 2.41, + "learning_rate": 5.978016327055344e-05, + "loss": 0.7169, + "step": 12560 + }, + { + "epoch": 2.41, + "learning_rate": 5.93944847978402e-05, + "loss": 0.7052, + "step": 12580 + }, + { + "epoch": 2.41, + "learning_rate": 5.900880632512694e-05, + "loss": 0.7019, + "step": 12600 + }, + { + "epoch": 2.41, + "eval_loss": 0.7369959354400635, + "eval_runtime": 25.5241, + "eval_samples_per_second": 78.357, + "eval_steps_per_second": 1.254, + "step": 12600 + }, + { + "epoch": 2.42, + "learning_rate": 5.86231278524137e-05, + "loss": 0.7026, + "step": 12620 + }, + { + "epoch": 2.42, + "learning_rate": 5.823744937970045e-05, + "loss": 0.6981, + "step": 12640 + }, + { + "epoch": 2.43, + "learning_rate": 5.7851770906987205e-05, + "loss": 0.7044, + "step": 12660 + }, + { + "epoch": 2.43, + "learning_rate": 5.7466092434273955e-05, + "loss": 0.7087, + "step": 12680 + }, + { + "epoch": 2.43, + "learning_rate": 5.708041396156071e-05, + "loss": 0.7039, + "step": 12700 + }, + { + "epoch": 2.44, + "learning_rate": 5.6694735488847454e-05, + "loss": 0.7015, + "step": 12720 + }, + { + "epoch": 2.44, + "learning_rate": 5.630905701613421e-05, + "loss": 0.7053, + "step": 12740 + }, + { + "epoch": 2.44, + "learning_rate": 5.592337854342096e-05, + "loss": 0.7037, + "step": 12760 + }, + { + "epoch": 2.45, + "learning_rate": 5.553770007070772e-05, + "loss": 0.6938, + "step": 12780 + }, + { + "epoch": 2.45, + "learning_rate": 5.515202159799447e-05, + "loss": 0.7063, + "step": 12800 + }, + { + "epoch": 2.45, + "eval_loss": 0.7364639639854431, + "eval_runtime": 25.4856, + "eval_samples_per_second": 78.476, + "eval_steps_per_second": 1.256, + "step": 12800 + }, + { + "epoch": 2.46, + "learning_rate": 5.476634312528122e-05, + "loss": 0.7013, + "step": 12820 + }, + { + "epoch": 2.46, + "learning_rate": 5.4380664652567966e-05, + "loss": 0.7012, + "step": 12840 + }, + { + "epoch": 2.46, + "learning_rate": 5.399498617985472e-05, + "loss": 0.7, + "step": 12860 + }, + { + "epoch": 2.47, + "learning_rate": 5.360930770714147e-05, + "loss": 0.7017, + "step": 12880 + }, + { + "epoch": 2.47, + "learning_rate": 5.322362923442823e-05, + "loss": 0.7145, + "step": 12900 + }, + { + "epoch": 2.48, + "learning_rate": 5.283795076171498e-05, + "loss": 0.7156, + "step": 12920 + }, + { + "epoch": 2.48, + "learning_rate": 5.247155621263739e-05, + "loss": 0.6965, + "step": 12940 + }, + { + "epoch": 2.48, + "learning_rate": 5.2085877739924146e-05, + "loss": 0.7001, + "step": 12960 + }, + { + "epoch": 2.49, + "learning_rate": 5.1700199267210896e-05, + "loss": 0.7012, + "step": 12980 + }, + { + "epoch": 2.49, + "learning_rate": 5.131452079449765e-05, + "loss": 0.6939, + "step": 13000 + }, + { + "epoch": 2.49, + "eval_loss": 0.7364306449890137, + "eval_runtime": 25.5093, + "eval_samples_per_second": 78.403, + "eval_steps_per_second": 1.254, + "step": 13000 + }, + { + "epoch": 2.49, + "learning_rate": 5.09288423217844e-05, + "loss": 0.7084, + "step": 13020 + }, + { + "epoch": 2.5, + "learning_rate": 5.054316384907115e-05, + "loss": 0.6987, + "step": 13040 + }, + { + "epoch": 2.5, + "learning_rate": 5.01574853763579e-05, + "loss": 0.7087, + "step": 13060 + }, + { + "epoch": 2.51, + "learning_rate": 4.977180690364466e-05, + "loss": 0.7028, + "step": 13080 + }, + { + "epoch": 2.51, + "learning_rate": 4.938612843093141e-05, + "loss": 0.7012, + "step": 13100 + }, + { + "epoch": 2.51, + "learning_rate": 4.9000449958218165e-05, + "loss": 0.6959, + "step": 13120 + }, + { + "epoch": 2.52, + "learning_rate": 4.861477148550491e-05, + "loss": 0.7056, + "step": 13140 + }, + { + "epoch": 2.52, + "learning_rate": 4.8229093012791664e-05, + "loss": 0.716, + "step": 13160 + }, + { + "epoch": 2.53, + "learning_rate": 4.7843414540078414e-05, + "loss": 0.7144, + "step": 13180 + }, + { + "epoch": 2.53, + "learning_rate": 4.745773606736517e-05, + "loss": 0.6969, + "step": 13200 + }, + { + "epoch": 2.53, + "eval_loss": 0.7360122203826904, + "eval_runtime": 25.4878, + "eval_samples_per_second": 78.469, + "eval_steps_per_second": 1.256, + "step": 13200 + }, + { + "epoch": 2.53, + "learning_rate": 4.707205759465192e-05, + "loss": 0.6993, + "step": 13220 + }, + { + "epoch": 2.54, + "learning_rate": 4.668637912193868e-05, + "loss": 0.7013, + "step": 13240 + }, + { + "epoch": 2.54, + "learning_rate": 4.630070064922542e-05, + "loss": 0.7033, + "step": 13260 + }, + { + "epoch": 2.54, + "learning_rate": 4.5915022176512176e-05, + "loss": 0.7067, + "step": 13280 + }, + { + "epoch": 2.55, + "learning_rate": 4.5529343703798926e-05, + "loss": 0.6886, + "step": 13300 + }, + { + "epoch": 2.55, + "learning_rate": 4.514366523108568e-05, + "loss": 0.7061, + "step": 13320 + }, + { + "epoch": 2.56, + "learning_rate": 4.475798675837243e-05, + "loss": 0.7027, + "step": 13340 + }, + { + "epoch": 2.56, + "learning_rate": 4.437230828565919e-05, + "loss": 0.6982, + "step": 13360 + }, + { + "epoch": 2.56, + "learning_rate": 4.398662981294593e-05, + "loss": 0.7042, + "step": 13380 + }, + { + "epoch": 2.57, + "learning_rate": 4.360095134023269e-05, + "loss": 0.6956, + "step": 13400 + }, + { + "epoch": 2.57, + "eval_loss": 0.7356610298156738, + "eval_runtime": 25.5629, + "eval_samples_per_second": 78.238, + "eval_steps_per_second": 1.252, + "step": 13400 + }, + { + "epoch": 2.57, + "learning_rate": 4.321527286751944e-05, + "loss": 0.7046, + "step": 13420 + }, + { + "epoch": 2.58, + "learning_rate": 4.2829594394806195e-05, + "loss": 0.7053, + "step": 13440 + }, + { + "epoch": 2.58, + "learning_rate": 4.2443915922092944e-05, + "loss": 0.707, + "step": 13460 + }, + { + "epoch": 2.58, + "learning_rate": 4.20582374493797e-05, + "loss": 0.7123, + "step": 13480 + }, + { + "epoch": 2.59, + "learning_rate": 4.1672558976666444e-05, + "loss": 0.7032, + "step": 13500 + }, + { + "epoch": 2.59, + "learning_rate": 4.12868805039532e-05, + "loss": 0.6942, + "step": 13520 + }, + { + "epoch": 2.59, + "learning_rate": 4.090120203123995e-05, + "loss": 0.6981, + "step": 13540 + }, + { + "epoch": 2.6, + "learning_rate": 4.051552355852671e-05, + "loss": 0.7052, + "step": 13560 + }, + { + "epoch": 2.6, + "learning_rate": 4.012984508581345e-05, + "loss": 0.7044, + "step": 13580 + }, + { + "epoch": 2.61, + "learning_rate": 3.9744166613100206e-05, + "loss": 0.6978, + "step": 13600 + }, + { + "epoch": 2.61, + "eval_loss": 0.7352051734924316, + "eval_runtime": 25.5016, + "eval_samples_per_second": 78.426, + "eval_steps_per_second": 1.255, + "step": 13600 + }, + { + "epoch": 2.61, + "learning_rate": 3.9358488140386956e-05, + "loss": 0.7001, + "step": 13620 + }, + { + "epoch": 2.61, + "learning_rate": 3.897280966767371e-05, + "loss": 0.7065, + "step": 13640 + }, + { + "epoch": 2.62, + "learning_rate": 3.858713119496046e-05, + "loss": 0.6999, + "step": 13660 + }, + { + "epoch": 2.62, + "learning_rate": 3.820145272224722e-05, + "loss": 0.7104, + "step": 13680 + }, + { + "epoch": 2.63, + "learning_rate": 3.781577424953396e-05, + "loss": 0.7079, + "step": 13700 + }, + { + "epoch": 2.63, + "learning_rate": 3.743009577682072e-05, + "loss": 0.7059, + "step": 13720 + }, + { + "epoch": 2.63, + "learning_rate": 3.7063701227743136e-05, + "loss": 0.7088, + "step": 13740 + }, + { + "epoch": 2.64, + "learning_rate": 3.6678022755029886e-05, + "loss": 0.7051, + "step": 13760 + }, + { + "epoch": 2.64, + "learning_rate": 3.629234428231664e-05, + "loss": 0.7004, + "step": 13780 + }, + { + "epoch": 2.64, + "learning_rate": 3.590666580960339e-05, + "loss": 0.7, + "step": 13800 + }, + { + "epoch": 2.64, + "eval_loss": 0.7350977659225464, + "eval_runtime": 25.4618, + "eval_samples_per_second": 78.549, + "eval_steps_per_second": 1.257, + "step": 13800 + }, + { + "epoch": 2.65, + "learning_rate": 3.552098733689014e-05, + "loss": 0.7044, + "step": 13820 + }, + { + "epoch": 2.65, + "learning_rate": 3.51353088641769e-05, + "loss": 0.6967, + "step": 13840 + }, + { + "epoch": 2.66, + "learning_rate": 3.474963039146365e-05, + "loss": 0.6932, + "step": 13860 + }, + { + "epoch": 2.66, + "learning_rate": 3.43639519187504e-05, + "loss": 0.6982, + "step": 13880 + }, + { + "epoch": 2.66, + "learning_rate": 3.3978273446037154e-05, + "loss": 0.7064, + "step": 13900 + }, + { + "epoch": 2.67, + "learning_rate": 3.3592594973323904e-05, + "loss": 0.7064, + "step": 13920 + }, + { + "epoch": 2.67, + "learning_rate": 3.3206916500610654e-05, + "loss": 0.6975, + "step": 13940 + }, + { + "epoch": 2.67, + "learning_rate": 3.282123802789741e-05, + "loss": 0.7023, + "step": 13960 + }, + { + "epoch": 2.68, + "learning_rate": 3.243555955518416e-05, + "loss": 0.706, + "step": 13980 + }, + { + "epoch": 2.68, + "learning_rate": 3.204988108247091e-05, + "loss": 0.696, + "step": 14000 + }, + { + "epoch": 2.68, + "eval_loss": 0.7347920536994934, + "eval_runtime": 25.5132, + "eval_samples_per_second": 78.391, + "eval_steps_per_second": 1.254, + "step": 14000 + }, + { + "epoch": 2.69, + "learning_rate": 3.1664202609757666e-05, + "loss": 0.6995, + "step": 14020 + }, + { + "epoch": 2.69, + "learning_rate": 3.1278524137044416e-05, + "loss": 0.7022, + "step": 14040 + }, + { + "epoch": 2.69, + "learning_rate": 3.0892845664331166e-05, + "loss": 0.7086, + "step": 14060 + }, + { + "epoch": 2.7, + "learning_rate": 3.050716719161792e-05, + "loss": 0.7135, + "step": 14080 + }, + { + "epoch": 2.7, + "learning_rate": 3.0121488718904672e-05, + "loss": 0.7036, + "step": 14100 + }, + { + "epoch": 2.71, + "learning_rate": 2.9735810246191422e-05, + "loss": 0.6979, + "step": 14120 + }, + { + "epoch": 2.71, + "learning_rate": 2.9350131773478175e-05, + "loss": 0.7082, + "step": 14140 + }, + { + "epoch": 2.71, + "learning_rate": 2.8964453300764928e-05, + "loss": 0.7008, + "step": 14160 + }, + { + "epoch": 2.72, + "learning_rate": 2.8578774828051678e-05, + "loss": 0.7085, + "step": 14180 + }, + { + "epoch": 2.72, + "learning_rate": 2.819309635533843e-05, + "loss": 0.6983, + "step": 14200 + }, + { + "epoch": 2.72, + "eval_loss": 0.73465496301651, + "eval_runtime": 25.4933, + "eval_samples_per_second": 78.452, + "eval_steps_per_second": 1.255, + "step": 14200 + }, + { + "epoch": 2.72, + "learning_rate": 2.7807417882625184e-05, + "loss": 0.7123, + "step": 14220 + }, + { + "epoch": 2.73, + "learning_rate": 2.7421739409911934e-05, + "loss": 0.7027, + "step": 14240 + }, + { + "epoch": 2.73, + "learning_rate": 2.7036060937198687e-05, + "loss": 0.7124, + "step": 14260 + }, + { + "epoch": 2.74, + "learning_rate": 2.6650382464485437e-05, + "loss": 0.7102, + "step": 14280 + }, + { + "epoch": 2.74, + "learning_rate": 2.626470399177219e-05, + "loss": 0.7062, + "step": 14300 + }, + { + "epoch": 2.74, + "learning_rate": 2.5879025519058943e-05, + "loss": 0.7094, + "step": 14320 + }, + { + "epoch": 2.75, + "learning_rate": 2.5493347046345693e-05, + "loss": 0.7017, + "step": 14340 + }, + { + "epoch": 2.75, + "learning_rate": 2.5107668573632446e-05, + "loss": 0.7033, + "step": 14360 + }, + { + "epoch": 2.76, + "learning_rate": 2.47219901009192e-05, + "loss": 0.7036, + "step": 14380 + }, + { + "epoch": 2.76, + "learning_rate": 2.433631162820595e-05, + "loss": 0.7041, + "step": 14400 + }, + { + "epoch": 2.76, + "eval_loss": 0.7345843315124512, + "eval_runtime": 25.4933, + "eval_samples_per_second": 78.452, + "eval_steps_per_second": 1.255, + "step": 14400 + }, + { + "epoch": 2.76, + "learning_rate": 2.3950633155492702e-05, + "loss": 0.6983, + "step": 14420 + }, + { + "epoch": 2.77, + "learning_rate": 2.3564954682779455e-05, + "loss": 0.7006, + "step": 14440 + }, + { + "epoch": 2.77, + "learning_rate": 2.3179276210066205e-05, + "loss": 0.7047, + "step": 14460 + }, + { + "epoch": 2.77, + "learning_rate": 2.2793597737352958e-05, + "loss": 0.7036, + "step": 14480 + }, + { + "epoch": 2.78, + "learning_rate": 2.2407919264639708e-05, + "loss": 0.7025, + "step": 14500 + }, + { + "epoch": 2.78, + "learning_rate": 2.202224079192646e-05, + "loss": 0.699, + "step": 14520 + }, + { + "epoch": 2.79, + "learning_rate": 2.1636562319213214e-05, + "loss": 0.699, + "step": 14540 + }, + { + "epoch": 2.79, + "learning_rate": 2.1250883846499964e-05, + "loss": 0.6968, + "step": 14560 + }, + { + "epoch": 2.79, + "learning_rate": 2.0865205373786717e-05, + "loss": 0.697, + "step": 14580 + }, + { + "epoch": 2.8, + "learning_rate": 2.047952690107347e-05, + "loss": 0.6981, + "step": 14600 + }, + { + "epoch": 2.8, + "eval_loss": 0.7341080904006958, + "eval_runtime": 25.516, + "eval_samples_per_second": 78.382, + "eval_steps_per_second": 1.254, + "step": 14600 + }, + { + "epoch": 2.8, + "learning_rate": 2.009384842836022e-05, + "loss": 0.706, + "step": 14620 + }, + { + "epoch": 2.81, + "learning_rate": 1.9708169955646973e-05, + "loss": 0.6964, + "step": 14640 + }, + { + "epoch": 2.81, + "learning_rate": 1.9322491482933726e-05, + "loss": 0.7043, + "step": 14660 + }, + { + "epoch": 2.81, + "learning_rate": 1.8936813010220476e-05, + "loss": 0.7044, + "step": 14680 + }, + { + "epoch": 2.82, + "learning_rate": 1.855113453750723e-05, + "loss": 0.7079, + "step": 14700 + }, + { + "epoch": 2.82, + "learning_rate": 1.8165456064793982e-05, + "loss": 0.7096, + "step": 14720 + }, + { + "epoch": 2.82, + "learning_rate": 1.7779777592080735e-05, + "loss": 0.6977, + "step": 14740 + }, + { + "epoch": 2.83, + "learning_rate": 1.7394099119367485e-05, + "loss": 0.6997, + "step": 14760 + }, + { + "epoch": 2.83, + "learning_rate": 1.7008420646654238e-05, + "loss": 0.7033, + "step": 14780 + }, + { + "epoch": 2.84, + "learning_rate": 1.662274217394099e-05, + "loss": 0.7016, + "step": 14800 + }, + { + "epoch": 2.84, + "eval_loss": 0.7337221503257751, + "eval_runtime": 25.5058, + "eval_samples_per_second": 78.413, + "eval_steps_per_second": 1.255, + "step": 14800 + }, + { + "epoch": 2.84, + "learning_rate": 1.623706370122774e-05, + "loss": 0.6907, + "step": 14820 + }, + { + "epoch": 2.84, + "learning_rate": 1.5851385228514494e-05, + "loss": 0.7043, + "step": 14840 + }, + { + "epoch": 2.85, + "learning_rate": 1.5465706755801247e-05, + "loss": 0.7058, + "step": 14860 + }, + { + "epoch": 2.85, + "learning_rate": 1.5080028283087997e-05, + "loss": 0.6956, + "step": 14880 + }, + { + "epoch": 2.85, + "learning_rate": 1.469434981037475e-05, + "loss": 0.7109, + "step": 14900 + }, + { + "epoch": 2.86, + "learning_rate": 1.4308671337661502e-05, + "loss": 0.7055, + "step": 14920 + }, + { + "epoch": 2.86, + "learning_rate": 1.3922992864948253e-05, + "loss": 0.7011, + "step": 14940 + }, + { + "epoch": 2.87, + "learning_rate": 1.3537314392235005e-05, + "loss": 0.7009, + "step": 14960 + }, + { + "epoch": 2.87, + "learning_rate": 1.3151635919521758e-05, + "loss": 0.7069, + "step": 14980 + }, + { + "epoch": 2.87, + "learning_rate": 1.276595744680851e-05, + "loss": 0.7038, + "step": 15000 + }, + { + "epoch": 2.87, + "eval_loss": 0.7338148355484009, + "eval_runtime": 25.4764, + "eval_samples_per_second": 78.504, + "eval_steps_per_second": 1.256, + "step": 15000 + }, + { + "epoch": 2.88, + "learning_rate": 1.238027897409526e-05, + "loss": 0.706, + "step": 15020 + }, + { + "epoch": 2.88, + "learning_rate": 1.1994600501382012e-05, + "loss": 0.6918, + "step": 15040 + }, + { + "epoch": 2.89, + "learning_rate": 1.1608922028668765e-05, + "loss": 0.7045, + "step": 15060 + }, + { + "epoch": 2.89, + "learning_rate": 1.1223243555955517e-05, + "loss": 0.6984, + "step": 15080 + }, + { + "epoch": 2.89, + "learning_rate": 1.0837565083242268e-05, + "loss": 0.7126, + "step": 15100 + }, + { + "epoch": 2.9, + "learning_rate": 1.0451886610529021e-05, + "loss": 0.6974, + "step": 15120 + }, + { + "epoch": 2.9, + "learning_rate": 1.0066208137815773e-05, + "loss": 0.7063, + "step": 15140 + }, + { + "epoch": 2.9, + "learning_rate": 9.680529665102524e-06, + "loss": 0.697, + "step": 15160 + }, + { + "epoch": 2.91, + "learning_rate": 9.294851192389277e-06, + "loss": 0.6965, + "step": 15180 + }, + { + "epoch": 2.91, + "learning_rate": 8.909172719676029e-06, + "loss": 0.7001, + "step": 15200 + }, + { + "epoch": 2.91, + "eval_loss": 0.733613908290863, + "eval_runtime": 25.4875, + "eval_samples_per_second": 78.47, + "eval_steps_per_second": 1.256, + "step": 15200 + } + ], + "max_steps": 15657, + "num_train_epochs": 3, + "total_flos": 7.685422038604841e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/training_args.bin b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8cadc78dc16a3098f59f602efe3fce82b270b5ad --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cbb446b8dfb89a3bab291d29c74c98c3984471bb063f88c9b78e95c95415320 +size 3643 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/optimizer.pt b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..56551902c42fc3dc007ec0706326cd2060b9ae96 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bad472a869763277fa06e5f8e1d6dc2ae0961227a8975bd64690cb21ba784769 +size 52523141 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/pytorch_model.bin b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..eff1801b84a8b8e239b9613110c33b73689b94f8 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52e21b08b4c8274ee2c0fd87fd205c86eb8dc0741c58dd9546789541414f5a0e +size 26271757 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_0.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..47ac42e8e0fe81a61b8566c101a57c63c6833fa8 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:590b152ac39eb4425ffbbf4f459f8a6fab9d2eb1837f635f165c1bc8301cb530 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_1.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2f7e882aeb2cc846106bbc908041a3a84a1fc5c9 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16445192f436a49fa0a5fc0b2559d57413d13893dac8ae3fdf4b06f86f30e78f +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_2.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..b9af466c64b3c23549ea2e03ce45cdba4418b556 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5caf2a1c051d39af8af599faf9a7dcca8712f7e766ffb9d336503f67d236b4aa +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_3.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..4a589cc9a7eb07d1245b8e55fd1d6522a7cfede1 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fe36e24585049f9c3f5dad074cc8f0ca7bf68525ed6dc90ce06cb5ddabf9c80 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_4.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..6899440b241719105fde642aa5da16bc93c85e44 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:deb0f0867b7a12781e9ca2b3dcfcd445c34c7356275685d2d6e09a37dd794dba +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_5.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..6954f314546be5a7a67d9287e255efd3b6608d7f --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a477b5804ff11301d5e005e3378fe5a3be2064bbfe5563b4d1544eb2206ccce2 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_6.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..aea45239fd5dfd385c5f297b389bb015b4ad04e2 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b933d7cdc538a80648c9d73a6c436b5a824dba2d03e902852fe709ac9690026f +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_7.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d53095cea830f3e294a8054cd956ae7fd98e3627 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad44d4d58b6b861970f2533785462bd8a7916d87b6861b03a0877667c46f2542 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/scaler.pt b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7ff2637e36793eb56350a90977ea80e5831d0cd --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4779ef9677c92ca055ab0ba6a2b67c3d92a9033027e471e37da5853efd565e7c +size 557 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/scheduler.pt b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2d84c0450960622027bef0b6ade2214faeccaf1 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90ce2f3b458c2d1f7d2f417b91b14c2552d8943584e19cec21f7737d05cec068 +size 627 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/trainer_state.json b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f72ab87dff00758ecbdef9b00a0ed7353975071d --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/trainer_state.json @@ -0,0 +1,5252 @@ +{ + "best_metric": 0.7335031032562256, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-belle-cot13b/checkpoint-15400", + "epoch": 2.950756849971259, + "global_step": 15400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7259, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 1.4365, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 1.0988, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 0.00023999999999999998, + "loss": 1.0373, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.0003, + "loss": 0.9935, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029961432152728675, + "loss": 0.9734, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002992286430545735, + "loss": 0.9538, + "step": 140 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029884296458186025, + "loss": 0.9304, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029845728610914697, + "loss": 0.9159, + "step": 180 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029807160763643375, + "loss": 0.9056, + "step": 200 + }, + { + "epoch": 0.04, + "eval_loss": 0.9217711091041565, + "eval_runtime": 25.3617, + "eval_samples_per_second": 78.859, + "eval_steps_per_second": 1.262, + "step": 200 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029768592916372047, + "loss": 0.9028, + "step": 220 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029730025069100725, + "loss": 0.8939, + "step": 240 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029691457221829397, + "loss": 0.8839, + "step": 260 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029652889374558075, + "loss": 0.8929, + "step": 280 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029614321527286747, + "loss": 0.8708, + "step": 300 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029575753680015425, + "loss": 0.8824, + "step": 320 + }, + { + "epoch": 0.07, + "learning_rate": 0.000295371858327441, + "loss": 0.8705, + "step": 340 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029498617985472775, + "loss": 0.8678, + "step": 360 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002946005013820145, + "loss": 0.8687, + "step": 380 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029421482290930125, + "loss": 0.8609, + "step": 400 + }, + { + "epoch": 0.08, + "eval_loss": 0.8822715878486633, + "eval_runtime": 25.3188, + "eval_samples_per_second": 78.993, + "eval_steps_per_second": 1.264, + "step": 400 + }, + { + "epoch": 0.08, + "learning_rate": 0.000293829144436588, + "loss": 0.8603, + "step": 420 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029344346596387475, + "loss": 0.8662, + "step": 440 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002930577874911615, + "loss": 0.8591, + "step": 460 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029267210901844825, + "loss": 0.8442, + "step": 480 + }, + { + "epoch": 0.1, + "learning_rate": 0.000292286430545735, + "loss": 0.8482, + "step": 500 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029190075207302175, + "loss": 0.8458, + "step": 520 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002915150736003085, + "loss": 0.8377, + "step": 540 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029112939512759525, + "loss": 0.8372, + "step": 560 + }, + { + "epoch": 0.11, + "learning_rate": 0.000290743716654882, + "loss": 0.8444, + "step": 580 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002903580381821688, + "loss": 0.84, + "step": 600 + }, + { + "epoch": 0.11, + "eval_loss": 0.8602503538131714, + "eval_runtime": 25.3547, + "eval_samples_per_second": 78.881, + "eval_steps_per_second": 1.262, + "step": 600 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002899723597094555, + "loss": 0.8428, + "step": 620 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002895866812367423, + "loss": 0.8366, + "step": 640 + }, + { + "epoch": 0.13, + "learning_rate": 0.000289201002764029, + "loss": 0.8408, + "step": 660 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002888153242913158, + "loss": 0.8445, + "step": 680 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002884296458186025, + "loss": 0.8335, + "step": 700 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002880439673458893, + "loss": 0.8316, + "step": 720 + }, + { + "epoch": 0.14, + "learning_rate": 0.000287658288873176, + "loss": 0.8449, + "step": 740 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002872726104004628, + "loss": 0.836, + "step": 760 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002868869319277495, + "loss": 0.8257, + "step": 780 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002865012534550363, + "loss": 0.8252, + "step": 800 + }, + { + "epoch": 0.15, + "eval_loss": 0.8450831174850464, + "eval_runtime": 25.4039, + "eval_samples_per_second": 78.728, + "eval_steps_per_second": 1.26, + "step": 800 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002861155749823231, + "loss": 0.8227, + "step": 820 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002857298965096098, + "loss": 0.8274, + "step": 840 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028534421803689657, + "loss": 0.8197, + "step": 860 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002849585395641833, + "loss": 0.823, + "step": 880 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028457286109147007, + "loss": 0.8176, + "step": 900 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002841871826187568, + "loss": 0.8092, + "step": 920 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028380150414604357, + "loss": 0.8171, + "step": 940 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002834158256733303, + "loss": 0.816, + "step": 960 + }, + { + "epoch": 0.19, + "learning_rate": 0.00028303014720061707, + "loss": 0.816, + "step": 980 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002826444687279038, + "loss": 0.8066, + "step": 1000 + }, + { + "epoch": 0.19, + "eval_loss": 0.8338332176208496, + "eval_runtime": 25.3851, + "eval_samples_per_second": 78.786, + "eval_steps_per_second": 1.261, + "step": 1000 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028225879025519057, + "loss": 0.82, + "step": 1020 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002818731117824773, + "loss": 0.8116, + "step": 1040 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028148743330976407, + "loss": 0.8156, + "step": 1060 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028110175483705085, + "loss": 0.8135, + "step": 1080 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028071607636433757, + "loss": 0.8055, + "step": 1100 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028033039789162435, + "loss": 0.8062, + "step": 1120 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027994471941891107, + "loss": 0.8082, + "step": 1140 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027955904094619785, + "loss": 0.8144, + "step": 1160 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027917336247348457, + "loss": 0.8067, + "step": 1180 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002787876840007713, + "loss": 0.8042, + "step": 1200 + }, + { + "epoch": 0.23, + "eval_loss": 0.8253737688064575, + "eval_runtime": 25.4089, + "eval_samples_per_second": 78.713, + "eval_steps_per_second": 1.259, + "step": 1200 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027840200552805807, + "loss": 0.8093, + "step": 1220 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027801632705534485, + "loss": 0.801, + "step": 1240 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027763064858263157, + "loss": 0.8043, + "step": 1260 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027724497010991834, + "loss": 0.8027, + "step": 1280 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002768592916372051, + "loss": 0.7979, + "step": 1300 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027647361316449184, + "loss": 0.7988, + "step": 1320 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002760879346917786, + "loss": 0.8051, + "step": 1340 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027570225621906534, + "loss": 0.7962, + "step": 1360 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002753165777463521, + "loss": 0.8034, + "step": 1380 + }, + { + "epoch": 0.27, + "learning_rate": 0.00027493089927363884, + "loss": 0.7994, + "step": 1400 + }, + { + "epoch": 0.27, + "eval_loss": 0.8166970014572144, + "eval_runtime": 25.3787, + "eval_samples_per_second": 78.806, + "eval_steps_per_second": 1.261, + "step": 1400 + }, + { + "epoch": 0.27, + "learning_rate": 0.00027454522080092557, + "loss": 0.7949, + "step": 1420 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027415954232821234, + "loss": 0.7919, + "step": 1440 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002737738638554991, + "loss": 0.7983, + "step": 1460 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027338818538278584, + "loss": 0.7828, + "step": 1480 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002730025069100726, + "loss": 0.7926, + "step": 1500 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002726168284373594, + "loss": 0.7837, + "step": 1520 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002722311499646461, + "loss": 0.7922, + "step": 1540 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002718454714919329, + "loss": 0.7852, + "step": 1560 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002714597930192196, + "loss": 0.7846, + "step": 1580 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002710741145465064, + "loss": 0.782, + "step": 1600 + }, + { + "epoch": 0.31, + "eval_loss": 0.8094187378883362, + "eval_runtime": 25.4544, + "eval_samples_per_second": 78.572, + "eval_steps_per_second": 1.257, + "step": 1600 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002706884360737931, + "loss": 0.7822, + "step": 1620 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027030275760107984, + "loss": 0.7787, + "step": 1640 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002699170791283666, + "loss": 0.7913, + "step": 1660 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002695314006556534, + "loss": 0.79, + "step": 1680 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002691457221829401, + "loss": 0.7934, + "step": 1700 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002687600437102269, + "loss": 0.7816, + "step": 1720 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002683743652375136, + "loss": 0.7825, + "step": 1740 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002679886867648004, + "loss": 0.7903, + "step": 1760 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026760300829208717, + "loss": 0.7906, + "step": 1780 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002672173298193739, + "loss": 0.7778, + "step": 1800 + }, + { + "epoch": 0.34, + "eval_loss": 0.8045867681503296, + "eval_runtime": 25.4351, + "eval_samples_per_second": 78.632, + "eval_steps_per_second": 1.258, + "step": 1800 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026683165134666067, + "loss": 0.7815, + "step": 1820 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002664459728739474, + "loss": 0.7851, + "step": 1840 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026606029440123417, + "loss": 0.7807, + "step": 1860 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002656746159285209, + "loss": 0.7856, + "step": 1880 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002652889374558076, + "loss": 0.7798, + "step": 1900 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002649032589830944, + "loss": 0.7777, + "step": 1920 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026451758051038117, + "loss": 0.7798, + "step": 1940 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002641319020376679, + "loss": 0.7783, + "step": 1960 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026374622356495467, + "loss": 0.7739, + "step": 1980 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026336054509224144, + "loss": 0.7823, + "step": 2000 + }, + { + "epoch": 0.38, + "eval_loss": 0.7984708547592163, + "eval_runtime": 25.4598, + "eval_samples_per_second": 78.555, + "eval_steps_per_second": 1.257, + "step": 2000 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026297486661952817, + "loss": 0.7774, + "step": 2020 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026258918814681494, + "loss": 0.7701, + "step": 2040 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026220350967410167, + "loss": 0.7777, + "step": 2060 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026181783120138844, + "loss": 0.781, + "step": 2080 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026143215272867517, + "loss": 0.779, + "step": 2100 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002610464742559619, + "loss": 0.7703, + "step": 2120 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026066079578324867, + "loss": 0.7749, + "step": 2140 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026027511731053544, + "loss": 0.772, + "step": 2160 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025988943883782216, + "loss": 0.771, + "step": 2180 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025950376036510894, + "loss": 0.7757, + "step": 2200 + }, + { + "epoch": 0.42, + "eval_loss": 0.7949528694152832, + "eval_runtime": 25.4504, + "eval_samples_per_second": 78.584, + "eval_steps_per_second": 1.257, + "step": 2200 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025911808189239566, + "loss": 0.7776, + "step": 2220 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025873240341968244, + "loss": 0.7689, + "step": 2240 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002583467249469692, + "loss": 0.7646, + "step": 2260 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025796104647425594, + "loss": 0.7805, + "step": 2280 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002575753680015427, + "loss": 0.7717, + "step": 2300 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025718968952882944, + "loss": 0.7672, + "step": 2320 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025680401105611616, + "loss": 0.7716, + "step": 2340 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025641833258340294, + "loss": 0.7661, + "step": 2360 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025603265411068966, + "loss": 0.7659, + "step": 2380 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025564697563797644, + "loss": 0.7697, + "step": 2400 + }, + { + "epoch": 0.46, + "eval_loss": 0.7915205359458923, + "eval_runtime": 25.4326, + "eval_samples_per_second": 78.639, + "eval_steps_per_second": 1.258, + "step": 2400 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002552612971652632, + "loss": 0.7686, + "step": 2420 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025487561869254994, + "loss": 0.7691, + "step": 2440 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002544899402198367, + "loss": 0.768, + "step": 2460 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002541042617471235, + "loss": 0.7663, + "step": 2480 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002537185832744102, + "loss": 0.767, + "step": 2500 + }, + { + "epoch": 0.48, + "learning_rate": 0.000253332904801697, + "loss": 0.769, + "step": 2520 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002529472263289837, + "loss": 0.7686, + "step": 2540 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025256154785627044, + "loss": 0.7722, + "step": 2560 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002521758693835572, + "loss": 0.7691, + "step": 2580 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025179019091084394, + "loss": 0.7742, + "step": 2600 + }, + { + "epoch": 0.5, + "eval_loss": 0.7875179648399353, + "eval_runtime": 25.4595, + "eval_samples_per_second": 78.556, + "eval_steps_per_second": 1.257, + "step": 2600 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002514045124381307, + "loss": 0.7682, + "step": 2620 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002510188339654175, + "loss": 0.7574, + "step": 2640 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002506331554927042, + "loss": 0.77, + "step": 2660 + }, + { + "epoch": 0.51, + "learning_rate": 0.000250247477019991, + "loss": 0.7638, + "step": 2680 + }, + { + "epoch": 0.52, + "learning_rate": 0.00024986179854727777, + "loss": 0.7517, + "step": 2700 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002494761200745645, + "loss": 0.7596, + "step": 2720 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024909044160185127, + "loss": 0.7608, + "step": 2740 + }, + { + "epoch": 0.53, + "learning_rate": 0.000248704763129138, + "loss": 0.7571, + "step": 2760 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002483190846564247, + "loss": 0.7597, + "step": 2780 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002479334061837115, + "loss": 0.7659, + "step": 2800 + }, + { + "epoch": 0.54, + "eval_loss": 0.7841727137565613, + "eval_runtime": 25.4853, + "eval_samples_per_second": 78.477, + "eval_steps_per_second": 1.256, + "step": 2800 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002475477277109982, + "loss": 0.7694, + "step": 2820 + }, + { + "epoch": 0.54, + "learning_rate": 0.000247162049238285, + "loss": 0.7722, + "step": 2840 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024677637076557176, + "loss": 0.7513, + "step": 2860 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002463906922928585, + "loss": 0.7553, + "step": 2880 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024600501382014526, + "loss": 0.7611, + "step": 2900 + }, + { + "epoch": 0.56, + "learning_rate": 0.000245619335347432, + "loss": 0.7614, + "step": 2920 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024523365687471876, + "loss": 0.761, + "step": 2940 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024484797840200554, + "loss": 0.7568, + "step": 2960 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024446229992929226, + "loss": 0.7571, + "step": 2980 + }, + { + "epoch": 0.57, + "learning_rate": 0.000244076621456579, + "loss": 0.7514, + "step": 3000 + }, + { + "epoch": 0.57, + "eval_loss": 0.7828710675239563, + "eval_runtime": 25.4475, + "eval_samples_per_second": 78.593, + "eval_steps_per_second": 1.257, + "step": 3000 + }, + { + "epoch": 0.58, + "learning_rate": 0.0002436909429838658, + "loss": 0.7564, + "step": 3020 + }, + { + "epoch": 0.58, + "learning_rate": 0.0002433052645111525, + "loss": 0.7593, + "step": 3040 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024291958603843926, + "loss": 0.7533, + "step": 3060 + }, + { + "epoch": 0.59, + "learning_rate": 0.000242533907565726, + "loss": 0.7566, + "step": 3080 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024214822909301276, + "loss": 0.7667, + "step": 3100 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024176255062029954, + "loss": 0.7638, + "step": 3120 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024137687214758626, + "loss": 0.7613, + "step": 3140 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024099119367487304, + "loss": 0.755, + "step": 3160 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002406055152021598, + "loss": 0.7547, + "step": 3180 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002402198367294465, + "loss": 0.7611, + "step": 3200 + }, + { + "epoch": 0.61, + "eval_loss": 0.7789185643196106, + "eval_runtime": 25.4744, + "eval_samples_per_second": 78.51, + "eval_steps_per_second": 1.256, + "step": 3200 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002398341582567333, + "loss": 0.7498, + "step": 3220 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023944847978402, + "loss": 0.757, + "step": 3240 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002390628013113068, + "loss": 0.7472, + "step": 3260 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023867712283859354, + "loss": 0.7557, + "step": 3280 + }, + { + "epoch": 0.63, + "learning_rate": 0.0002382914443658803, + "loss": 0.7602, + "step": 3300 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023790576589316704, + "loss": 0.7573, + "step": 3320 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002375200874204538, + "loss": 0.7565, + "step": 3340 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023713440894774054, + "loss": 0.7517, + "step": 3360 + }, + { + "epoch": 0.65, + "learning_rate": 0.0002367487304750273, + "loss": 0.7521, + "step": 3380 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023636305200231404, + "loss": 0.7575, + "step": 3400 + }, + { + "epoch": 0.65, + "eval_loss": 0.7771645784378052, + "eval_runtime": 25.4832, + "eval_samples_per_second": 78.483, + "eval_steps_per_second": 1.256, + "step": 3400 + }, + { + "epoch": 0.66, + "learning_rate": 0.0002359773735296008, + "loss": 0.7605, + "step": 3420 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023559169505688756, + "loss": 0.7547, + "step": 3440 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023520601658417428, + "loss": 0.7522, + "step": 3460 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023482033811146106, + "loss": 0.757, + "step": 3480 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002344346596387478, + "loss": 0.7561, + "step": 3500 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023404898116603456, + "loss": 0.7486, + "step": 3520 + }, + { + "epoch": 0.68, + "learning_rate": 0.0002336633026933213, + "loss": 0.7519, + "step": 3540 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023327762422060806, + "loss": 0.7487, + "step": 3560 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002328919457478948, + "loss": 0.747, + "step": 3580 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002325062672751816, + "loss": 0.7523, + "step": 3600 + }, + { + "epoch": 0.69, + "eval_loss": 0.7746226787567139, + "eval_runtime": 25.4795, + "eval_samples_per_second": 78.494, + "eval_steps_per_second": 1.256, + "step": 3600 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002321205888024683, + "loss": 0.7427, + "step": 3620 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002317349103297551, + "loss": 0.7442, + "step": 3640 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023134923185704184, + "loss": 0.7587, + "step": 3660 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023096355338432856, + "loss": 0.7506, + "step": 3680 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023057787491161534, + "loss": 0.7514, + "step": 3700 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023019219643890206, + "loss": 0.7475, + "step": 3720 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022980651796618884, + "loss": 0.7601, + "step": 3740 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022942083949347559, + "loss": 0.7474, + "step": 3760 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022903516102076233, + "loss": 0.7529, + "step": 3780 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022864948254804908, + "loss": 0.7458, + "step": 3800 + }, + { + "epoch": 0.73, + "eval_loss": 0.7719505429267883, + "eval_runtime": 25.4724, + "eval_samples_per_second": 78.516, + "eval_steps_per_second": 1.256, + "step": 3800 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022826380407533586, + "loss": 0.7584, + "step": 3820 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022787812560262258, + "loss": 0.7416, + "step": 3840 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022749244712990936, + "loss": 0.7444, + "step": 3860 + }, + { + "epoch": 0.74, + "learning_rate": 0.0002271067686571961, + "loss": 0.7459, + "step": 3880 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022672109018448283, + "loss": 0.7476, + "step": 3900 + }, + { + "epoch": 0.75, + "learning_rate": 0.0002263354117117696, + "loss": 0.7473, + "step": 3920 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022594973323905633, + "loss": 0.7434, + "step": 3940 + }, + { + "epoch": 0.76, + "learning_rate": 0.0002255640547663431, + "loss": 0.7463, + "step": 3960 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022517837629362986, + "loss": 0.7435, + "step": 3980 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002247926978209166, + "loss": 0.7445, + "step": 4000 + }, + { + "epoch": 0.77, + "eval_loss": 0.7707083821296692, + "eval_runtime": 25.4747, + "eval_samples_per_second": 78.509, + "eval_steps_per_second": 1.256, + "step": 4000 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022440701934820336, + "loss": 0.7321, + "step": 4020 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022402134087549014, + "loss": 0.7525, + "step": 4040 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022363566240277686, + "loss": 0.7494, + "step": 4060 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022324998393006364, + "loss": 0.7533, + "step": 4080 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022286430545735036, + "loss": 0.7442, + "step": 4100 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002224786269846371, + "loss": 0.7423, + "step": 4120 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022209294851192388, + "loss": 0.7443, + "step": 4140 + }, + { + "epoch": 0.8, + "learning_rate": 0.0002217072700392106, + "loss": 0.7388, + "step": 4160 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022132159156649738, + "loss": 0.7425, + "step": 4180 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022093591309378413, + "loss": 0.7507, + "step": 4200 + }, + { + "epoch": 0.8, + "eval_loss": 0.7691813111305237, + "eval_runtime": 25.5011, + "eval_samples_per_second": 78.428, + "eval_steps_per_second": 1.255, + "step": 4200 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022055023462107088, + "loss": 0.7276, + "step": 4220 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022016455614835763, + "loss": 0.7399, + "step": 4240 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021977887767564438, + "loss": 0.7409, + "step": 4260 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021939319920293113, + "loss": 0.7391, + "step": 4280 + }, + { + "epoch": 0.82, + "learning_rate": 0.0002190075207302179, + "loss": 0.741, + "step": 4300 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021862184225750463, + "loss": 0.7404, + "step": 4320 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021823616378479138, + "loss": 0.7356, + "step": 4340 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021785048531207816, + "loss": 0.7458, + "step": 4360 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021746480683936488, + "loss": 0.7373, + "step": 4380 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021707912836665166, + "loss": 0.7455, + "step": 4400 + }, + { + "epoch": 0.84, + "eval_loss": 0.7680566310882568, + "eval_runtime": 25.4479, + "eval_samples_per_second": 78.592, + "eval_steps_per_second": 1.257, + "step": 4400 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021669344989393838, + "loss": 0.7376, + "step": 4420 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021630777142122516, + "loss": 0.7396, + "step": 4440 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002159220929485119, + "loss": 0.7367, + "step": 4460 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021553641447579866, + "loss": 0.7354, + "step": 4480 + }, + { + "epoch": 0.86, + "learning_rate": 0.0002151507360030854, + "loss": 0.7337, + "step": 4500 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021476505753037218, + "loss": 0.7384, + "step": 4520 + }, + { + "epoch": 0.87, + "learning_rate": 0.0002143793790576589, + "loss": 0.7334, + "step": 4540 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021399370058494568, + "loss": 0.742, + "step": 4560 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002136080221122324, + "loss": 0.7408, + "step": 4580 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021322234363951916, + "loss": 0.7466, + "step": 4600 + }, + { + "epoch": 0.88, + "eval_loss": 0.7663780450820923, + "eval_runtime": 25.53, + "eval_samples_per_second": 78.339, + "eval_steps_per_second": 1.253, + "step": 4600 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021283666516680593, + "loss": 0.7399, + "step": 4620 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021245098669409266, + "loss": 0.746, + "step": 4640 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021206530822137943, + "loss": 0.7397, + "step": 4660 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021167962974866618, + "loss": 0.7349, + "step": 4680 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021129395127595293, + "loss": 0.7334, + "step": 4700 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021090827280323968, + "loss": 0.738, + "step": 4720 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002105225943305264, + "loss": 0.7398, + "step": 4740 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021013691585781318, + "loss": 0.7465, + "step": 4760 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020975123738509996, + "loss": 0.7388, + "step": 4780 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020936555891238668, + "loss": 0.7462, + "step": 4800 + }, + { + "epoch": 0.92, + "eval_loss": 0.7650267481803894, + "eval_runtime": 25.5463, + "eval_samples_per_second": 78.289, + "eval_steps_per_second": 1.253, + "step": 4800 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020897988043967343, + "loss": 0.7463, + "step": 4820 + }, + { + "epoch": 0.93, + "learning_rate": 0.0002085942019669602, + "loss": 0.7389, + "step": 4840 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020820852349424693, + "loss": 0.7316, + "step": 4860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002078228450215337, + "loss": 0.73, + "step": 4880 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020743716654882043, + "loss": 0.7472, + "step": 4900 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002070514880761072, + "loss": 0.7494, + "step": 4920 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020666580960339396, + "loss": 0.7424, + "step": 4940 + }, + { + "epoch": 0.95, + "learning_rate": 0.0002062801311306807, + "loss": 0.7443, + "step": 4960 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020589445265796746, + "loss": 0.7355, + "step": 4980 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020550877418525423, + "loss": 0.7388, + "step": 5000 + }, + { + "epoch": 0.96, + "eval_loss": 0.7630622386932373, + "eval_runtime": 25.8654, + "eval_samples_per_second": 77.323, + "eval_steps_per_second": 1.237, + "step": 5000 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020512309571254096, + "loss": 0.7317, + "step": 5020 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002047374172398277, + "loss": 0.7385, + "step": 5040 + }, + { + "epoch": 0.97, + "learning_rate": 0.00020435173876711445, + "loss": 0.7369, + "step": 5060 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002039660602944012, + "loss": 0.7243, + "step": 5080 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020358038182168798, + "loss": 0.7334, + "step": 5100 + }, + { + "epoch": 0.98, + "learning_rate": 0.0002031947033489747, + "loss": 0.7433, + "step": 5120 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020280902487626148, + "loss": 0.7202, + "step": 5140 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020242334640354823, + "loss": 0.7336, + "step": 5160 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020203766793083498, + "loss": 0.7324, + "step": 5180 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020165198945812173, + "loss": 0.7363, + "step": 5200 + }, + { + "epoch": 1.0, + "eval_loss": 0.7617191076278687, + "eval_runtime": 25.4884, + "eval_samples_per_second": 78.467, + "eval_steps_per_second": 1.255, + "step": 5200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0002012663109854085, + "loss": 0.7359, + "step": 5220 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020088063251269523, + "loss": 0.7347, + "step": 5240 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020049495403998198, + "loss": 0.732, + "step": 5260 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020010927556726873, + "loss": 0.7385, + "step": 5280 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019972359709455548, + "loss": 0.7313, + "step": 5300 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019933791862184226, + "loss": 0.7337, + "step": 5320 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019895224014912898, + "loss": 0.733, + "step": 5340 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019856656167641576, + "loss": 0.7226, + "step": 5360 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001981808832037025, + "loss": 0.7363, + "step": 5380 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019779520473098925, + "loss": 0.7296, + "step": 5400 + }, + { + "epoch": 1.03, + "eval_loss": 0.7608480453491211, + "eval_runtime": 25.5307, + "eval_samples_per_second": 78.337, + "eval_steps_per_second": 1.253, + "step": 5400 + }, + { + "epoch": 1.04, + "learning_rate": 0.000197409526258276, + "loss": 0.7237, + "step": 5420 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019702384778556273, + "loss": 0.735, + "step": 5440 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001966381693128495, + "loss": 0.7379, + "step": 5460 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019625249084013628, + "loss": 0.7372, + "step": 5480 + }, + { + "epoch": 1.05, + "learning_rate": 0.000195866812367423, + "loss": 0.7332, + "step": 5500 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019548113389470975, + "loss": 0.7375, + "step": 5520 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019509545542199653, + "loss": 0.7352, + "step": 5540 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019470977694928325, + "loss": 0.7336, + "step": 5560 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019432409847657003, + "loss": 0.7266, + "step": 5580 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019393842000385675, + "loss": 0.7325, + "step": 5600 + }, + { + "epoch": 1.07, + "eval_loss": 0.7595871686935425, + "eval_runtime": 25.4845, + "eval_samples_per_second": 78.479, + "eval_steps_per_second": 1.256, + "step": 5600 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019355274153114353, + "loss": 0.7259, + "step": 5620 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019316706305843028, + "loss": 0.7274, + "step": 5640 + }, + { + "epoch": 1.08, + "learning_rate": 0.000192781384585717, + "loss": 0.7254, + "step": 5660 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019239570611300378, + "loss": 0.7332, + "step": 5680 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019201002764029056, + "loss": 0.73, + "step": 5700 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019162434916757728, + "loss": 0.7365, + "step": 5720 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019123867069486403, + "loss": 0.7261, + "step": 5740 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019085299222215078, + "loss": 0.7331, + "step": 5760 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019046731374943753, + "loss": 0.7272, + "step": 5780 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001900816352767243, + "loss": 0.7325, + "step": 5800 + }, + { + "epoch": 1.11, + "eval_loss": 0.7583591341972351, + "eval_runtime": 25.49, + "eval_samples_per_second": 78.462, + "eval_steps_per_second": 1.255, + "step": 5800 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018969595680401103, + "loss": 0.7277, + "step": 5820 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001893102783312978, + "loss": 0.7352, + "step": 5840 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018892459985858455, + "loss": 0.7312, + "step": 5860 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018853892138587128, + "loss": 0.7296, + "step": 5880 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018815324291315805, + "loss": 0.7275, + "step": 5900 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018776756444044478, + "loss": 0.7345, + "step": 5920 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018738188596773155, + "loss": 0.7322, + "step": 5940 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001869962074950183, + "loss": 0.737, + "step": 5960 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018661052902230505, + "loss": 0.7243, + "step": 5980 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001862248505495918, + "loss": 0.7303, + "step": 6000 + }, + { + "epoch": 1.15, + "eval_loss": 0.7572018504142761, + "eval_runtime": 25.7126, + "eval_samples_per_second": 77.783, + "eval_steps_per_second": 1.245, + "step": 6000 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018583917207687858, + "loss": 0.7237, + "step": 6020 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001854534936041653, + "loss": 0.735, + "step": 6040 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018506781513145208, + "loss": 0.727, + "step": 6060 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001846821366587388, + "loss": 0.7226, + "step": 6080 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018429645818602558, + "loss": 0.7213, + "step": 6100 + }, + { + "epoch": 1.17, + "learning_rate": 0.000183930063636948, + "loss": 0.7206, + "step": 6120 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001835443851642347, + "loss": 0.7292, + "step": 6140 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001831587066915215, + "loss": 0.7316, + "step": 6160 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018277302821880824, + "loss": 0.7318, + "step": 6180 + }, + { + "epoch": 1.19, + "learning_rate": 0.000182387349746095, + "loss": 0.7302, + "step": 6200 + }, + { + "epoch": 1.19, + "eval_loss": 0.755982518196106, + "eval_runtime": 25.5055, + "eval_samples_per_second": 78.415, + "eval_steps_per_second": 1.255, + "step": 6200 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018200167127338174, + "loss": 0.728, + "step": 6220 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018161599280066851, + "loss": 0.7334, + "step": 6240 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018123031432795524, + "loss": 0.7303, + "step": 6260 + }, + { + "epoch": 1.2, + "learning_rate": 0.000180844635855242, + "loss": 0.7274, + "step": 6280 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018045895738252874, + "loss": 0.7368, + "step": 6300 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018007327890981549, + "loss": 0.724, + "step": 6320 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017968760043710226, + "loss": 0.7229, + "step": 6340 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017930192196438899, + "loss": 0.7216, + "step": 6360 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017891624349167576, + "loss": 0.7292, + "step": 6380 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001785305650189625, + "loss": 0.7226, + "step": 6400 + }, + { + "epoch": 1.23, + "eval_loss": 0.7554095387458801, + "eval_runtime": 25.5062, + "eval_samples_per_second": 78.412, + "eval_steps_per_second": 1.255, + "step": 6400 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017814488654624926, + "loss": 0.7262, + "step": 6420 + }, + { + "epoch": 1.23, + "learning_rate": 0.000177759208073536, + "loss": 0.7274, + "step": 6440 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017737352960082276, + "loss": 0.7271, + "step": 6460 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001769878511281095, + "loss": 0.7299, + "step": 6480 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001766021726553963, + "loss": 0.7264, + "step": 6500 + }, + { + "epoch": 1.25, + "learning_rate": 0.000176216494182683, + "loss": 0.7285, + "step": 6520 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001758308157099698, + "loss": 0.7216, + "step": 6540 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017544513723725654, + "loss": 0.7215, + "step": 6560 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017505945876454326, + "loss": 0.7253, + "step": 6580 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017467378029183004, + "loss": 0.7246, + "step": 6600 + }, + { + "epoch": 1.26, + "eval_loss": 0.7540405988693237, + "eval_runtime": 25.4725, + "eval_samples_per_second": 78.516, + "eval_steps_per_second": 1.256, + "step": 6600 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017428810181911676, + "loss": 0.7166, + "step": 6620 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017390242334640354, + "loss": 0.7213, + "step": 6640 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017351674487369029, + "loss": 0.7305, + "step": 6660 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017313106640097704, + "loss": 0.7347, + "step": 6680 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017274538792826379, + "loss": 0.7272, + "step": 6700 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017235970945555056, + "loss": 0.7224, + "step": 6720 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017197403098283728, + "loss": 0.7327, + "step": 6740 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017158835251012406, + "loss": 0.7228, + "step": 6760 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017120267403741078, + "loss": 0.7344, + "step": 6780 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017081699556469753, + "loss": 0.7269, + "step": 6800 + }, + { + "epoch": 1.3, + "eval_loss": 0.7531024813652039, + "eval_runtime": 25.6796, + "eval_samples_per_second": 77.883, + "eval_steps_per_second": 1.246, + "step": 6800 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001704313170919843, + "loss": 0.7362, + "step": 6820 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017004563861927103, + "loss": 0.7293, + "step": 6840 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001696599601465578, + "loss": 0.7286, + "step": 6860 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016927428167384456, + "loss": 0.7148, + "step": 6880 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001688886032011313, + "loss": 0.72, + "step": 6900 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016850292472841806, + "loss": 0.7239, + "step": 6920 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001681172462557048, + "loss": 0.726, + "step": 6940 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016773156778299156, + "loss": 0.7286, + "step": 6960 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016734588931027834, + "loss": 0.7276, + "step": 6980 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016696021083756506, + "loss": 0.7258, + "step": 7000 + }, + { + "epoch": 1.34, + "eval_loss": 0.7521764636039734, + "eval_runtime": 25.5237, + "eval_samples_per_second": 78.359, + "eval_steps_per_second": 1.254, + "step": 7000 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001665745323648518, + "loss": 0.7326, + "step": 7020 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016618885389213859, + "loss": 0.7311, + "step": 7040 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001658031754194253, + "loss": 0.7295, + "step": 7060 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016541749694671208, + "loss": 0.7279, + "step": 7080 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016503181847399883, + "loss": 0.7293, + "step": 7100 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016464614000128558, + "loss": 0.7256, + "step": 7120 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016426046152857233, + "loss": 0.7204, + "step": 7140 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016387478305585908, + "loss": 0.718, + "step": 7160 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016348910458314583, + "loss": 0.7206, + "step": 7180 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001631034261104326, + "loss": 0.7282, + "step": 7200 + }, + { + "epoch": 1.38, + "eval_loss": 0.7514960765838623, + "eval_runtime": 25.5249, + "eval_samples_per_second": 78.355, + "eval_steps_per_second": 1.254, + "step": 7200 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016271774763771933, + "loss": 0.7162, + "step": 7220 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016233206916500608, + "loss": 0.7277, + "step": 7240 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016194639069229286, + "loss": 0.7147, + "step": 7260 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016156071221957958, + "loss": 0.7339, + "step": 7280 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016117503374686636, + "loss": 0.7257, + "step": 7300 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016078935527415308, + "loss": 0.728, + "step": 7320 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016040367680143986, + "loss": 0.7139, + "step": 7340 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001600179983287266, + "loss": 0.7202, + "step": 7360 + }, + { + "epoch": 1.41, + "learning_rate": 0.00015963231985601336, + "loss": 0.7323, + "step": 7380 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001592466413833001, + "loss": 0.7198, + "step": 7400 + }, + { + "epoch": 1.42, + "eval_loss": 0.750492513179779, + "eval_runtime": 25.8887, + "eval_samples_per_second": 77.254, + "eval_steps_per_second": 1.236, + "step": 7400 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015886096291058688, + "loss": 0.7138, + "step": 7420 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001584752844378736, + "loss": 0.7205, + "step": 7440 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015808960596516038, + "loss": 0.7178, + "step": 7460 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001577039274924471, + "loss": 0.7251, + "step": 7480 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015731824901973386, + "loss": 0.7187, + "step": 7500 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015693257054702063, + "loss": 0.7238, + "step": 7520 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015654689207430736, + "loss": 0.7283, + "step": 7540 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015616121360159413, + "loss": 0.7189, + "step": 7560 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015577553512888088, + "loss": 0.7216, + "step": 7580 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015538985665616763, + "loss": 0.7219, + "step": 7600 + }, + { + "epoch": 1.46, + "eval_loss": 0.7496184706687927, + "eval_runtime": 25.4957, + "eval_samples_per_second": 78.445, + "eval_steps_per_second": 1.255, + "step": 7600 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015500417818345438, + "loss": 0.7233, + "step": 7620 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001546184997107411, + "loss": 0.7241, + "step": 7640 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015423282123802788, + "loss": 0.7194, + "step": 7660 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015384714276531466, + "loss": 0.7229, + "step": 7680 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015346146429260138, + "loss": 0.7219, + "step": 7700 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015307578581988813, + "loss": 0.7027, + "step": 7720 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001526901073471749, + "loss": 0.7171, + "step": 7740 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015230442887446163, + "loss": 0.7193, + "step": 7760 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001519187504017484, + "loss": 0.7269, + "step": 7780 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015153307192903513, + "loss": 0.7171, + "step": 7800 + }, + { + "epoch": 1.49, + "eval_loss": 0.7494381070137024, + "eval_runtime": 25.5318, + "eval_samples_per_second": 78.334, + "eval_steps_per_second": 1.253, + "step": 7800 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001511473934563219, + "loss": 0.7186, + "step": 7820 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015076171498360866, + "loss": 0.7137, + "step": 7840 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015037603651089538, + "loss": 0.7212, + "step": 7860 + }, + { + "epoch": 1.51, + "learning_rate": 0.00014999035803818216, + "loss": 0.7167, + "step": 7880 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001496046795654689, + "loss": 0.7203, + "step": 7900 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014921900109275566, + "loss": 0.714, + "step": 7920 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001488333226200424, + "loss": 0.7153, + "step": 7940 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014844764414732916, + "loss": 0.7176, + "step": 7960 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001480619656746159, + "loss": 0.7049, + "step": 7980 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014767628720190265, + "loss": 0.7204, + "step": 8000 + }, + { + "epoch": 1.53, + "eval_loss": 0.7486086487770081, + "eval_runtime": 25.5275, + "eval_samples_per_second": 78.347, + "eval_steps_per_second": 1.254, + "step": 8000 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014729060872918943, + "loss": 0.7167, + "step": 8020 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014690493025647618, + "loss": 0.72, + "step": 8040 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014651925178376293, + "loss": 0.7203, + "step": 8060 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014613357331104968, + "loss": 0.7258, + "step": 8080 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014574789483833643, + "loss": 0.715, + "step": 8100 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014536221636562318, + "loss": 0.7245, + "step": 8120 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014497653789290993, + "loss": 0.7258, + "step": 8140 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014459085942019668, + "loss": 0.7234, + "step": 8160 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014420518094748343, + "loss": 0.7128, + "step": 8180 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014381950247477018, + "loss": 0.7181, + "step": 8200 + }, + { + "epoch": 1.57, + "eval_loss": 0.7475513219833374, + "eval_runtime": 25.5412, + "eval_samples_per_second": 78.305, + "eval_steps_per_second": 1.253, + "step": 8200 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014343382400205693, + "loss": 0.7236, + "step": 8220 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001430481455293437, + "loss": 0.7125, + "step": 8240 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014266246705663046, + "loss": 0.7186, + "step": 8260 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001422767885839172, + "loss": 0.7203, + "step": 8280 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014189111011120396, + "loss": 0.7156, + "step": 8300 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001415054316384907, + "loss": 0.714, + "step": 8320 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014111975316577745, + "loss": 0.7129, + "step": 8340 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001407340746930642, + "loss": 0.7179, + "step": 8360 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014036768014398662, + "loss": 0.7197, + "step": 8380 + }, + { + "epoch": 1.61, + "learning_rate": 0.00013998200167127337, + "loss": 0.7287, + "step": 8400 + }, + { + "epoch": 1.61, + "eval_loss": 0.7470650672912598, + "eval_runtime": 25.5238, + "eval_samples_per_second": 78.358, + "eval_steps_per_second": 1.254, + "step": 8400 + }, + { + "epoch": 1.61, + "learning_rate": 0.00013959632319856011, + "loss": 0.718, + "step": 8420 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013921064472584686, + "loss": 0.7166, + "step": 8440 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013882496625313361, + "loss": 0.7218, + "step": 8460 + }, + { + "epoch": 1.62, + "learning_rate": 0.0001384392877804204, + "loss": 0.723, + "step": 8480 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013805360930770714, + "loss": 0.7104, + "step": 8500 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001376679308349939, + "loss": 0.7136, + "step": 8520 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001372822523622806, + "loss": 0.7237, + "step": 8540 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001368965738895674, + "loss": 0.7196, + "step": 8560 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013651089541685414, + "loss": 0.7218, + "step": 8580 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001361252169441409, + "loss": 0.7132, + "step": 8600 + }, + { + "epoch": 1.65, + "eval_loss": 0.7465201020240784, + "eval_runtime": 25.542, + "eval_samples_per_second": 78.302, + "eval_steps_per_second": 1.253, + "step": 8600 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013573953847142764, + "loss": 0.7139, + "step": 8620 + }, + { + "epoch": 1.66, + "learning_rate": 0.0001353538599987144, + "loss": 0.7093, + "step": 8640 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013496818152600114, + "loss": 0.7243, + "step": 8660 + }, + { + "epoch": 1.66, + "learning_rate": 0.0001345825030532879, + "loss": 0.7127, + "step": 8680 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013419682458057464, + "loss": 0.7148, + "step": 8700 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013381114610786142, + "loss": 0.7236, + "step": 8720 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013342546763514817, + "loss": 0.7103, + "step": 8740 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001330397891624349, + "loss": 0.7133, + "step": 8760 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013265411068972164, + "loss": 0.7182, + "step": 8780 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013226843221700841, + "loss": 0.7198, + "step": 8800 + }, + { + "epoch": 1.69, + "eval_loss": 0.7450763583183289, + "eval_runtime": 25.4725, + "eval_samples_per_second": 78.516, + "eval_steps_per_second": 1.256, + "step": 8800 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013188275374429516, + "loss": 0.7073, + "step": 8820 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013149707527158191, + "loss": 0.7208, + "step": 8840 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013111139679886866, + "loss": 0.7067, + "step": 8860 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001307257183261554, + "loss": 0.7149, + "step": 8880 + }, + { + "epoch": 1.71, + "learning_rate": 0.00013034003985344216, + "loss": 0.7133, + "step": 8900 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001299543613807289, + "loss": 0.7137, + "step": 8920 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001295686829080157, + "loss": 0.719, + "step": 8940 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012918300443530244, + "loss": 0.7198, + "step": 8960 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001287973259625892, + "loss": 0.7074, + "step": 8980 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001284116474898759, + "loss": 0.7216, + "step": 9000 + }, + { + "epoch": 1.72, + "eval_loss": 0.7454459071159363, + "eval_runtime": 25.5717, + "eval_samples_per_second": 78.211, + "eval_steps_per_second": 1.251, + "step": 9000 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001280259690171627, + "loss": 0.7203, + "step": 9020 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012764029054444944, + "loss": 0.7133, + "step": 9040 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001272546120717362, + "loss": 0.7081, + "step": 9060 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012686893359902294, + "loss": 0.7153, + "step": 9080 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001264832551263097, + "loss": 0.7108, + "step": 9100 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012609757665359644, + "loss": 0.7106, + "step": 9120 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001257118981808832, + "loss": 0.7117, + "step": 9140 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012532621970816994, + "loss": 0.7171, + "step": 9160 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012494054123545671, + "loss": 0.7148, + "step": 9180 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012455486276274346, + "loss": 0.714, + "step": 9200 + }, + { + "epoch": 1.76, + "eval_loss": 0.7446411848068237, + "eval_runtime": 25.5622, + "eval_samples_per_second": 78.24, + "eval_steps_per_second": 1.252, + "step": 9200 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012416918429003019, + "loss": 0.7133, + "step": 9220 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012378350581731694, + "loss": 0.7108, + "step": 9240 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001233978273446037, + "loss": 0.7147, + "step": 9260 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012301214887189046, + "loss": 0.715, + "step": 9280 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001226264703991772, + "loss": 0.7255, + "step": 9300 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012224079192646396, + "loss": 0.7168, + "step": 9320 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012185511345375073, + "loss": 0.7155, + "step": 9340 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012146943498103746, + "loss": 0.7064, + "step": 9360 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012108375650832421, + "loss": 0.716, + "step": 9380 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012069807803561096, + "loss": 0.7145, + "step": 9400 + }, + { + "epoch": 1.8, + "eval_loss": 0.7441000938415527, + "eval_runtime": 25.7378, + "eval_samples_per_second": 77.707, + "eval_steps_per_second": 1.243, + "step": 9400 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012031239956289772, + "loss": 0.7135, + "step": 9420 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011992672109018447, + "loss": 0.7164, + "step": 9440 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011954104261747122, + "loss": 0.714, + "step": 9460 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011915536414475797, + "loss": 0.7173, + "step": 9480 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011876968567204474, + "loss": 0.7102, + "step": 9500 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011838400719933149, + "loss": 0.7122, + "step": 9520 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011799832872661822, + "loss": 0.7197, + "step": 9540 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011761265025390497, + "loss": 0.7132, + "step": 9560 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011722697178119174, + "loss": 0.7255, + "step": 9580 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011684129330847849, + "loss": 0.7175, + "step": 9600 + }, + { + "epoch": 1.84, + "eval_loss": 0.7432481646537781, + "eval_runtime": 25.5076, + "eval_samples_per_second": 78.408, + "eval_steps_per_second": 1.255, + "step": 9600 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011645561483576524, + "loss": 0.7125, + "step": 9620 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011606993636305199, + "loss": 0.7119, + "step": 9640 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011568425789033875, + "loss": 0.7147, + "step": 9660 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001152985794176255, + "loss": 0.7101, + "step": 9680 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011491290094491225, + "loss": 0.7105, + "step": 9700 + }, + { + "epoch": 1.86, + "learning_rate": 0.000114527222472199, + "loss": 0.7153, + "step": 9720 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011414154399948576, + "loss": 0.7047, + "step": 9740 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011375586552677251, + "loss": 0.6967, + "step": 9760 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011337018705405925, + "loss": 0.7094, + "step": 9780 + }, + { + "epoch": 1.88, + "learning_rate": 0.000112984508581346, + "loss": 0.7195, + "step": 9800 + }, + { + "epoch": 1.88, + "eval_loss": 0.7431700229644775, + "eval_runtime": 25.5752, + "eval_samples_per_second": 78.201, + "eval_steps_per_second": 1.251, + "step": 9800 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011259883010863276, + "loss": 0.7122, + "step": 9820 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011221315163591951, + "loss": 0.7193, + "step": 9840 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011182747316320626, + "loss": 0.7147, + "step": 9860 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011144179469049301, + "loss": 0.7058, + "step": 9880 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011105611621777977, + "loss": 0.7106, + "step": 9900 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011067043774506652, + "loss": 0.71, + "step": 9920 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011028475927235327, + "loss": 0.7182, + "step": 9940 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010989908079964002, + "loss": 0.7048, + "step": 9960 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010951340232692679, + "loss": 0.7165, + "step": 9980 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010912772385421352, + "loss": 0.7153, + "step": 10000 + }, + { + "epoch": 1.92, + "eval_loss": 0.7425808310508728, + "eval_runtime": 25.6297, + "eval_samples_per_second": 78.034, + "eval_steps_per_second": 1.249, + "step": 10000 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010874204538150027, + "loss": 0.7127, + "step": 10020 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010835636690878702, + "loss": 0.7062, + "step": 10040 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010797068843607378, + "loss": 0.7125, + "step": 10060 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010758500996336053, + "loss": 0.7114, + "step": 10080 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010719933149064728, + "loss": 0.7096, + "step": 10100 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010681365301793405, + "loss": 0.7119, + "step": 10120 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001064279745452208, + "loss": 0.7034, + "step": 10140 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010604229607250755, + "loss": 0.7049, + "step": 10160 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001056566175997943, + "loss": 0.7156, + "step": 10180 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010527093912708106, + "loss": 0.718, + "step": 10200 + }, + { + "epoch": 1.95, + "eval_loss": 0.7418650984764099, + "eval_runtime": 25.554, + "eval_samples_per_second": 78.266, + "eval_steps_per_second": 1.252, + "step": 10200 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010488526065436781, + "loss": 0.7141, + "step": 10220 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010449958218165455, + "loss": 0.7073, + "step": 10240 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001041139037089413, + "loss": 0.7129, + "step": 10260 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010372822523622806, + "loss": 0.7174, + "step": 10280 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010334254676351481, + "loss": 0.7112, + "step": 10300 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010295686829080156, + "loss": 0.7073, + "step": 10320 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010257118981808831, + "loss": 0.7164, + "step": 10340 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010218551134537507, + "loss": 0.7057, + "step": 10360 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010179983287266182, + "loss": 0.709, + "step": 10380 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010141415439994857, + "loss": 0.7147, + "step": 10400 + }, + { + "epoch": 1.99, + "eval_loss": 0.7417293787002563, + "eval_runtime": 25.4964, + "eval_samples_per_second": 78.443, + "eval_steps_per_second": 1.255, + "step": 10400 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010102847592723531, + "loss": 0.713, + "step": 10420 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010064279745452208, + "loss": 0.7128, + "step": 10440 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010025711898180882, + "loss": 0.7094, + "step": 10460 + }, + { + "epoch": 2.01, + "learning_rate": 9.987144050909557e-05, + "loss": 0.7008, + "step": 10480 + }, + { + "epoch": 2.01, + "learning_rate": 9.948576203638232e-05, + "loss": 0.7083, + "step": 10500 + }, + { + "epoch": 2.02, + "learning_rate": 9.910008356366908e-05, + "loss": 0.7049, + "step": 10520 + }, + { + "epoch": 2.02, + "learning_rate": 9.871440509095583e-05, + "loss": 0.7041, + "step": 10540 + }, + { + "epoch": 2.02, + "learning_rate": 9.834801054187824e-05, + "loss": 0.7105, + "step": 10560 + }, + { + "epoch": 2.03, + "learning_rate": 9.7962332069165e-05, + "loss": 0.7041, + "step": 10580 + }, + { + "epoch": 2.03, + "learning_rate": 9.757665359645176e-05, + "loss": 0.7103, + "step": 10600 + }, + { + "epoch": 2.03, + "eval_loss": 0.7410894632339478, + "eval_runtime": 25.5424, + "eval_samples_per_second": 78.301, + "eval_steps_per_second": 1.253, + "step": 10600 + }, + { + "epoch": 2.03, + "learning_rate": 9.71909751237385e-05, + "loss": 0.7037, + "step": 10620 + }, + { + "epoch": 2.04, + "learning_rate": 9.680529665102524e-05, + "loss": 0.7078, + "step": 10640 + }, + { + "epoch": 2.04, + "learning_rate": 9.641961817831202e-05, + "loss": 0.7116, + "step": 10660 + }, + { + "epoch": 2.05, + "learning_rate": 9.603393970559876e-05, + "loss": 0.7094, + "step": 10680 + }, + { + "epoch": 2.05, + "learning_rate": 9.56482612328855e-05, + "loss": 0.7217, + "step": 10700 + }, + { + "epoch": 2.05, + "learning_rate": 9.526258276017226e-05, + "loss": 0.7038, + "step": 10720 + }, + { + "epoch": 2.06, + "learning_rate": 9.487690428745902e-05, + "loss": 0.7131, + "step": 10740 + }, + { + "epoch": 2.06, + "learning_rate": 9.449122581474577e-05, + "loss": 0.7051, + "step": 10760 + }, + { + "epoch": 2.07, + "learning_rate": 9.410554734203252e-05, + "loss": 0.7058, + "step": 10780 + }, + { + "epoch": 2.07, + "learning_rate": 9.371986886931927e-05, + "loss": 0.7039, + "step": 10800 + }, + { + "epoch": 2.07, + "eval_loss": 0.7405736446380615, + "eval_runtime": 25.7467, + "eval_samples_per_second": 77.68, + "eval_steps_per_second": 1.243, + "step": 10800 + }, + { + "epoch": 2.07, + "learning_rate": 9.333419039660603e-05, + "loss": 0.7101, + "step": 10820 + }, + { + "epoch": 2.08, + "learning_rate": 9.294851192389278e-05, + "loss": 0.6991, + "step": 10840 + }, + { + "epoch": 2.08, + "learning_rate": 9.256283345117953e-05, + "loss": 0.7069, + "step": 10860 + }, + { + "epoch": 2.08, + "learning_rate": 9.217715497846627e-05, + "loss": 0.7094, + "step": 10880 + }, + { + "epoch": 2.09, + "learning_rate": 9.179147650575303e-05, + "loss": 0.7103, + "step": 10900 + }, + { + "epoch": 2.09, + "learning_rate": 9.140579803303978e-05, + "loss": 0.7015, + "step": 10920 + }, + { + "epoch": 2.1, + "learning_rate": 9.102011956032653e-05, + "loss": 0.712, + "step": 10940 + }, + { + "epoch": 2.1, + "learning_rate": 9.063444108761328e-05, + "loss": 0.707, + "step": 10960 + }, + { + "epoch": 2.1, + "learning_rate": 9.024876261490004e-05, + "loss": 0.7009, + "step": 10980 + }, + { + "epoch": 2.11, + "learning_rate": 8.986308414218679e-05, + "loss": 0.7062, + "step": 11000 + }, + { + "epoch": 2.11, + "eval_loss": 0.7398320436477661, + "eval_runtime": 25.5459, + "eval_samples_per_second": 78.29, + "eval_steps_per_second": 1.253, + "step": 11000 + }, + { + "epoch": 2.11, + "learning_rate": 8.947740566947354e-05, + "loss": 0.7054, + "step": 11020 + }, + { + "epoch": 2.12, + "learning_rate": 8.909172719676029e-05, + "loss": 0.7094, + "step": 11040 + }, + { + "epoch": 2.12, + "learning_rate": 8.870604872404706e-05, + "loss": 0.7059, + "step": 11060 + }, + { + "epoch": 2.12, + "learning_rate": 8.83203702513338e-05, + "loss": 0.7202, + "step": 11080 + }, + { + "epoch": 2.13, + "learning_rate": 8.793469177862054e-05, + "loss": 0.699, + "step": 11100 + }, + { + "epoch": 2.13, + "learning_rate": 8.754901330590729e-05, + "loss": 0.7137, + "step": 11120 + }, + { + "epoch": 2.13, + "learning_rate": 8.716333483319405e-05, + "loss": 0.7048, + "step": 11140 + }, + { + "epoch": 2.14, + "learning_rate": 8.67776563604808e-05, + "loss": 0.7089, + "step": 11160 + }, + { + "epoch": 2.14, + "learning_rate": 8.639197788776755e-05, + "loss": 0.7057, + "step": 11180 + }, + { + "epoch": 2.15, + "learning_rate": 8.60062994150543e-05, + "loss": 0.709, + "step": 11200 + }, + { + "epoch": 2.15, + "eval_loss": 0.7393301725387573, + "eval_runtime": 25.7257, + "eval_samples_per_second": 77.743, + "eval_steps_per_second": 1.244, + "step": 11200 + }, + { + "epoch": 2.15, + "learning_rate": 8.562062094234107e-05, + "loss": 0.7027, + "step": 11220 + }, + { + "epoch": 2.15, + "learning_rate": 8.523494246962782e-05, + "loss": 0.7082, + "step": 11240 + }, + { + "epoch": 2.16, + "learning_rate": 8.484926399691457e-05, + "loss": 0.7007, + "step": 11260 + }, + { + "epoch": 2.16, + "learning_rate": 8.446358552420132e-05, + "loss": 0.7011, + "step": 11280 + }, + { + "epoch": 2.17, + "learning_rate": 8.407790705148808e-05, + "loss": 0.7067, + "step": 11300 + }, + { + "epoch": 2.17, + "learning_rate": 8.369222857877483e-05, + "loss": 0.702, + "step": 11320 + }, + { + "epoch": 2.17, + "learning_rate": 8.330655010606157e-05, + "loss": 0.7126, + "step": 11340 + }, + { + "epoch": 2.18, + "learning_rate": 8.292087163334832e-05, + "loss": 0.6947, + "step": 11360 + }, + { + "epoch": 2.18, + "learning_rate": 8.253519316063508e-05, + "loss": 0.7033, + "step": 11380 + }, + { + "epoch": 2.18, + "learning_rate": 8.214951468792183e-05, + "loss": 0.7075, + "step": 11400 + }, + { + "epoch": 2.18, + "eval_loss": 0.7390503883361816, + "eval_runtime": 25.6097, + "eval_samples_per_second": 78.095, + "eval_steps_per_second": 1.25, + "step": 11400 + }, + { + "epoch": 2.19, + "learning_rate": 8.176383621520858e-05, + "loss": 0.7081, + "step": 11420 + }, + { + "epoch": 2.19, + "learning_rate": 8.137815774249533e-05, + "loss": 0.7114, + "step": 11440 + }, + { + "epoch": 2.2, + "learning_rate": 8.099247926978209e-05, + "loss": 0.7105, + "step": 11460 + }, + { + "epoch": 2.2, + "learning_rate": 8.060680079706884e-05, + "loss": 0.7113, + "step": 11480 + }, + { + "epoch": 2.2, + "learning_rate": 8.022112232435559e-05, + "loss": 0.7109, + "step": 11500 + }, + { + "epoch": 2.21, + "learning_rate": 7.983544385164233e-05, + "loss": 0.7039, + "step": 11520 + }, + { + "epoch": 2.21, + "learning_rate": 7.94497653789291e-05, + "loss": 0.7144, + "step": 11540 + }, + { + "epoch": 2.21, + "learning_rate": 7.906408690621584e-05, + "loss": 0.7003, + "step": 11560 + }, + { + "epoch": 2.22, + "learning_rate": 7.867840843350259e-05, + "loss": 0.7028, + "step": 11580 + }, + { + "epoch": 2.22, + "learning_rate": 7.829272996078934e-05, + "loss": 0.7018, + "step": 11600 + }, + { + "epoch": 2.22, + "eval_loss": 0.7388148307800293, + "eval_runtime": 25.5069, + "eval_samples_per_second": 78.41, + "eval_steps_per_second": 1.255, + "step": 11600 + }, + { + "epoch": 2.23, + "learning_rate": 7.79070514880761e-05, + "loss": 0.7113, + "step": 11620 + }, + { + "epoch": 2.23, + "learning_rate": 7.752137301536285e-05, + "loss": 0.7136, + "step": 11640 + }, + { + "epoch": 2.23, + "learning_rate": 7.71356945426496e-05, + "loss": 0.7097, + "step": 11660 + }, + { + "epoch": 2.24, + "learning_rate": 7.675001606993635e-05, + "loss": 0.7057, + "step": 11680 + }, + { + "epoch": 2.24, + "learning_rate": 7.636433759722312e-05, + "loss": 0.7028, + "step": 11700 + }, + { + "epoch": 2.25, + "learning_rate": 7.597865912450986e-05, + "loss": 0.708, + "step": 11720 + }, + { + "epoch": 2.25, + "learning_rate": 7.559298065179661e-05, + "loss": 0.7088, + "step": 11740 + }, + { + "epoch": 2.25, + "learning_rate": 7.520730217908335e-05, + "loss": 0.7024, + "step": 11760 + }, + { + "epoch": 2.26, + "learning_rate": 7.482162370637011e-05, + "loss": 0.7016, + "step": 11780 + }, + { + "epoch": 2.26, + "learning_rate": 7.443594523365686e-05, + "loss": 0.7132, + "step": 11800 + }, + { + "epoch": 2.26, + "eval_loss": 0.7381731271743774, + "eval_runtime": 25.4976, + "eval_samples_per_second": 78.439, + "eval_steps_per_second": 1.255, + "step": 11800 + }, + { + "epoch": 2.26, + "learning_rate": 7.405026676094361e-05, + "loss": 0.6969, + "step": 11820 + }, + { + "epoch": 2.27, + "learning_rate": 7.366458828823038e-05, + "loss": 0.7042, + "step": 11840 + }, + { + "epoch": 2.27, + "learning_rate": 7.327890981551713e-05, + "loss": 0.7088, + "step": 11860 + }, + { + "epoch": 2.28, + "learning_rate": 7.289323134280388e-05, + "loss": 0.7109, + "step": 11880 + }, + { + "epoch": 2.28, + "learning_rate": 7.250755287009063e-05, + "loss": 0.7046, + "step": 11900 + }, + { + "epoch": 2.28, + "learning_rate": 7.212187439737738e-05, + "loss": 0.706, + "step": 11920 + }, + { + "epoch": 2.29, + "learning_rate": 7.173619592466414e-05, + "loss": 0.7045, + "step": 11940 + }, + { + "epoch": 2.29, + "learning_rate": 7.135051745195089e-05, + "loss": 0.7121, + "step": 11960 + }, + { + "epoch": 2.3, + "learning_rate": 7.096483897923764e-05, + "loss": 0.6946, + "step": 11980 + }, + { + "epoch": 2.3, + "learning_rate": 7.057916050652439e-05, + "loss": 0.7003, + "step": 12000 + }, + { + "epoch": 2.3, + "eval_loss": 0.7378225922584534, + "eval_runtime": 25.5221, + "eval_samples_per_second": 78.363, + "eval_steps_per_second": 1.254, + "step": 12000 + }, + { + "epoch": 2.3, + "learning_rate": 7.019348203381114e-05, + "loss": 0.7147, + "step": 12020 + }, + { + "epoch": 2.31, + "learning_rate": 6.980780356109789e-05, + "loss": 0.7066, + "step": 12040 + }, + { + "epoch": 2.31, + "learning_rate": 6.942212508838465e-05, + "loss": 0.6997, + "step": 12060 + }, + { + "epoch": 2.31, + "learning_rate": 6.90364466156714e-05, + "loss": 0.7083, + "step": 12080 + }, + { + "epoch": 2.32, + "learning_rate": 6.865076814295815e-05, + "loss": 0.6991, + "step": 12100 + }, + { + "epoch": 2.32, + "learning_rate": 6.82650896702449e-05, + "loss": 0.6982, + "step": 12120 + }, + { + "epoch": 2.33, + "learning_rate": 6.787941119753165e-05, + "loss": 0.7028, + "step": 12140 + }, + { + "epoch": 2.33, + "learning_rate": 6.74937327248184e-05, + "loss": 0.704, + "step": 12160 + }, + { + "epoch": 2.33, + "learning_rate": 6.710805425210516e-05, + "loss": 0.7084, + "step": 12180 + }, + { + "epoch": 2.34, + "learning_rate": 6.672237577939191e-05, + "loss": 0.7061, + "step": 12200 + }, + { + "epoch": 2.34, + "eval_loss": 0.7376002669334412, + "eval_runtime": 25.5156, + "eval_samples_per_second": 78.383, + "eval_steps_per_second": 1.254, + "step": 12200 + }, + { + "epoch": 2.34, + "learning_rate": 6.633669730667866e-05, + "loss": 0.7017, + "step": 12220 + }, + { + "epoch": 2.35, + "learning_rate": 6.595101883396541e-05, + "loss": 0.6949, + "step": 12240 + }, + { + "epoch": 2.35, + "learning_rate": 6.556534036125216e-05, + "loss": 0.6985, + "step": 12260 + }, + { + "epoch": 2.35, + "learning_rate": 6.517966188853891e-05, + "loss": 0.7075, + "step": 12280 + }, + { + "epoch": 2.36, + "learning_rate": 6.479398341582568e-05, + "loss": 0.6997, + "step": 12300 + }, + { + "epoch": 2.36, + "learning_rate": 6.440830494311241e-05, + "loss": 0.7045, + "step": 12320 + }, + { + "epoch": 2.36, + "learning_rate": 6.402262647039918e-05, + "loss": 0.7148, + "step": 12340 + }, + { + "epoch": 2.37, + "learning_rate": 6.363694799768592e-05, + "loss": 0.7085, + "step": 12360 + }, + { + "epoch": 2.37, + "learning_rate": 6.325126952497267e-05, + "loss": 0.7062, + "step": 12380 + }, + { + "epoch": 2.38, + "learning_rate": 6.286559105225942e-05, + "loss": 0.7092, + "step": 12400 + }, + { + "epoch": 2.38, + "eval_loss": 0.7370800971984863, + "eval_runtime": 25.5432, + "eval_samples_per_second": 78.299, + "eval_steps_per_second": 1.253, + "step": 12400 + }, + { + "epoch": 2.38, + "learning_rate": 6.247991257954619e-05, + "loss": 0.7069, + "step": 12420 + }, + { + "epoch": 2.38, + "learning_rate": 6.209423410683292e-05, + "loss": 0.7083, + "step": 12440 + }, + { + "epoch": 2.39, + "learning_rate": 6.170855563411969e-05, + "loss": 0.7126, + "step": 12460 + }, + { + "epoch": 2.39, + "learning_rate": 6.132287716140644e-05, + "loss": 0.7062, + "step": 12480 + }, + { + "epoch": 2.4, + "learning_rate": 6.0937198688693187e-05, + "loss": 0.7149, + "step": 12500 + }, + { + "epoch": 2.4, + "learning_rate": 6.0551520215979936e-05, + "loss": 0.7111, + "step": 12520 + }, + { + "epoch": 2.4, + "learning_rate": 6.016584174326669e-05, + "loss": 0.7059, + "step": 12540 + }, + { + "epoch": 2.41, + "learning_rate": 5.978016327055344e-05, + "loss": 0.7169, + "step": 12560 + }, + { + "epoch": 2.41, + "learning_rate": 5.93944847978402e-05, + "loss": 0.7052, + "step": 12580 + }, + { + "epoch": 2.41, + "learning_rate": 5.900880632512694e-05, + "loss": 0.7019, + "step": 12600 + }, + { + "epoch": 2.41, + "eval_loss": 0.7369959354400635, + "eval_runtime": 25.5241, + "eval_samples_per_second": 78.357, + "eval_steps_per_second": 1.254, + "step": 12600 + }, + { + "epoch": 2.42, + "learning_rate": 5.86231278524137e-05, + "loss": 0.7026, + "step": 12620 + }, + { + "epoch": 2.42, + "learning_rate": 5.823744937970045e-05, + "loss": 0.6981, + "step": 12640 + }, + { + "epoch": 2.43, + "learning_rate": 5.7851770906987205e-05, + "loss": 0.7044, + "step": 12660 + }, + { + "epoch": 2.43, + "learning_rate": 5.7466092434273955e-05, + "loss": 0.7087, + "step": 12680 + }, + { + "epoch": 2.43, + "learning_rate": 5.708041396156071e-05, + "loss": 0.7039, + "step": 12700 + }, + { + "epoch": 2.44, + "learning_rate": 5.6694735488847454e-05, + "loss": 0.7015, + "step": 12720 + }, + { + "epoch": 2.44, + "learning_rate": 5.630905701613421e-05, + "loss": 0.7053, + "step": 12740 + }, + { + "epoch": 2.44, + "learning_rate": 5.592337854342096e-05, + "loss": 0.7037, + "step": 12760 + }, + { + "epoch": 2.45, + "learning_rate": 5.553770007070772e-05, + "loss": 0.6938, + "step": 12780 + }, + { + "epoch": 2.45, + "learning_rate": 5.515202159799447e-05, + "loss": 0.7063, + "step": 12800 + }, + { + "epoch": 2.45, + "eval_loss": 0.7364639639854431, + "eval_runtime": 25.4856, + "eval_samples_per_second": 78.476, + "eval_steps_per_second": 1.256, + "step": 12800 + }, + { + "epoch": 2.46, + "learning_rate": 5.476634312528122e-05, + "loss": 0.7013, + "step": 12820 + }, + { + "epoch": 2.46, + "learning_rate": 5.4380664652567966e-05, + "loss": 0.7012, + "step": 12840 + }, + { + "epoch": 2.46, + "learning_rate": 5.399498617985472e-05, + "loss": 0.7, + "step": 12860 + }, + { + "epoch": 2.47, + "learning_rate": 5.360930770714147e-05, + "loss": 0.7017, + "step": 12880 + }, + { + "epoch": 2.47, + "learning_rate": 5.322362923442823e-05, + "loss": 0.7145, + "step": 12900 + }, + { + "epoch": 2.48, + "learning_rate": 5.283795076171498e-05, + "loss": 0.7156, + "step": 12920 + }, + { + "epoch": 2.48, + "learning_rate": 5.247155621263739e-05, + "loss": 0.6965, + "step": 12940 + }, + { + "epoch": 2.48, + "learning_rate": 5.2085877739924146e-05, + "loss": 0.7001, + "step": 12960 + }, + { + "epoch": 2.49, + "learning_rate": 5.1700199267210896e-05, + "loss": 0.7012, + "step": 12980 + }, + { + "epoch": 2.49, + "learning_rate": 5.131452079449765e-05, + "loss": 0.6939, + "step": 13000 + }, + { + "epoch": 2.49, + "eval_loss": 0.7364306449890137, + "eval_runtime": 25.5093, + "eval_samples_per_second": 78.403, + "eval_steps_per_second": 1.254, + "step": 13000 + }, + { + "epoch": 2.49, + "learning_rate": 5.09288423217844e-05, + "loss": 0.7084, + "step": 13020 + }, + { + "epoch": 2.5, + "learning_rate": 5.054316384907115e-05, + "loss": 0.6987, + "step": 13040 + }, + { + "epoch": 2.5, + "learning_rate": 5.01574853763579e-05, + "loss": 0.7087, + "step": 13060 + }, + { + "epoch": 2.51, + "learning_rate": 4.977180690364466e-05, + "loss": 0.7028, + "step": 13080 + }, + { + "epoch": 2.51, + "learning_rate": 4.938612843093141e-05, + "loss": 0.7012, + "step": 13100 + }, + { + "epoch": 2.51, + "learning_rate": 4.9000449958218165e-05, + "loss": 0.6959, + "step": 13120 + }, + { + "epoch": 2.52, + "learning_rate": 4.861477148550491e-05, + "loss": 0.7056, + "step": 13140 + }, + { + "epoch": 2.52, + "learning_rate": 4.8229093012791664e-05, + "loss": 0.716, + "step": 13160 + }, + { + "epoch": 2.53, + "learning_rate": 4.7843414540078414e-05, + "loss": 0.7144, + "step": 13180 + }, + { + "epoch": 2.53, + "learning_rate": 4.745773606736517e-05, + "loss": 0.6969, + "step": 13200 + }, + { + "epoch": 2.53, + "eval_loss": 0.7360122203826904, + "eval_runtime": 25.4878, + "eval_samples_per_second": 78.469, + "eval_steps_per_second": 1.256, + "step": 13200 + }, + { + "epoch": 2.53, + "learning_rate": 4.707205759465192e-05, + "loss": 0.6993, + "step": 13220 + }, + { + "epoch": 2.54, + "learning_rate": 4.668637912193868e-05, + "loss": 0.7013, + "step": 13240 + }, + { + "epoch": 2.54, + "learning_rate": 4.630070064922542e-05, + "loss": 0.7033, + "step": 13260 + }, + { + "epoch": 2.54, + "learning_rate": 4.5915022176512176e-05, + "loss": 0.7067, + "step": 13280 + }, + { + "epoch": 2.55, + "learning_rate": 4.5529343703798926e-05, + "loss": 0.6886, + "step": 13300 + }, + { + "epoch": 2.55, + "learning_rate": 4.514366523108568e-05, + "loss": 0.7061, + "step": 13320 + }, + { + "epoch": 2.56, + "learning_rate": 4.475798675837243e-05, + "loss": 0.7027, + "step": 13340 + }, + { + "epoch": 2.56, + "learning_rate": 4.437230828565919e-05, + "loss": 0.6982, + "step": 13360 + }, + { + "epoch": 2.56, + "learning_rate": 4.398662981294593e-05, + "loss": 0.7042, + "step": 13380 + }, + { + "epoch": 2.57, + "learning_rate": 4.360095134023269e-05, + "loss": 0.6956, + "step": 13400 + }, + { + "epoch": 2.57, + "eval_loss": 0.7356610298156738, + "eval_runtime": 25.5629, + "eval_samples_per_second": 78.238, + "eval_steps_per_second": 1.252, + "step": 13400 + }, + { + "epoch": 2.57, + "learning_rate": 4.321527286751944e-05, + "loss": 0.7046, + "step": 13420 + }, + { + "epoch": 2.58, + "learning_rate": 4.2829594394806195e-05, + "loss": 0.7053, + "step": 13440 + }, + { + "epoch": 2.58, + "learning_rate": 4.2443915922092944e-05, + "loss": 0.707, + "step": 13460 + }, + { + "epoch": 2.58, + "learning_rate": 4.20582374493797e-05, + "loss": 0.7123, + "step": 13480 + }, + { + "epoch": 2.59, + "learning_rate": 4.1672558976666444e-05, + "loss": 0.7032, + "step": 13500 + }, + { + "epoch": 2.59, + "learning_rate": 4.12868805039532e-05, + "loss": 0.6942, + "step": 13520 + }, + { + "epoch": 2.59, + "learning_rate": 4.090120203123995e-05, + "loss": 0.6981, + "step": 13540 + }, + { + "epoch": 2.6, + "learning_rate": 4.051552355852671e-05, + "loss": 0.7052, + "step": 13560 + }, + { + "epoch": 2.6, + "learning_rate": 4.012984508581345e-05, + "loss": 0.7044, + "step": 13580 + }, + { + "epoch": 2.61, + "learning_rate": 3.9744166613100206e-05, + "loss": 0.6978, + "step": 13600 + }, + { + "epoch": 2.61, + "eval_loss": 0.7352051734924316, + "eval_runtime": 25.5016, + "eval_samples_per_second": 78.426, + "eval_steps_per_second": 1.255, + "step": 13600 + }, + { + "epoch": 2.61, + "learning_rate": 3.9358488140386956e-05, + "loss": 0.7001, + "step": 13620 + }, + { + "epoch": 2.61, + "learning_rate": 3.897280966767371e-05, + "loss": 0.7065, + "step": 13640 + }, + { + "epoch": 2.62, + "learning_rate": 3.858713119496046e-05, + "loss": 0.6999, + "step": 13660 + }, + { + "epoch": 2.62, + "learning_rate": 3.820145272224722e-05, + "loss": 0.7104, + "step": 13680 + }, + { + "epoch": 2.63, + "learning_rate": 3.781577424953396e-05, + "loss": 0.7079, + "step": 13700 + }, + { + "epoch": 2.63, + "learning_rate": 3.743009577682072e-05, + "loss": 0.7059, + "step": 13720 + }, + { + "epoch": 2.63, + "learning_rate": 3.7063701227743136e-05, + "loss": 0.7088, + "step": 13740 + }, + { + "epoch": 2.64, + "learning_rate": 3.6678022755029886e-05, + "loss": 0.7051, + "step": 13760 + }, + { + "epoch": 2.64, + "learning_rate": 3.629234428231664e-05, + "loss": 0.7004, + "step": 13780 + }, + { + "epoch": 2.64, + "learning_rate": 3.590666580960339e-05, + "loss": 0.7, + "step": 13800 + }, + { + "epoch": 2.64, + "eval_loss": 0.7350977659225464, + "eval_runtime": 25.4618, + "eval_samples_per_second": 78.549, + "eval_steps_per_second": 1.257, + "step": 13800 + }, + { + "epoch": 2.65, + "learning_rate": 3.552098733689014e-05, + "loss": 0.7044, + "step": 13820 + }, + { + "epoch": 2.65, + "learning_rate": 3.51353088641769e-05, + "loss": 0.6967, + "step": 13840 + }, + { + "epoch": 2.66, + "learning_rate": 3.474963039146365e-05, + "loss": 0.6932, + "step": 13860 + }, + { + "epoch": 2.66, + "learning_rate": 3.43639519187504e-05, + "loss": 0.6982, + "step": 13880 + }, + { + "epoch": 2.66, + "learning_rate": 3.3978273446037154e-05, + "loss": 0.7064, + "step": 13900 + }, + { + "epoch": 2.67, + "learning_rate": 3.3592594973323904e-05, + "loss": 0.7064, + "step": 13920 + }, + { + "epoch": 2.67, + "learning_rate": 3.3206916500610654e-05, + "loss": 0.6975, + "step": 13940 + }, + { + "epoch": 2.67, + "learning_rate": 3.282123802789741e-05, + "loss": 0.7023, + "step": 13960 + }, + { + "epoch": 2.68, + "learning_rate": 3.243555955518416e-05, + "loss": 0.706, + "step": 13980 + }, + { + "epoch": 2.68, + "learning_rate": 3.204988108247091e-05, + "loss": 0.696, + "step": 14000 + }, + { + "epoch": 2.68, + "eval_loss": 0.7347920536994934, + "eval_runtime": 25.5132, + "eval_samples_per_second": 78.391, + "eval_steps_per_second": 1.254, + "step": 14000 + }, + { + "epoch": 2.69, + "learning_rate": 3.1664202609757666e-05, + "loss": 0.6995, + "step": 14020 + }, + { + "epoch": 2.69, + "learning_rate": 3.1278524137044416e-05, + "loss": 0.7022, + "step": 14040 + }, + { + "epoch": 2.69, + "learning_rate": 3.0892845664331166e-05, + "loss": 0.7086, + "step": 14060 + }, + { + "epoch": 2.7, + "learning_rate": 3.050716719161792e-05, + "loss": 0.7135, + "step": 14080 + }, + { + "epoch": 2.7, + "learning_rate": 3.0121488718904672e-05, + "loss": 0.7036, + "step": 14100 + }, + { + "epoch": 2.71, + "learning_rate": 2.9735810246191422e-05, + "loss": 0.6979, + "step": 14120 + }, + { + "epoch": 2.71, + "learning_rate": 2.9350131773478175e-05, + "loss": 0.7082, + "step": 14140 + }, + { + "epoch": 2.71, + "learning_rate": 2.8964453300764928e-05, + "loss": 0.7008, + "step": 14160 + }, + { + "epoch": 2.72, + "learning_rate": 2.8578774828051678e-05, + "loss": 0.7085, + "step": 14180 + }, + { + "epoch": 2.72, + "learning_rate": 2.819309635533843e-05, + "loss": 0.6983, + "step": 14200 + }, + { + "epoch": 2.72, + "eval_loss": 0.73465496301651, + "eval_runtime": 25.4933, + "eval_samples_per_second": 78.452, + "eval_steps_per_second": 1.255, + "step": 14200 + }, + { + "epoch": 2.72, + "learning_rate": 2.7807417882625184e-05, + "loss": 0.7123, + "step": 14220 + }, + { + "epoch": 2.73, + "learning_rate": 2.7421739409911934e-05, + "loss": 0.7027, + "step": 14240 + }, + { + "epoch": 2.73, + "learning_rate": 2.7036060937198687e-05, + "loss": 0.7124, + "step": 14260 + }, + { + "epoch": 2.74, + "learning_rate": 2.6650382464485437e-05, + "loss": 0.7102, + "step": 14280 + }, + { + "epoch": 2.74, + "learning_rate": 2.626470399177219e-05, + "loss": 0.7062, + "step": 14300 + }, + { + "epoch": 2.74, + "learning_rate": 2.5879025519058943e-05, + "loss": 0.7094, + "step": 14320 + }, + { + "epoch": 2.75, + "learning_rate": 2.5493347046345693e-05, + "loss": 0.7017, + "step": 14340 + }, + { + "epoch": 2.75, + "learning_rate": 2.5107668573632446e-05, + "loss": 0.7033, + "step": 14360 + }, + { + "epoch": 2.76, + "learning_rate": 2.47219901009192e-05, + "loss": 0.7036, + "step": 14380 + }, + { + "epoch": 2.76, + "learning_rate": 2.433631162820595e-05, + "loss": 0.7041, + "step": 14400 + }, + { + "epoch": 2.76, + "eval_loss": 0.7345843315124512, + "eval_runtime": 25.4933, + "eval_samples_per_second": 78.452, + "eval_steps_per_second": 1.255, + "step": 14400 + }, + { + "epoch": 2.76, + "learning_rate": 2.3950633155492702e-05, + "loss": 0.6983, + "step": 14420 + }, + { + "epoch": 2.77, + "learning_rate": 2.3564954682779455e-05, + "loss": 0.7006, + "step": 14440 + }, + { + "epoch": 2.77, + "learning_rate": 2.3179276210066205e-05, + "loss": 0.7047, + "step": 14460 + }, + { + "epoch": 2.77, + "learning_rate": 2.2793597737352958e-05, + "loss": 0.7036, + "step": 14480 + }, + { + "epoch": 2.78, + "learning_rate": 2.2407919264639708e-05, + "loss": 0.7025, + "step": 14500 + }, + { + "epoch": 2.78, + "learning_rate": 2.202224079192646e-05, + "loss": 0.699, + "step": 14520 + }, + { + "epoch": 2.79, + "learning_rate": 2.1636562319213214e-05, + "loss": 0.699, + "step": 14540 + }, + { + "epoch": 2.79, + "learning_rate": 2.1250883846499964e-05, + "loss": 0.6968, + "step": 14560 + }, + { + "epoch": 2.79, + "learning_rate": 2.0865205373786717e-05, + "loss": 0.697, + "step": 14580 + }, + { + "epoch": 2.8, + "learning_rate": 2.047952690107347e-05, + "loss": 0.6981, + "step": 14600 + }, + { + "epoch": 2.8, + "eval_loss": 0.7341080904006958, + "eval_runtime": 25.516, + "eval_samples_per_second": 78.382, + "eval_steps_per_second": 1.254, + "step": 14600 + }, + { + "epoch": 2.8, + "learning_rate": 2.009384842836022e-05, + "loss": 0.706, + "step": 14620 + }, + { + "epoch": 2.81, + "learning_rate": 1.9708169955646973e-05, + "loss": 0.6964, + "step": 14640 + }, + { + "epoch": 2.81, + "learning_rate": 1.9322491482933726e-05, + "loss": 0.7043, + "step": 14660 + }, + { + "epoch": 2.81, + "learning_rate": 1.8936813010220476e-05, + "loss": 0.7044, + "step": 14680 + }, + { + "epoch": 2.82, + "learning_rate": 1.855113453750723e-05, + "loss": 0.7079, + "step": 14700 + }, + { + "epoch": 2.82, + "learning_rate": 1.8165456064793982e-05, + "loss": 0.7096, + "step": 14720 + }, + { + "epoch": 2.82, + "learning_rate": 1.7779777592080735e-05, + "loss": 0.6977, + "step": 14740 + }, + { + "epoch": 2.83, + "learning_rate": 1.7394099119367485e-05, + "loss": 0.6997, + "step": 14760 + }, + { + "epoch": 2.83, + "learning_rate": 1.7008420646654238e-05, + "loss": 0.7033, + "step": 14780 + }, + { + "epoch": 2.84, + "learning_rate": 1.662274217394099e-05, + "loss": 0.7016, + "step": 14800 + }, + { + "epoch": 2.84, + "eval_loss": 0.7337221503257751, + "eval_runtime": 25.5058, + "eval_samples_per_second": 78.413, + "eval_steps_per_second": 1.255, + "step": 14800 + }, + { + "epoch": 2.84, + "learning_rate": 1.623706370122774e-05, + "loss": 0.6907, + "step": 14820 + }, + { + "epoch": 2.84, + "learning_rate": 1.5851385228514494e-05, + "loss": 0.7043, + "step": 14840 + }, + { + "epoch": 2.85, + "learning_rate": 1.5465706755801247e-05, + "loss": 0.7058, + "step": 14860 + }, + { + "epoch": 2.85, + "learning_rate": 1.5080028283087997e-05, + "loss": 0.6956, + "step": 14880 + }, + { + "epoch": 2.85, + "learning_rate": 1.469434981037475e-05, + "loss": 0.7109, + "step": 14900 + }, + { + "epoch": 2.86, + "learning_rate": 1.4308671337661502e-05, + "loss": 0.7055, + "step": 14920 + }, + { + "epoch": 2.86, + "learning_rate": 1.3922992864948253e-05, + "loss": 0.7011, + "step": 14940 + }, + { + "epoch": 2.87, + "learning_rate": 1.3537314392235005e-05, + "loss": 0.7009, + "step": 14960 + }, + { + "epoch": 2.87, + "learning_rate": 1.3151635919521758e-05, + "loss": 0.7069, + "step": 14980 + }, + { + "epoch": 2.87, + "learning_rate": 1.276595744680851e-05, + "loss": 0.7038, + "step": 15000 + }, + { + "epoch": 2.87, + "eval_loss": 0.7338148355484009, + "eval_runtime": 25.4764, + "eval_samples_per_second": 78.504, + "eval_steps_per_second": 1.256, + "step": 15000 + }, + { + "epoch": 2.88, + "learning_rate": 1.238027897409526e-05, + "loss": 0.706, + "step": 15020 + }, + { + "epoch": 2.88, + "learning_rate": 1.1994600501382012e-05, + "loss": 0.6918, + "step": 15040 + }, + { + "epoch": 2.89, + "learning_rate": 1.1608922028668765e-05, + "loss": 0.7045, + "step": 15060 + }, + { + "epoch": 2.89, + "learning_rate": 1.1223243555955517e-05, + "loss": 0.6984, + "step": 15080 + }, + { + "epoch": 2.89, + "learning_rate": 1.0837565083242268e-05, + "loss": 0.7126, + "step": 15100 + }, + { + "epoch": 2.9, + "learning_rate": 1.0451886610529021e-05, + "loss": 0.6974, + "step": 15120 + }, + { + "epoch": 2.9, + "learning_rate": 1.0066208137815773e-05, + "loss": 0.7063, + "step": 15140 + }, + { + "epoch": 2.9, + "learning_rate": 9.680529665102524e-06, + "loss": 0.697, + "step": 15160 + }, + { + "epoch": 2.91, + "learning_rate": 9.294851192389277e-06, + "loss": 0.6965, + "step": 15180 + }, + { + "epoch": 2.91, + "learning_rate": 8.909172719676029e-06, + "loss": 0.7001, + "step": 15200 + }, + { + "epoch": 2.91, + "eval_loss": 0.733613908290863, + "eval_runtime": 25.4875, + "eval_samples_per_second": 78.47, + "eval_steps_per_second": 1.256, + "step": 15200 + }, + { + "epoch": 2.92, + "learning_rate": 8.523494246962782e-06, + "loss": 0.6928, + "step": 15220 + }, + { + "epoch": 2.92, + "learning_rate": 8.137815774249533e-06, + "loss": 0.7047, + "step": 15240 + }, + { + "epoch": 2.92, + "learning_rate": 7.752137301536285e-06, + "loss": 0.6888, + "step": 15260 + }, + { + "epoch": 2.93, + "learning_rate": 7.366458828823037e-06, + "loss": 0.7133, + "step": 15280 + }, + { + "epoch": 2.93, + "learning_rate": 6.980780356109789e-06, + "loss": 0.7019, + "step": 15300 + }, + { + "epoch": 2.94, + "learning_rate": 6.595101883396541e-06, + "loss": 0.7017, + "step": 15320 + }, + { + "epoch": 2.94, + "learning_rate": 6.209423410683292e-06, + "loss": 0.7045, + "step": 15340 + }, + { + "epoch": 2.94, + "learning_rate": 5.823744937970045e-06, + "loss": 0.7017, + "step": 15360 + }, + { + "epoch": 2.95, + "learning_rate": 5.438066465256798e-06, + "loss": 0.699, + "step": 15380 + }, + { + "epoch": 2.95, + "learning_rate": 5.052387992543549e-06, + "loss": 0.691, + "step": 15400 + }, + { + "epoch": 2.95, + "eval_loss": 0.7335031032562256, + "eval_runtime": 25.5263, + "eval_samples_per_second": 78.351, + "eval_steps_per_second": 1.254, + "step": 15400 + } + ], + "max_steps": 15657, + "num_train_epochs": 3, + "total_flos": 7.78654601279701e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/training_args.bin b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8cadc78dc16a3098f59f602efe3fce82b270b5ad --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cbb446b8dfb89a3bab291d29c74c98c3984471bb063f88c9b78e95c95415320 +size 3643 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/optimizer.pt b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9473ff787471efa00da1872e57c3650f7c808a04 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79bc871bc05941104730fe62e6eb5ca7d7f393622af5b1e14861a50489bfe98b +size 52523141 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/pytorch_model.bin b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..973828c70acded1c528f8ca1c0d904f3b5053aff --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0122109efb809fa91e3cd9e50431724c28dce219524993b232a58f79a34d71ec +size 26271757 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_0.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..6fb4b229510518abaa0e734a27dbf6df166fe169 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33acc3da1f7c7a71b32c8e585ef757fbec55a3dcd13f70fe071ca8cec81d8576 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_1.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..ebb04fe5911206e43ba78281b470f7a4895f5ea3 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0e2da27a3d84c283457975cdd171b028c92fbb4cd99f699e244fcef8332703f +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_2.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..5cbb8410f92beacd0b6421774e9e2ef42889befe --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f9e88065347e9c3d8a43483492ad581090ab26e153ae8b69b9839cbce6aa6d1 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_3.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..c467e6903bb26540597c8b2b1fb0cf1bfd6c7556 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f05300a4f9eaeb3b73dd291e90e1d1da9aab65470a28698da08135483f1609f0 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_4.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..460a7c51c035bd7686a1c9fef37bc4102157de95 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e392a932c57b39dd20ffbeb7c2cbaa6a8e19187c9c314e56919266019ec7f3ed +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_5.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..338555b10ea78c61f4234dfbbc451922dc112acd --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0aeeab9ae76b857995ceaac21065479615d19bca73913260a13e19a1b5fe526 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_6.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..25188858f559b2f06342564c739694e32e8c81b5 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7ac961b79eaa967660665663f92ef2af2f960d9e64ce8334c9bec18042eccdf +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_7.pth b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7360f7d17d22ae03a047c632e33f16c3cb35fc92 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3a7899468ea8638d727076ba3d10ac5b12a9452bc99d2b04c292b0217c4f82e +size 14583 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/scaler.pt b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..686f92f51b6b39b269e7a7d5051037ed177e1810 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ef7119f1c71b1e98e4ccccf8bda9b090d09e68a8a3c6de26f11d3903b69d3a2 +size 557 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/scheduler.pt b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..47160cb81a8bce5e4e8cc7d19954bc3f572a738b --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f226e70722083a6f120d03c8de3df770a437c3c8dd19c077e5af09c5008aee3 +size 627 diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/trainer_state.json b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..758fc2598ee8967d9a92424b4e6c01344e62540e --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/trainer_state.json @@ -0,0 +1,5320 @@ +{ + "best_metric": 0.7334907054901123, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-belle-cot13b/checkpoint-15600", + "epoch": 2.989078367503353, + "global_step": 15600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7259, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 1.4365, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 1.0988, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 0.00023999999999999998, + "loss": 1.0373, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.0003, + "loss": 0.9935, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029961432152728675, + "loss": 0.9734, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002992286430545735, + "loss": 0.9538, + "step": 140 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029884296458186025, + "loss": 0.9304, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029845728610914697, + "loss": 0.9159, + "step": 180 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029807160763643375, + "loss": 0.9056, + "step": 200 + }, + { + "epoch": 0.04, + "eval_loss": 0.9217711091041565, + "eval_runtime": 25.3617, + "eval_samples_per_second": 78.859, + "eval_steps_per_second": 1.262, + "step": 200 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029768592916372047, + "loss": 0.9028, + "step": 220 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029730025069100725, + "loss": 0.8939, + "step": 240 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029691457221829397, + "loss": 0.8839, + "step": 260 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029652889374558075, + "loss": 0.8929, + "step": 280 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029614321527286747, + "loss": 0.8708, + "step": 300 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029575753680015425, + "loss": 0.8824, + "step": 320 + }, + { + "epoch": 0.07, + "learning_rate": 0.000295371858327441, + "loss": 0.8705, + "step": 340 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029498617985472775, + "loss": 0.8678, + "step": 360 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002946005013820145, + "loss": 0.8687, + "step": 380 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029421482290930125, + "loss": 0.8609, + "step": 400 + }, + { + "epoch": 0.08, + "eval_loss": 0.8822715878486633, + "eval_runtime": 25.3188, + "eval_samples_per_second": 78.993, + "eval_steps_per_second": 1.264, + "step": 400 + }, + { + "epoch": 0.08, + "learning_rate": 0.000293829144436588, + "loss": 0.8603, + "step": 420 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029344346596387475, + "loss": 0.8662, + "step": 440 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002930577874911615, + "loss": 0.8591, + "step": 460 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029267210901844825, + "loss": 0.8442, + "step": 480 + }, + { + "epoch": 0.1, + "learning_rate": 0.000292286430545735, + "loss": 0.8482, + "step": 500 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029190075207302175, + "loss": 0.8458, + "step": 520 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002915150736003085, + "loss": 0.8377, + "step": 540 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029112939512759525, + "loss": 0.8372, + "step": 560 + }, + { + "epoch": 0.11, + "learning_rate": 0.000290743716654882, + "loss": 0.8444, + "step": 580 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002903580381821688, + "loss": 0.84, + "step": 600 + }, + { + "epoch": 0.11, + "eval_loss": 0.8602503538131714, + "eval_runtime": 25.3547, + "eval_samples_per_second": 78.881, + "eval_steps_per_second": 1.262, + "step": 600 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002899723597094555, + "loss": 0.8428, + "step": 620 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002895866812367423, + "loss": 0.8366, + "step": 640 + }, + { + "epoch": 0.13, + "learning_rate": 0.000289201002764029, + "loss": 0.8408, + "step": 660 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002888153242913158, + "loss": 0.8445, + "step": 680 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002884296458186025, + "loss": 0.8335, + "step": 700 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002880439673458893, + "loss": 0.8316, + "step": 720 + }, + { + "epoch": 0.14, + "learning_rate": 0.000287658288873176, + "loss": 0.8449, + "step": 740 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002872726104004628, + "loss": 0.836, + "step": 760 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002868869319277495, + "loss": 0.8257, + "step": 780 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002865012534550363, + "loss": 0.8252, + "step": 800 + }, + { + "epoch": 0.15, + "eval_loss": 0.8450831174850464, + "eval_runtime": 25.4039, + "eval_samples_per_second": 78.728, + "eval_steps_per_second": 1.26, + "step": 800 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002861155749823231, + "loss": 0.8227, + "step": 820 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002857298965096098, + "loss": 0.8274, + "step": 840 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028534421803689657, + "loss": 0.8197, + "step": 860 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002849585395641833, + "loss": 0.823, + "step": 880 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028457286109147007, + "loss": 0.8176, + "step": 900 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002841871826187568, + "loss": 0.8092, + "step": 920 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028380150414604357, + "loss": 0.8171, + "step": 940 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002834158256733303, + "loss": 0.816, + "step": 960 + }, + { + "epoch": 0.19, + "learning_rate": 0.00028303014720061707, + "loss": 0.816, + "step": 980 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002826444687279038, + "loss": 0.8066, + "step": 1000 + }, + { + "epoch": 0.19, + "eval_loss": 0.8338332176208496, + "eval_runtime": 25.3851, + "eval_samples_per_second": 78.786, + "eval_steps_per_second": 1.261, + "step": 1000 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028225879025519057, + "loss": 0.82, + "step": 1020 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002818731117824773, + "loss": 0.8116, + "step": 1040 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028148743330976407, + "loss": 0.8156, + "step": 1060 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028110175483705085, + "loss": 0.8135, + "step": 1080 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028071607636433757, + "loss": 0.8055, + "step": 1100 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028033039789162435, + "loss": 0.8062, + "step": 1120 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027994471941891107, + "loss": 0.8082, + "step": 1140 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027955904094619785, + "loss": 0.8144, + "step": 1160 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027917336247348457, + "loss": 0.8067, + "step": 1180 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002787876840007713, + "loss": 0.8042, + "step": 1200 + }, + { + "epoch": 0.23, + "eval_loss": 0.8253737688064575, + "eval_runtime": 25.4089, + "eval_samples_per_second": 78.713, + "eval_steps_per_second": 1.259, + "step": 1200 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027840200552805807, + "loss": 0.8093, + "step": 1220 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027801632705534485, + "loss": 0.801, + "step": 1240 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027763064858263157, + "loss": 0.8043, + "step": 1260 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027724497010991834, + "loss": 0.8027, + "step": 1280 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002768592916372051, + "loss": 0.7979, + "step": 1300 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027647361316449184, + "loss": 0.7988, + "step": 1320 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002760879346917786, + "loss": 0.8051, + "step": 1340 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027570225621906534, + "loss": 0.7962, + "step": 1360 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002753165777463521, + "loss": 0.8034, + "step": 1380 + }, + { + "epoch": 0.27, + "learning_rate": 0.00027493089927363884, + "loss": 0.7994, + "step": 1400 + }, + { + "epoch": 0.27, + "eval_loss": 0.8166970014572144, + "eval_runtime": 25.3787, + "eval_samples_per_second": 78.806, + "eval_steps_per_second": 1.261, + "step": 1400 + }, + { + "epoch": 0.27, + "learning_rate": 0.00027454522080092557, + "loss": 0.7949, + "step": 1420 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027415954232821234, + "loss": 0.7919, + "step": 1440 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002737738638554991, + "loss": 0.7983, + "step": 1460 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027338818538278584, + "loss": 0.7828, + "step": 1480 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002730025069100726, + "loss": 0.7926, + "step": 1500 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002726168284373594, + "loss": 0.7837, + "step": 1520 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002722311499646461, + "loss": 0.7922, + "step": 1540 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002718454714919329, + "loss": 0.7852, + "step": 1560 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002714597930192196, + "loss": 0.7846, + "step": 1580 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002710741145465064, + "loss": 0.782, + "step": 1600 + }, + { + "epoch": 0.31, + "eval_loss": 0.8094187378883362, + "eval_runtime": 25.4544, + "eval_samples_per_second": 78.572, + "eval_steps_per_second": 1.257, + "step": 1600 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002706884360737931, + "loss": 0.7822, + "step": 1620 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027030275760107984, + "loss": 0.7787, + "step": 1640 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002699170791283666, + "loss": 0.7913, + "step": 1660 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002695314006556534, + "loss": 0.79, + "step": 1680 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002691457221829401, + "loss": 0.7934, + "step": 1700 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002687600437102269, + "loss": 0.7816, + "step": 1720 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002683743652375136, + "loss": 0.7825, + "step": 1740 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002679886867648004, + "loss": 0.7903, + "step": 1760 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026760300829208717, + "loss": 0.7906, + "step": 1780 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002672173298193739, + "loss": 0.7778, + "step": 1800 + }, + { + "epoch": 0.34, + "eval_loss": 0.8045867681503296, + "eval_runtime": 25.4351, + "eval_samples_per_second": 78.632, + "eval_steps_per_second": 1.258, + "step": 1800 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026683165134666067, + "loss": 0.7815, + "step": 1820 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002664459728739474, + "loss": 0.7851, + "step": 1840 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026606029440123417, + "loss": 0.7807, + "step": 1860 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002656746159285209, + "loss": 0.7856, + "step": 1880 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002652889374558076, + "loss": 0.7798, + "step": 1900 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002649032589830944, + "loss": 0.7777, + "step": 1920 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026451758051038117, + "loss": 0.7798, + "step": 1940 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002641319020376679, + "loss": 0.7783, + "step": 1960 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026374622356495467, + "loss": 0.7739, + "step": 1980 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026336054509224144, + "loss": 0.7823, + "step": 2000 + }, + { + "epoch": 0.38, + "eval_loss": 0.7984708547592163, + "eval_runtime": 25.4598, + "eval_samples_per_second": 78.555, + "eval_steps_per_second": 1.257, + "step": 2000 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026297486661952817, + "loss": 0.7774, + "step": 2020 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026258918814681494, + "loss": 0.7701, + "step": 2040 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026220350967410167, + "loss": 0.7777, + "step": 2060 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026181783120138844, + "loss": 0.781, + "step": 2080 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026143215272867517, + "loss": 0.779, + "step": 2100 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002610464742559619, + "loss": 0.7703, + "step": 2120 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026066079578324867, + "loss": 0.7749, + "step": 2140 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026027511731053544, + "loss": 0.772, + "step": 2160 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025988943883782216, + "loss": 0.771, + "step": 2180 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025950376036510894, + "loss": 0.7757, + "step": 2200 + }, + { + "epoch": 0.42, + "eval_loss": 0.7949528694152832, + "eval_runtime": 25.4504, + "eval_samples_per_second": 78.584, + "eval_steps_per_second": 1.257, + "step": 2200 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025911808189239566, + "loss": 0.7776, + "step": 2220 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025873240341968244, + "loss": 0.7689, + "step": 2240 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002583467249469692, + "loss": 0.7646, + "step": 2260 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025796104647425594, + "loss": 0.7805, + "step": 2280 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002575753680015427, + "loss": 0.7717, + "step": 2300 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025718968952882944, + "loss": 0.7672, + "step": 2320 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025680401105611616, + "loss": 0.7716, + "step": 2340 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025641833258340294, + "loss": 0.7661, + "step": 2360 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025603265411068966, + "loss": 0.7659, + "step": 2380 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025564697563797644, + "loss": 0.7697, + "step": 2400 + }, + { + "epoch": 0.46, + "eval_loss": 0.7915205359458923, + "eval_runtime": 25.4326, + "eval_samples_per_second": 78.639, + "eval_steps_per_second": 1.258, + "step": 2400 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002552612971652632, + "loss": 0.7686, + "step": 2420 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025487561869254994, + "loss": 0.7691, + "step": 2440 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002544899402198367, + "loss": 0.768, + "step": 2460 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002541042617471235, + "loss": 0.7663, + "step": 2480 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002537185832744102, + "loss": 0.767, + "step": 2500 + }, + { + "epoch": 0.48, + "learning_rate": 0.000253332904801697, + "loss": 0.769, + "step": 2520 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002529472263289837, + "loss": 0.7686, + "step": 2540 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025256154785627044, + "loss": 0.7722, + "step": 2560 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002521758693835572, + "loss": 0.7691, + "step": 2580 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025179019091084394, + "loss": 0.7742, + "step": 2600 + }, + { + "epoch": 0.5, + "eval_loss": 0.7875179648399353, + "eval_runtime": 25.4595, + "eval_samples_per_second": 78.556, + "eval_steps_per_second": 1.257, + "step": 2600 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002514045124381307, + "loss": 0.7682, + "step": 2620 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002510188339654175, + "loss": 0.7574, + "step": 2640 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002506331554927042, + "loss": 0.77, + "step": 2660 + }, + { + "epoch": 0.51, + "learning_rate": 0.000250247477019991, + "loss": 0.7638, + "step": 2680 + }, + { + "epoch": 0.52, + "learning_rate": 0.00024986179854727777, + "loss": 0.7517, + "step": 2700 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002494761200745645, + "loss": 0.7596, + "step": 2720 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024909044160185127, + "loss": 0.7608, + "step": 2740 + }, + { + "epoch": 0.53, + "learning_rate": 0.000248704763129138, + "loss": 0.7571, + "step": 2760 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002483190846564247, + "loss": 0.7597, + "step": 2780 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002479334061837115, + "loss": 0.7659, + "step": 2800 + }, + { + "epoch": 0.54, + "eval_loss": 0.7841727137565613, + "eval_runtime": 25.4853, + "eval_samples_per_second": 78.477, + "eval_steps_per_second": 1.256, + "step": 2800 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002475477277109982, + "loss": 0.7694, + "step": 2820 + }, + { + "epoch": 0.54, + "learning_rate": 0.000247162049238285, + "loss": 0.7722, + "step": 2840 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024677637076557176, + "loss": 0.7513, + "step": 2860 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002463906922928585, + "loss": 0.7553, + "step": 2880 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024600501382014526, + "loss": 0.7611, + "step": 2900 + }, + { + "epoch": 0.56, + "learning_rate": 0.000245619335347432, + "loss": 0.7614, + "step": 2920 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024523365687471876, + "loss": 0.761, + "step": 2940 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024484797840200554, + "loss": 0.7568, + "step": 2960 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024446229992929226, + "loss": 0.7571, + "step": 2980 + }, + { + "epoch": 0.57, + "learning_rate": 0.000244076621456579, + "loss": 0.7514, + "step": 3000 + }, + { + "epoch": 0.57, + "eval_loss": 0.7828710675239563, + "eval_runtime": 25.4475, + "eval_samples_per_second": 78.593, + "eval_steps_per_second": 1.257, + "step": 3000 + }, + { + "epoch": 0.58, + "learning_rate": 0.0002436909429838658, + "loss": 0.7564, + "step": 3020 + }, + { + "epoch": 0.58, + "learning_rate": 0.0002433052645111525, + "loss": 0.7593, + "step": 3040 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024291958603843926, + "loss": 0.7533, + "step": 3060 + }, + { + "epoch": 0.59, + "learning_rate": 0.000242533907565726, + "loss": 0.7566, + "step": 3080 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024214822909301276, + "loss": 0.7667, + "step": 3100 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024176255062029954, + "loss": 0.7638, + "step": 3120 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024137687214758626, + "loss": 0.7613, + "step": 3140 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024099119367487304, + "loss": 0.755, + "step": 3160 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002406055152021598, + "loss": 0.7547, + "step": 3180 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002402198367294465, + "loss": 0.7611, + "step": 3200 + }, + { + "epoch": 0.61, + "eval_loss": 0.7789185643196106, + "eval_runtime": 25.4744, + "eval_samples_per_second": 78.51, + "eval_steps_per_second": 1.256, + "step": 3200 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002398341582567333, + "loss": 0.7498, + "step": 3220 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023944847978402, + "loss": 0.757, + "step": 3240 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002390628013113068, + "loss": 0.7472, + "step": 3260 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023867712283859354, + "loss": 0.7557, + "step": 3280 + }, + { + "epoch": 0.63, + "learning_rate": 0.0002382914443658803, + "loss": 0.7602, + "step": 3300 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023790576589316704, + "loss": 0.7573, + "step": 3320 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002375200874204538, + "loss": 0.7565, + "step": 3340 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023713440894774054, + "loss": 0.7517, + "step": 3360 + }, + { + "epoch": 0.65, + "learning_rate": 0.0002367487304750273, + "loss": 0.7521, + "step": 3380 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023636305200231404, + "loss": 0.7575, + "step": 3400 + }, + { + "epoch": 0.65, + "eval_loss": 0.7771645784378052, + "eval_runtime": 25.4832, + "eval_samples_per_second": 78.483, + "eval_steps_per_second": 1.256, + "step": 3400 + }, + { + "epoch": 0.66, + "learning_rate": 0.0002359773735296008, + "loss": 0.7605, + "step": 3420 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023559169505688756, + "loss": 0.7547, + "step": 3440 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023520601658417428, + "loss": 0.7522, + "step": 3460 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023482033811146106, + "loss": 0.757, + "step": 3480 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002344346596387478, + "loss": 0.7561, + "step": 3500 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023404898116603456, + "loss": 0.7486, + "step": 3520 + }, + { + "epoch": 0.68, + "learning_rate": 0.0002336633026933213, + "loss": 0.7519, + "step": 3540 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023327762422060806, + "loss": 0.7487, + "step": 3560 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002328919457478948, + "loss": 0.747, + "step": 3580 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002325062672751816, + "loss": 0.7523, + "step": 3600 + }, + { + "epoch": 0.69, + "eval_loss": 0.7746226787567139, + "eval_runtime": 25.4795, + "eval_samples_per_second": 78.494, + "eval_steps_per_second": 1.256, + "step": 3600 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002321205888024683, + "loss": 0.7427, + "step": 3620 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002317349103297551, + "loss": 0.7442, + "step": 3640 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023134923185704184, + "loss": 0.7587, + "step": 3660 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023096355338432856, + "loss": 0.7506, + "step": 3680 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023057787491161534, + "loss": 0.7514, + "step": 3700 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023019219643890206, + "loss": 0.7475, + "step": 3720 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022980651796618884, + "loss": 0.7601, + "step": 3740 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022942083949347559, + "loss": 0.7474, + "step": 3760 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022903516102076233, + "loss": 0.7529, + "step": 3780 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022864948254804908, + "loss": 0.7458, + "step": 3800 + }, + { + "epoch": 0.73, + "eval_loss": 0.7719505429267883, + "eval_runtime": 25.4724, + "eval_samples_per_second": 78.516, + "eval_steps_per_second": 1.256, + "step": 3800 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022826380407533586, + "loss": 0.7584, + "step": 3820 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022787812560262258, + "loss": 0.7416, + "step": 3840 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022749244712990936, + "loss": 0.7444, + "step": 3860 + }, + { + "epoch": 0.74, + "learning_rate": 0.0002271067686571961, + "loss": 0.7459, + "step": 3880 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022672109018448283, + "loss": 0.7476, + "step": 3900 + }, + { + "epoch": 0.75, + "learning_rate": 0.0002263354117117696, + "loss": 0.7473, + "step": 3920 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022594973323905633, + "loss": 0.7434, + "step": 3940 + }, + { + "epoch": 0.76, + "learning_rate": 0.0002255640547663431, + "loss": 0.7463, + "step": 3960 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022517837629362986, + "loss": 0.7435, + "step": 3980 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002247926978209166, + "loss": 0.7445, + "step": 4000 + }, + { + "epoch": 0.77, + "eval_loss": 0.7707083821296692, + "eval_runtime": 25.4747, + "eval_samples_per_second": 78.509, + "eval_steps_per_second": 1.256, + "step": 4000 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022440701934820336, + "loss": 0.7321, + "step": 4020 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022402134087549014, + "loss": 0.7525, + "step": 4040 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022363566240277686, + "loss": 0.7494, + "step": 4060 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022324998393006364, + "loss": 0.7533, + "step": 4080 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022286430545735036, + "loss": 0.7442, + "step": 4100 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002224786269846371, + "loss": 0.7423, + "step": 4120 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022209294851192388, + "loss": 0.7443, + "step": 4140 + }, + { + "epoch": 0.8, + "learning_rate": 0.0002217072700392106, + "loss": 0.7388, + "step": 4160 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022132159156649738, + "loss": 0.7425, + "step": 4180 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022093591309378413, + "loss": 0.7507, + "step": 4200 + }, + { + "epoch": 0.8, + "eval_loss": 0.7691813111305237, + "eval_runtime": 25.5011, + "eval_samples_per_second": 78.428, + "eval_steps_per_second": 1.255, + "step": 4200 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022055023462107088, + "loss": 0.7276, + "step": 4220 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022016455614835763, + "loss": 0.7399, + "step": 4240 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021977887767564438, + "loss": 0.7409, + "step": 4260 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021939319920293113, + "loss": 0.7391, + "step": 4280 + }, + { + "epoch": 0.82, + "learning_rate": 0.0002190075207302179, + "loss": 0.741, + "step": 4300 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021862184225750463, + "loss": 0.7404, + "step": 4320 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021823616378479138, + "loss": 0.7356, + "step": 4340 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021785048531207816, + "loss": 0.7458, + "step": 4360 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021746480683936488, + "loss": 0.7373, + "step": 4380 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021707912836665166, + "loss": 0.7455, + "step": 4400 + }, + { + "epoch": 0.84, + "eval_loss": 0.7680566310882568, + "eval_runtime": 25.4479, + "eval_samples_per_second": 78.592, + "eval_steps_per_second": 1.257, + "step": 4400 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021669344989393838, + "loss": 0.7376, + "step": 4420 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021630777142122516, + "loss": 0.7396, + "step": 4440 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002159220929485119, + "loss": 0.7367, + "step": 4460 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021553641447579866, + "loss": 0.7354, + "step": 4480 + }, + { + "epoch": 0.86, + "learning_rate": 0.0002151507360030854, + "loss": 0.7337, + "step": 4500 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021476505753037218, + "loss": 0.7384, + "step": 4520 + }, + { + "epoch": 0.87, + "learning_rate": 0.0002143793790576589, + "loss": 0.7334, + "step": 4540 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021399370058494568, + "loss": 0.742, + "step": 4560 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002136080221122324, + "loss": 0.7408, + "step": 4580 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021322234363951916, + "loss": 0.7466, + "step": 4600 + }, + { + "epoch": 0.88, + "eval_loss": 0.7663780450820923, + "eval_runtime": 25.53, + "eval_samples_per_second": 78.339, + "eval_steps_per_second": 1.253, + "step": 4600 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021283666516680593, + "loss": 0.7399, + "step": 4620 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021245098669409266, + "loss": 0.746, + "step": 4640 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021206530822137943, + "loss": 0.7397, + "step": 4660 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021167962974866618, + "loss": 0.7349, + "step": 4680 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021129395127595293, + "loss": 0.7334, + "step": 4700 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021090827280323968, + "loss": 0.738, + "step": 4720 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002105225943305264, + "loss": 0.7398, + "step": 4740 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021013691585781318, + "loss": 0.7465, + "step": 4760 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020975123738509996, + "loss": 0.7388, + "step": 4780 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020936555891238668, + "loss": 0.7462, + "step": 4800 + }, + { + "epoch": 0.92, + "eval_loss": 0.7650267481803894, + "eval_runtime": 25.5463, + "eval_samples_per_second": 78.289, + "eval_steps_per_second": 1.253, + "step": 4800 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020897988043967343, + "loss": 0.7463, + "step": 4820 + }, + { + "epoch": 0.93, + "learning_rate": 0.0002085942019669602, + "loss": 0.7389, + "step": 4840 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020820852349424693, + "loss": 0.7316, + "step": 4860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002078228450215337, + "loss": 0.73, + "step": 4880 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020743716654882043, + "loss": 0.7472, + "step": 4900 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002070514880761072, + "loss": 0.7494, + "step": 4920 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020666580960339396, + "loss": 0.7424, + "step": 4940 + }, + { + "epoch": 0.95, + "learning_rate": 0.0002062801311306807, + "loss": 0.7443, + "step": 4960 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020589445265796746, + "loss": 0.7355, + "step": 4980 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020550877418525423, + "loss": 0.7388, + "step": 5000 + }, + { + "epoch": 0.96, + "eval_loss": 0.7630622386932373, + "eval_runtime": 25.8654, + "eval_samples_per_second": 77.323, + "eval_steps_per_second": 1.237, + "step": 5000 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020512309571254096, + "loss": 0.7317, + "step": 5020 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002047374172398277, + "loss": 0.7385, + "step": 5040 + }, + { + "epoch": 0.97, + "learning_rate": 0.00020435173876711445, + "loss": 0.7369, + "step": 5060 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002039660602944012, + "loss": 0.7243, + "step": 5080 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020358038182168798, + "loss": 0.7334, + "step": 5100 + }, + { + "epoch": 0.98, + "learning_rate": 0.0002031947033489747, + "loss": 0.7433, + "step": 5120 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020280902487626148, + "loss": 0.7202, + "step": 5140 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020242334640354823, + "loss": 0.7336, + "step": 5160 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020203766793083498, + "loss": 0.7324, + "step": 5180 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020165198945812173, + "loss": 0.7363, + "step": 5200 + }, + { + "epoch": 1.0, + "eval_loss": 0.7617191076278687, + "eval_runtime": 25.4884, + "eval_samples_per_second": 78.467, + "eval_steps_per_second": 1.255, + "step": 5200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0002012663109854085, + "loss": 0.7359, + "step": 5220 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020088063251269523, + "loss": 0.7347, + "step": 5240 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020049495403998198, + "loss": 0.732, + "step": 5260 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020010927556726873, + "loss": 0.7385, + "step": 5280 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019972359709455548, + "loss": 0.7313, + "step": 5300 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019933791862184226, + "loss": 0.7337, + "step": 5320 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019895224014912898, + "loss": 0.733, + "step": 5340 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019856656167641576, + "loss": 0.7226, + "step": 5360 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001981808832037025, + "loss": 0.7363, + "step": 5380 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019779520473098925, + "loss": 0.7296, + "step": 5400 + }, + { + "epoch": 1.03, + "eval_loss": 0.7608480453491211, + "eval_runtime": 25.5307, + "eval_samples_per_second": 78.337, + "eval_steps_per_second": 1.253, + "step": 5400 + }, + { + "epoch": 1.04, + "learning_rate": 0.000197409526258276, + "loss": 0.7237, + "step": 5420 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019702384778556273, + "loss": 0.735, + "step": 5440 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001966381693128495, + "loss": 0.7379, + "step": 5460 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019625249084013628, + "loss": 0.7372, + "step": 5480 + }, + { + "epoch": 1.05, + "learning_rate": 0.000195866812367423, + "loss": 0.7332, + "step": 5500 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019548113389470975, + "loss": 0.7375, + "step": 5520 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019509545542199653, + "loss": 0.7352, + "step": 5540 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019470977694928325, + "loss": 0.7336, + "step": 5560 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019432409847657003, + "loss": 0.7266, + "step": 5580 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019393842000385675, + "loss": 0.7325, + "step": 5600 + }, + { + "epoch": 1.07, + "eval_loss": 0.7595871686935425, + "eval_runtime": 25.4845, + "eval_samples_per_second": 78.479, + "eval_steps_per_second": 1.256, + "step": 5600 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019355274153114353, + "loss": 0.7259, + "step": 5620 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019316706305843028, + "loss": 0.7274, + "step": 5640 + }, + { + "epoch": 1.08, + "learning_rate": 0.000192781384585717, + "loss": 0.7254, + "step": 5660 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019239570611300378, + "loss": 0.7332, + "step": 5680 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019201002764029056, + "loss": 0.73, + "step": 5700 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019162434916757728, + "loss": 0.7365, + "step": 5720 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019123867069486403, + "loss": 0.7261, + "step": 5740 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019085299222215078, + "loss": 0.7331, + "step": 5760 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019046731374943753, + "loss": 0.7272, + "step": 5780 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001900816352767243, + "loss": 0.7325, + "step": 5800 + }, + { + "epoch": 1.11, + "eval_loss": 0.7583591341972351, + "eval_runtime": 25.49, + "eval_samples_per_second": 78.462, + "eval_steps_per_second": 1.255, + "step": 5800 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018969595680401103, + "loss": 0.7277, + "step": 5820 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001893102783312978, + "loss": 0.7352, + "step": 5840 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018892459985858455, + "loss": 0.7312, + "step": 5860 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018853892138587128, + "loss": 0.7296, + "step": 5880 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018815324291315805, + "loss": 0.7275, + "step": 5900 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018776756444044478, + "loss": 0.7345, + "step": 5920 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018738188596773155, + "loss": 0.7322, + "step": 5940 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001869962074950183, + "loss": 0.737, + "step": 5960 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018661052902230505, + "loss": 0.7243, + "step": 5980 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001862248505495918, + "loss": 0.7303, + "step": 6000 + }, + { + "epoch": 1.15, + "eval_loss": 0.7572018504142761, + "eval_runtime": 25.7126, + "eval_samples_per_second": 77.783, + "eval_steps_per_second": 1.245, + "step": 6000 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018583917207687858, + "loss": 0.7237, + "step": 6020 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001854534936041653, + "loss": 0.735, + "step": 6040 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018506781513145208, + "loss": 0.727, + "step": 6060 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001846821366587388, + "loss": 0.7226, + "step": 6080 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018429645818602558, + "loss": 0.7213, + "step": 6100 + }, + { + "epoch": 1.17, + "learning_rate": 0.000183930063636948, + "loss": 0.7206, + "step": 6120 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001835443851642347, + "loss": 0.7292, + "step": 6140 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001831587066915215, + "loss": 0.7316, + "step": 6160 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018277302821880824, + "loss": 0.7318, + "step": 6180 + }, + { + "epoch": 1.19, + "learning_rate": 0.000182387349746095, + "loss": 0.7302, + "step": 6200 + }, + { + "epoch": 1.19, + "eval_loss": 0.755982518196106, + "eval_runtime": 25.5055, + "eval_samples_per_second": 78.415, + "eval_steps_per_second": 1.255, + "step": 6200 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018200167127338174, + "loss": 0.728, + "step": 6220 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018161599280066851, + "loss": 0.7334, + "step": 6240 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018123031432795524, + "loss": 0.7303, + "step": 6260 + }, + { + "epoch": 1.2, + "learning_rate": 0.000180844635855242, + "loss": 0.7274, + "step": 6280 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018045895738252874, + "loss": 0.7368, + "step": 6300 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018007327890981549, + "loss": 0.724, + "step": 6320 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017968760043710226, + "loss": 0.7229, + "step": 6340 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017930192196438899, + "loss": 0.7216, + "step": 6360 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017891624349167576, + "loss": 0.7292, + "step": 6380 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001785305650189625, + "loss": 0.7226, + "step": 6400 + }, + { + "epoch": 1.23, + "eval_loss": 0.7554095387458801, + "eval_runtime": 25.5062, + "eval_samples_per_second": 78.412, + "eval_steps_per_second": 1.255, + "step": 6400 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017814488654624926, + "loss": 0.7262, + "step": 6420 + }, + { + "epoch": 1.23, + "learning_rate": 0.000177759208073536, + "loss": 0.7274, + "step": 6440 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017737352960082276, + "loss": 0.7271, + "step": 6460 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001769878511281095, + "loss": 0.7299, + "step": 6480 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001766021726553963, + "loss": 0.7264, + "step": 6500 + }, + { + "epoch": 1.25, + "learning_rate": 0.000176216494182683, + "loss": 0.7285, + "step": 6520 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001758308157099698, + "loss": 0.7216, + "step": 6540 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017544513723725654, + "loss": 0.7215, + "step": 6560 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017505945876454326, + "loss": 0.7253, + "step": 6580 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017467378029183004, + "loss": 0.7246, + "step": 6600 + }, + { + "epoch": 1.26, + "eval_loss": 0.7540405988693237, + "eval_runtime": 25.4725, + "eval_samples_per_second": 78.516, + "eval_steps_per_second": 1.256, + "step": 6600 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017428810181911676, + "loss": 0.7166, + "step": 6620 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017390242334640354, + "loss": 0.7213, + "step": 6640 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017351674487369029, + "loss": 0.7305, + "step": 6660 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017313106640097704, + "loss": 0.7347, + "step": 6680 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017274538792826379, + "loss": 0.7272, + "step": 6700 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017235970945555056, + "loss": 0.7224, + "step": 6720 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017197403098283728, + "loss": 0.7327, + "step": 6740 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017158835251012406, + "loss": 0.7228, + "step": 6760 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017120267403741078, + "loss": 0.7344, + "step": 6780 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017081699556469753, + "loss": 0.7269, + "step": 6800 + }, + { + "epoch": 1.3, + "eval_loss": 0.7531024813652039, + "eval_runtime": 25.6796, + "eval_samples_per_second": 77.883, + "eval_steps_per_second": 1.246, + "step": 6800 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001704313170919843, + "loss": 0.7362, + "step": 6820 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017004563861927103, + "loss": 0.7293, + "step": 6840 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001696599601465578, + "loss": 0.7286, + "step": 6860 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016927428167384456, + "loss": 0.7148, + "step": 6880 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001688886032011313, + "loss": 0.72, + "step": 6900 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016850292472841806, + "loss": 0.7239, + "step": 6920 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001681172462557048, + "loss": 0.726, + "step": 6940 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016773156778299156, + "loss": 0.7286, + "step": 6960 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016734588931027834, + "loss": 0.7276, + "step": 6980 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016696021083756506, + "loss": 0.7258, + "step": 7000 + }, + { + "epoch": 1.34, + "eval_loss": 0.7521764636039734, + "eval_runtime": 25.5237, + "eval_samples_per_second": 78.359, + "eval_steps_per_second": 1.254, + "step": 7000 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001665745323648518, + "loss": 0.7326, + "step": 7020 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016618885389213859, + "loss": 0.7311, + "step": 7040 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001658031754194253, + "loss": 0.7295, + "step": 7060 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016541749694671208, + "loss": 0.7279, + "step": 7080 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016503181847399883, + "loss": 0.7293, + "step": 7100 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016464614000128558, + "loss": 0.7256, + "step": 7120 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016426046152857233, + "loss": 0.7204, + "step": 7140 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016387478305585908, + "loss": 0.718, + "step": 7160 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016348910458314583, + "loss": 0.7206, + "step": 7180 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001631034261104326, + "loss": 0.7282, + "step": 7200 + }, + { + "epoch": 1.38, + "eval_loss": 0.7514960765838623, + "eval_runtime": 25.5249, + "eval_samples_per_second": 78.355, + "eval_steps_per_second": 1.254, + "step": 7200 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016271774763771933, + "loss": 0.7162, + "step": 7220 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016233206916500608, + "loss": 0.7277, + "step": 7240 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016194639069229286, + "loss": 0.7147, + "step": 7260 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016156071221957958, + "loss": 0.7339, + "step": 7280 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016117503374686636, + "loss": 0.7257, + "step": 7300 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016078935527415308, + "loss": 0.728, + "step": 7320 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016040367680143986, + "loss": 0.7139, + "step": 7340 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001600179983287266, + "loss": 0.7202, + "step": 7360 + }, + { + "epoch": 1.41, + "learning_rate": 0.00015963231985601336, + "loss": 0.7323, + "step": 7380 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001592466413833001, + "loss": 0.7198, + "step": 7400 + }, + { + "epoch": 1.42, + "eval_loss": 0.750492513179779, + "eval_runtime": 25.8887, + "eval_samples_per_second": 77.254, + "eval_steps_per_second": 1.236, + "step": 7400 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015886096291058688, + "loss": 0.7138, + "step": 7420 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001584752844378736, + "loss": 0.7205, + "step": 7440 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015808960596516038, + "loss": 0.7178, + "step": 7460 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001577039274924471, + "loss": 0.7251, + "step": 7480 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015731824901973386, + "loss": 0.7187, + "step": 7500 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015693257054702063, + "loss": 0.7238, + "step": 7520 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015654689207430736, + "loss": 0.7283, + "step": 7540 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015616121360159413, + "loss": 0.7189, + "step": 7560 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015577553512888088, + "loss": 0.7216, + "step": 7580 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015538985665616763, + "loss": 0.7219, + "step": 7600 + }, + { + "epoch": 1.46, + "eval_loss": 0.7496184706687927, + "eval_runtime": 25.4957, + "eval_samples_per_second": 78.445, + "eval_steps_per_second": 1.255, + "step": 7600 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015500417818345438, + "loss": 0.7233, + "step": 7620 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001546184997107411, + "loss": 0.7241, + "step": 7640 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015423282123802788, + "loss": 0.7194, + "step": 7660 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015384714276531466, + "loss": 0.7229, + "step": 7680 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015346146429260138, + "loss": 0.7219, + "step": 7700 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015307578581988813, + "loss": 0.7027, + "step": 7720 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001526901073471749, + "loss": 0.7171, + "step": 7740 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015230442887446163, + "loss": 0.7193, + "step": 7760 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001519187504017484, + "loss": 0.7269, + "step": 7780 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015153307192903513, + "loss": 0.7171, + "step": 7800 + }, + { + "epoch": 1.49, + "eval_loss": 0.7494381070137024, + "eval_runtime": 25.5318, + "eval_samples_per_second": 78.334, + "eval_steps_per_second": 1.253, + "step": 7800 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001511473934563219, + "loss": 0.7186, + "step": 7820 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015076171498360866, + "loss": 0.7137, + "step": 7840 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015037603651089538, + "loss": 0.7212, + "step": 7860 + }, + { + "epoch": 1.51, + "learning_rate": 0.00014999035803818216, + "loss": 0.7167, + "step": 7880 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001496046795654689, + "loss": 0.7203, + "step": 7900 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014921900109275566, + "loss": 0.714, + "step": 7920 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001488333226200424, + "loss": 0.7153, + "step": 7940 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014844764414732916, + "loss": 0.7176, + "step": 7960 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001480619656746159, + "loss": 0.7049, + "step": 7980 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014767628720190265, + "loss": 0.7204, + "step": 8000 + }, + { + "epoch": 1.53, + "eval_loss": 0.7486086487770081, + "eval_runtime": 25.5275, + "eval_samples_per_second": 78.347, + "eval_steps_per_second": 1.254, + "step": 8000 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014729060872918943, + "loss": 0.7167, + "step": 8020 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014690493025647618, + "loss": 0.72, + "step": 8040 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014651925178376293, + "loss": 0.7203, + "step": 8060 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014613357331104968, + "loss": 0.7258, + "step": 8080 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014574789483833643, + "loss": 0.715, + "step": 8100 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014536221636562318, + "loss": 0.7245, + "step": 8120 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014497653789290993, + "loss": 0.7258, + "step": 8140 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014459085942019668, + "loss": 0.7234, + "step": 8160 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014420518094748343, + "loss": 0.7128, + "step": 8180 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014381950247477018, + "loss": 0.7181, + "step": 8200 + }, + { + "epoch": 1.57, + "eval_loss": 0.7475513219833374, + "eval_runtime": 25.5412, + "eval_samples_per_second": 78.305, + "eval_steps_per_second": 1.253, + "step": 8200 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014343382400205693, + "loss": 0.7236, + "step": 8220 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001430481455293437, + "loss": 0.7125, + "step": 8240 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014266246705663046, + "loss": 0.7186, + "step": 8260 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001422767885839172, + "loss": 0.7203, + "step": 8280 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014189111011120396, + "loss": 0.7156, + "step": 8300 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001415054316384907, + "loss": 0.714, + "step": 8320 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014111975316577745, + "loss": 0.7129, + "step": 8340 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001407340746930642, + "loss": 0.7179, + "step": 8360 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014036768014398662, + "loss": 0.7197, + "step": 8380 + }, + { + "epoch": 1.61, + "learning_rate": 0.00013998200167127337, + "loss": 0.7287, + "step": 8400 + }, + { + "epoch": 1.61, + "eval_loss": 0.7470650672912598, + "eval_runtime": 25.5238, + "eval_samples_per_second": 78.358, + "eval_steps_per_second": 1.254, + "step": 8400 + }, + { + "epoch": 1.61, + "learning_rate": 0.00013959632319856011, + "loss": 0.718, + "step": 8420 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013921064472584686, + "loss": 0.7166, + "step": 8440 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013882496625313361, + "loss": 0.7218, + "step": 8460 + }, + { + "epoch": 1.62, + "learning_rate": 0.0001384392877804204, + "loss": 0.723, + "step": 8480 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013805360930770714, + "loss": 0.7104, + "step": 8500 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001376679308349939, + "loss": 0.7136, + "step": 8520 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001372822523622806, + "loss": 0.7237, + "step": 8540 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001368965738895674, + "loss": 0.7196, + "step": 8560 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013651089541685414, + "loss": 0.7218, + "step": 8580 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001361252169441409, + "loss": 0.7132, + "step": 8600 + }, + { + "epoch": 1.65, + "eval_loss": 0.7465201020240784, + "eval_runtime": 25.542, + "eval_samples_per_second": 78.302, + "eval_steps_per_second": 1.253, + "step": 8600 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013573953847142764, + "loss": 0.7139, + "step": 8620 + }, + { + "epoch": 1.66, + "learning_rate": 0.0001353538599987144, + "loss": 0.7093, + "step": 8640 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013496818152600114, + "loss": 0.7243, + "step": 8660 + }, + { + "epoch": 1.66, + "learning_rate": 0.0001345825030532879, + "loss": 0.7127, + "step": 8680 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013419682458057464, + "loss": 0.7148, + "step": 8700 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013381114610786142, + "loss": 0.7236, + "step": 8720 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013342546763514817, + "loss": 0.7103, + "step": 8740 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001330397891624349, + "loss": 0.7133, + "step": 8760 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013265411068972164, + "loss": 0.7182, + "step": 8780 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013226843221700841, + "loss": 0.7198, + "step": 8800 + }, + { + "epoch": 1.69, + "eval_loss": 0.7450763583183289, + "eval_runtime": 25.4725, + "eval_samples_per_second": 78.516, + "eval_steps_per_second": 1.256, + "step": 8800 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013188275374429516, + "loss": 0.7073, + "step": 8820 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013149707527158191, + "loss": 0.7208, + "step": 8840 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013111139679886866, + "loss": 0.7067, + "step": 8860 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001307257183261554, + "loss": 0.7149, + "step": 8880 + }, + { + "epoch": 1.71, + "learning_rate": 0.00013034003985344216, + "loss": 0.7133, + "step": 8900 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001299543613807289, + "loss": 0.7137, + "step": 8920 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001295686829080157, + "loss": 0.719, + "step": 8940 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012918300443530244, + "loss": 0.7198, + "step": 8960 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001287973259625892, + "loss": 0.7074, + "step": 8980 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001284116474898759, + "loss": 0.7216, + "step": 9000 + }, + { + "epoch": 1.72, + "eval_loss": 0.7454459071159363, + "eval_runtime": 25.5717, + "eval_samples_per_second": 78.211, + "eval_steps_per_second": 1.251, + "step": 9000 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001280259690171627, + "loss": 0.7203, + "step": 9020 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012764029054444944, + "loss": 0.7133, + "step": 9040 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001272546120717362, + "loss": 0.7081, + "step": 9060 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012686893359902294, + "loss": 0.7153, + "step": 9080 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001264832551263097, + "loss": 0.7108, + "step": 9100 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012609757665359644, + "loss": 0.7106, + "step": 9120 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001257118981808832, + "loss": 0.7117, + "step": 9140 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012532621970816994, + "loss": 0.7171, + "step": 9160 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012494054123545671, + "loss": 0.7148, + "step": 9180 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012455486276274346, + "loss": 0.714, + "step": 9200 + }, + { + "epoch": 1.76, + "eval_loss": 0.7446411848068237, + "eval_runtime": 25.5622, + "eval_samples_per_second": 78.24, + "eval_steps_per_second": 1.252, + "step": 9200 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012416918429003019, + "loss": 0.7133, + "step": 9220 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012378350581731694, + "loss": 0.7108, + "step": 9240 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001233978273446037, + "loss": 0.7147, + "step": 9260 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012301214887189046, + "loss": 0.715, + "step": 9280 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001226264703991772, + "loss": 0.7255, + "step": 9300 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012224079192646396, + "loss": 0.7168, + "step": 9320 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012185511345375073, + "loss": 0.7155, + "step": 9340 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012146943498103746, + "loss": 0.7064, + "step": 9360 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012108375650832421, + "loss": 0.716, + "step": 9380 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012069807803561096, + "loss": 0.7145, + "step": 9400 + }, + { + "epoch": 1.8, + "eval_loss": 0.7441000938415527, + "eval_runtime": 25.7378, + "eval_samples_per_second": 77.707, + "eval_steps_per_second": 1.243, + "step": 9400 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012031239956289772, + "loss": 0.7135, + "step": 9420 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011992672109018447, + "loss": 0.7164, + "step": 9440 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011954104261747122, + "loss": 0.714, + "step": 9460 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011915536414475797, + "loss": 0.7173, + "step": 9480 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011876968567204474, + "loss": 0.7102, + "step": 9500 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011838400719933149, + "loss": 0.7122, + "step": 9520 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011799832872661822, + "loss": 0.7197, + "step": 9540 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011761265025390497, + "loss": 0.7132, + "step": 9560 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011722697178119174, + "loss": 0.7255, + "step": 9580 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011684129330847849, + "loss": 0.7175, + "step": 9600 + }, + { + "epoch": 1.84, + "eval_loss": 0.7432481646537781, + "eval_runtime": 25.5076, + "eval_samples_per_second": 78.408, + "eval_steps_per_second": 1.255, + "step": 9600 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011645561483576524, + "loss": 0.7125, + "step": 9620 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011606993636305199, + "loss": 0.7119, + "step": 9640 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011568425789033875, + "loss": 0.7147, + "step": 9660 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001152985794176255, + "loss": 0.7101, + "step": 9680 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011491290094491225, + "loss": 0.7105, + "step": 9700 + }, + { + "epoch": 1.86, + "learning_rate": 0.000114527222472199, + "loss": 0.7153, + "step": 9720 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011414154399948576, + "loss": 0.7047, + "step": 9740 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011375586552677251, + "loss": 0.6967, + "step": 9760 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011337018705405925, + "loss": 0.7094, + "step": 9780 + }, + { + "epoch": 1.88, + "learning_rate": 0.000112984508581346, + "loss": 0.7195, + "step": 9800 + }, + { + "epoch": 1.88, + "eval_loss": 0.7431700229644775, + "eval_runtime": 25.5752, + "eval_samples_per_second": 78.201, + "eval_steps_per_second": 1.251, + "step": 9800 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011259883010863276, + "loss": 0.7122, + "step": 9820 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011221315163591951, + "loss": 0.7193, + "step": 9840 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011182747316320626, + "loss": 0.7147, + "step": 9860 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011144179469049301, + "loss": 0.7058, + "step": 9880 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011105611621777977, + "loss": 0.7106, + "step": 9900 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011067043774506652, + "loss": 0.71, + "step": 9920 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011028475927235327, + "loss": 0.7182, + "step": 9940 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010989908079964002, + "loss": 0.7048, + "step": 9960 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010951340232692679, + "loss": 0.7165, + "step": 9980 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010912772385421352, + "loss": 0.7153, + "step": 10000 + }, + { + "epoch": 1.92, + "eval_loss": 0.7425808310508728, + "eval_runtime": 25.6297, + "eval_samples_per_second": 78.034, + "eval_steps_per_second": 1.249, + "step": 10000 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010874204538150027, + "loss": 0.7127, + "step": 10020 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010835636690878702, + "loss": 0.7062, + "step": 10040 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010797068843607378, + "loss": 0.7125, + "step": 10060 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010758500996336053, + "loss": 0.7114, + "step": 10080 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010719933149064728, + "loss": 0.7096, + "step": 10100 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010681365301793405, + "loss": 0.7119, + "step": 10120 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001064279745452208, + "loss": 0.7034, + "step": 10140 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010604229607250755, + "loss": 0.7049, + "step": 10160 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001056566175997943, + "loss": 0.7156, + "step": 10180 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010527093912708106, + "loss": 0.718, + "step": 10200 + }, + { + "epoch": 1.95, + "eval_loss": 0.7418650984764099, + "eval_runtime": 25.554, + "eval_samples_per_second": 78.266, + "eval_steps_per_second": 1.252, + "step": 10200 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010488526065436781, + "loss": 0.7141, + "step": 10220 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010449958218165455, + "loss": 0.7073, + "step": 10240 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001041139037089413, + "loss": 0.7129, + "step": 10260 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010372822523622806, + "loss": 0.7174, + "step": 10280 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010334254676351481, + "loss": 0.7112, + "step": 10300 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010295686829080156, + "loss": 0.7073, + "step": 10320 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010257118981808831, + "loss": 0.7164, + "step": 10340 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010218551134537507, + "loss": 0.7057, + "step": 10360 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010179983287266182, + "loss": 0.709, + "step": 10380 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010141415439994857, + "loss": 0.7147, + "step": 10400 + }, + { + "epoch": 1.99, + "eval_loss": 0.7417293787002563, + "eval_runtime": 25.4964, + "eval_samples_per_second": 78.443, + "eval_steps_per_second": 1.255, + "step": 10400 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010102847592723531, + "loss": 0.713, + "step": 10420 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010064279745452208, + "loss": 0.7128, + "step": 10440 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010025711898180882, + "loss": 0.7094, + "step": 10460 + }, + { + "epoch": 2.01, + "learning_rate": 9.987144050909557e-05, + "loss": 0.7008, + "step": 10480 + }, + { + "epoch": 2.01, + "learning_rate": 9.948576203638232e-05, + "loss": 0.7083, + "step": 10500 + }, + { + "epoch": 2.02, + "learning_rate": 9.910008356366908e-05, + "loss": 0.7049, + "step": 10520 + }, + { + "epoch": 2.02, + "learning_rate": 9.871440509095583e-05, + "loss": 0.7041, + "step": 10540 + }, + { + "epoch": 2.02, + "learning_rate": 9.834801054187824e-05, + "loss": 0.7105, + "step": 10560 + }, + { + "epoch": 2.03, + "learning_rate": 9.7962332069165e-05, + "loss": 0.7041, + "step": 10580 + }, + { + "epoch": 2.03, + "learning_rate": 9.757665359645176e-05, + "loss": 0.7103, + "step": 10600 + }, + { + "epoch": 2.03, + "eval_loss": 0.7410894632339478, + "eval_runtime": 25.5424, + "eval_samples_per_second": 78.301, + "eval_steps_per_second": 1.253, + "step": 10600 + }, + { + "epoch": 2.03, + "learning_rate": 9.71909751237385e-05, + "loss": 0.7037, + "step": 10620 + }, + { + "epoch": 2.04, + "learning_rate": 9.680529665102524e-05, + "loss": 0.7078, + "step": 10640 + }, + { + "epoch": 2.04, + "learning_rate": 9.641961817831202e-05, + "loss": 0.7116, + "step": 10660 + }, + { + "epoch": 2.05, + "learning_rate": 9.603393970559876e-05, + "loss": 0.7094, + "step": 10680 + }, + { + "epoch": 2.05, + "learning_rate": 9.56482612328855e-05, + "loss": 0.7217, + "step": 10700 + }, + { + "epoch": 2.05, + "learning_rate": 9.526258276017226e-05, + "loss": 0.7038, + "step": 10720 + }, + { + "epoch": 2.06, + "learning_rate": 9.487690428745902e-05, + "loss": 0.7131, + "step": 10740 + }, + { + "epoch": 2.06, + "learning_rate": 9.449122581474577e-05, + "loss": 0.7051, + "step": 10760 + }, + { + "epoch": 2.07, + "learning_rate": 9.410554734203252e-05, + "loss": 0.7058, + "step": 10780 + }, + { + "epoch": 2.07, + "learning_rate": 9.371986886931927e-05, + "loss": 0.7039, + "step": 10800 + }, + { + "epoch": 2.07, + "eval_loss": 0.7405736446380615, + "eval_runtime": 25.7467, + "eval_samples_per_second": 77.68, + "eval_steps_per_second": 1.243, + "step": 10800 + }, + { + "epoch": 2.07, + "learning_rate": 9.333419039660603e-05, + "loss": 0.7101, + "step": 10820 + }, + { + "epoch": 2.08, + "learning_rate": 9.294851192389278e-05, + "loss": 0.6991, + "step": 10840 + }, + { + "epoch": 2.08, + "learning_rate": 9.256283345117953e-05, + "loss": 0.7069, + "step": 10860 + }, + { + "epoch": 2.08, + "learning_rate": 9.217715497846627e-05, + "loss": 0.7094, + "step": 10880 + }, + { + "epoch": 2.09, + "learning_rate": 9.179147650575303e-05, + "loss": 0.7103, + "step": 10900 + }, + { + "epoch": 2.09, + "learning_rate": 9.140579803303978e-05, + "loss": 0.7015, + "step": 10920 + }, + { + "epoch": 2.1, + "learning_rate": 9.102011956032653e-05, + "loss": 0.712, + "step": 10940 + }, + { + "epoch": 2.1, + "learning_rate": 9.063444108761328e-05, + "loss": 0.707, + "step": 10960 + }, + { + "epoch": 2.1, + "learning_rate": 9.024876261490004e-05, + "loss": 0.7009, + "step": 10980 + }, + { + "epoch": 2.11, + "learning_rate": 8.986308414218679e-05, + "loss": 0.7062, + "step": 11000 + }, + { + "epoch": 2.11, + "eval_loss": 0.7398320436477661, + "eval_runtime": 25.5459, + "eval_samples_per_second": 78.29, + "eval_steps_per_second": 1.253, + "step": 11000 + }, + { + "epoch": 2.11, + "learning_rate": 8.947740566947354e-05, + "loss": 0.7054, + "step": 11020 + }, + { + "epoch": 2.12, + "learning_rate": 8.909172719676029e-05, + "loss": 0.7094, + "step": 11040 + }, + { + "epoch": 2.12, + "learning_rate": 8.870604872404706e-05, + "loss": 0.7059, + "step": 11060 + }, + { + "epoch": 2.12, + "learning_rate": 8.83203702513338e-05, + "loss": 0.7202, + "step": 11080 + }, + { + "epoch": 2.13, + "learning_rate": 8.793469177862054e-05, + "loss": 0.699, + "step": 11100 + }, + { + "epoch": 2.13, + "learning_rate": 8.754901330590729e-05, + "loss": 0.7137, + "step": 11120 + }, + { + "epoch": 2.13, + "learning_rate": 8.716333483319405e-05, + "loss": 0.7048, + "step": 11140 + }, + { + "epoch": 2.14, + "learning_rate": 8.67776563604808e-05, + "loss": 0.7089, + "step": 11160 + }, + { + "epoch": 2.14, + "learning_rate": 8.639197788776755e-05, + "loss": 0.7057, + "step": 11180 + }, + { + "epoch": 2.15, + "learning_rate": 8.60062994150543e-05, + "loss": 0.709, + "step": 11200 + }, + { + "epoch": 2.15, + "eval_loss": 0.7393301725387573, + "eval_runtime": 25.7257, + "eval_samples_per_second": 77.743, + "eval_steps_per_second": 1.244, + "step": 11200 + }, + { + "epoch": 2.15, + "learning_rate": 8.562062094234107e-05, + "loss": 0.7027, + "step": 11220 + }, + { + "epoch": 2.15, + "learning_rate": 8.523494246962782e-05, + "loss": 0.7082, + "step": 11240 + }, + { + "epoch": 2.16, + "learning_rate": 8.484926399691457e-05, + "loss": 0.7007, + "step": 11260 + }, + { + "epoch": 2.16, + "learning_rate": 8.446358552420132e-05, + "loss": 0.7011, + "step": 11280 + }, + { + "epoch": 2.17, + "learning_rate": 8.407790705148808e-05, + "loss": 0.7067, + "step": 11300 + }, + { + "epoch": 2.17, + "learning_rate": 8.369222857877483e-05, + "loss": 0.702, + "step": 11320 + }, + { + "epoch": 2.17, + "learning_rate": 8.330655010606157e-05, + "loss": 0.7126, + "step": 11340 + }, + { + "epoch": 2.18, + "learning_rate": 8.292087163334832e-05, + "loss": 0.6947, + "step": 11360 + }, + { + "epoch": 2.18, + "learning_rate": 8.253519316063508e-05, + "loss": 0.7033, + "step": 11380 + }, + { + "epoch": 2.18, + "learning_rate": 8.214951468792183e-05, + "loss": 0.7075, + "step": 11400 + }, + { + "epoch": 2.18, + "eval_loss": 0.7390503883361816, + "eval_runtime": 25.6097, + "eval_samples_per_second": 78.095, + "eval_steps_per_second": 1.25, + "step": 11400 + }, + { + "epoch": 2.19, + "learning_rate": 8.176383621520858e-05, + "loss": 0.7081, + "step": 11420 + }, + { + "epoch": 2.19, + "learning_rate": 8.137815774249533e-05, + "loss": 0.7114, + "step": 11440 + }, + { + "epoch": 2.2, + "learning_rate": 8.099247926978209e-05, + "loss": 0.7105, + "step": 11460 + }, + { + "epoch": 2.2, + "learning_rate": 8.060680079706884e-05, + "loss": 0.7113, + "step": 11480 + }, + { + "epoch": 2.2, + "learning_rate": 8.022112232435559e-05, + "loss": 0.7109, + "step": 11500 + }, + { + "epoch": 2.21, + "learning_rate": 7.983544385164233e-05, + "loss": 0.7039, + "step": 11520 + }, + { + "epoch": 2.21, + "learning_rate": 7.94497653789291e-05, + "loss": 0.7144, + "step": 11540 + }, + { + "epoch": 2.21, + "learning_rate": 7.906408690621584e-05, + "loss": 0.7003, + "step": 11560 + }, + { + "epoch": 2.22, + "learning_rate": 7.867840843350259e-05, + "loss": 0.7028, + "step": 11580 + }, + { + "epoch": 2.22, + "learning_rate": 7.829272996078934e-05, + "loss": 0.7018, + "step": 11600 + }, + { + "epoch": 2.22, + "eval_loss": 0.7388148307800293, + "eval_runtime": 25.5069, + "eval_samples_per_second": 78.41, + "eval_steps_per_second": 1.255, + "step": 11600 + }, + { + "epoch": 2.23, + "learning_rate": 7.79070514880761e-05, + "loss": 0.7113, + "step": 11620 + }, + { + "epoch": 2.23, + "learning_rate": 7.752137301536285e-05, + "loss": 0.7136, + "step": 11640 + }, + { + "epoch": 2.23, + "learning_rate": 7.71356945426496e-05, + "loss": 0.7097, + "step": 11660 + }, + { + "epoch": 2.24, + "learning_rate": 7.675001606993635e-05, + "loss": 0.7057, + "step": 11680 + }, + { + "epoch": 2.24, + "learning_rate": 7.636433759722312e-05, + "loss": 0.7028, + "step": 11700 + }, + { + "epoch": 2.25, + "learning_rate": 7.597865912450986e-05, + "loss": 0.708, + "step": 11720 + }, + { + "epoch": 2.25, + "learning_rate": 7.559298065179661e-05, + "loss": 0.7088, + "step": 11740 + }, + { + "epoch": 2.25, + "learning_rate": 7.520730217908335e-05, + "loss": 0.7024, + "step": 11760 + }, + { + "epoch": 2.26, + "learning_rate": 7.482162370637011e-05, + "loss": 0.7016, + "step": 11780 + }, + { + "epoch": 2.26, + "learning_rate": 7.443594523365686e-05, + "loss": 0.7132, + "step": 11800 + }, + { + "epoch": 2.26, + "eval_loss": 0.7381731271743774, + "eval_runtime": 25.4976, + "eval_samples_per_second": 78.439, + "eval_steps_per_second": 1.255, + "step": 11800 + }, + { + "epoch": 2.26, + "learning_rate": 7.405026676094361e-05, + "loss": 0.6969, + "step": 11820 + }, + { + "epoch": 2.27, + "learning_rate": 7.366458828823038e-05, + "loss": 0.7042, + "step": 11840 + }, + { + "epoch": 2.27, + "learning_rate": 7.327890981551713e-05, + "loss": 0.7088, + "step": 11860 + }, + { + "epoch": 2.28, + "learning_rate": 7.289323134280388e-05, + "loss": 0.7109, + "step": 11880 + }, + { + "epoch": 2.28, + "learning_rate": 7.250755287009063e-05, + "loss": 0.7046, + "step": 11900 + }, + { + "epoch": 2.28, + "learning_rate": 7.212187439737738e-05, + "loss": 0.706, + "step": 11920 + }, + { + "epoch": 2.29, + "learning_rate": 7.173619592466414e-05, + "loss": 0.7045, + "step": 11940 + }, + { + "epoch": 2.29, + "learning_rate": 7.135051745195089e-05, + "loss": 0.7121, + "step": 11960 + }, + { + "epoch": 2.3, + "learning_rate": 7.096483897923764e-05, + "loss": 0.6946, + "step": 11980 + }, + { + "epoch": 2.3, + "learning_rate": 7.057916050652439e-05, + "loss": 0.7003, + "step": 12000 + }, + { + "epoch": 2.3, + "eval_loss": 0.7378225922584534, + "eval_runtime": 25.5221, + "eval_samples_per_second": 78.363, + "eval_steps_per_second": 1.254, + "step": 12000 + }, + { + "epoch": 2.3, + "learning_rate": 7.019348203381114e-05, + "loss": 0.7147, + "step": 12020 + }, + { + "epoch": 2.31, + "learning_rate": 6.980780356109789e-05, + "loss": 0.7066, + "step": 12040 + }, + { + "epoch": 2.31, + "learning_rate": 6.942212508838465e-05, + "loss": 0.6997, + "step": 12060 + }, + { + "epoch": 2.31, + "learning_rate": 6.90364466156714e-05, + "loss": 0.7083, + "step": 12080 + }, + { + "epoch": 2.32, + "learning_rate": 6.865076814295815e-05, + "loss": 0.6991, + "step": 12100 + }, + { + "epoch": 2.32, + "learning_rate": 6.82650896702449e-05, + "loss": 0.6982, + "step": 12120 + }, + { + "epoch": 2.33, + "learning_rate": 6.787941119753165e-05, + "loss": 0.7028, + "step": 12140 + }, + { + "epoch": 2.33, + "learning_rate": 6.74937327248184e-05, + "loss": 0.704, + "step": 12160 + }, + { + "epoch": 2.33, + "learning_rate": 6.710805425210516e-05, + "loss": 0.7084, + "step": 12180 + }, + { + "epoch": 2.34, + "learning_rate": 6.672237577939191e-05, + "loss": 0.7061, + "step": 12200 + }, + { + "epoch": 2.34, + "eval_loss": 0.7376002669334412, + "eval_runtime": 25.5156, + "eval_samples_per_second": 78.383, + "eval_steps_per_second": 1.254, + "step": 12200 + }, + { + "epoch": 2.34, + "learning_rate": 6.633669730667866e-05, + "loss": 0.7017, + "step": 12220 + }, + { + "epoch": 2.35, + "learning_rate": 6.595101883396541e-05, + "loss": 0.6949, + "step": 12240 + }, + { + "epoch": 2.35, + "learning_rate": 6.556534036125216e-05, + "loss": 0.6985, + "step": 12260 + }, + { + "epoch": 2.35, + "learning_rate": 6.517966188853891e-05, + "loss": 0.7075, + "step": 12280 + }, + { + "epoch": 2.36, + "learning_rate": 6.479398341582568e-05, + "loss": 0.6997, + "step": 12300 + }, + { + "epoch": 2.36, + "learning_rate": 6.440830494311241e-05, + "loss": 0.7045, + "step": 12320 + }, + { + "epoch": 2.36, + "learning_rate": 6.402262647039918e-05, + "loss": 0.7148, + "step": 12340 + }, + { + "epoch": 2.37, + "learning_rate": 6.363694799768592e-05, + "loss": 0.7085, + "step": 12360 + }, + { + "epoch": 2.37, + "learning_rate": 6.325126952497267e-05, + "loss": 0.7062, + "step": 12380 + }, + { + "epoch": 2.38, + "learning_rate": 6.286559105225942e-05, + "loss": 0.7092, + "step": 12400 + }, + { + "epoch": 2.38, + "eval_loss": 0.7370800971984863, + "eval_runtime": 25.5432, + "eval_samples_per_second": 78.299, + "eval_steps_per_second": 1.253, + "step": 12400 + }, + { + "epoch": 2.38, + "learning_rate": 6.247991257954619e-05, + "loss": 0.7069, + "step": 12420 + }, + { + "epoch": 2.38, + "learning_rate": 6.209423410683292e-05, + "loss": 0.7083, + "step": 12440 + }, + { + "epoch": 2.39, + "learning_rate": 6.170855563411969e-05, + "loss": 0.7126, + "step": 12460 + }, + { + "epoch": 2.39, + "learning_rate": 6.132287716140644e-05, + "loss": 0.7062, + "step": 12480 + }, + { + "epoch": 2.4, + "learning_rate": 6.0937198688693187e-05, + "loss": 0.7149, + "step": 12500 + }, + { + "epoch": 2.4, + "learning_rate": 6.0551520215979936e-05, + "loss": 0.7111, + "step": 12520 + }, + { + "epoch": 2.4, + "learning_rate": 6.016584174326669e-05, + "loss": 0.7059, + "step": 12540 + }, + { + "epoch": 2.41, + "learning_rate": 5.978016327055344e-05, + "loss": 0.7169, + "step": 12560 + }, + { + "epoch": 2.41, + "learning_rate": 5.93944847978402e-05, + "loss": 0.7052, + "step": 12580 + }, + { + "epoch": 2.41, + "learning_rate": 5.900880632512694e-05, + "loss": 0.7019, + "step": 12600 + }, + { + "epoch": 2.41, + "eval_loss": 0.7369959354400635, + "eval_runtime": 25.5241, + "eval_samples_per_second": 78.357, + "eval_steps_per_second": 1.254, + "step": 12600 + }, + { + "epoch": 2.42, + "learning_rate": 5.86231278524137e-05, + "loss": 0.7026, + "step": 12620 + }, + { + "epoch": 2.42, + "learning_rate": 5.823744937970045e-05, + "loss": 0.6981, + "step": 12640 + }, + { + "epoch": 2.43, + "learning_rate": 5.7851770906987205e-05, + "loss": 0.7044, + "step": 12660 + }, + { + "epoch": 2.43, + "learning_rate": 5.7466092434273955e-05, + "loss": 0.7087, + "step": 12680 + }, + { + "epoch": 2.43, + "learning_rate": 5.708041396156071e-05, + "loss": 0.7039, + "step": 12700 + }, + { + "epoch": 2.44, + "learning_rate": 5.6694735488847454e-05, + "loss": 0.7015, + "step": 12720 + }, + { + "epoch": 2.44, + "learning_rate": 5.630905701613421e-05, + "loss": 0.7053, + "step": 12740 + }, + { + "epoch": 2.44, + "learning_rate": 5.592337854342096e-05, + "loss": 0.7037, + "step": 12760 + }, + { + "epoch": 2.45, + "learning_rate": 5.553770007070772e-05, + "loss": 0.6938, + "step": 12780 + }, + { + "epoch": 2.45, + "learning_rate": 5.515202159799447e-05, + "loss": 0.7063, + "step": 12800 + }, + { + "epoch": 2.45, + "eval_loss": 0.7364639639854431, + "eval_runtime": 25.4856, + "eval_samples_per_second": 78.476, + "eval_steps_per_second": 1.256, + "step": 12800 + }, + { + "epoch": 2.46, + "learning_rate": 5.476634312528122e-05, + "loss": 0.7013, + "step": 12820 + }, + { + "epoch": 2.46, + "learning_rate": 5.4380664652567966e-05, + "loss": 0.7012, + "step": 12840 + }, + { + "epoch": 2.46, + "learning_rate": 5.399498617985472e-05, + "loss": 0.7, + "step": 12860 + }, + { + "epoch": 2.47, + "learning_rate": 5.360930770714147e-05, + "loss": 0.7017, + "step": 12880 + }, + { + "epoch": 2.47, + "learning_rate": 5.322362923442823e-05, + "loss": 0.7145, + "step": 12900 + }, + { + "epoch": 2.48, + "learning_rate": 5.283795076171498e-05, + "loss": 0.7156, + "step": 12920 + }, + { + "epoch": 2.48, + "learning_rate": 5.247155621263739e-05, + "loss": 0.6965, + "step": 12940 + }, + { + "epoch": 2.48, + "learning_rate": 5.2085877739924146e-05, + "loss": 0.7001, + "step": 12960 + }, + { + "epoch": 2.49, + "learning_rate": 5.1700199267210896e-05, + "loss": 0.7012, + "step": 12980 + }, + { + "epoch": 2.49, + "learning_rate": 5.131452079449765e-05, + "loss": 0.6939, + "step": 13000 + }, + { + "epoch": 2.49, + "eval_loss": 0.7364306449890137, + "eval_runtime": 25.5093, + "eval_samples_per_second": 78.403, + "eval_steps_per_second": 1.254, + "step": 13000 + }, + { + "epoch": 2.49, + "learning_rate": 5.09288423217844e-05, + "loss": 0.7084, + "step": 13020 + }, + { + "epoch": 2.5, + "learning_rate": 5.054316384907115e-05, + "loss": 0.6987, + "step": 13040 + }, + { + "epoch": 2.5, + "learning_rate": 5.01574853763579e-05, + "loss": 0.7087, + "step": 13060 + }, + { + "epoch": 2.51, + "learning_rate": 4.977180690364466e-05, + "loss": 0.7028, + "step": 13080 + }, + { + "epoch": 2.51, + "learning_rate": 4.938612843093141e-05, + "loss": 0.7012, + "step": 13100 + }, + { + "epoch": 2.51, + "learning_rate": 4.9000449958218165e-05, + "loss": 0.6959, + "step": 13120 + }, + { + "epoch": 2.52, + "learning_rate": 4.861477148550491e-05, + "loss": 0.7056, + "step": 13140 + }, + { + "epoch": 2.52, + "learning_rate": 4.8229093012791664e-05, + "loss": 0.716, + "step": 13160 + }, + { + "epoch": 2.53, + "learning_rate": 4.7843414540078414e-05, + "loss": 0.7144, + "step": 13180 + }, + { + "epoch": 2.53, + "learning_rate": 4.745773606736517e-05, + "loss": 0.6969, + "step": 13200 + }, + { + "epoch": 2.53, + "eval_loss": 0.7360122203826904, + "eval_runtime": 25.4878, + "eval_samples_per_second": 78.469, + "eval_steps_per_second": 1.256, + "step": 13200 + }, + { + "epoch": 2.53, + "learning_rate": 4.707205759465192e-05, + "loss": 0.6993, + "step": 13220 + }, + { + "epoch": 2.54, + "learning_rate": 4.668637912193868e-05, + "loss": 0.7013, + "step": 13240 + }, + { + "epoch": 2.54, + "learning_rate": 4.630070064922542e-05, + "loss": 0.7033, + "step": 13260 + }, + { + "epoch": 2.54, + "learning_rate": 4.5915022176512176e-05, + "loss": 0.7067, + "step": 13280 + }, + { + "epoch": 2.55, + "learning_rate": 4.5529343703798926e-05, + "loss": 0.6886, + "step": 13300 + }, + { + "epoch": 2.55, + "learning_rate": 4.514366523108568e-05, + "loss": 0.7061, + "step": 13320 + }, + { + "epoch": 2.56, + "learning_rate": 4.475798675837243e-05, + "loss": 0.7027, + "step": 13340 + }, + { + "epoch": 2.56, + "learning_rate": 4.437230828565919e-05, + "loss": 0.6982, + "step": 13360 + }, + { + "epoch": 2.56, + "learning_rate": 4.398662981294593e-05, + "loss": 0.7042, + "step": 13380 + }, + { + "epoch": 2.57, + "learning_rate": 4.360095134023269e-05, + "loss": 0.6956, + "step": 13400 + }, + { + "epoch": 2.57, + "eval_loss": 0.7356610298156738, + "eval_runtime": 25.5629, + "eval_samples_per_second": 78.238, + "eval_steps_per_second": 1.252, + "step": 13400 + }, + { + "epoch": 2.57, + "learning_rate": 4.321527286751944e-05, + "loss": 0.7046, + "step": 13420 + }, + { + "epoch": 2.58, + "learning_rate": 4.2829594394806195e-05, + "loss": 0.7053, + "step": 13440 + }, + { + "epoch": 2.58, + "learning_rate": 4.2443915922092944e-05, + "loss": 0.707, + "step": 13460 + }, + { + "epoch": 2.58, + "learning_rate": 4.20582374493797e-05, + "loss": 0.7123, + "step": 13480 + }, + { + "epoch": 2.59, + "learning_rate": 4.1672558976666444e-05, + "loss": 0.7032, + "step": 13500 + }, + { + "epoch": 2.59, + "learning_rate": 4.12868805039532e-05, + "loss": 0.6942, + "step": 13520 + }, + { + "epoch": 2.59, + "learning_rate": 4.090120203123995e-05, + "loss": 0.6981, + "step": 13540 + }, + { + "epoch": 2.6, + "learning_rate": 4.051552355852671e-05, + "loss": 0.7052, + "step": 13560 + }, + { + "epoch": 2.6, + "learning_rate": 4.012984508581345e-05, + "loss": 0.7044, + "step": 13580 + }, + { + "epoch": 2.61, + "learning_rate": 3.9744166613100206e-05, + "loss": 0.6978, + "step": 13600 + }, + { + "epoch": 2.61, + "eval_loss": 0.7352051734924316, + "eval_runtime": 25.5016, + "eval_samples_per_second": 78.426, + "eval_steps_per_second": 1.255, + "step": 13600 + }, + { + "epoch": 2.61, + "learning_rate": 3.9358488140386956e-05, + "loss": 0.7001, + "step": 13620 + }, + { + "epoch": 2.61, + "learning_rate": 3.897280966767371e-05, + "loss": 0.7065, + "step": 13640 + }, + { + "epoch": 2.62, + "learning_rate": 3.858713119496046e-05, + "loss": 0.6999, + "step": 13660 + }, + { + "epoch": 2.62, + "learning_rate": 3.820145272224722e-05, + "loss": 0.7104, + "step": 13680 + }, + { + "epoch": 2.63, + "learning_rate": 3.781577424953396e-05, + "loss": 0.7079, + "step": 13700 + }, + { + "epoch": 2.63, + "learning_rate": 3.743009577682072e-05, + "loss": 0.7059, + "step": 13720 + }, + { + "epoch": 2.63, + "learning_rate": 3.7063701227743136e-05, + "loss": 0.7088, + "step": 13740 + }, + { + "epoch": 2.64, + "learning_rate": 3.6678022755029886e-05, + "loss": 0.7051, + "step": 13760 + }, + { + "epoch": 2.64, + "learning_rate": 3.629234428231664e-05, + "loss": 0.7004, + "step": 13780 + }, + { + "epoch": 2.64, + "learning_rate": 3.590666580960339e-05, + "loss": 0.7, + "step": 13800 + }, + { + "epoch": 2.64, + "eval_loss": 0.7350977659225464, + "eval_runtime": 25.4618, + "eval_samples_per_second": 78.549, + "eval_steps_per_second": 1.257, + "step": 13800 + }, + { + "epoch": 2.65, + "learning_rate": 3.552098733689014e-05, + "loss": 0.7044, + "step": 13820 + }, + { + "epoch": 2.65, + "learning_rate": 3.51353088641769e-05, + "loss": 0.6967, + "step": 13840 + }, + { + "epoch": 2.66, + "learning_rate": 3.474963039146365e-05, + "loss": 0.6932, + "step": 13860 + }, + { + "epoch": 2.66, + "learning_rate": 3.43639519187504e-05, + "loss": 0.6982, + "step": 13880 + }, + { + "epoch": 2.66, + "learning_rate": 3.3978273446037154e-05, + "loss": 0.7064, + "step": 13900 + }, + { + "epoch": 2.67, + "learning_rate": 3.3592594973323904e-05, + "loss": 0.7064, + "step": 13920 + }, + { + "epoch": 2.67, + "learning_rate": 3.3206916500610654e-05, + "loss": 0.6975, + "step": 13940 + }, + { + "epoch": 2.67, + "learning_rate": 3.282123802789741e-05, + "loss": 0.7023, + "step": 13960 + }, + { + "epoch": 2.68, + "learning_rate": 3.243555955518416e-05, + "loss": 0.706, + "step": 13980 + }, + { + "epoch": 2.68, + "learning_rate": 3.204988108247091e-05, + "loss": 0.696, + "step": 14000 + }, + { + "epoch": 2.68, + "eval_loss": 0.7347920536994934, + "eval_runtime": 25.5132, + "eval_samples_per_second": 78.391, + "eval_steps_per_second": 1.254, + "step": 14000 + }, + { + "epoch": 2.69, + "learning_rate": 3.1664202609757666e-05, + "loss": 0.6995, + "step": 14020 + }, + { + "epoch": 2.69, + "learning_rate": 3.1278524137044416e-05, + "loss": 0.7022, + "step": 14040 + }, + { + "epoch": 2.69, + "learning_rate": 3.0892845664331166e-05, + "loss": 0.7086, + "step": 14060 + }, + { + "epoch": 2.7, + "learning_rate": 3.050716719161792e-05, + "loss": 0.7135, + "step": 14080 + }, + { + "epoch": 2.7, + "learning_rate": 3.0121488718904672e-05, + "loss": 0.7036, + "step": 14100 + }, + { + "epoch": 2.71, + "learning_rate": 2.9735810246191422e-05, + "loss": 0.6979, + "step": 14120 + }, + { + "epoch": 2.71, + "learning_rate": 2.9350131773478175e-05, + "loss": 0.7082, + "step": 14140 + }, + { + "epoch": 2.71, + "learning_rate": 2.8964453300764928e-05, + "loss": 0.7008, + "step": 14160 + }, + { + "epoch": 2.72, + "learning_rate": 2.8578774828051678e-05, + "loss": 0.7085, + "step": 14180 + }, + { + "epoch": 2.72, + "learning_rate": 2.819309635533843e-05, + "loss": 0.6983, + "step": 14200 + }, + { + "epoch": 2.72, + "eval_loss": 0.73465496301651, + "eval_runtime": 25.4933, + "eval_samples_per_second": 78.452, + "eval_steps_per_second": 1.255, + "step": 14200 + }, + { + "epoch": 2.72, + "learning_rate": 2.7807417882625184e-05, + "loss": 0.7123, + "step": 14220 + }, + { + "epoch": 2.73, + "learning_rate": 2.7421739409911934e-05, + "loss": 0.7027, + "step": 14240 + }, + { + "epoch": 2.73, + "learning_rate": 2.7036060937198687e-05, + "loss": 0.7124, + "step": 14260 + }, + { + "epoch": 2.74, + "learning_rate": 2.6650382464485437e-05, + "loss": 0.7102, + "step": 14280 + }, + { + "epoch": 2.74, + "learning_rate": 2.626470399177219e-05, + "loss": 0.7062, + "step": 14300 + }, + { + "epoch": 2.74, + "learning_rate": 2.5879025519058943e-05, + "loss": 0.7094, + "step": 14320 + }, + { + "epoch": 2.75, + "learning_rate": 2.5493347046345693e-05, + "loss": 0.7017, + "step": 14340 + }, + { + "epoch": 2.75, + "learning_rate": 2.5107668573632446e-05, + "loss": 0.7033, + "step": 14360 + }, + { + "epoch": 2.76, + "learning_rate": 2.47219901009192e-05, + "loss": 0.7036, + "step": 14380 + }, + { + "epoch": 2.76, + "learning_rate": 2.433631162820595e-05, + "loss": 0.7041, + "step": 14400 + }, + { + "epoch": 2.76, + "eval_loss": 0.7345843315124512, + "eval_runtime": 25.4933, + "eval_samples_per_second": 78.452, + "eval_steps_per_second": 1.255, + "step": 14400 + }, + { + "epoch": 2.76, + "learning_rate": 2.3950633155492702e-05, + "loss": 0.6983, + "step": 14420 + }, + { + "epoch": 2.77, + "learning_rate": 2.3564954682779455e-05, + "loss": 0.7006, + "step": 14440 + }, + { + "epoch": 2.77, + "learning_rate": 2.3179276210066205e-05, + "loss": 0.7047, + "step": 14460 + }, + { + "epoch": 2.77, + "learning_rate": 2.2793597737352958e-05, + "loss": 0.7036, + "step": 14480 + }, + { + "epoch": 2.78, + "learning_rate": 2.2407919264639708e-05, + "loss": 0.7025, + "step": 14500 + }, + { + "epoch": 2.78, + "learning_rate": 2.202224079192646e-05, + "loss": 0.699, + "step": 14520 + }, + { + "epoch": 2.79, + "learning_rate": 2.1636562319213214e-05, + "loss": 0.699, + "step": 14540 + }, + { + "epoch": 2.79, + "learning_rate": 2.1250883846499964e-05, + "loss": 0.6968, + "step": 14560 + }, + { + "epoch": 2.79, + "learning_rate": 2.0865205373786717e-05, + "loss": 0.697, + "step": 14580 + }, + { + "epoch": 2.8, + "learning_rate": 2.047952690107347e-05, + "loss": 0.6981, + "step": 14600 + }, + { + "epoch": 2.8, + "eval_loss": 0.7341080904006958, + "eval_runtime": 25.516, + "eval_samples_per_second": 78.382, + "eval_steps_per_second": 1.254, + "step": 14600 + }, + { + "epoch": 2.8, + "learning_rate": 2.009384842836022e-05, + "loss": 0.706, + "step": 14620 + }, + { + "epoch": 2.81, + "learning_rate": 1.9708169955646973e-05, + "loss": 0.6964, + "step": 14640 + }, + { + "epoch": 2.81, + "learning_rate": 1.9322491482933726e-05, + "loss": 0.7043, + "step": 14660 + }, + { + "epoch": 2.81, + "learning_rate": 1.8936813010220476e-05, + "loss": 0.7044, + "step": 14680 + }, + { + "epoch": 2.82, + "learning_rate": 1.855113453750723e-05, + "loss": 0.7079, + "step": 14700 + }, + { + "epoch": 2.82, + "learning_rate": 1.8165456064793982e-05, + "loss": 0.7096, + "step": 14720 + }, + { + "epoch": 2.82, + "learning_rate": 1.7779777592080735e-05, + "loss": 0.6977, + "step": 14740 + }, + { + "epoch": 2.83, + "learning_rate": 1.7394099119367485e-05, + "loss": 0.6997, + "step": 14760 + }, + { + "epoch": 2.83, + "learning_rate": 1.7008420646654238e-05, + "loss": 0.7033, + "step": 14780 + }, + { + "epoch": 2.84, + "learning_rate": 1.662274217394099e-05, + "loss": 0.7016, + "step": 14800 + }, + { + "epoch": 2.84, + "eval_loss": 0.7337221503257751, + "eval_runtime": 25.5058, + "eval_samples_per_second": 78.413, + "eval_steps_per_second": 1.255, + "step": 14800 + }, + { + "epoch": 2.84, + "learning_rate": 1.623706370122774e-05, + "loss": 0.6907, + "step": 14820 + }, + { + "epoch": 2.84, + "learning_rate": 1.5851385228514494e-05, + "loss": 0.7043, + "step": 14840 + }, + { + "epoch": 2.85, + "learning_rate": 1.5465706755801247e-05, + "loss": 0.7058, + "step": 14860 + }, + { + "epoch": 2.85, + "learning_rate": 1.5080028283087997e-05, + "loss": 0.6956, + "step": 14880 + }, + { + "epoch": 2.85, + "learning_rate": 1.469434981037475e-05, + "loss": 0.7109, + "step": 14900 + }, + { + "epoch": 2.86, + "learning_rate": 1.4308671337661502e-05, + "loss": 0.7055, + "step": 14920 + }, + { + "epoch": 2.86, + "learning_rate": 1.3922992864948253e-05, + "loss": 0.7011, + "step": 14940 + }, + { + "epoch": 2.87, + "learning_rate": 1.3537314392235005e-05, + "loss": 0.7009, + "step": 14960 + }, + { + "epoch": 2.87, + "learning_rate": 1.3151635919521758e-05, + "loss": 0.7069, + "step": 14980 + }, + { + "epoch": 2.87, + "learning_rate": 1.276595744680851e-05, + "loss": 0.7038, + "step": 15000 + }, + { + "epoch": 2.87, + "eval_loss": 0.7338148355484009, + "eval_runtime": 25.4764, + "eval_samples_per_second": 78.504, + "eval_steps_per_second": 1.256, + "step": 15000 + }, + { + "epoch": 2.88, + "learning_rate": 1.238027897409526e-05, + "loss": 0.706, + "step": 15020 + }, + { + "epoch": 2.88, + "learning_rate": 1.1994600501382012e-05, + "loss": 0.6918, + "step": 15040 + }, + { + "epoch": 2.89, + "learning_rate": 1.1608922028668765e-05, + "loss": 0.7045, + "step": 15060 + }, + { + "epoch": 2.89, + "learning_rate": 1.1223243555955517e-05, + "loss": 0.6984, + "step": 15080 + }, + { + "epoch": 2.89, + "learning_rate": 1.0837565083242268e-05, + "loss": 0.7126, + "step": 15100 + }, + { + "epoch": 2.9, + "learning_rate": 1.0451886610529021e-05, + "loss": 0.6974, + "step": 15120 + }, + { + "epoch": 2.9, + "learning_rate": 1.0066208137815773e-05, + "loss": 0.7063, + "step": 15140 + }, + { + "epoch": 2.9, + "learning_rate": 9.680529665102524e-06, + "loss": 0.697, + "step": 15160 + }, + { + "epoch": 2.91, + "learning_rate": 9.294851192389277e-06, + "loss": 0.6965, + "step": 15180 + }, + { + "epoch": 2.91, + "learning_rate": 8.909172719676029e-06, + "loss": 0.7001, + "step": 15200 + }, + { + "epoch": 2.91, + "eval_loss": 0.733613908290863, + "eval_runtime": 25.4875, + "eval_samples_per_second": 78.47, + "eval_steps_per_second": 1.256, + "step": 15200 + }, + { + "epoch": 2.92, + "learning_rate": 8.523494246962782e-06, + "loss": 0.6928, + "step": 15220 + }, + { + "epoch": 2.92, + "learning_rate": 8.137815774249533e-06, + "loss": 0.7047, + "step": 15240 + }, + { + "epoch": 2.92, + "learning_rate": 7.752137301536285e-06, + "loss": 0.6888, + "step": 15260 + }, + { + "epoch": 2.93, + "learning_rate": 7.366458828823037e-06, + "loss": 0.7133, + "step": 15280 + }, + { + "epoch": 2.93, + "learning_rate": 6.980780356109789e-06, + "loss": 0.7019, + "step": 15300 + }, + { + "epoch": 2.94, + "learning_rate": 6.595101883396541e-06, + "loss": 0.7017, + "step": 15320 + }, + { + "epoch": 2.94, + "learning_rate": 6.209423410683292e-06, + "loss": 0.7045, + "step": 15340 + }, + { + "epoch": 2.94, + "learning_rate": 5.823744937970045e-06, + "loss": 0.7017, + "step": 15360 + }, + { + "epoch": 2.95, + "learning_rate": 5.438066465256798e-06, + "loss": 0.699, + "step": 15380 + }, + { + "epoch": 2.95, + "learning_rate": 5.052387992543549e-06, + "loss": 0.691, + "step": 15400 + }, + { + "epoch": 2.95, + "eval_loss": 0.7335031032562256, + "eval_runtime": 25.5263, + "eval_samples_per_second": 78.351, + "eval_steps_per_second": 1.254, + "step": 15400 + }, + { + "epoch": 2.95, + "learning_rate": 4.666709519830301e-06, + "loss": 0.699, + "step": 15420 + }, + { + "epoch": 2.96, + "learning_rate": 4.281031047117053e-06, + "loss": 0.7009, + "step": 15440 + }, + { + "epoch": 2.96, + "learning_rate": 3.895352574403805e-06, + "loss": 0.6946, + "step": 15460 + }, + { + "epoch": 2.97, + "learning_rate": 3.509674101690557e-06, + "loss": 0.7027, + "step": 15480 + }, + { + "epoch": 2.97, + "learning_rate": 3.123995628977309e-06, + "loss": 0.6914, + "step": 15500 + }, + { + "epoch": 2.97, + "learning_rate": 2.738317156264061e-06, + "loss": 0.6969, + "step": 15520 + }, + { + "epoch": 2.98, + "learning_rate": 2.3526386835508128e-06, + "loss": 0.6994, + "step": 15540 + }, + { + "epoch": 2.98, + "learning_rate": 1.9669602108375647e-06, + "loss": 0.6961, + "step": 15560 + }, + { + "epoch": 2.99, + "learning_rate": 1.581281738124317e-06, + "loss": 0.6913, + "step": 15580 + }, + { + "epoch": 2.99, + "learning_rate": 1.1956032654110688e-06, + "loss": 0.6994, + "step": 15600 + }, + { + "epoch": 2.99, + "eval_loss": 0.7334907054901123, + "eval_runtime": 25.5112, + "eval_samples_per_second": 78.397, + "eval_steps_per_second": 1.254, + "step": 15600 + } + ], + "max_steps": 15657, + "num_train_epochs": 3, + "total_flos": 7.887669986989179e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/training_args.bin b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8cadc78dc16a3098f59f602efe3fce82b270b5ad --- /dev/null +++ b/adapters/saved-alpaca-belle-cot13b/checkpoint-15600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cbb446b8dfb89a3bab291d29c74c98c3984471bb063f88c9b78e95c95415320 +size 3643 diff --git a/adapters/saved-alpaca-belle-cot7b/adapter_config.json b/adapters/saved-alpaca-belle-cot7b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6e56f2ae8f10fadfeec6c730ac6b119025824443 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/adapter_config.json @@ -0,0 +1,18 @@ +{ + "base_model_name_or_path": "/mnt/bn/qingyi-bn-lq/llama/llama-7b-hf", + "bias": "none", + "enable_lora": null, + "fan_in_fan_out": false, + "inference_mode": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved-alpaca-belle-cot7b/adapter_model.bin b/adapters/saved-alpaca-belle-cot7b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..a3349efefcbdbd8a6f95010862eb84db79f5c2c6 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:330494735335477a234baf27e22b489c89e7e7a34d26a212ccec73a8434164e3 +size 16822989 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/optimizer.pt b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..846453c1535adda133b0edbbf8455f69ba37dd10 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5e8256e776a25716fe32ef3b3eda9560c4bd8a171d1e5fad38387183eda7895 +size 33629893 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/pytorch_model.bin b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..65829944a72d52a650adb2480d8e7f590c846b8e --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48305eecc9b93894c7b59b93426d9ab2394cc5151beeb07774b9d3a0e91f7e7a +size 16822989 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_0.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d05a7d0f559508e46ea6bfc4ad9fc07f0eb41af2 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c9f3a0bb1007c28a2b8fa530f0d1a424fbddb23bf0dece63473ec7702e409e0 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_1.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..f85797499bde42c5cc3c5c251facdbfed570cf2b --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8a4915466be3b8e540f331e9fd0e05404871f1807af6e239f2c8ed085caccb2 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_2.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..214d032b3d38598aa7ebfc7ad8fa4e1e22e0df13 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cfa8686978e36f8af094277d9666e5456eacc34c4a1959cbff637aab8fa552e +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_3.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..e824b0f98fb10c14723e8a086c56df35579f088a --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1851c92017060b4abff22c0cc5f803a7689b1d6e39b1e8a6a5fb31170c078450 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_4.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..9602da00eb6c310861a87cc0faedda30a8002454 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c712b8fdb5115d30161468162cb123d7b42376bef8676295fbc1a5fa5f108e13 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_5.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..2084607ff7965c24a9f737a4059321230f65e969 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2900c030e2ad08e954c9bd5845f1b1470862eee11adfeefea6193970e4126278 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_6.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..8a24fd4d293608214749e5a2f072a72758967349 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51056a4d3cd4ff72adc80dbd16e88d4fff2ac4ddb10e73a2c63665acf120bf71 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_7.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..5ddd5b77cb4d611c513770a486b45c8992ace524 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de30c1977eb1bc19684d687e751b3e884f6513f9edd461e7e64dfbce2f8f81cb +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/scaler.pt b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d30ed14693eab572f47c1e2beba2bcd521cba4dd --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:716a64eaf76d74e95fa448d0408ed57f1b6662ecbc1d8d1b3466a01e085e464b +size 557 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/scheduler.pt b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc42a7b073141bdad7a73b36a944d4285a8269d2 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:547444d0b1563c08f71c58494e0eeb4c8d495498b015407edc5402412a53a467 +size 627 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/trainer_state.json b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..aee4bfe1d0ddd922021cec31fe14c78ac7257e81 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/trainer_state.json @@ -0,0 +1,5184 @@ +{ + "best_metric": 0.7872186303138733, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-belle-cot7b/checkpoint-15200", + "epoch": 2.9124353324391645, + "global_step": 15200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.8213, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 1.5494, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 1.179, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 0.00023999999999999998, + "loss": 1.1022, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.0003, + "loss": 1.078, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029961432152728675, + "loss": 1.0347, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002992286430545735, + "loss": 1.0169, + "step": 140 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029884296458186025, + "loss": 1.0088, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029845728610914697, + "loss": 0.9896, + "step": 180 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029807160763643375, + "loss": 0.99, + "step": 200 + }, + { + "epoch": 0.04, + "eval_loss": 1.0032634735107422, + "eval_runtime": 16.3952, + "eval_samples_per_second": 121.987, + "eval_steps_per_second": 1.952, + "step": 200 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029768592916372047, + "loss": 0.9724, + "step": 220 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029730025069100725, + "loss": 0.9719, + "step": 240 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029691457221829397, + "loss": 0.9652, + "step": 260 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029652889374558075, + "loss": 0.9579, + "step": 280 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029614321527286747, + "loss": 0.9532, + "step": 300 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029575753680015425, + "loss": 0.9613, + "step": 320 + }, + { + "epoch": 0.07, + "learning_rate": 0.000295371858327441, + "loss": 0.9473, + "step": 340 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029498617985472775, + "loss": 0.9416, + "step": 360 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002946005013820145, + "loss": 0.9386, + "step": 380 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029421482290930125, + "loss": 0.9338, + "step": 400 + }, + { + "epoch": 0.08, + "eval_loss": 0.957970380783081, + "eval_runtime": 16.3897, + "eval_samples_per_second": 122.028, + "eval_steps_per_second": 1.952, + "step": 400 + }, + { + "epoch": 0.08, + "learning_rate": 0.000293829144436588, + "loss": 0.937, + "step": 420 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029344346596387475, + "loss": 0.9304, + "step": 440 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002930577874911615, + "loss": 0.9323, + "step": 460 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029267210901844825, + "loss": 0.9185, + "step": 480 + }, + { + "epoch": 0.1, + "learning_rate": 0.000292286430545735, + "loss": 0.9273, + "step": 500 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029190075207302175, + "loss": 0.922, + "step": 520 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002915150736003085, + "loss": 0.9146, + "step": 540 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029112939512759525, + "loss": 0.9129, + "step": 560 + }, + { + "epoch": 0.11, + "learning_rate": 0.000290743716654882, + "loss": 0.9146, + "step": 580 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002903580381821688, + "loss": 0.9078, + "step": 600 + }, + { + "epoch": 0.11, + "eval_loss": 0.9345074892044067, + "eval_runtime": 16.4049, + "eval_samples_per_second": 121.914, + "eval_steps_per_second": 1.951, + "step": 600 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002899723597094555, + "loss": 0.9004, + "step": 620 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002895866812367423, + "loss": 0.9042, + "step": 640 + }, + { + "epoch": 0.13, + "learning_rate": 0.000289201002764029, + "loss": 0.9028, + "step": 660 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002888153242913158, + "loss": 0.8889, + "step": 680 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002884296458186025, + "loss": 0.8935, + "step": 700 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002880439673458893, + "loss": 0.9024, + "step": 720 + }, + { + "epoch": 0.14, + "learning_rate": 0.000287658288873176, + "loss": 0.8922, + "step": 740 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002872726104004628, + "loss": 0.8896, + "step": 760 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002868869319277495, + "loss": 0.8907, + "step": 780 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002865012534550363, + "loss": 0.8922, + "step": 800 + }, + { + "epoch": 0.15, + "eval_loss": 0.9149895310401917, + "eval_runtime": 16.4499, + "eval_samples_per_second": 121.581, + "eval_steps_per_second": 1.945, + "step": 800 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002861155749823231, + "loss": 0.8867, + "step": 820 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002857298965096098, + "loss": 0.891, + "step": 840 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028534421803689657, + "loss": 0.8882, + "step": 860 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002849585395641833, + "loss": 0.8835, + "step": 880 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028457286109147007, + "loss": 0.8798, + "step": 900 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002841871826187568, + "loss": 0.8784, + "step": 920 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028380150414604357, + "loss": 0.8841, + "step": 940 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002834158256733303, + "loss": 0.8787, + "step": 960 + }, + { + "epoch": 0.19, + "learning_rate": 0.00028303014720061707, + "loss": 0.8693, + "step": 980 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002826444687279038, + "loss": 0.8711, + "step": 1000 + }, + { + "epoch": 0.19, + "eval_loss": 0.9027432799339294, + "eval_runtime": 16.447, + "eval_samples_per_second": 121.603, + "eval_steps_per_second": 1.946, + "step": 1000 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028225879025519057, + "loss": 0.876, + "step": 1020 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002818731117824773, + "loss": 0.8749, + "step": 1040 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028148743330976407, + "loss": 0.877, + "step": 1060 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028110175483705085, + "loss": 0.8754, + "step": 1080 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028071607636433757, + "loss": 0.8792, + "step": 1100 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028033039789162435, + "loss": 0.8701, + "step": 1120 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027994471941891107, + "loss": 0.8667, + "step": 1140 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027955904094619785, + "loss": 0.8769, + "step": 1160 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027917336247348457, + "loss": 0.8734, + "step": 1180 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002787876840007713, + "loss": 0.8708, + "step": 1200 + }, + { + "epoch": 0.23, + "eval_loss": 0.8911536335945129, + "eval_runtime": 16.423, + "eval_samples_per_second": 121.78, + "eval_steps_per_second": 1.948, + "step": 1200 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027840200552805807, + "loss": 0.8673, + "step": 1220 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027801632705534485, + "loss": 0.8618, + "step": 1240 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027763064858263157, + "loss": 0.8739, + "step": 1260 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027724497010991834, + "loss": 0.8608, + "step": 1280 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002768592916372051, + "loss": 0.8631, + "step": 1300 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027647361316449184, + "loss": 0.8547, + "step": 1320 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002760879346917786, + "loss": 0.8589, + "step": 1340 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027570225621906534, + "loss": 0.8615, + "step": 1360 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002753165777463521, + "loss": 0.8644, + "step": 1380 + }, + { + "epoch": 0.27, + "learning_rate": 0.00027493089927363884, + "loss": 0.8524, + "step": 1400 + }, + { + "epoch": 0.27, + "eval_loss": 0.8813066482543945, + "eval_runtime": 16.4628, + "eval_samples_per_second": 121.486, + "eval_steps_per_second": 1.944, + "step": 1400 + }, + { + "epoch": 0.27, + "learning_rate": 0.00027454522080092557, + "loss": 0.8562, + "step": 1420 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027415954232821234, + "loss": 0.8547, + "step": 1440 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002737738638554991, + "loss": 0.8599, + "step": 1460 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027338818538278584, + "loss": 0.8491, + "step": 1480 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002730025069100726, + "loss": 0.8496, + "step": 1500 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002726168284373594, + "loss": 0.8594, + "step": 1520 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002722311499646461, + "loss": 0.8512, + "step": 1540 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002718454714919329, + "loss": 0.8441, + "step": 1560 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002714597930192196, + "loss": 0.8621, + "step": 1580 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002710741145465064, + "loss": 0.8525, + "step": 1600 + }, + { + "epoch": 0.31, + "eval_loss": 0.8728711009025574, + "eval_runtime": 16.4389, + "eval_samples_per_second": 121.663, + "eval_steps_per_second": 1.947, + "step": 1600 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002706884360737931, + "loss": 0.852, + "step": 1620 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027030275760107984, + "loss": 0.8553, + "step": 1640 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002699170791283666, + "loss": 0.8445, + "step": 1660 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002695314006556534, + "loss": 0.8518, + "step": 1680 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002691457221829401, + "loss": 0.8318, + "step": 1700 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002687600437102269, + "loss": 0.8492, + "step": 1720 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002683743652375136, + "loss": 0.8475, + "step": 1740 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002679886867648004, + "loss": 0.8437, + "step": 1760 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026760300829208717, + "loss": 0.8355, + "step": 1780 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002672173298193739, + "loss": 0.8486, + "step": 1800 + }, + { + "epoch": 0.34, + "eval_loss": 0.8663893938064575, + "eval_runtime": 16.4511, + "eval_samples_per_second": 121.572, + "eval_steps_per_second": 1.945, + "step": 1800 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026683165134666067, + "loss": 0.8449, + "step": 1820 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002664459728739474, + "loss": 0.853, + "step": 1840 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026606029440123417, + "loss": 0.8472, + "step": 1860 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002656746159285209, + "loss": 0.83, + "step": 1880 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002652889374558076, + "loss": 0.8398, + "step": 1900 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002649032589830944, + "loss": 0.8337, + "step": 1920 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026451758051038117, + "loss": 0.8314, + "step": 1940 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002641319020376679, + "loss": 0.8314, + "step": 1960 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026374622356495467, + "loss": 0.845, + "step": 1980 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026336054509224144, + "loss": 0.8294, + "step": 2000 + }, + { + "epoch": 0.38, + "eval_loss": 0.8619188666343689, + "eval_runtime": 16.4444, + "eval_samples_per_second": 121.622, + "eval_steps_per_second": 1.946, + "step": 2000 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026297486661952817, + "loss": 0.8404, + "step": 2020 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026258918814681494, + "loss": 0.839, + "step": 2040 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026220350967410167, + "loss": 0.84, + "step": 2060 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026181783120138844, + "loss": 0.8442, + "step": 2080 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026143215272867517, + "loss": 0.8443, + "step": 2100 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002610464742559619, + "loss": 0.8301, + "step": 2120 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026066079578324867, + "loss": 0.8302, + "step": 2140 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026027511731053544, + "loss": 0.836, + "step": 2160 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025988943883782216, + "loss": 0.8277, + "step": 2180 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025950376036510894, + "loss": 0.8335, + "step": 2200 + }, + { + "epoch": 0.42, + "eval_loss": 0.8562669157981873, + "eval_runtime": 16.4486, + "eval_samples_per_second": 121.591, + "eval_steps_per_second": 1.945, + "step": 2200 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025911808189239566, + "loss": 0.8267, + "step": 2220 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025873240341968244, + "loss": 0.8267, + "step": 2240 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002583467249469692, + "loss": 0.8293, + "step": 2260 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025796104647425594, + "loss": 0.836, + "step": 2280 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002575753680015427, + "loss": 0.8255, + "step": 2300 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025718968952882944, + "loss": 0.8177, + "step": 2320 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025680401105611616, + "loss": 0.8272, + "step": 2340 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025641833258340294, + "loss": 0.831, + "step": 2360 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025603265411068966, + "loss": 0.819, + "step": 2380 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025564697563797644, + "loss": 0.8216, + "step": 2400 + }, + { + "epoch": 0.46, + "eval_loss": 0.8516544103622437, + "eval_runtime": 16.476, + "eval_samples_per_second": 121.389, + "eval_steps_per_second": 1.942, + "step": 2400 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002552612971652632, + "loss": 0.8305, + "step": 2420 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025487561869254994, + "loss": 0.8305, + "step": 2440 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002544899402198367, + "loss": 0.8302, + "step": 2460 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002541042617471235, + "loss": 0.824, + "step": 2480 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002537185832744102, + "loss": 0.8315, + "step": 2500 + }, + { + "epoch": 0.48, + "learning_rate": 0.000253332904801697, + "loss": 0.8224, + "step": 2520 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002529472263289837, + "loss": 0.8229, + "step": 2540 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025256154785627044, + "loss": 0.8156, + "step": 2560 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002521758693835572, + "loss": 0.8319, + "step": 2580 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025179019091084394, + "loss": 0.8222, + "step": 2600 + }, + { + "epoch": 0.5, + "eval_loss": 0.8481459021568298, + "eval_runtime": 16.453, + "eval_samples_per_second": 121.558, + "eval_steps_per_second": 1.945, + "step": 2600 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002514045124381307, + "loss": 0.8205, + "step": 2620 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002510188339654175, + "loss": 0.8267, + "step": 2640 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002506331554927042, + "loss": 0.8116, + "step": 2660 + }, + { + "epoch": 0.51, + "learning_rate": 0.000250247477019991, + "loss": 0.8239, + "step": 2680 + }, + { + "epoch": 0.52, + "learning_rate": 0.00024986179854727777, + "loss": 0.8126, + "step": 2700 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002494761200745645, + "loss": 0.8226, + "step": 2720 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024909044160185127, + "loss": 0.8173, + "step": 2740 + }, + { + "epoch": 0.53, + "learning_rate": 0.000248704763129138, + "loss": 0.8227, + "step": 2760 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002483190846564247, + "loss": 0.8129, + "step": 2780 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002479334061837115, + "loss": 0.8164, + "step": 2800 + }, + { + "epoch": 0.54, + "eval_loss": 0.8439643979072571, + "eval_runtime": 16.4767, + "eval_samples_per_second": 121.384, + "eval_steps_per_second": 1.942, + "step": 2800 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002475477277109982, + "loss": 0.807, + "step": 2820 + }, + { + "epoch": 0.54, + "learning_rate": 0.000247162049238285, + "loss": 0.8126, + "step": 2840 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024677637076557176, + "loss": 0.8193, + "step": 2860 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002463906922928585, + "loss": 0.8091, + "step": 2880 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024600501382014526, + "loss": 0.8147, + "step": 2900 + }, + { + "epoch": 0.56, + "learning_rate": 0.000245619335347432, + "loss": 0.8207, + "step": 2920 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024523365687471876, + "loss": 0.8087, + "step": 2940 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024484797840200554, + "loss": 0.8198, + "step": 2960 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024446229992929226, + "loss": 0.8087, + "step": 2980 + }, + { + "epoch": 0.57, + "learning_rate": 0.000244076621456579, + "loss": 0.8182, + "step": 3000 + }, + { + "epoch": 0.57, + "eval_loss": 0.8408891558647156, + "eval_runtime": 16.4801, + "eval_samples_per_second": 121.358, + "eval_steps_per_second": 1.942, + "step": 3000 + }, + { + "epoch": 0.58, + "learning_rate": 0.0002436909429838658, + "loss": 0.8188, + "step": 3020 + }, + { + "epoch": 0.58, + "learning_rate": 0.0002433052645111525, + "loss": 0.8082, + "step": 3040 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024291958603843926, + "loss": 0.8171, + "step": 3060 + }, + { + "epoch": 0.59, + "learning_rate": 0.000242533907565726, + "loss": 0.8088, + "step": 3080 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024214822909301276, + "loss": 0.8148, + "step": 3100 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024176255062029954, + "loss": 0.8122, + "step": 3120 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024137687214758626, + "loss": 0.811, + "step": 3140 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024099119367487304, + "loss": 0.8179, + "step": 3160 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002406055152021598, + "loss": 0.8029, + "step": 3180 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002402198367294465, + "loss": 0.8143, + "step": 3200 + }, + { + "epoch": 0.61, + "eval_loss": 0.837196946144104, + "eval_runtime": 16.4913, + "eval_samples_per_second": 121.276, + "eval_steps_per_second": 1.94, + "step": 3200 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002398341582567333, + "loss": 0.7969, + "step": 3220 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023944847978402, + "loss": 0.8158, + "step": 3240 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002390628013113068, + "loss": 0.8019, + "step": 3260 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023867712283859354, + "loss": 0.8042, + "step": 3280 + }, + { + "epoch": 0.63, + "learning_rate": 0.0002382914443658803, + "loss": 0.8022, + "step": 3300 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023790576589316704, + "loss": 0.8043, + "step": 3320 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002375200874204538, + "loss": 0.8106, + "step": 3340 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023713440894774054, + "loss": 0.8146, + "step": 3360 + }, + { + "epoch": 0.65, + "learning_rate": 0.0002367487304750273, + "loss": 0.8004, + "step": 3380 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023636305200231404, + "loss": 0.8096, + "step": 3400 + }, + { + "epoch": 0.65, + "eval_loss": 0.8347571492195129, + "eval_runtime": 16.4822, + "eval_samples_per_second": 121.343, + "eval_steps_per_second": 1.941, + "step": 3400 + }, + { + "epoch": 0.66, + "learning_rate": 0.0002359773735296008, + "loss": 0.8226, + "step": 3420 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023559169505688756, + "loss": 0.8083, + "step": 3440 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023520601658417428, + "loss": 0.8168, + "step": 3460 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023482033811146106, + "loss": 0.8112, + "step": 3480 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002344346596387478, + "loss": 0.8131, + "step": 3500 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023404898116603456, + "loss": 0.8097, + "step": 3520 + }, + { + "epoch": 0.68, + "learning_rate": 0.0002336633026933213, + "loss": 0.804, + "step": 3540 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023327762422060806, + "loss": 0.8085, + "step": 3560 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002328919457478948, + "loss": 0.7992, + "step": 3580 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002325062672751816, + "loss": 0.8124, + "step": 3600 + }, + { + "epoch": 0.69, + "eval_loss": 0.8324670791625977, + "eval_runtime": 16.4936, + "eval_samples_per_second": 121.259, + "eval_steps_per_second": 1.94, + "step": 3600 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002321205888024683, + "loss": 0.8024, + "step": 3620 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002317349103297551, + "loss": 0.8032, + "step": 3640 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023134923185704184, + "loss": 0.8065, + "step": 3660 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023096355338432856, + "loss": 0.8106, + "step": 3680 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023057787491161534, + "loss": 0.8009, + "step": 3700 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023019219643890206, + "loss": 0.816, + "step": 3720 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022980651796618884, + "loss": 0.8103, + "step": 3740 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022942083949347559, + "loss": 0.8099, + "step": 3760 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022903516102076233, + "loss": 0.8085, + "step": 3780 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022864948254804908, + "loss": 0.8044, + "step": 3800 + }, + { + "epoch": 0.73, + "eval_loss": 0.830141544342041, + "eval_runtime": 16.4845, + "eval_samples_per_second": 121.326, + "eval_steps_per_second": 1.941, + "step": 3800 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022826380407533586, + "loss": 0.7969, + "step": 3820 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022787812560262258, + "loss": 0.8029, + "step": 3840 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022749244712990936, + "loss": 0.7921, + "step": 3860 + }, + { + "epoch": 0.74, + "learning_rate": 0.0002271067686571961, + "loss": 0.8051, + "step": 3880 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022672109018448283, + "loss": 0.807, + "step": 3900 + }, + { + "epoch": 0.75, + "learning_rate": 0.0002263354117117696, + "loss": 0.8042, + "step": 3920 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022594973323905633, + "loss": 0.7947, + "step": 3940 + }, + { + "epoch": 0.76, + "learning_rate": 0.0002255640547663431, + "loss": 0.7972, + "step": 3960 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022517837629362986, + "loss": 0.8038, + "step": 3980 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002247926978209166, + "loss": 0.8064, + "step": 4000 + }, + { + "epoch": 0.77, + "eval_loss": 0.828279435634613, + "eval_runtime": 16.4904, + "eval_samples_per_second": 121.283, + "eval_steps_per_second": 1.941, + "step": 4000 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022440701934820336, + "loss": 0.8032, + "step": 4020 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022402134087549014, + "loss": 0.7934, + "step": 4040 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022363566240277686, + "loss": 0.7919, + "step": 4060 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022324998393006364, + "loss": 0.8011, + "step": 4080 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022286430545735036, + "loss": 0.8026, + "step": 4100 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002224786269846371, + "loss": 0.804, + "step": 4120 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022209294851192388, + "loss": 0.8122, + "step": 4140 + }, + { + "epoch": 0.8, + "learning_rate": 0.0002217072700392106, + "loss": 0.7932, + "step": 4160 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022132159156649738, + "loss": 0.7911, + "step": 4180 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022093591309378413, + "loss": 0.8012, + "step": 4200 + }, + { + "epoch": 0.8, + "eval_loss": 0.8261794447898865, + "eval_runtime": 16.4921, + "eval_samples_per_second": 121.27, + "eval_steps_per_second": 1.94, + "step": 4200 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022055023462107088, + "loss": 0.7989, + "step": 4220 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022016455614835763, + "loss": 0.8031, + "step": 4240 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021977887767564438, + "loss": 0.8066, + "step": 4260 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021939319920293113, + "loss": 0.7964, + "step": 4280 + }, + { + "epoch": 0.82, + "learning_rate": 0.0002190075207302179, + "loss": 0.7947, + "step": 4300 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021862184225750463, + "loss": 0.8035, + "step": 4320 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021823616378479138, + "loss": 0.8029, + "step": 4340 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021785048531207816, + "loss": 0.7941, + "step": 4360 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021746480683936488, + "loss": 0.7934, + "step": 4380 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021707912836665166, + "loss": 0.7946, + "step": 4400 + }, + { + "epoch": 0.84, + "eval_loss": 0.823946475982666, + "eval_runtime": 16.4887, + "eval_samples_per_second": 121.295, + "eval_steps_per_second": 1.941, + "step": 4400 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021669344989393838, + "loss": 0.7974, + "step": 4420 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021630777142122516, + "loss": 0.7962, + "step": 4440 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002159220929485119, + "loss": 0.7946, + "step": 4460 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021553641447579866, + "loss": 0.7818, + "step": 4480 + }, + { + "epoch": 0.86, + "learning_rate": 0.0002151507360030854, + "loss": 0.803, + "step": 4500 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021476505753037218, + "loss": 0.7851, + "step": 4520 + }, + { + "epoch": 0.87, + "learning_rate": 0.0002143793790576589, + "loss": 0.7984, + "step": 4540 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021399370058494568, + "loss": 0.7973, + "step": 4560 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002136080221122324, + "loss": 0.782, + "step": 4580 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021322234363951916, + "loss": 0.7951, + "step": 4600 + }, + { + "epoch": 0.88, + "eval_loss": 0.8220962285995483, + "eval_runtime": 16.5191, + "eval_samples_per_second": 121.072, + "eval_steps_per_second": 1.937, + "step": 4600 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021283666516680593, + "loss": 0.7947, + "step": 4620 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021245098669409266, + "loss": 0.7957, + "step": 4640 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021206530822137943, + "loss": 0.797, + "step": 4660 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021167962974866618, + "loss": 0.8097, + "step": 4680 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021129395127595293, + "loss": 0.7894, + "step": 4700 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021090827280323968, + "loss": 0.7789, + "step": 4720 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002105225943305264, + "loss": 0.7949, + "step": 4740 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021013691585781318, + "loss": 0.7895, + "step": 4760 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020975123738509996, + "loss": 0.8036, + "step": 4780 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020936555891238668, + "loss": 0.7966, + "step": 4800 + }, + { + "epoch": 0.92, + "eval_loss": 0.8209095597267151, + "eval_runtime": 16.5035, + "eval_samples_per_second": 121.187, + "eval_steps_per_second": 1.939, + "step": 4800 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020897988043967343, + "loss": 0.7892, + "step": 4820 + }, + { + "epoch": 0.93, + "learning_rate": 0.0002085942019669602, + "loss": 0.7825, + "step": 4840 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020820852349424693, + "loss": 0.7937, + "step": 4860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002078228450215337, + "loss": 0.7893, + "step": 4880 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020743716654882043, + "loss": 0.7944, + "step": 4900 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002070514880761072, + "loss": 0.7973, + "step": 4920 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020666580960339396, + "loss": 0.7919, + "step": 4940 + }, + { + "epoch": 0.95, + "learning_rate": 0.0002062801311306807, + "loss": 0.7918, + "step": 4960 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020589445265796746, + "loss": 0.7901, + "step": 4980 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020550877418525423, + "loss": 0.7891, + "step": 5000 + }, + { + "epoch": 0.96, + "eval_loss": 0.8192855715751648, + "eval_runtime": 16.5248, + "eval_samples_per_second": 121.03, + "eval_steps_per_second": 1.936, + "step": 5000 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020512309571254096, + "loss": 0.7813, + "step": 5020 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002047374172398277, + "loss": 0.7831, + "step": 5040 + }, + { + "epoch": 0.97, + "learning_rate": 0.00020435173876711445, + "loss": 0.7911, + "step": 5060 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002039660602944012, + "loss": 0.7816, + "step": 5080 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020358038182168798, + "loss": 0.7915, + "step": 5100 + }, + { + "epoch": 0.98, + "learning_rate": 0.0002031947033489747, + "loss": 0.791, + "step": 5120 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020280902487626148, + "loss": 0.7851, + "step": 5140 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020242334640354823, + "loss": 0.7859, + "step": 5160 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020203766793083498, + "loss": 0.7888, + "step": 5180 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020165198945812173, + "loss": 0.7854, + "step": 5200 + }, + { + "epoch": 1.0, + "eval_loss": 0.8173321485519409, + "eval_runtime": 16.5042, + "eval_samples_per_second": 121.182, + "eval_steps_per_second": 1.939, + "step": 5200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0002012663109854085, + "loss": 0.7888, + "step": 5220 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020088063251269523, + "loss": 0.7893, + "step": 5240 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020049495403998198, + "loss": 0.7817, + "step": 5260 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020010927556726873, + "loss": 0.7755, + "step": 5280 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019972359709455548, + "loss": 0.7839, + "step": 5300 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019933791862184226, + "loss": 0.7911, + "step": 5320 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019895224014912898, + "loss": 0.7819, + "step": 5340 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019856656167641576, + "loss": 0.7802, + "step": 5360 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001981808832037025, + "loss": 0.7847, + "step": 5380 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019779520473098925, + "loss": 0.7824, + "step": 5400 + }, + { + "epoch": 1.03, + "eval_loss": 0.8163856267929077, + "eval_runtime": 16.5306, + "eval_samples_per_second": 120.988, + "eval_steps_per_second": 1.936, + "step": 5400 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019742881018191167, + "loss": 0.7757, + "step": 5420 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001970431317091984, + "loss": 0.786, + "step": 5440 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019665745323648517, + "loss": 0.7923, + "step": 5460 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019627177476377191, + "loss": 0.791, + "step": 5480 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019588609629105866, + "loss": 0.7863, + "step": 5500 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019550041781834541, + "loss": 0.7879, + "step": 5520 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001951147393456322, + "loss": 0.7924, + "step": 5540 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019472906087291891, + "loss": 0.7918, + "step": 5560 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001943433824002057, + "loss": 0.792, + "step": 5580 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001939577039274924, + "loss": 0.7784, + "step": 5600 + }, + { + "epoch": 1.07, + "eval_loss": 0.8148436546325684, + "eval_runtime": 16.5424, + "eval_samples_per_second": 120.901, + "eval_steps_per_second": 1.934, + "step": 5600 + }, + { + "epoch": 1.08, + "learning_rate": 0.0001935720254547792, + "loss": 0.7903, + "step": 5620 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019318634698206594, + "loss": 0.785, + "step": 5640 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019280066850935266, + "loss": 0.7916, + "step": 5660 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019241499003663944, + "loss": 0.779, + "step": 5680 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001920293115639262, + "loss": 0.7909, + "step": 5700 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019164363309121294, + "loss": 0.7798, + "step": 5720 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001912579546184997, + "loss": 0.7846, + "step": 5740 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019087227614578647, + "loss": 0.7887, + "step": 5760 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001904865976730732, + "loss": 0.7802, + "step": 5780 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019010091920035997, + "loss": 0.7891, + "step": 5800 + }, + { + "epoch": 1.11, + "eval_loss": 0.8130878806114197, + "eval_runtime": 16.5056, + "eval_samples_per_second": 121.171, + "eval_steps_per_second": 1.939, + "step": 5800 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001897152407276467, + "loss": 0.7819, + "step": 5820 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018932956225493346, + "loss": 0.7945, + "step": 5840 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018894388378222021, + "loss": 0.784, + "step": 5860 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018855820530950694, + "loss": 0.7838, + "step": 5880 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018817252683679371, + "loss": 0.7841, + "step": 5900 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001877868483640805, + "loss": 0.7909, + "step": 5920 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001874011698913672, + "loss": 0.7775, + "step": 5940 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018701549141865396, + "loss": 0.7827, + "step": 5960 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001866298129459407, + "loss": 0.7866, + "step": 5980 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018624413447322746, + "loss": 0.7696, + "step": 6000 + }, + { + "epoch": 1.15, + "eval_loss": 0.8125277757644653, + "eval_runtime": 16.506, + "eval_samples_per_second": 121.168, + "eval_steps_per_second": 1.939, + "step": 6000 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018585845600051424, + "loss": 0.783, + "step": 6020 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018547277752780096, + "loss": 0.7792, + "step": 6040 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018508709905508774, + "loss": 0.7775, + "step": 6060 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001847014205823745, + "loss": 0.7806, + "step": 6080 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001843157421096612, + "loss": 0.7801, + "step": 6100 + }, + { + "epoch": 1.17, + "learning_rate": 0.000183930063636948, + "loss": 0.7853, + "step": 6120 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001835443851642347, + "loss": 0.7937, + "step": 6140 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001831587066915215, + "loss": 0.7873, + "step": 6160 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018277302821880824, + "loss": 0.778, + "step": 6180 + }, + { + "epoch": 1.19, + "learning_rate": 0.000182387349746095, + "loss": 0.781, + "step": 6200 + }, + { + "epoch": 1.19, + "eval_loss": 0.8113830089569092, + "eval_runtime": 16.5217, + "eval_samples_per_second": 121.053, + "eval_steps_per_second": 1.937, + "step": 6200 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018200167127338174, + "loss": 0.7746, + "step": 6220 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018161599280066851, + "loss": 0.7752, + "step": 6240 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018123031432795524, + "loss": 0.7838, + "step": 6260 + }, + { + "epoch": 1.2, + "learning_rate": 0.000180844635855242, + "loss": 0.789, + "step": 6280 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018045895738252874, + "loss": 0.7882, + "step": 6300 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018007327890981549, + "loss": 0.7822, + "step": 6320 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017968760043710226, + "loss": 0.7889, + "step": 6340 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017930192196438899, + "loss": 0.7891, + "step": 6360 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017891624349167576, + "loss": 0.7884, + "step": 6380 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001785305650189625, + "loss": 0.7733, + "step": 6400 + }, + { + "epoch": 1.23, + "eval_loss": 0.810148298740387, + "eval_runtime": 16.5113, + "eval_samples_per_second": 121.129, + "eval_steps_per_second": 1.938, + "step": 6400 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017814488654624926, + "loss": 0.7794, + "step": 6420 + }, + { + "epoch": 1.23, + "learning_rate": 0.000177759208073536, + "loss": 0.775, + "step": 6440 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017737352960082276, + "loss": 0.7706, + "step": 6460 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001769878511281095, + "loss": 0.7808, + "step": 6480 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001766021726553963, + "loss": 0.7805, + "step": 6500 + }, + { + "epoch": 1.25, + "learning_rate": 0.000176216494182683, + "loss": 0.7813, + "step": 6520 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001758308157099698, + "loss": 0.7789, + "step": 6540 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017544513723725654, + "loss": 0.7827, + "step": 6560 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017505945876454326, + "loss": 0.7763, + "step": 6580 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017467378029183004, + "loss": 0.7779, + "step": 6600 + }, + { + "epoch": 1.26, + "eval_loss": 0.8090565800666809, + "eval_runtime": 16.4954, + "eval_samples_per_second": 121.246, + "eval_steps_per_second": 1.94, + "step": 6600 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017428810181911676, + "loss": 0.7793, + "step": 6620 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017390242334640354, + "loss": 0.7778, + "step": 6640 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017351674487369029, + "loss": 0.7802, + "step": 6660 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017313106640097704, + "loss": 0.7823, + "step": 6680 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017274538792826379, + "loss": 0.7868, + "step": 6700 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017235970945555056, + "loss": 0.7824, + "step": 6720 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017197403098283728, + "loss": 0.7777, + "step": 6740 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017158835251012406, + "loss": 0.7822, + "step": 6760 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017120267403741078, + "loss": 0.7798, + "step": 6780 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017081699556469753, + "loss": 0.7712, + "step": 6800 + }, + { + "epoch": 1.3, + "eval_loss": 0.8080956935882568, + "eval_runtime": 16.5234, + "eval_samples_per_second": 121.041, + "eval_steps_per_second": 1.937, + "step": 6800 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001704313170919843, + "loss": 0.7888, + "step": 6820 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017004563861927103, + "loss": 0.7769, + "step": 6840 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001696599601465578, + "loss": 0.7686, + "step": 6860 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016927428167384456, + "loss": 0.7762, + "step": 6880 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001688886032011313, + "loss": 0.7807, + "step": 6900 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016850292472841806, + "loss": 0.7831, + "step": 6920 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001681172462557048, + "loss": 0.7856, + "step": 6940 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016773156778299156, + "loss": 0.775, + "step": 6960 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016734588931027834, + "loss": 0.7835, + "step": 6980 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016696021083756506, + "loss": 0.7756, + "step": 7000 + }, + { + "epoch": 1.34, + "eval_loss": 0.8070209622383118, + "eval_runtime": 16.4997, + "eval_samples_per_second": 121.214, + "eval_steps_per_second": 1.939, + "step": 7000 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001665745323648518, + "loss": 0.7756, + "step": 7020 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016618885389213859, + "loss": 0.7783, + "step": 7040 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001658031754194253, + "loss": 0.7697, + "step": 7060 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016541749694671208, + "loss": 0.7889, + "step": 7080 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016503181847399883, + "loss": 0.7725, + "step": 7100 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016464614000128558, + "loss": 0.7726, + "step": 7120 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016426046152857233, + "loss": 0.7787, + "step": 7140 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016387478305585908, + "loss": 0.782, + "step": 7160 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016348910458314583, + "loss": 0.7736, + "step": 7180 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001631034261104326, + "loss": 0.7748, + "step": 7200 + }, + { + "epoch": 1.38, + "eval_loss": 0.8063712120056152, + "eval_runtime": 16.5096, + "eval_samples_per_second": 121.142, + "eval_steps_per_second": 1.938, + "step": 7200 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016271774763771933, + "loss": 0.7717, + "step": 7220 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016233206916500608, + "loss": 0.7676, + "step": 7240 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016194639069229286, + "loss": 0.7662, + "step": 7260 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016156071221957958, + "loss": 0.7809, + "step": 7280 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016117503374686636, + "loss": 0.7731, + "step": 7300 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016078935527415308, + "loss": 0.7795, + "step": 7320 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016040367680143986, + "loss": 0.78, + "step": 7340 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001600179983287266, + "loss": 0.7785, + "step": 7360 + }, + { + "epoch": 1.41, + "learning_rate": 0.00015963231985601336, + "loss": 0.7694, + "step": 7380 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001592466413833001, + "loss": 0.781, + "step": 7400 + }, + { + "epoch": 1.42, + "eval_loss": 0.8048364520072937, + "eval_runtime": 16.5235, + "eval_samples_per_second": 121.04, + "eval_steps_per_second": 1.937, + "step": 7400 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015886096291058688, + "loss": 0.7681, + "step": 7420 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001584752844378736, + "loss": 0.7835, + "step": 7440 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015808960596516038, + "loss": 0.7778, + "step": 7460 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001577039274924471, + "loss": 0.775, + "step": 7480 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015731824901973386, + "loss": 0.7758, + "step": 7500 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015693257054702063, + "loss": 0.7846, + "step": 7520 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015654689207430736, + "loss": 0.7756, + "step": 7540 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015616121360159413, + "loss": 0.7764, + "step": 7560 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015577553512888088, + "loss": 0.7684, + "step": 7580 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015538985665616763, + "loss": 0.7837, + "step": 7600 + }, + { + "epoch": 1.46, + "eval_loss": 0.8041849136352539, + "eval_runtime": 16.4633, + "eval_samples_per_second": 121.482, + "eval_steps_per_second": 1.944, + "step": 7600 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015500417818345438, + "loss": 0.772, + "step": 7620 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001546184997107411, + "loss": 0.7759, + "step": 7640 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015423282123802788, + "loss": 0.7778, + "step": 7660 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015384714276531466, + "loss": 0.78, + "step": 7680 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015346146429260138, + "loss": 0.7681, + "step": 7700 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015307578581988813, + "loss": 0.7731, + "step": 7720 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001526901073471749, + "loss": 0.78, + "step": 7740 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015230442887446163, + "loss": 0.7719, + "step": 7760 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001519187504017484, + "loss": 0.7667, + "step": 7780 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015153307192903513, + "loss": 0.7804, + "step": 7800 + }, + { + "epoch": 1.49, + "eval_loss": 0.8034607768058777, + "eval_runtime": 16.4833, + "eval_samples_per_second": 121.335, + "eval_steps_per_second": 1.941, + "step": 7800 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001511473934563219, + "loss": 0.7813, + "step": 7820 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015076171498360866, + "loss": 0.7751, + "step": 7840 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015037603651089538, + "loss": 0.7681, + "step": 7860 + }, + { + "epoch": 1.51, + "learning_rate": 0.00014999035803818216, + "loss": 0.7679, + "step": 7880 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001496046795654689, + "loss": 0.7723, + "step": 7900 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014921900109275566, + "loss": 0.7732, + "step": 7920 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001488333226200424, + "loss": 0.7805, + "step": 7940 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014844764414732916, + "loss": 0.7666, + "step": 7960 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001480619656746159, + "loss": 0.7801, + "step": 7980 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014767628720190265, + "loss": 0.7736, + "step": 8000 + }, + { + "epoch": 1.53, + "eval_loss": 0.8029702305793762, + "eval_runtime": 16.5088, + "eval_samples_per_second": 121.147, + "eval_steps_per_second": 1.938, + "step": 8000 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014729060872918943, + "loss": 0.7716, + "step": 8020 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014690493025647618, + "loss": 0.7771, + "step": 8040 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014651925178376293, + "loss": 0.7715, + "step": 8060 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014613357331104968, + "loss": 0.7731, + "step": 8080 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014574789483833643, + "loss": 0.7763, + "step": 8100 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014536221636562318, + "loss": 0.7705, + "step": 8120 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014497653789290993, + "loss": 0.7702, + "step": 8140 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014459085942019668, + "loss": 0.7752, + "step": 8160 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014420518094748343, + "loss": 0.7662, + "step": 8180 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014381950247477018, + "loss": 0.7757, + "step": 8200 + }, + { + "epoch": 1.57, + "eval_loss": 0.8025923371315002, + "eval_runtime": 16.5398, + "eval_samples_per_second": 120.921, + "eval_steps_per_second": 1.935, + "step": 8200 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014343382400205693, + "loss": 0.7638, + "step": 8220 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001430481455293437, + "loss": 0.7836, + "step": 8240 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014266246705663046, + "loss": 0.7685, + "step": 8260 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001422767885839172, + "loss": 0.7901, + "step": 8280 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014189111011120396, + "loss": 0.7729, + "step": 8300 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001415054316384907, + "loss": 0.7614, + "step": 8320 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014111975316577745, + "loss": 0.7789, + "step": 8340 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001407340746930642, + "loss": 0.7713, + "step": 8360 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014034839622035095, + "loss": 0.7831, + "step": 8380 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001399627177476377, + "loss": 0.7674, + "step": 8400 + }, + { + "epoch": 1.61, + "eval_loss": 0.801445722579956, + "eval_runtime": 16.5305, + "eval_samples_per_second": 120.989, + "eval_steps_per_second": 1.936, + "step": 8400 + }, + { + "epoch": 1.61, + "learning_rate": 0.00013957703927492445, + "loss": 0.7698, + "step": 8420 + }, + { + "epoch": 1.62, + "learning_rate": 0.0001391913608022112, + "loss": 0.7725, + "step": 8440 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013880568232949795, + "loss": 0.771, + "step": 8460 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013842000385678473, + "loss": 0.7679, + "step": 8480 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013803432538407148, + "loss": 0.7788, + "step": 8500 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013764864691135823, + "loss": 0.7705, + "step": 8520 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013726296843864495, + "loss": 0.7625, + "step": 8540 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013687728996593173, + "loss": 0.7626, + "step": 8560 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013649161149321848, + "loss": 0.7731, + "step": 8580 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013610593302050523, + "loss": 0.7788, + "step": 8600 + }, + { + "epoch": 1.65, + "eval_loss": 0.8010225296020508, + "eval_runtime": 16.5075, + "eval_samples_per_second": 121.157, + "eval_steps_per_second": 1.939, + "step": 8600 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013572025454779198, + "loss": 0.7758, + "step": 8620 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013533457607507873, + "loss": 0.7738, + "step": 8640 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013494889760236548, + "loss": 0.7827, + "step": 8660 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013456321912965223, + "loss": 0.779, + "step": 8680 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013417754065693898, + "loss": 0.771, + "step": 8700 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013379186218422575, + "loss": 0.7683, + "step": 8720 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001334061837115125, + "loss": 0.7728, + "step": 8740 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013302050523879925, + "loss": 0.7761, + "step": 8760 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013263482676608598, + "loss": 0.7705, + "step": 8780 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013224914829337275, + "loss": 0.7624, + "step": 8800 + }, + { + "epoch": 1.69, + "eval_loss": 0.8003928065299988, + "eval_runtime": 16.5035, + "eval_samples_per_second": 121.186, + "eval_steps_per_second": 1.939, + "step": 8800 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001318634698206595, + "loss": 0.7669, + "step": 8820 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013147779134794625, + "loss": 0.7675, + "step": 8840 + }, + { + "epoch": 1.7, + "learning_rate": 0.000131092112875233, + "loss": 0.7629, + "step": 8860 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013070643440251975, + "loss": 0.7663, + "step": 8880 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001303207559298065, + "loss": 0.7708, + "step": 8900 + }, + { + "epoch": 1.71, + "learning_rate": 0.00012993507745709325, + "loss": 0.7734, + "step": 8920 + }, + { + "epoch": 1.71, + "learning_rate": 0.00012954939898438, + "loss": 0.7711, + "step": 8940 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012916372051166678, + "loss": 0.769, + "step": 8960 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012877804203895353, + "loss": 0.7706, + "step": 8980 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012839236356624025, + "loss": 0.7752, + "step": 9000 + }, + { + "epoch": 1.72, + "eval_loss": 0.799389660358429, + "eval_runtime": 16.516, + "eval_samples_per_second": 121.094, + "eval_steps_per_second": 1.938, + "step": 9000 + }, + { + "epoch": 1.73, + "learning_rate": 0.000128006685093527, + "loss": 0.7678, + "step": 9020 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012762100662081378, + "loss": 0.7764, + "step": 9040 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012723532814810053, + "loss": 0.7672, + "step": 9060 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012684964967538728, + "loss": 0.7705, + "step": 9080 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012646397120267403, + "loss": 0.7657, + "step": 9100 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012607829272996078, + "loss": 0.7648, + "step": 9120 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012569261425724753, + "loss": 0.7737, + "step": 9140 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012530693578453428, + "loss": 0.7628, + "step": 9160 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012492125731182103, + "loss": 0.767, + "step": 9180 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001245355788391078, + "loss": 0.764, + "step": 9200 + }, + { + "epoch": 1.76, + "eval_loss": 0.7991757988929749, + "eval_runtime": 16.5416, + "eval_samples_per_second": 120.907, + "eval_steps_per_second": 1.935, + "step": 9200 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012414990036639455, + "loss": 0.7658, + "step": 9220 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012376422189368128, + "loss": 0.7642, + "step": 9240 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012337854342096802, + "loss": 0.7611, + "step": 9260 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001229928649482548, + "loss": 0.7665, + "step": 9280 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012260718647554155, + "loss": 0.7785, + "step": 9300 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001222215080028283, + "loss": 0.7673, + "step": 9320 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012183582953011504, + "loss": 0.777, + "step": 9340 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001214501510574018, + "loss": 0.7684, + "step": 9360 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012106447258468855, + "loss": 0.7694, + "step": 9380 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001206787941119753, + "loss": 0.7634, + "step": 9400 + }, + { + "epoch": 1.8, + "eval_loss": 0.7980849742889404, + "eval_runtime": 16.5086, + "eval_samples_per_second": 121.149, + "eval_steps_per_second": 1.938, + "step": 9400 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012031239956289772, + "loss": 0.7636, + "step": 9420 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011992672109018447, + "loss": 0.7629, + "step": 9440 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011954104261747122, + "loss": 0.7724, + "step": 9460 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011915536414475797, + "loss": 0.7697, + "step": 9480 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011876968567204474, + "loss": 0.7574, + "step": 9500 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011838400719933149, + "loss": 0.7719, + "step": 9520 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011799832872661822, + "loss": 0.7761, + "step": 9540 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011761265025390497, + "loss": 0.7693, + "step": 9560 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011722697178119174, + "loss": 0.7687, + "step": 9580 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011684129330847849, + "loss": 0.7758, + "step": 9600 + }, + { + "epoch": 1.84, + "eval_loss": 0.7981218099594116, + "eval_runtime": 16.5407, + "eval_samples_per_second": 120.914, + "eval_steps_per_second": 1.935, + "step": 9600 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011645561483576524, + "loss": 0.7603, + "step": 9620 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011606993636305199, + "loss": 0.7579, + "step": 9640 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011568425789033875, + "loss": 0.7673, + "step": 9660 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001152985794176255, + "loss": 0.7745, + "step": 9680 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011491290094491225, + "loss": 0.758, + "step": 9700 + }, + { + "epoch": 1.86, + "learning_rate": 0.000114527222472199, + "loss": 0.7686, + "step": 9720 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011414154399948576, + "loss": 0.7741, + "step": 9740 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011375586552677251, + "loss": 0.7646, + "step": 9760 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011337018705405925, + "loss": 0.7675, + "step": 9780 + }, + { + "epoch": 1.88, + "learning_rate": 0.000112984508581346, + "loss": 0.7637, + "step": 9800 + }, + { + "epoch": 1.88, + "eval_loss": 0.7970672845840454, + "eval_runtime": 16.5386, + "eval_samples_per_second": 120.929, + "eval_steps_per_second": 1.935, + "step": 9800 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011259883010863276, + "loss": 0.7678, + "step": 9820 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011221315163591951, + "loss": 0.762, + "step": 9840 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011182747316320626, + "loss": 0.7653, + "step": 9860 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011144179469049301, + "loss": 0.7666, + "step": 9880 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011105611621777977, + "loss": 0.7621, + "step": 9900 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011067043774506652, + "loss": 0.7715, + "step": 9920 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011028475927235327, + "loss": 0.7605, + "step": 9940 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010989908079964002, + "loss": 0.7618, + "step": 9960 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010951340232692679, + "loss": 0.7726, + "step": 9980 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010912772385421352, + "loss": 0.7684, + "step": 10000 + }, + { + "epoch": 1.92, + "eval_loss": 0.7967627048492432, + "eval_runtime": 16.5033, + "eval_samples_per_second": 121.188, + "eval_steps_per_second": 1.939, + "step": 10000 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010874204538150027, + "loss": 0.7666, + "step": 10020 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010835636690878702, + "loss": 0.7661, + "step": 10040 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010797068843607378, + "loss": 0.7621, + "step": 10060 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010758500996336053, + "loss": 0.7736, + "step": 10080 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010719933149064728, + "loss": 0.76, + "step": 10100 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010681365301793405, + "loss": 0.764, + "step": 10120 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001064279745452208, + "loss": 0.7697, + "step": 10140 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010604229607250755, + "loss": 0.7602, + "step": 10160 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001056566175997943, + "loss": 0.766, + "step": 10180 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010527093912708106, + "loss": 0.7719, + "step": 10200 + }, + { + "epoch": 1.95, + "eval_loss": 0.7964752912521362, + "eval_runtime": 16.4947, + "eval_samples_per_second": 121.251, + "eval_steps_per_second": 1.94, + "step": 10200 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010488526065436781, + "loss": 0.7653, + "step": 10220 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010449958218165455, + "loss": 0.7653, + "step": 10240 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001041139037089413, + "loss": 0.7711, + "step": 10260 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010372822523622806, + "loss": 0.7729, + "step": 10280 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010334254676351481, + "loss": 0.7709, + "step": 10300 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010295686829080156, + "loss": 0.7611, + "step": 10320 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010257118981808831, + "loss": 0.7607, + "step": 10340 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010218551134537507, + "loss": 0.761, + "step": 10360 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010179983287266182, + "loss": 0.7645, + "step": 10380 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010141415439994857, + "loss": 0.7682, + "step": 10400 + }, + { + "epoch": 1.99, + "eval_loss": 0.7955361008644104, + "eval_runtime": 16.5066, + "eval_samples_per_second": 121.164, + "eval_steps_per_second": 1.939, + "step": 10400 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010102847592723531, + "loss": 0.76, + "step": 10420 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010064279745452208, + "loss": 0.7653, + "step": 10440 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010025711898180882, + "loss": 0.7625, + "step": 10460 + }, + { + "epoch": 2.01, + "learning_rate": 9.987144050909557e-05, + "loss": 0.764, + "step": 10480 + }, + { + "epoch": 2.01, + "learning_rate": 9.948576203638232e-05, + "loss": 0.766, + "step": 10500 + }, + { + "epoch": 2.02, + "learning_rate": 9.910008356366908e-05, + "loss": 0.7656, + "step": 10520 + }, + { + "epoch": 2.02, + "learning_rate": 9.871440509095583e-05, + "loss": 0.7698, + "step": 10540 + }, + { + "epoch": 2.02, + "learning_rate": 9.832872661824258e-05, + "loss": 0.7635, + "step": 10560 + }, + { + "epoch": 2.03, + "learning_rate": 9.794304814552933e-05, + "loss": 0.77, + "step": 10580 + }, + { + "epoch": 2.03, + "learning_rate": 9.75573696728161e-05, + "loss": 0.7651, + "step": 10600 + }, + { + "epoch": 2.03, + "eval_loss": 0.7953855395317078, + "eval_runtime": 16.5084, + "eval_samples_per_second": 121.15, + "eval_steps_per_second": 1.938, + "step": 10600 + }, + { + "epoch": 2.03, + "learning_rate": 9.717169120010285e-05, + "loss": 0.7628, + "step": 10620 + }, + { + "epoch": 2.04, + "learning_rate": 9.67860127273896e-05, + "loss": 0.7662, + "step": 10640 + }, + { + "epoch": 2.04, + "learning_rate": 9.640033425467633e-05, + "loss": 0.7635, + "step": 10660 + }, + { + "epoch": 2.05, + "learning_rate": 9.60146557819631e-05, + "loss": 0.7601, + "step": 10680 + }, + { + "epoch": 2.05, + "learning_rate": 9.562897730924984e-05, + "loss": 0.7649, + "step": 10700 + }, + { + "epoch": 2.05, + "learning_rate": 9.52432988365366e-05, + "loss": 0.758, + "step": 10720 + }, + { + "epoch": 2.06, + "learning_rate": 9.485762036382334e-05, + "loss": 0.767, + "step": 10740 + }, + { + "epoch": 2.06, + "learning_rate": 9.447194189111011e-05, + "loss": 0.7559, + "step": 10760 + }, + { + "epoch": 2.07, + "learning_rate": 9.408626341839686e-05, + "loss": 0.765, + "step": 10780 + }, + { + "epoch": 2.07, + "learning_rate": 9.37005849456836e-05, + "loss": 0.7641, + "step": 10800 + }, + { + "epoch": 2.07, + "eval_loss": 0.794941782951355, + "eval_runtime": 16.5101, + "eval_samples_per_second": 121.138, + "eval_steps_per_second": 1.938, + "step": 10800 + }, + { + "epoch": 2.07, + "learning_rate": 9.331490647297036e-05, + "loss": 0.7691, + "step": 10820 + }, + { + "epoch": 2.08, + "learning_rate": 9.292922800025712e-05, + "loss": 0.7611, + "step": 10840 + }, + { + "epoch": 2.08, + "learning_rate": 9.254354952754387e-05, + "loss": 0.7609, + "step": 10860 + }, + { + "epoch": 2.08, + "learning_rate": 9.21578710548306e-05, + "loss": 0.758, + "step": 10880 + }, + { + "epoch": 2.09, + "learning_rate": 9.177219258211736e-05, + "loss": 0.7637, + "step": 10900 + }, + { + "epoch": 2.09, + "learning_rate": 9.138651410940412e-05, + "loss": 0.7645, + "step": 10920 + }, + { + "epoch": 2.1, + "learning_rate": 9.100083563669087e-05, + "loss": 0.7507, + "step": 10940 + }, + { + "epoch": 2.1, + "learning_rate": 9.061515716397762e-05, + "loss": 0.7673, + "step": 10960 + }, + { + "epoch": 2.1, + "learning_rate": 9.022947869126437e-05, + "loss": 0.7552, + "step": 10980 + }, + { + "epoch": 2.11, + "learning_rate": 8.984380021855113e-05, + "loss": 0.7639, + "step": 11000 + }, + { + "epoch": 2.11, + "eval_loss": 0.7940524220466614, + "eval_runtime": 16.5029, + "eval_samples_per_second": 121.19, + "eval_steps_per_second": 1.939, + "step": 11000 + }, + { + "epoch": 2.11, + "learning_rate": 8.945812174583788e-05, + "loss": 0.7719, + "step": 11020 + }, + { + "epoch": 2.12, + "learning_rate": 8.907244327312463e-05, + "loss": 0.7641, + "step": 11040 + }, + { + "epoch": 2.12, + "learning_rate": 8.868676480041138e-05, + "loss": 0.7614, + "step": 11060 + }, + { + "epoch": 2.12, + "learning_rate": 8.830108632769814e-05, + "loss": 0.7785, + "step": 11080 + }, + { + "epoch": 2.13, + "learning_rate": 8.79154078549849e-05, + "loss": 0.7756, + "step": 11100 + }, + { + "epoch": 2.13, + "learning_rate": 8.752972938227163e-05, + "loss": 0.7645, + "step": 11120 + }, + { + "epoch": 2.13, + "learning_rate": 8.714405090955838e-05, + "loss": 0.7621, + "step": 11140 + }, + { + "epoch": 2.14, + "learning_rate": 8.675837243684514e-05, + "loss": 0.7662, + "step": 11160 + }, + { + "epoch": 2.14, + "learning_rate": 8.637269396413189e-05, + "loss": 0.7617, + "step": 11180 + }, + { + "epoch": 2.15, + "learning_rate": 8.598701549141864e-05, + "loss": 0.7683, + "step": 11200 + }, + { + "epoch": 2.15, + "eval_loss": 0.7937352061271667, + "eval_runtime": 16.5052, + "eval_samples_per_second": 121.174, + "eval_steps_per_second": 1.939, + "step": 11200 + }, + { + "epoch": 2.15, + "learning_rate": 8.560133701870539e-05, + "loss": 0.7635, + "step": 11220 + }, + { + "epoch": 2.15, + "learning_rate": 8.521565854599216e-05, + "loss": 0.7622, + "step": 11240 + }, + { + "epoch": 2.16, + "learning_rate": 8.48299800732789e-05, + "loss": 0.7616, + "step": 11260 + }, + { + "epoch": 2.16, + "learning_rate": 8.444430160056565e-05, + "loss": 0.7558, + "step": 11280 + }, + { + "epoch": 2.17, + "learning_rate": 8.40586231278524e-05, + "loss": 0.7714, + "step": 11300 + }, + { + "epoch": 2.17, + "learning_rate": 8.367294465513917e-05, + "loss": 0.7676, + "step": 11320 + }, + { + "epoch": 2.17, + "learning_rate": 8.32872661824259e-05, + "loss": 0.7623, + "step": 11340 + }, + { + "epoch": 2.18, + "learning_rate": 8.290158770971265e-05, + "loss": 0.7608, + "step": 11360 + }, + { + "epoch": 2.18, + "learning_rate": 8.251590923699942e-05, + "loss": 0.7746, + "step": 11380 + }, + { + "epoch": 2.18, + "learning_rate": 8.213023076428617e-05, + "loss": 0.7684, + "step": 11400 + }, + { + "epoch": 2.18, + "eval_loss": 0.7929428219795227, + "eval_runtime": 16.7561, + "eval_samples_per_second": 119.359, + "eval_steps_per_second": 1.91, + "step": 11400 + }, + { + "epoch": 2.19, + "learning_rate": 8.174455229157292e-05, + "loss": 0.7628, + "step": 11420 + }, + { + "epoch": 2.19, + "learning_rate": 8.135887381885967e-05, + "loss": 0.7614, + "step": 11440 + }, + { + "epoch": 2.2, + "learning_rate": 8.099247926978209e-05, + "loss": 0.7616, + "step": 11460 + }, + { + "epoch": 2.2, + "learning_rate": 8.060680079706884e-05, + "loss": 0.7614, + "step": 11480 + }, + { + "epoch": 2.2, + "learning_rate": 8.022112232435559e-05, + "loss": 0.7684, + "step": 11500 + }, + { + "epoch": 2.21, + "learning_rate": 7.983544385164233e-05, + "loss": 0.7663, + "step": 11520 + }, + { + "epoch": 2.21, + "learning_rate": 7.94497653789291e-05, + "loss": 0.7621, + "step": 11540 + }, + { + "epoch": 2.21, + "learning_rate": 7.906408690621584e-05, + "loss": 0.77, + "step": 11560 + }, + { + "epoch": 2.22, + "learning_rate": 7.867840843350259e-05, + "loss": 0.7629, + "step": 11580 + }, + { + "epoch": 2.22, + "learning_rate": 7.829272996078934e-05, + "loss": 0.7592, + "step": 11600 + }, + { + "epoch": 2.22, + "eval_loss": 0.7931132316589355, + "eval_runtime": 16.4886, + "eval_samples_per_second": 121.296, + "eval_steps_per_second": 1.941, + "step": 11600 + }, + { + "epoch": 2.23, + "learning_rate": 7.79070514880761e-05, + "loss": 0.7593, + "step": 11620 + }, + { + "epoch": 2.23, + "learning_rate": 7.752137301536285e-05, + "loss": 0.7579, + "step": 11640 + }, + { + "epoch": 2.23, + "learning_rate": 7.71356945426496e-05, + "loss": 0.7666, + "step": 11660 + }, + { + "epoch": 2.24, + "learning_rate": 7.675001606993635e-05, + "loss": 0.7573, + "step": 11680 + }, + { + "epoch": 2.24, + "learning_rate": 7.636433759722312e-05, + "loss": 0.7654, + "step": 11700 + }, + { + "epoch": 2.25, + "learning_rate": 7.597865912450986e-05, + "loss": 0.7637, + "step": 11720 + }, + { + "epoch": 2.25, + "learning_rate": 7.559298065179661e-05, + "loss": 0.7638, + "step": 11740 + }, + { + "epoch": 2.25, + "learning_rate": 7.520730217908335e-05, + "loss": 0.7538, + "step": 11760 + }, + { + "epoch": 2.26, + "learning_rate": 7.482162370637011e-05, + "loss": 0.7598, + "step": 11780 + }, + { + "epoch": 2.26, + "learning_rate": 7.443594523365686e-05, + "loss": 0.7577, + "step": 11800 + }, + { + "epoch": 2.26, + "eval_loss": 0.7928204536437988, + "eval_runtime": 16.5434, + "eval_samples_per_second": 120.894, + "eval_steps_per_second": 1.934, + "step": 11800 + }, + { + "epoch": 2.26, + "learning_rate": 7.405026676094361e-05, + "loss": 0.7561, + "step": 11820 + }, + { + "epoch": 2.27, + "learning_rate": 7.366458828823038e-05, + "loss": 0.7557, + "step": 11840 + }, + { + "epoch": 2.27, + "learning_rate": 7.327890981551713e-05, + "loss": 0.7606, + "step": 11860 + }, + { + "epoch": 2.28, + "learning_rate": 7.289323134280388e-05, + "loss": 0.7575, + "step": 11880 + }, + { + "epoch": 2.28, + "learning_rate": 7.250755287009063e-05, + "loss": 0.7557, + "step": 11900 + }, + { + "epoch": 2.28, + "learning_rate": 7.212187439737738e-05, + "loss": 0.7687, + "step": 11920 + }, + { + "epoch": 2.29, + "learning_rate": 7.173619592466414e-05, + "loss": 0.7647, + "step": 11940 + }, + { + "epoch": 2.29, + "learning_rate": 7.135051745195089e-05, + "loss": 0.7608, + "step": 11960 + }, + { + "epoch": 2.3, + "learning_rate": 7.096483897923764e-05, + "loss": 0.7624, + "step": 11980 + }, + { + "epoch": 2.3, + "learning_rate": 7.057916050652439e-05, + "loss": 0.7651, + "step": 12000 + }, + { + "epoch": 2.3, + "eval_loss": 0.7917994856834412, + "eval_runtime": 16.5312, + "eval_samples_per_second": 120.983, + "eval_steps_per_second": 1.936, + "step": 12000 + }, + { + "epoch": 2.3, + "learning_rate": 7.019348203381114e-05, + "loss": 0.7678, + "step": 12020 + }, + { + "epoch": 2.31, + "learning_rate": 6.980780356109789e-05, + "loss": 0.7606, + "step": 12040 + }, + { + "epoch": 2.31, + "learning_rate": 6.942212508838465e-05, + "loss": 0.7607, + "step": 12060 + }, + { + "epoch": 2.31, + "learning_rate": 6.90364466156714e-05, + "loss": 0.763, + "step": 12080 + }, + { + "epoch": 2.32, + "learning_rate": 6.865076814295815e-05, + "loss": 0.7669, + "step": 12100 + }, + { + "epoch": 2.32, + "learning_rate": 6.82650896702449e-05, + "loss": 0.755, + "step": 12120 + }, + { + "epoch": 2.33, + "learning_rate": 6.787941119753165e-05, + "loss": 0.7611, + "step": 12140 + }, + { + "epoch": 2.33, + "learning_rate": 6.74937327248184e-05, + "loss": 0.7576, + "step": 12160 + }, + { + "epoch": 2.33, + "learning_rate": 6.710805425210516e-05, + "loss": 0.7581, + "step": 12180 + }, + { + "epoch": 2.34, + "learning_rate": 6.672237577939191e-05, + "loss": 0.7647, + "step": 12200 + }, + { + "epoch": 2.34, + "eval_loss": 0.7914180755615234, + "eval_runtime": 16.5021, + "eval_samples_per_second": 121.196, + "eval_steps_per_second": 1.939, + "step": 12200 + }, + { + "epoch": 2.34, + "learning_rate": 6.633669730667866e-05, + "loss": 0.7582, + "step": 12220 + }, + { + "epoch": 2.35, + "learning_rate": 6.595101883396541e-05, + "loss": 0.7531, + "step": 12240 + }, + { + "epoch": 2.35, + "learning_rate": 6.556534036125216e-05, + "loss": 0.7526, + "step": 12260 + }, + { + "epoch": 2.35, + "learning_rate": 6.517966188853891e-05, + "loss": 0.7701, + "step": 12280 + }, + { + "epoch": 2.36, + "learning_rate": 6.479398341582568e-05, + "loss": 0.7662, + "step": 12300 + }, + { + "epoch": 2.36, + "learning_rate": 6.440830494311241e-05, + "loss": 0.7541, + "step": 12320 + }, + { + "epoch": 2.36, + "learning_rate": 6.402262647039918e-05, + "loss": 0.7578, + "step": 12340 + }, + { + "epoch": 2.37, + "learning_rate": 6.363694799768592e-05, + "loss": 0.7569, + "step": 12360 + }, + { + "epoch": 2.37, + "learning_rate": 6.325126952497267e-05, + "loss": 0.7635, + "step": 12380 + }, + { + "epoch": 2.38, + "learning_rate": 6.286559105225942e-05, + "loss": 0.7618, + "step": 12400 + }, + { + "epoch": 2.38, + "eval_loss": 0.7912635207176208, + "eval_runtime": 16.4984, + "eval_samples_per_second": 121.224, + "eval_steps_per_second": 1.94, + "step": 12400 + }, + { + "epoch": 2.38, + "learning_rate": 6.247991257954619e-05, + "loss": 0.7536, + "step": 12420 + }, + { + "epoch": 2.38, + "learning_rate": 6.209423410683292e-05, + "loss": 0.7478, + "step": 12440 + }, + { + "epoch": 2.39, + "learning_rate": 6.170855563411969e-05, + "loss": 0.745, + "step": 12460 + }, + { + "epoch": 2.39, + "learning_rate": 6.132287716140644e-05, + "loss": 0.7611, + "step": 12480 + }, + { + "epoch": 2.4, + "learning_rate": 6.0937198688693187e-05, + "loss": 0.763, + "step": 12500 + }, + { + "epoch": 2.4, + "learning_rate": 6.0551520215979936e-05, + "loss": 0.7647, + "step": 12520 + }, + { + "epoch": 2.4, + "learning_rate": 6.016584174326669e-05, + "loss": 0.7621, + "step": 12540 + }, + { + "epoch": 2.41, + "learning_rate": 5.978016327055344e-05, + "loss": 0.7568, + "step": 12560 + }, + { + "epoch": 2.41, + "learning_rate": 5.93944847978402e-05, + "loss": 0.7613, + "step": 12580 + }, + { + "epoch": 2.41, + "learning_rate": 5.900880632512694e-05, + "loss": 0.7568, + "step": 12600 + }, + { + "epoch": 2.41, + "eval_loss": 0.7910023331642151, + "eval_runtime": 16.5022, + "eval_samples_per_second": 121.196, + "eval_steps_per_second": 1.939, + "step": 12600 + }, + { + "epoch": 2.42, + "learning_rate": 5.86231278524137e-05, + "loss": 0.7636, + "step": 12620 + }, + { + "epoch": 2.42, + "learning_rate": 5.823744937970045e-05, + "loss": 0.7657, + "step": 12640 + }, + { + "epoch": 2.43, + "learning_rate": 5.7851770906987205e-05, + "loss": 0.7703, + "step": 12660 + }, + { + "epoch": 2.43, + "learning_rate": 5.7466092434273955e-05, + "loss": 0.7557, + "step": 12680 + }, + { + "epoch": 2.43, + "learning_rate": 5.708041396156071e-05, + "loss": 0.7667, + "step": 12700 + }, + { + "epoch": 2.44, + "learning_rate": 5.6694735488847454e-05, + "loss": 0.7605, + "step": 12720 + }, + { + "epoch": 2.44, + "learning_rate": 5.630905701613421e-05, + "loss": 0.7549, + "step": 12740 + }, + { + "epoch": 2.44, + "learning_rate": 5.592337854342096e-05, + "loss": 0.7592, + "step": 12760 + }, + { + "epoch": 2.45, + "learning_rate": 5.553770007070772e-05, + "loss": 0.7654, + "step": 12780 + }, + { + "epoch": 2.45, + "learning_rate": 5.5171305521630135e-05, + "loss": 0.7636, + "step": 12800 + }, + { + "epoch": 2.45, + "eval_loss": 0.7906058430671692, + "eval_runtime": 16.5012, + "eval_samples_per_second": 121.203, + "eval_steps_per_second": 1.939, + "step": 12800 + }, + { + "epoch": 2.46, + "learning_rate": 5.478562704891688e-05, + "loss": 0.7629, + "step": 12820 + }, + { + "epoch": 2.46, + "learning_rate": 5.4399948576203634e-05, + "loss": 0.7688, + "step": 12840 + }, + { + "epoch": 2.46, + "learning_rate": 5.4014270103490384e-05, + "loss": 0.7534, + "step": 12860 + }, + { + "epoch": 2.47, + "learning_rate": 5.362859163077714e-05, + "loss": 0.76, + "step": 12880 + }, + { + "epoch": 2.47, + "learning_rate": 5.324291315806389e-05, + "loss": 0.7561, + "step": 12900 + }, + { + "epoch": 2.48, + "learning_rate": 5.285723468535065e-05, + "loss": 0.7562, + "step": 12920 + }, + { + "epoch": 2.48, + "learning_rate": 5.247155621263739e-05, + "loss": 0.7607, + "step": 12940 + }, + { + "epoch": 2.48, + "learning_rate": 5.2085877739924146e-05, + "loss": 0.7612, + "step": 12960 + }, + { + "epoch": 2.49, + "learning_rate": 5.1700199267210896e-05, + "loss": 0.7643, + "step": 12980 + }, + { + "epoch": 2.49, + "learning_rate": 5.131452079449765e-05, + "loss": 0.7656, + "step": 13000 + }, + { + "epoch": 2.49, + "eval_loss": 0.790121853351593, + "eval_runtime": 16.5158, + "eval_samples_per_second": 121.096, + "eval_steps_per_second": 1.938, + "step": 13000 + }, + { + "epoch": 2.49, + "learning_rate": 5.09288423217844e-05, + "loss": 0.756, + "step": 13020 + }, + { + "epoch": 2.5, + "learning_rate": 5.054316384907115e-05, + "loss": 0.7597, + "step": 13040 + }, + { + "epoch": 2.5, + "learning_rate": 5.01574853763579e-05, + "loss": 0.7525, + "step": 13060 + }, + { + "epoch": 2.51, + "learning_rate": 4.977180690364466e-05, + "loss": 0.7565, + "step": 13080 + }, + { + "epoch": 2.51, + "learning_rate": 4.938612843093141e-05, + "loss": 0.7631, + "step": 13100 + }, + { + "epoch": 2.51, + "learning_rate": 4.9000449958218165e-05, + "loss": 0.7514, + "step": 13120 + }, + { + "epoch": 2.52, + "learning_rate": 4.861477148550491e-05, + "loss": 0.7576, + "step": 13140 + }, + { + "epoch": 2.52, + "learning_rate": 4.8229093012791664e-05, + "loss": 0.7539, + "step": 13160 + }, + { + "epoch": 2.53, + "learning_rate": 4.7843414540078414e-05, + "loss": 0.7586, + "step": 13180 + }, + { + "epoch": 2.53, + "learning_rate": 4.745773606736517e-05, + "loss": 0.7573, + "step": 13200 + }, + { + "epoch": 2.53, + "eval_loss": 0.7899668216705322, + "eval_runtime": 16.509, + "eval_samples_per_second": 121.146, + "eval_steps_per_second": 1.938, + "step": 13200 + }, + { + "epoch": 2.53, + "learning_rate": 4.707205759465192e-05, + "loss": 0.7671, + "step": 13220 + }, + { + "epoch": 2.54, + "learning_rate": 4.668637912193868e-05, + "loss": 0.758, + "step": 13240 + }, + { + "epoch": 2.54, + "learning_rate": 4.630070064922542e-05, + "loss": 0.7444, + "step": 13260 + }, + { + "epoch": 2.54, + "learning_rate": 4.5915022176512176e-05, + "loss": 0.7548, + "step": 13280 + }, + { + "epoch": 2.55, + "learning_rate": 4.5529343703798926e-05, + "loss": 0.7681, + "step": 13300 + }, + { + "epoch": 2.55, + "learning_rate": 4.514366523108568e-05, + "loss": 0.7599, + "step": 13320 + }, + { + "epoch": 2.56, + "learning_rate": 4.475798675837243e-05, + "loss": 0.7631, + "step": 13340 + }, + { + "epoch": 2.56, + "learning_rate": 4.437230828565919e-05, + "loss": 0.7565, + "step": 13360 + }, + { + "epoch": 2.56, + "learning_rate": 4.398662981294593e-05, + "loss": 0.7586, + "step": 13380 + }, + { + "epoch": 2.57, + "learning_rate": 4.360095134023269e-05, + "loss": 0.7526, + "step": 13400 + }, + { + "epoch": 2.57, + "eval_loss": 0.7896500825881958, + "eval_runtime": 16.5086, + "eval_samples_per_second": 121.149, + "eval_steps_per_second": 1.938, + "step": 13400 + }, + { + "epoch": 2.57, + "learning_rate": 4.321527286751944e-05, + "loss": 0.7591, + "step": 13420 + }, + { + "epoch": 2.58, + "learning_rate": 4.2829594394806195e-05, + "loss": 0.7645, + "step": 13440 + }, + { + "epoch": 2.58, + "learning_rate": 4.2443915922092944e-05, + "loss": 0.7532, + "step": 13460 + }, + { + "epoch": 2.58, + "learning_rate": 4.2077521373015355e-05, + "loss": 0.746, + "step": 13480 + }, + { + "epoch": 2.59, + "learning_rate": 4.169184290030211e-05, + "loss": 0.7534, + "step": 13500 + }, + { + "epoch": 2.59, + "learning_rate": 4.130616442758886e-05, + "loss": 0.7506, + "step": 13520 + }, + { + "epoch": 2.59, + "learning_rate": 4.092048595487562e-05, + "loss": 0.7535, + "step": 13540 + }, + { + "epoch": 2.6, + "learning_rate": 4.053480748216237e-05, + "loss": 0.7596, + "step": 13560 + }, + { + "epoch": 2.6, + "learning_rate": 4.0149129009449124e-05, + "loss": 0.7686, + "step": 13580 + }, + { + "epoch": 2.61, + "learning_rate": 3.976345053673587e-05, + "loss": 0.7537, + "step": 13600 + }, + { + "epoch": 2.61, + "eval_loss": 0.7891342639923096, + "eval_runtime": 16.5163, + "eval_samples_per_second": 121.092, + "eval_steps_per_second": 1.937, + "step": 13600 + }, + { + "epoch": 2.61, + "learning_rate": 3.9377772064022624e-05, + "loss": 0.7656, + "step": 13620 + }, + { + "epoch": 2.61, + "learning_rate": 3.8992093591309374e-05, + "loss": 0.7515, + "step": 13640 + }, + { + "epoch": 2.62, + "learning_rate": 3.860641511859613e-05, + "loss": 0.761, + "step": 13660 + }, + { + "epoch": 2.62, + "learning_rate": 3.822073664588288e-05, + "loss": 0.7648, + "step": 13680 + }, + { + "epoch": 2.63, + "learning_rate": 3.783505817316963e-05, + "loss": 0.7671, + "step": 13700 + }, + { + "epoch": 2.63, + "learning_rate": 3.7449379700456386e-05, + "loss": 0.7653, + "step": 13720 + }, + { + "epoch": 2.63, + "learning_rate": 3.7063701227743136e-05, + "loss": 0.7583, + "step": 13740 + }, + { + "epoch": 2.64, + "learning_rate": 3.6678022755029886e-05, + "loss": 0.7602, + "step": 13760 + }, + { + "epoch": 2.64, + "learning_rate": 3.629234428231664e-05, + "loss": 0.7626, + "step": 13780 + }, + { + "epoch": 2.64, + "learning_rate": 3.590666580960339e-05, + "loss": 0.7485, + "step": 13800 + }, + { + "epoch": 2.64, + "eval_loss": 0.7891269326210022, + "eval_runtime": 16.5041, + "eval_samples_per_second": 121.182, + "eval_steps_per_second": 1.939, + "step": 13800 + }, + { + "epoch": 2.65, + "learning_rate": 3.552098733689014e-05, + "loss": 0.7564, + "step": 13820 + }, + { + "epoch": 2.65, + "learning_rate": 3.51353088641769e-05, + "loss": 0.7603, + "step": 13840 + }, + { + "epoch": 2.66, + "learning_rate": 3.474963039146365e-05, + "loss": 0.7584, + "step": 13860 + }, + { + "epoch": 2.66, + "learning_rate": 3.43639519187504e-05, + "loss": 0.7608, + "step": 13880 + }, + { + "epoch": 2.66, + "learning_rate": 3.3978273446037154e-05, + "loss": 0.7535, + "step": 13900 + }, + { + "epoch": 2.67, + "learning_rate": 3.3592594973323904e-05, + "loss": 0.7614, + "step": 13920 + }, + { + "epoch": 2.67, + "learning_rate": 3.3206916500610654e-05, + "loss": 0.7654, + "step": 13940 + }, + { + "epoch": 2.67, + "learning_rate": 3.282123802789741e-05, + "loss": 0.7656, + "step": 13960 + }, + { + "epoch": 2.68, + "learning_rate": 3.243555955518416e-05, + "loss": 0.756, + "step": 13980 + }, + { + "epoch": 2.68, + "learning_rate": 3.204988108247091e-05, + "loss": 0.7653, + "step": 14000 + }, + { + "epoch": 2.68, + "eval_loss": 0.7888805866241455, + "eval_runtime": 16.5275, + "eval_samples_per_second": 121.01, + "eval_steps_per_second": 1.936, + "step": 14000 + }, + { + "epoch": 2.69, + "learning_rate": 3.1664202609757666e-05, + "loss": 0.7618, + "step": 14020 + }, + { + "epoch": 2.69, + "learning_rate": 3.1278524137044416e-05, + "loss": 0.7588, + "step": 14040 + }, + { + "epoch": 2.69, + "learning_rate": 3.0892845664331166e-05, + "loss": 0.7625, + "step": 14060 + }, + { + "epoch": 2.7, + "learning_rate": 3.050716719161792e-05, + "loss": 0.7532, + "step": 14080 + }, + { + "epoch": 2.7, + "learning_rate": 3.0121488718904672e-05, + "loss": 0.7538, + "step": 14100 + }, + { + "epoch": 2.71, + "learning_rate": 2.9735810246191422e-05, + "loss": 0.7531, + "step": 14120 + }, + { + "epoch": 2.71, + "learning_rate": 2.9350131773478175e-05, + "loss": 0.7551, + "step": 14140 + }, + { + "epoch": 2.71, + "learning_rate": 2.8964453300764928e-05, + "loss": 0.7479, + "step": 14160 + }, + { + "epoch": 2.72, + "learning_rate": 2.8578774828051678e-05, + "loss": 0.7629, + "step": 14180 + }, + { + "epoch": 2.72, + "learning_rate": 2.819309635533843e-05, + "loss": 0.7572, + "step": 14200 + }, + { + "epoch": 2.72, + "eval_loss": 0.7884878516197205, + "eval_runtime": 16.7595, + "eval_samples_per_second": 119.335, + "eval_steps_per_second": 1.909, + "step": 14200 + }, + { + "epoch": 2.72, + "learning_rate": 2.7807417882625184e-05, + "loss": 0.758, + "step": 14220 + }, + { + "epoch": 2.73, + "learning_rate": 2.7421739409911934e-05, + "loss": 0.7608, + "step": 14240 + }, + { + "epoch": 2.73, + "learning_rate": 2.7036060937198687e-05, + "loss": 0.7555, + "step": 14260 + }, + { + "epoch": 2.74, + "learning_rate": 2.6650382464485437e-05, + "loss": 0.7512, + "step": 14280 + }, + { + "epoch": 2.74, + "learning_rate": 2.626470399177219e-05, + "loss": 0.7488, + "step": 14300 + }, + { + "epoch": 2.74, + "learning_rate": 2.5879025519058943e-05, + "loss": 0.7532, + "step": 14320 + }, + { + "epoch": 2.75, + "learning_rate": 2.5493347046345693e-05, + "loss": 0.7525, + "step": 14340 + }, + { + "epoch": 2.75, + "learning_rate": 2.5107668573632446e-05, + "loss": 0.7662, + "step": 14360 + }, + { + "epoch": 2.76, + "learning_rate": 2.47219901009192e-05, + "loss": 0.7583, + "step": 14380 + }, + { + "epoch": 2.76, + "learning_rate": 2.433631162820595e-05, + "loss": 0.7442, + "step": 14400 + }, + { + "epoch": 2.76, + "eval_loss": 0.7883238196372986, + "eval_runtime": 16.474, + "eval_samples_per_second": 121.403, + "eval_steps_per_second": 1.942, + "step": 14400 + }, + { + "epoch": 2.76, + "learning_rate": 2.3950633155492702e-05, + "loss": 0.7612, + "step": 14420 + }, + { + "epoch": 2.77, + "learning_rate": 2.3564954682779455e-05, + "loss": 0.7571, + "step": 14440 + }, + { + "epoch": 2.77, + "learning_rate": 2.3179276210066205e-05, + "loss": 0.7511, + "step": 14460 + }, + { + "epoch": 2.77, + "learning_rate": 2.2793597737352958e-05, + "loss": 0.7567, + "step": 14480 + }, + { + "epoch": 2.78, + "learning_rate": 2.2407919264639708e-05, + "loss": 0.7555, + "step": 14500 + }, + { + "epoch": 2.78, + "learning_rate": 2.202224079192646e-05, + "loss": 0.7555, + "step": 14520 + }, + { + "epoch": 2.79, + "learning_rate": 2.1636562319213214e-05, + "loss": 0.7509, + "step": 14540 + }, + { + "epoch": 2.79, + "learning_rate": 2.1250883846499964e-05, + "loss": 0.7585, + "step": 14560 + }, + { + "epoch": 2.79, + "learning_rate": 2.0865205373786717e-05, + "loss": 0.7621, + "step": 14580 + }, + { + "epoch": 2.8, + "learning_rate": 2.047952690107347e-05, + "loss": 0.7601, + "step": 14600 + }, + { + "epoch": 2.8, + "eval_loss": 0.7880419492721558, + "eval_runtime": 16.6163, + "eval_samples_per_second": 120.364, + "eval_steps_per_second": 1.926, + "step": 14600 + }, + { + "epoch": 2.8, + "learning_rate": 2.009384842836022e-05, + "loss": 0.7574, + "step": 14620 + }, + { + "epoch": 2.81, + "learning_rate": 1.9708169955646973e-05, + "loss": 0.7538, + "step": 14640 + }, + { + "epoch": 2.81, + "learning_rate": 1.9322491482933726e-05, + "loss": 0.7611, + "step": 14660 + }, + { + "epoch": 2.81, + "learning_rate": 1.8936813010220476e-05, + "loss": 0.7519, + "step": 14680 + }, + { + "epoch": 2.82, + "learning_rate": 1.855113453750723e-05, + "loss": 0.7559, + "step": 14700 + }, + { + "epoch": 2.82, + "learning_rate": 1.8165456064793982e-05, + "loss": 0.7596, + "step": 14720 + }, + { + "epoch": 2.82, + "learning_rate": 1.7779777592080735e-05, + "loss": 0.7564, + "step": 14740 + }, + { + "epoch": 2.83, + "learning_rate": 1.7394099119367485e-05, + "loss": 0.7526, + "step": 14760 + }, + { + "epoch": 2.83, + "learning_rate": 1.7008420646654238e-05, + "loss": 0.7624, + "step": 14780 + }, + { + "epoch": 2.84, + "learning_rate": 1.662274217394099e-05, + "loss": 0.7569, + "step": 14800 + }, + { + "epoch": 2.84, + "eval_loss": 0.7879504561424255, + "eval_runtime": 16.5411, + "eval_samples_per_second": 120.911, + "eval_steps_per_second": 1.935, + "step": 14800 + }, + { + "epoch": 2.84, + "learning_rate": 1.623706370122774e-05, + "loss": 0.7543, + "step": 14820 + }, + { + "epoch": 2.84, + "learning_rate": 1.5851385228514494e-05, + "loss": 0.7533, + "step": 14840 + }, + { + "epoch": 2.85, + "learning_rate": 1.5465706755801247e-05, + "loss": 0.7579, + "step": 14860 + }, + { + "epoch": 2.85, + "learning_rate": 1.5080028283087997e-05, + "loss": 0.7638, + "step": 14880 + }, + { + "epoch": 2.85, + "learning_rate": 1.469434981037475e-05, + "loss": 0.7456, + "step": 14900 + }, + { + "epoch": 2.86, + "learning_rate": 1.4308671337661502e-05, + "loss": 0.7561, + "step": 14920 + }, + { + "epoch": 2.86, + "learning_rate": 1.3922992864948253e-05, + "loss": 0.7626, + "step": 14940 + }, + { + "epoch": 2.87, + "learning_rate": 1.3537314392235005e-05, + "loss": 0.7686, + "step": 14960 + }, + { + "epoch": 2.87, + "learning_rate": 1.3151635919521758e-05, + "loss": 0.7512, + "step": 14980 + }, + { + "epoch": 2.87, + "learning_rate": 1.276595744680851e-05, + "loss": 0.7526, + "step": 15000 + }, + { + "epoch": 2.87, + "eval_loss": 0.7875809073448181, + "eval_runtime": 16.5086, + "eval_samples_per_second": 121.149, + "eval_steps_per_second": 1.938, + "step": 15000 + }, + { + "epoch": 2.88, + "learning_rate": 1.238027897409526e-05, + "loss": 0.7582, + "step": 15020 + }, + { + "epoch": 2.88, + "learning_rate": 1.1994600501382012e-05, + "loss": 0.7584, + "step": 15040 + }, + { + "epoch": 2.89, + "learning_rate": 1.1608922028668765e-05, + "loss": 0.7643, + "step": 15060 + }, + { + "epoch": 2.89, + "learning_rate": 1.1223243555955517e-05, + "loss": 0.7693, + "step": 15080 + }, + { + "epoch": 2.89, + "learning_rate": 1.0837565083242268e-05, + "loss": 0.7591, + "step": 15100 + }, + { + "epoch": 2.9, + "learning_rate": 1.0451886610529021e-05, + "loss": 0.7482, + "step": 15120 + }, + { + "epoch": 2.9, + "learning_rate": 1.0066208137815773e-05, + "loss": 0.7553, + "step": 15140 + }, + { + "epoch": 2.9, + "learning_rate": 9.680529665102524e-06, + "loss": 0.7563, + "step": 15160 + }, + { + "epoch": 2.91, + "learning_rate": 9.294851192389277e-06, + "loss": 0.7639, + "step": 15180 + }, + { + "epoch": 2.91, + "learning_rate": 8.909172719676029e-06, + "loss": 0.7577, + "step": 15200 + }, + { + "epoch": 2.91, + "eval_loss": 0.7872186303138733, + "eval_runtime": 16.5027, + "eval_samples_per_second": 121.192, + "eval_steps_per_second": 1.939, + "step": 15200 + } + ], + "max_steps": 15657, + "num_train_epochs": 3, + "total_flos": 3.951639155229852e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/training_args.bin b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cd8a096e4fd3ba848cec18e7c5691ebcb18ad76b --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a580d27270395c94c3ef0dba9604e87b9d9eebe09ad2bc995408d9ab207ebfd +size 3643 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/optimizer.pt b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..74ae81efd70f793bf0591bfc8d42937cc8a95892 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d518ecd138065bfe01e66e1de4e597ef4c4c5267e970fa02fc1a2ccef062294c +size 33629893 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/pytorch_model.bin b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..962a4e9f2fde9d6147430aa196d6049b2d3ca094 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36a7c5217ea387e6a6486b2dd75e5411a017c05bce0a9e27dc48f97a0a411553 +size 16822989 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_0.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..cd0ed6f5423844b24960647419f31d2ef84ab728 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b9049304ebee5fcc978bdbfdcad76d4ab1f664cdcb2108962bf1fa084f72551 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_1.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7a5320b7bc4f308b0f60cce3d61d5900e6e5c50b --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0136d727de42049931107ddb8106013547ce13ef091f2962616a1c5c0900e5cd +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_2.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8c09d8cec29914fde5751b64c05f112fe3bfe08 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d72975d746f72a95f0d9d92a5f147de3cac64881032f82f4d5bf6f0d8dfae88b +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_3.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f0e6fe6696d32d15ccd24d3238c9d78325f93985 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92f6f840a2fa3574937879224a6955c903713cb58b58a027ccd1e5a57abc2ce1 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_4.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..8562bb28d104601eec98f265ae80f3b0053f2787 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:662cb2bada5878781b452bf720fbfeb8e31ebdb814ce6566c90a5e86272fff8e +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_5.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..e1dc27604a3e51a64f5e5b8b9180fe9d1cff6f36 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2628680e1f8e3ec4599d6dc81b5ca42da7dbd6411bad421d7296e1e438520570 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_6.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..ee480bd4945d4625c2009c44b7989de63c12e624 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e052a1b584ce8d59130e15cd2b84ffe30e6577de49e26a92222fd215de11cdf +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_7.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..8c9e0f1b7271182a3ac68c68cc8e1448d0b70e8d --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:940839e9e855eebbf800019cc3793de8d286a103a71a29869942853d6ebd4872 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/scaler.pt b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b143803db29742fb39194beea6b7f82157cd7358 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2f07c498624a59240958b09486c1e343abe160118142124ffa7e205a4e8f0da +size 557 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/scheduler.pt b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2d84c0450960622027bef0b6ade2214faeccaf1 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90ce2f3b458c2d1f7d2f417b91b14c2552d8943584e19cec21f7737d05cec068 +size 627 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/trainer_state.json b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a9993c38941974e88ba1c0f88d97711a701049b9 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/trainer_state.json @@ -0,0 +1,5252 @@ +{ + "best_metric": 0.7872186303138733, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-belle-cot7b/checkpoint-15200", + "epoch": 2.950756849971259, + "global_step": 15400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.8213, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 1.5494, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 1.179, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 0.00023999999999999998, + "loss": 1.1022, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.0003, + "loss": 1.078, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029961432152728675, + "loss": 1.0347, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002992286430545735, + "loss": 1.0169, + "step": 140 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029884296458186025, + "loss": 1.0088, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029845728610914697, + "loss": 0.9896, + "step": 180 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029807160763643375, + "loss": 0.99, + "step": 200 + }, + { + "epoch": 0.04, + "eval_loss": 1.0032634735107422, + "eval_runtime": 16.3952, + "eval_samples_per_second": 121.987, + "eval_steps_per_second": 1.952, + "step": 200 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029768592916372047, + "loss": 0.9724, + "step": 220 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029730025069100725, + "loss": 0.9719, + "step": 240 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029691457221829397, + "loss": 0.9652, + "step": 260 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029652889374558075, + "loss": 0.9579, + "step": 280 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029614321527286747, + "loss": 0.9532, + "step": 300 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029575753680015425, + "loss": 0.9613, + "step": 320 + }, + { + "epoch": 0.07, + "learning_rate": 0.000295371858327441, + "loss": 0.9473, + "step": 340 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029498617985472775, + "loss": 0.9416, + "step": 360 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002946005013820145, + "loss": 0.9386, + "step": 380 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029421482290930125, + "loss": 0.9338, + "step": 400 + }, + { + "epoch": 0.08, + "eval_loss": 0.957970380783081, + "eval_runtime": 16.3897, + "eval_samples_per_second": 122.028, + "eval_steps_per_second": 1.952, + "step": 400 + }, + { + "epoch": 0.08, + "learning_rate": 0.000293829144436588, + "loss": 0.937, + "step": 420 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029344346596387475, + "loss": 0.9304, + "step": 440 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002930577874911615, + "loss": 0.9323, + "step": 460 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029267210901844825, + "loss": 0.9185, + "step": 480 + }, + { + "epoch": 0.1, + "learning_rate": 0.000292286430545735, + "loss": 0.9273, + "step": 500 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029190075207302175, + "loss": 0.922, + "step": 520 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002915150736003085, + "loss": 0.9146, + "step": 540 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029112939512759525, + "loss": 0.9129, + "step": 560 + }, + { + "epoch": 0.11, + "learning_rate": 0.000290743716654882, + "loss": 0.9146, + "step": 580 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002903580381821688, + "loss": 0.9078, + "step": 600 + }, + { + "epoch": 0.11, + "eval_loss": 0.9345074892044067, + "eval_runtime": 16.4049, + "eval_samples_per_second": 121.914, + "eval_steps_per_second": 1.951, + "step": 600 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002899723597094555, + "loss": 0.9004, + "step": 620 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002895866812367423, + "loss": 0.9042, + "step": 640 + }, + { + "epoch": 0.13, + "learning_rate": 0.000289201002764029, + "loss": 0.9028, + "step": 660 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002888153242913158, + "loss": 0.8889, + "step": 680 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002884296458186025, + "loss": 0.8935, + "step": 700 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002880439673458893, + "loss": 0.9024, + "step": 720 + }, + { + "epoch": 0.14, + "learning_rate": 0.000287658288873176, + "loss": 0.8922, + "step": 740 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002872726104004628, + "loss": 0.8896, + "step": 760 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002868869319277495, + "loss": 0.8907, + "step": 780 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002865012534550363, + "loss": 0.8922, + "step": 800 + }, + { + "epoch": 0.15, + "eval_loss": 0.9149895310401917, + "eval_runtime": 16.4499, + "eval_samples_per_second": 121.581, + "eval_steps_per_second": 1.945, + "step": 800 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002861155749823231, + "loss": 0.8867, + "step": 820 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002857298965096098, + "loss": 0.891, + "step": 840 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028534421803689657, + "loss": 0.8882, + "step": 860 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002849585395641833, + "loss": 0.8835, + "step": 880 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028457286109147007, + "loss": 0.8798, + "step": 900 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002841871826187568, + "loss": 0.8784, + "step": 920 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028380150414604357, + "loss": 0.8841, + "step": 940 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002834158256733303, + "loss": 0.8787, + "step": 960 + }, + { + "epoch": 0.19, + "learning_rate": 0.00028303014720061707, + "loss": 0.8693, + "step": 980 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002826444687279038, + "loss": 0.8711, + "step": 1000 + }, + { + "epoch": 0.19, + "eval_loss": 0.9027432799339294, + "eval_runtime": 16.447, + "eval_samples_per_second": 121.603, + "eval_steps_per_second": 1.946, + "step": 1000 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028225879025519057, + "loss": 0.876, + "step": 1020 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002818731117824773, + "loss": 0.8749, + "step": 1040 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028148743330976407, + "loss": 0.877, + "step": 1060 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028110175483705085, + "loss": 0.8754, + "step": 1080 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028071607636433757, + "loss": 0.8792, + "step": 1100 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028033039789162435, + "loss": 0.8701, + "step": 1120 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027994471941891107, + "loss": 0.8667, + "step": 1140 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027955904094619785, + "loss": 0.8769, + "step": 1160 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027917336247348457, + "loss": 0.8734, + "step": 1180 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002787876840007713, + "loss": 0.8708, + "step": 1200 + }, + { + "epoch": 0.23, + "eval_loss": 0.8911536335945129, + "eval_runtime": 16.423, + "eval_samples_per_second": 121.78, + "eval_steps_per_second": 1.948, + "step": 1200 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027840200552805807, + "loss": 0.8673, + "step": 1220 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027801632705534485, + "loss": 0.8618, + "step": 1240 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027763064858263157, + "loss": 0.8739, + "step": 1260 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027724497010991834, + "loss": 0.8608, + "step": 1280 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002768592916372051, + "loss": 0.8631, + "step": 1300 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027647361316449184, + "loss": 0.8547, + "step": 1320 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002760879346917786, + "loss": 0.8589, + "step": 1340 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027570225621906534, + "loss": 0.8615, + "step": 1360 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002753165777463521, + "loss": 0.8644, + "step": 1380 + }, + { + "epoch": 0.27, + "learning_rate": 0.00027493089927363884, + "loss": 0.8524, + "step": 1400 + }, + { + "epoch": 0.27, + "eval_loss": 0.8813066482543945, + "eval_runtime": 16.4628, + "eval_samples_per_second": 121.486, + "eval_steps_per_second": 1.944, + "step": 1400 + }, + { + "epoch": 0.27, + "learning_rate": 0.00027454522080092557, + "loss": 0.8562, + "step": 1420 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027415954232821234, + "loss": 0.8547, + "step": 1440 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002737738638554991, + "loss": 0.8599, + "step": 1460 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027338818538278584, + "loss": 0.8491, + "step": 1480 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002730025069100726, + "loss": 0.8496, + "step": 1500 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002726168284373594, + "loss": 0.8594, + "step": 1520 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002722311499646461, + "loss": 0.8512, + "step": 1540 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002718454714919329, + "loss": 0.8441, + "step": 1560 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002714597930192196, + "loss": 0.8621, + "step": 1580 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002710741145465064, + "loss": 0.8525, + "step": 1600 + }, + { + "epoch": 0.31, + "eval_loss": 0.8728711009025574, + "eval_runtime": 16.4389, + "eval_samples_per_second": 121.663, + "eval_steps_per_second": 1.947, + "step": 1600 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002706884360737931, + "loss": 0.852, + "step": 1620 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027030275760107984, + "loss": 0.8553, + "step": 1640 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002699170791283666, + "loss": 0.8445, + "step": 1660 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002695314006556534, + "loss": 0.8518, + "step": 1680 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002691457221829401, + "loss": 0.8318, + "step": 1700 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002687600437102269, + "loss": 0.8492, + "step": 1720 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002683743652375136, + "loss": 0.8475, + "step": 1740 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002679886867648004, + "loss": 0.8437, + "step": 1760 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026760300829208717, + "loss": 0.8355, + "step": 1780 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002672173298193739, + "loss": 0.8486, + "step": 1800 + }, + { + "epoch": 0.34, + "eval_loss": 0.8663893938064575, + "eval_runtime": 16.4511, + "eval_samples_per_second": 121.572, + "eval_steps_per_second": 1.945, + "step": 1800 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026683165134666067, + "loss": 0.8449, + "step": 1820 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002664459728739474, + "loss": 0.853, + "step": 1840 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026606029440123417, + "loss": 0.8472, + "step": 1860 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002656746159285209, + "loss": 0.83, + "step": 1880 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002652889374558076, + "loss": 0.8398, + "step": 1900 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002649032589830944, + "loss": 0.8337, + "step": 1920 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026451758051038117, + "loss": 0.8314, + "step": 1940 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002641319020376679, + "loss": 0.8314, + "step": 1960 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026374622356495467, + "loss": 0.845, + "step": 1980 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026336054509224144, + "loss": 0.8294, + "step": 2000 + }, + { + "epoch": 0.38, + "eval_loss": 0.8619188666343689, + "eval_runtime": 16.4444, + "eval_samples_per_second": 121.622, + "eval_steps_per_second": 1.946, + "step": 2000 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026297486661952817, + "loss": 0.8404, + "step": 2020 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026258918814681494, + "loss": 0.839, + "step": 2040 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026220350967410167, + "loss": 0.84, + "step": 2060 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026181783120138844, + "loss": 0.8442, + "step": 2080 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026143215272867517, + "loss": 0.8443, + "step": 2100 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002610464742559619, + "loss": 0.8301, + "step": 2120 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026066079578324867, + "loss": 0.8302, + "step": 2140 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026027511731053544, + "loss": 0.836, + "step": 2160 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025988943883782216, + "loss": 0.8277, + "step": 2180 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025950376036510894, + "loss": 0.8335, + "step": 2200 + }, + { + "epoch": 0.42, + "eval_loss": 0.8562669157981873, + "eval_runtime": 16.4486, + "eval_samples_per_second": 121.591, + "eval_steps_per_second": 1.945, + "step": 2200 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025911808189239566, + "loss": 0.8267, + "step": 2220 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025873240341968244, + "loss": 0.8267, + "step": 2240 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002583467249469692, + "loss": 0.8293, + "step": 2260 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025796104647425594, + "loss": 0.836, + "step": 2280 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002575753680015427, + "loss": 0.8255, + "step": 2300 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025718968952882944, + "loss": 0.8177, + "step": 2320 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025680401105611616, + "loss": 0.8272, + "step": 2340 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025641833258340294, + "loss": 0.831, + "step": 2360 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025603265411068966, + "loss": 0.819, + "step": 2380 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025564697563797644, + "loss": 0.8216, + "step": 2400 + }, + { + "epoch": 0.46, + "eval_loss": 0.8516544103622437, + "eval_runtime": 16.476, + "eval_samples_per_second": 121.389, + "eval_steps_per_second": 1.942, + "step": 2400 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002552612971652632, + "loss": 0.8305, + "step": 2420 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025487561869254994, + "loss": 0.8305, + "step": 2440 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002544899402198367, + "loss": 0.8302, + "step": 2460 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002541042617471235, + "loss": 0.824, + "step": 2480 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002537185832744102, + "loss": 0.8315, + "step": 2500 + }, + { + "epoch": 0.48, + "learning_rate": 0.000253332904801697, + "loss": 0.8224, + "step": 2520 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002529472263289837, + "loss": 0.8229, + "step": 2540 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025256154785627044, + "loss": 0.8156, + "step": 2560 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002521758693835572, + "loss": 0.8319, + "step": 2580 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025179019091084394, + "loss": 0.8222, + "step": 2600 + }, + { + "epoch": 0.5, + "eval_loss": 0.8481459021568298, + "eval_runtime": 16.453, + "eval_samples_per_second": 121.558, + "eval_steps_per_second": 1.945, + "step": 2600 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002514045124381307, + "loss": 0.8205, + "step": 2620 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002510188339654175, + "loss": 0.8267, + "step": 2640 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002506331554927042, + "loss": 0.8116, + "step": 2660 + }, + { + "epoch": 0.51, + "learning_rate": 0.000250247477019991, + "loss": 0.8239, + "step": 2680 + }, + { + "epoch": 0.52, + "learning_rate": 0.00024986179854727777, + "loss": 0.8126, + "step": 2700 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002494761200745645, + "loss": 0.8226, + "step": 2720 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024909044160185127, + "loss": 0.8173, + "step": 2740 + }, + { + "epoch": 0.53, + "learning_rate": 0.000248704763129138, + "loss": 0.8227, + "step": 2760 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002483190846564247, + "loss": 0.8129, + "step": 2780 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002479334061837115, + "loss": 0.8164, + "step": 2800 + }, + { + "epoch": 0.54, + "eval_loss": 0.8439643979072571, + "eval_runtime": 16.4767, + "eval_samples_per_second": 121.384, + "eval_steps_per_second": 1.942, + "step": 2800 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002475477277109982, + "loss": 0.807, + "step": 2820 + }, + { + "epoch": 0.54, + "learning_rate": 0.000247162049238285, + "loss": 0.8126, + "step": 2840 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024677637076557176, + "loss": 0.8193, + "step": 2860 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002463906922928585, + "loss": 0.8091, + "step": 2880 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024600501382014526, + "loss": 0.8147, + "step": 2900 + }, + { + "epoch": 0.56, + "learning_rate": 0.000245619335347432, + "loss": 0.8207, + "step": 2920 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024523365687471876, + "loss": 0.8087, + "step": 2940 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024484797840200554, + "loss": 0.8198, + "step": 2960 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024446229992929226, + "loss": 0.8087, + "step": 2980 + }, + { + "epoch": 0.57, + "learning_rate": 0.000244076621456579, + "loss": 0.8182, + "step": 3000 + }, + { + "epoch": 0.57, + "eval_loss": 0.8408891558647156, + "eval_runtime": 16.4801, + "eval_samples_per_second": 121.358, + "eval_steps_per_second": 1.942, + "step": 3000 + }, + { + "epoch": 0.58, + "learning_rate": 0.0002436909429838658, + "loss": 0.8188, + "step": 3020 + }, + { + "epoch": 0.58, + "learning_rate": 0.0002433052645111525, + "loss": 0.8082, + "step": 3040 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024291958603843926, + "loss": 0.8171, + "step": 3060 + }, + { + "epoch": 0.59, + "learning_rate": 0.000242533907565726, + "loss": 0.8088, + "step": 3080 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024214822909301276, + "loss": 0.8148, + "step": 3100 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024176255062029954, + "loss": 0.8122, + "step": 3120 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024137687214758626, + "loss": 0.811, + "step": 3140 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024099119367487304, + "loss": 0.8179, + "step": 3160 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002406055152021598, + "loss": 0.8029, + "step": 3180 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002402198367294465, + "loss": 0.8143, + "step": 3200 + }, + { + "epoch": 0.61, + "eval_loss": 0.837196946144104, + "eval_runtime": 16.4913, + "eval_samples_per_second": 121.276, + "eval_steps_per_second": 1.94, + "step": 3200 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002398341582567333, + "loss": 0.7969, + "step": 3220 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023944847978402, + "loss": 0.8158, + "step": 3240 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002390628013113068, + "loss": 0.8019, + "step": 3260 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023867712283859354, + "loss": 0.8042, + "step": 3280 + }, + { + "epoch": 0.63, + "learning_rate": 0.0002382914443658803, + "loss": 0.8022, + "step": 3300 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023790576589316704, + "loss": 0.8043, + "step": 3320 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002375200874204538, + "loss": 0.8106, + "step": 3340 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023713440894774054, + "loss": 0.8146, + "step": 3360 + }, + { + "epoch": 0.65, + "learning_rate": 0.0002367487304750273, + "loss": 0.8004, + "step": 3380 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023636305200231404, + "loss": 0.8096, + "step": 3400 + }, + { + "epoch": 0.65, + "eval_loss": 0.8347571492195129, + "eval_runtime": 16.4822, + "eval_samples_per_second": 121.343, + "eval_steps_per_second": 1.941, + "step": 3400 + }, + { + "epoch": 0.66, + "learning_rate": 0.0002359773735296008, + "loss": 0.8226, + "step": 3420 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023559169505688756, + "loss": 0.8083, + "step": 3440 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023520601658417428, + "loss": 0.8168, + "step": 3460 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023482033811146106, + "loss": 0.8112, + "step": 3480 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002344346596387478, + "loss": 0.8131, + "step": 3500 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023404898116603456, + "loss": 0.8097, + "step": 3520 + }, + { + "epoch": 0.68, + "learning_rate": 0.0002336633026933213, + "loss": 0.804, + "step": 3540 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023327762422060806, + "loss": 0.8085, + "step": 3560 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002328919457478948, + "loss": 0.7992, + "step": 3580 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002325062672751816, + "loss": 0.8124, + "step": 3600 + }, + { + "epoch": 0.69, + "eval_loss": 0.8324670791625977, + "eval_runtime": 16.4936, + "eval_samples_per_second": 121.259, + "eval_steps_per_second": 1.94, + "step": 3600 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002321205888024683, + "loss": 0.8024, + "step": 3620 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002317349103297551, + "loss": 0.8032, + "step": 3640 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023134923185704184, + "loss": 0.8065, + "step": 3660 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023096355338432856, + "loss": 0.8106, + "step": 3680 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023057787491161534, + "loss": 0.8009, + "step": 3700 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023019219643890206, + "loss": 0.816, + "step": 3720 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022980651796618884, + "loss": 0.8103, + "step": 3740 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022942083949347559, + "loss": 0.8099, + "step": 3760 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022903516102076233, + "loss": 0.8085, + "step": 3780 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022864948254804908, + "loss": 0.8044, + "step": 3800 + }, + { + "epoch": 0.73, + "eval_loss": 0.830141544342041, + "eval_runtime": 16.4845, + "eval_samples_per_second": 121.326, + "eval_steps_per_second": 1.941, + "step": 3800 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022826380407533586, + "loss": 0.7969, + "step": 3820 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022787812560262258, + "loss": 0.8029, + "step": 3840 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022749244712990936, + "loss": 0.7921, + "step": 3860 + }, + { + "epoch": 0.74, + "learning_rate": 0.0002271067686571961, + "loss": 0.8051, + "step": 3880 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022672109018448283, + "loss": 0.807, + "step": 3900 + }, + { + "epoch": 0.75, + "learning_rate": 0.0002263354117117696, + "loss": 0.8042, + "step": 3920 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022594973323905633, + "loss": 0.7947, + "step": 3940 + }, + { + "epoch": 0.76, + "learning_rate": 0.0002255640547663431, + "loss": 0.7972, + "step": 3960 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022517837629362986, + "loss": 0.8038, + "step": 3980 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002247926978209166, + "loss": 0.8064, + "step": 4000 + }, + { + "epoch": 0.77, + "eval_loss": 0.828279435634613, + "eval_runtime": 16.4904, + "eval_samples_per_second": 121.283, + "eval_steps_per_second": 1.941, + "step": 4000 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022440701934820336, + "loss": 0.8032, + "step": 4020 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022402134087549014, + "loss": 0.7934, + "step": 4040 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022363566240277686, + "loss": 0.7919, + "step": 4060 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022324998393006364, + "loss": 0.8011, + "step": 4080 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022286430545735036, + "loss": 0.8026, + "step": 4100 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002224786269846371, + "loss": 0.804, + "step": 4120 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022209294851192388, + "loss": 0.8122, + "step": 4140 + }, + { + "epoch": 0.8, + "learning_rate": 0.0002217072700392106, + "loss": 0.7932, + "step": 4160 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022132159156649738, + "loss": 0.7911, + "step": 4180 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022093591309378413, + "loss": 0.8012, + "step": 4200 + }, + { + "epoch": 0.8, + "eval_loss": 0.8261794447898865, + "eval_runtime": 16.4921, + "eval_samples_per_second": 121.27, + "eval_steps_per_second": 1.94, + "step": 4200 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022055023462107088, + "loss": 0.7989, + "step": 4220 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022016455614835763, + "loss": 0.8031, + "step": 4240 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021977887767564438, + "loss": 0.8066, + "step": 4260 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021939319920293113, + "loss": 0.7964, + "step": 4280 + }, + { + "epoch": 0.82, + "learning_rate": 0.0002190075207302179, + "loss": 0.7947, + "step": 4300 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021862184225750463, + "loss": 0.8035, + "step": 4320 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021823616378479138, + "loss": 0.8029, + "step": 4340 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021785048531207816, + "loss": 0.7941, + "step": 4360 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021746480683936488, + "loss": 0.7934, + "step": 4380 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021707912836665166, + "loss": 0.7946, + "step": 4400 + }, + { + "epoch": 0.84, + "eval_loss": 0.823946475982666, + "eval_runtime": 16.4887, + "eval_samples_per_second": 121.295, + "eval_steps_per_second": 1.941, + "step": 4400 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021669344989393838, + "loss": 0.7974, + "step": 4420 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021630777142122516, + "loss": 0.7962, + "step": 4440 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002159220929485119, + "loss": 0.7946, + "step": 4460 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021553641447579866, + "loss": 0.7818, + "step": 4480 + }, + { + "epoch": 0.86, + "learning_rate": 0.0002151507360030854, + "loss": 0.803, + "step": 4500 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021476505753037218, + "loss": 0.7851, + "step": 4520 + }, + { + "epoch": 0.87, + "learning_rate": 0.0002143793790576589, + "loss": 0.7984, + "step": 4540 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021399370058494568, + "loss": 0.7973, + "step": 4560 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002136080221122324, + "loss": 0.782, + "step": 4580 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021322234363951916, + "loss": 0.7951, + "step": 4600 + }, + { + "epoch": 0.88, + "eval_loss": 0.8220962285995483, + "eval_runtime": 16.5191, + "eval_samples_per_second": 121.072, + "eval_steps_per_second": 1.937, + "step": 4600 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021283666516680593, + "loss": 0.7947, + "step": 4620 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021245098669409266, + "loss": 0.7957, + "step": 4640 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021206530822137943, + "loss": 0.797, + "step": 4660 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021167962974866618, + "loss": 0.8097, + "step": 4680 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021129395127595293, + "loss": 0.7894, + "step": 4700 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021090827280323968, + "loss": 0.7789, + "step": 4720 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002105225943305264, + "loss": 0.7949, + "step": 4740 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021013691585781318, + "loss": 0.7895, + "step": 4760 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020975123738509996, + "loss": 0.8036, + "step": 4780 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020936555891238668, + "loss": 0.7966, + "step": 4800 + }, + { + "epoch": 0.92, + "eval_loss": 0.8209095597267151, + "eval_runtime": 16.5035, + "eval_samples_per_second": 121.187, + "eval_steps_per_second": 1.939, + "step": 4800 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020897988043967343, + "loss": 0.7892, + "step": 4820 + }, + { + "epoch": 0.93, + "learning_rate": 0.0002085942019669602, + "loss": 0.7825, + "step": 4840 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020820852349424693, + "loss": 0.7937, + "step": 4860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002078228450215337, + "loss": 0.7893, + "step": 4880 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020743716654882043, + "loss": 0.7944, + "step": 4900 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002070514880761072, + "loss": 0.7973, + "step": 4920 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020666580960339396, + "loss": 0.7919, + "step": 4940 + }, + { + "epoch": 0.95, + "learning_rate": 0.0002062801311306807, + "loss": 0.7918, + "step": 4960 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020589445265796746, + "loss": 0.7901, + "step": 4980 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020550877418525423, + "loss": 0.7891, + "step": 5000 + }, + { + "epoch": 0.96, + "eval_loss": 0.8192855715751648, + "eval_runtime": 16.5248, + "eval_samples_per_second": 121.03, + "eval_steps_per_second": 1.936, + "step": 5000 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020512309571254096, + "loss": 0.7813, + "step": 5020 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002047374172398277, + "loss": 0.7831, + "step": 5040 + }, + { + "epoch": 0.97, + "learning_rate": 0.00020435173876711445, + "loss": 0.7911, + "step": 5060 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002039660602944012, + "loss": 0.7816, + "step": 5080 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020358038182168798, + "loss": 0.7915, + "step": 5100 + }, + { + "epoch": 0.98, + "learning_rate": 0.0002031947033489747, + "loss": 0.791, + "step": 5120 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020280902487626148, + "loss": 0.7851, + "step": 5140 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020242334640354823, + "loss": 0.7859, + "step": 5160 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020203766793083498, + "loss": 0.7888, + "step": 5180 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020165198945812173, + "loss": 0.7854, + "step": 5200 + }, + { + "epoch": 1.0, + "eval_loss": 0.8173321485519409, + "eval_runtime": 16.5042, + "eval_samples_per_second": 121.182, + "eval_steps_per_second": 1.939, + "step": 5200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0002012663109854085, + "loss": 0.7888, + "step": 5220 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020088063251269523, + "loss": 0.7893, + "step": 5240 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020049495403998198, + "loss": 0.7817, + "step": 5260 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020010927556726873, + "loss": 0.7755, + "step": 5280 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019972359709455548, + "loss": 0.7839, + "step": 5300 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019933791862184226, + "loss": 0.7911, + "step": 5320 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019895224014912898, + "loss": 0.7819, + "step": 5340 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019856656167641576, + "loss": 0.7802, + "step": 5360 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001981808832037025, + "loss": 0.7847, + "step": 5380 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019779520473098925, + "loss": 0.7824, + "step": 5400 + }, + { + "epoch": 1.03, + "eval_loss": 0.8163856267929077, + "eval_runtime": 16.5306, + "eval_samples_per_second": 120.988, + "eval_steps_per_second": 1.936, + "step": 5400 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019742881018191167, + "loss": 0.7757, + "step": 5420 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001970431317091984, + "loss": 0.786, + "step": 5440 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019665745323648517, + "loss": 0.7923, + "step": 5460 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019627177476377191, + "loss": 0.791, + "step": 5480 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019588609629105866, + "loss": 0.7863, + "step": 5500 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019550041781834541, + "loss": 0.7879, + "step": 5520 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001951147393456322, + "loss": 0.7924, + "step": 5540 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019472906087291891, + "loss": 0.7918, + "step": 5560 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001943433824002057, + "loss": 0.792, + "step": 5580 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001939577039274924, + "loss": 0.7784, + "step": 5600 + }, + { + "epoch": 1.07, + "eval_loss": 0.8148436546325684, + "eval_runtime": 16.5424, + "eval_samples_per_second": 120.901, + "eval_steps_per_second": 1.934, + "step": 5600 + }, + { + "epoch": 1.08, + "learning_rate": 0.0001935720254547792, + "loss": 0.7903, + "step": 5620 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019318634698206594, + "loss": 0.785, + "step": 5640 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019280066850935266, + "loss": 0.7916, + "step": 5660 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019241499003663944, + "loss": 0.779, + "step": 5680 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001920293115639262, + "loss": 0.7909, + "step": 5700 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019164363309121294, + "loss": 0.7798, + "step": 5720 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001912579546184997, + "loss": 0.7846, + "step": 5740 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019087227614578647, + "loss": 0.7887, + "step": 5760 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001904865976730732, + "loss": 0.7802, + "step": 5780 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019010091920035997, + "loss": 0.7891, + "step": 5800 + }, + { + "epoch": 1.11, + "eval_loss": 0.8130878806114197, + "eval_runtime": 16.5056, + "eval_samples_per_second": 121.171, + "eval_steps_per_second": 1.939, + "step": 5800 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001897152407276467, + "loss": 0.7819, + "step": 5820 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018932956225493346, + "loss": 0.7945, + "step": 5840 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018894388378222021, + "loss": 0.784, + "step": 5860 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018855820530950694, + "loss": 0.7838, + "step": 5880 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018817252683679371, + "loss": 0.7841, + "step": 5900 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001877868483640805, + "loss": 0.7909, + "step": 5920 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001874011698913672, + "loss": 0.7775, + "step": 5940 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018701549141865396, + "loss": 0.7827, + "step": 5960 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001866298129459407, + "loss": 0.7866, + "step": 5980 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018624413447322746, + "loss": 0.7696, + "step": 6000 + }, + { + "epoch": 1.15, + "eval_loss": 0.8125277757644653, + "eval_runtime": 16.506, + "eval_samples_per_second": 121.168, + "eval_steps_per_second": 1.939, + "step": 6000 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018585845600051424, + "loss": 0.783, + "step": 6020 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018547277752780096, + "loss": 0.7792, + "step": 6040 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018508709905508774, + "loss": 0.7775, + "step": 6060 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001847014205823745, + "loss": 0.7806, + "step": 6080 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001843157421096612, + "loss": 0.7801, + "step": 6100 + }, + { + "epoch": 1.17, + "learning_rate": 0.000183930063636948, + "loss": 0.7853, + "step": 6120 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001835443851642347, + "loss": 0.7937, + "step": 6140 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001831587066915215, + "loss": 0.7873, + "step": 6160 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018277302821880824, + "loss": 0.778, + "step": 6180 + }, + { + "epoch": 1.19, + "learning_rate": 0.000182387349746095, + "loss": 0.781, + "step": 6200 + }, + { + "epoch": 1.19, + "eval_loss": 0.8113830089569092, + "eval_runtime": 16.5217, + "eval_samples_per_second": 121.053, + "eval_steps_per_second": 1.937, + "step": 6200 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018200167127338174, + "loss": 0.7746, + "step": 6220 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018161599280066851, + "loss": 0.7752, + "step": 6240 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018123031432795524, + "loss": 0.7838, + "step": 6260 + }, + { + "epoch": 1.2, + "learning_rate": 0.000180844635855242, + "loss": 0.789, + "step": 6280 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018045895738252874, + "loss": 0.7882, + "step": 6300 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018007327890981549, + "loss": 0.7822, + "step": 6320 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017968760043710226, + "loss": 0.7889, + "step": 6340 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017930192196438899, + "loss": 0.7891, + "step": 6360 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017891624349167576, + "loss": 0.7884, + "step": 6380 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001785305650189625, + "loss": 0.7733, + "step": 6400 + }, + { + "epoch": 1.23, + "eval_loss": 0.810148298740387, + "eval_runtime": 16.5113, + "eval_samples_per_second": 121.129, + "eval_steps_per_second": 1.938, + "step": 6400 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017814488654624926, + "loss": 0.7794, + "step": 6420 + }, + { + "epoch": 1.23, + "learning_rate": 0.000177759208073536, + "loss": 0.775, + "step": 6440 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017737352960082276, + "loss": 0.7706, + "step": 6460 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001769878511281095, + "loss": 0.7808, + "step": 6480 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001766021726553963, + "loss": 0.7805, + "step": 6500 + }, + { + "epoch": 1.25, + "learning_rate": 0.000176216494182683, + "loss": 0.7813, + "step": 6520 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001758308157099698, + "loss": 0.7789, + "step": 6540 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017544513723725654, + "loss": 0.7827, + "step": 6560 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017505945876454326, + "loss": 0.7763, + "step": 6580 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017467378029183004, + "loss": 0.7779, + "step": 6600 + }, + { + "epoch": 1.26, + "eval_loss": 0.8090565800666809, + "eval_runtime": 16.4954, + "eval_samples_per_second": 121.246, + "eval_steps_per_second": 1.94, + "step": 6600 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017428810181911676, + "loss": 0.7793, + "step": 6620 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017390242334640354, + "loss": 0.7778, + "step": 6640 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017351674487369029, + "loss": 0.7802, + "step": 6660 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017313106640097704, + "loss": 0.7823, + "step": 6680 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017274538792826379, + "loss": 0.7868, + "step": 6700 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017235970945555056, + "loss": 0.7824, + "step": 6720 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017197403098283728, + "loss": 0.7777, + "step": 6740 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017158835251012406, + "loss": 0.7822, + "step": 6760 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017120267403741078, + "loss": 0.7798, + "step": 6780 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017081699556469753, + "loss": 0.7712, + "step": 6800 + }, + { + "epoch": 1.3, + "eval_loss": 0.8080956935882568, + "eval_runtime": 16.5234, + "eval_samples_per_second": 121.041, + "eval_steps_per_second": 1.937, + "step": 6800 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001704313170919843, + "loss": 0.7888, + "step": 6820 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017004563861927103, + "loss": 0.7769, + "step": 6840 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001696599601465578, + "loss": 0.7686, + "step": 6860 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016927428167384456, + "loss": 0.7762, + "step": 6880 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001688886032011313, + "loss": 0.7807, + "step": 6900 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016850292472841806, + "loss": 0.7831, + "step": 6920 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001681172462557048, + "loss": 0.7856, + "step": 6940 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016773156778299156, + "loss": 0.775, + "step": 6960 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016734588931027834, + "loss": 0.7835, + "step": 6980 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016696021083756506, + "loss": 0.7756, + "step": 7000 + }, + { + "epoch": 1.34, + "eval_loss": 0.8070209622383118, + "eval_runtime": 16.4997, + "eval_samples_per_second": 121.214, + "eval_steps_per_second": 1.939, + "step": 7000 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001665745323648518, + "loss": 0.7756, + "step": 7020 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016618885389213859, + "loss": 0.7783, + "step": 7040 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001658031754194253, + "loss": 0.7697, + "step": 7060 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016541749694671208, + "loss": 0.7889, + "step": 7080 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016503181847399883, + "loss": 0.7725, + "step": 7100 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016464614000128558, + "loss": 0.7726, + "step": 7120 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016426046152857233, + "loss": 0.7787, + "step": 7140 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016387478305585908, + "loss": 0.782, + "step": 7160 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016348910458314583, + "loss": 0.7736, + "step": 7180 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001631034261104326, + "loss": 0.7748, + "step": 7200 + }, + { + "epoch": 1.38, + "eval_loss": 0.8063712120056152, + "eval_runtime": 16.5096, + "eval_samples_per_second": 121.142, + "eval_steps_per_second": 1.938, + "step": 7200 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016271774763771933, + "loss": 0.7717, + "step": 7220 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016233206916500608, + "loss": 0.7676, + "step": 7240 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016194639069229286, + "loss": 0.7662, + "step": 7260 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016156071221957958, + "loss": 0.7809, + "step": 7280 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016117503374686636, + "loss": 0.7731, + "step": 7300 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016078935527415308, + "loss": 0.7795, + "step": 7320 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016040367680143986, + "loss": 0.78, + "step": 7340 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001600179983287266, + "loss": 0.7785, + "step": 7360 + }, + { + "epoch": 1.41, + "learning_rate": 0.00015963231985601336, + "loss": 0.7694, + "step": 7380 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001592466413833001, + "loss": 0.781, + "step": 7400 + }, + { + "epoch": 1.42, + "eval_loss": 0.8048364520072937, + "eval_runtime": 16.5235, + "eval_samples_per_second": 121.04, + "eval_steps_per_second": 1.937, + "step": 7400 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015886096291058688, + "loss": 0.7681, + "step": 7420 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001584752844378736, + "loss": 0.7835, + "step": 7440 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015808960596516038, + "loss": 0.7778, + "step": 7460 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001577039274924471, + "loss": 0.775, + "step": 7480 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015731824901973386, + "loss": 0.7758, + "step": 7500 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015693257054702063, + "loss": 0.7846, + "step": 7520 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015654689207430736, + "loss": 0.7756, + "step": 7540 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015616121360159413, + "loss": 0.7764, + "step": 7560 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015577553512888088, + "loss": 0.7684, + "step": 7580 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015538985665616763, + "loss": 0.7837, + "step": 7600 + }, + { + "epoch": 1.46, + "eval_loss": 0.8041849136352539, + "eval_runtime": 16.4633, + "eval_samples_per_second": 121.482, + "eval_steps_per_second": 1.944, + "step": 7600 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015500417818345438, + "loss": 0.772, + "step": 7620 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001546184997107411, + "loss": 0.7759, + "step": 7640 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015423282123802788, + "loss": 0.7778, + "step": 7660 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015384714276531466, + "loss": 0.78, + "step": 7680 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015346146429260138, + "loss": 0.7681, + "step": 7700 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015307578581988813, + "loss": 0.7731, + "step": 7720 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001526901073471749, + "loss": 0.78, + "step": 7740 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015230442887446163, + "loss": 0.7719, + "step": 7760 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001519187504017484, + "loss": 0.7667, + "step": 7780 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015153307192903513, + "loss": 0.7804, + "step": 7800 + }, + { + "epoch": 1.49, + "eval_loss": 0.8034607768058777, + "eval_runtime": 16.4833, + "eval_samples_per_second": 121.335, + "eval_steps_per_second": 1.941, + "step": 7800 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001511473934563219, + "loss": 0.7813, + "step": 7820 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015076171498360866, + "loss": 0.7751, + "step": 7840 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015037603651089538, + "loss": 0.7681, + "step": 7860 + }, + { + "epoch": 1.51, + "learning_rate": 0.00014999035803818216, + "loss": 0.7679, + "step": 7880 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001496046795654689, + "loss": 0.7723, + "step": 7900 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014921900109275566, + "loss": 0.7732, + "step": 7920 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001488333226200424, + "loss": 0.7805, + "step": 7940 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014844764414732916, + "loss": 0.7666, + "step": 7960 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001480619656746159, + "loss": 0.7801, + "step": 7980 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014767628720190265, + "loss": 0.7736, + "step": 8000 + }, + { + "epoch": 1.53, + "eval_loss": 0.8029702305793762, + "eval_runtime": 16.5088, + "eval_samples_per_second": 121.147, + "eval_steps_per_second": 1.938, + "step": 8000 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014729060872918943, + "loss": 0.7716, + "step": 8020 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014690493025647618, + "loss": 0.7771, + "step": 8040 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014651925178376293, + "loss": 0.7715, + "step": 8060 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014613357331104968, + "loss": 0.7731, + "step": 8080 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014574789483833643, + "loss": 0.7763, + "step": 8100 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014536221636562318, + "loss": 0.7705, + "step": 8120 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014497653789290993, + "loss": 0.7702, + "step": 8140 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014459085942019668, + "loss": 0.7752, + "step": 8160 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014420518094748343, + "loss": 0.7662, + "step": 8180 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014381950247477018, + "loss": 0.7757, + "step": 8200 + }, + { + "epoch": 1.57, + "eval_loss": 0.8025923371315002, + "eval_runtime": 16.5398, + "eval_samples_per_second": 120.921, + "eval_steps_per_second": 1.935, + "step": 8200 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014343382400205693, + "loss": 0.7638, + "step": 8220 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001430481455293437, + "loss": 0.7836, + "step": 8240 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014266246705663046, + "loss": 0.7685, + "step": 8260 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001422767885839172, + "loss": 0.7901, + "step": 8280 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014189111011120396, + "loss": 0.7729, + "step": 8300 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001415054316384907, + "loss": 0.7614, + "step": 8320 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014111975316577745, + "loss": 0.7789, + "step": 8340 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001407340746930642, + "loss": 0.7713, + "step": 8360 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014034839622035095, + "loss": 0.7831, + "step": 8380 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001399627177476377, + "loss": 0.7674, + "step": 8400 + }, + { + "epoch": 1.61, + "eval_loss": 0.801445722579956, + "eval_runtime": 16.5305, + "eval_samples_per_second": 120.989, + "eval_steps_per_second": 1.936, + "step": 8400 + }, + { + "epoch": 1.61, + "learning_rate": 0.00013957703927492445, + "loss": 0.7698, + "step": 8420 + }, + { + "epoch": 1.62, + "learning_rate": 0.0001391913608022112, + "loss": 0.7725, + "step": 8440 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013880568232949795, + "loss": 0.771, + "step": 8460 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013842000385678473, + "loss": 0.7679, + "step": 8480 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013803432538407148, + "loss": 0.7788, + "step": 8500 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013764864691135823, + "loss": 0.7705, + "step": 8520 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013726296843864495, + "loss": 0.7625, + "step": 8540 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013687728996593173, + "loss": 0.7626, + "step": 8560 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013649161149321848, + "loss": 0.7731, + "step": 8580 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013610593302050523, + "loss": 0.7788, + "step": 8600 + }, + { + "epoch": 1.65, + "eval_loss": 0.8010225296020508, + "eval_runtime": 16.5075, + "eval_samples_per_second": 121.157, + "eval_steps_per_second": 1.939, + "step": 8600 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013572025454779198, + "loss": 0.7758, + "step": 8620 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013533457607507873, + "loss": 0.7738, + "step": 8640 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013494889760236548, + "loss": 0.7827, + "step": 8660 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013456321912965223, + "loss": 0.779, + "step": 8680 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013417754065693898, + "loss": 0.771, + "step": 8700 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013379186218422575, + "loss": 0.7683, + "step": 8720 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001334061837115125, + "loss": 0.7728, + "step": 8740 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013302050523879925, + "loss": 0.7761, + "step": 8760 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013263482676608598, + "loss": 0.7705, + "step": 8780 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013224914829337275, + "loss": 0.7624, + "step": 8800 + }, + { + "epoch": 1.69, + "eval_loss": 0.8003928065299988, + "eval_runtime": 16.5035, + "eval_samples_per_second": 121.186, + "eval_steps_per_second": 1.939, + "step": 8800 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001318634698206595, + "loss": 0.7669, + "step": 8820 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013147779134794625, + "loss": 0.7675, + "step": 8840 + }, + { + "epoch": 1.7, + "learning_rate": 0.000131092112875233, + "loss": 0.7629, + "step": 8860 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013070643440251975, + "loss": 0.7663, + "step": 8880 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001303207559298065, + "loss": 0.7708, + "step": 8900 + }, + { + "epoch": 1.71, + "learning_rate": 0.00012993507745709325, + "loss": 0.7734, + "step": 8920 + }, + { + "epoch": 1.71, + "learning_rate": 0.00012954939898438, + "loss": 0.7711, + "step": 8940 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012916372051166678, + "loss": 0.769, + "step": 8960 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012877804203895353, + "loss": 0.7706, + "step": 8980 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012839236356624025, + "loss": 0.7752, + "step": 9000 + }, + { + "epoch": 1.72, + "eval_loss": 0.799389660358429, + "eval_runtime": 16.516, + "eval_samples_per_second": 121.094, + "eval_steps_per_second": 1.938, + "step": 9000 + }, + { + "epoch": 1.73, + "learning_rate": 0.000128006685093527, + "loss": 0.7678, + "step": 9020 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012762100662081378, + "loss": 0.7764, + "step": 9040 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012723532814810053, + "loss": 0.7672, + "step": 9060 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012684964967538728, + "loss": 0.7705, + "step": 9080 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012646397120267403, + "loss": 0.7657, + "step": 9100 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012607829272996078, + "loss": 0.7648, + "step": 9120 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012569261425724753, + "loss": 0.7737, + "step": 9140 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012530693578453428, + "loss": 0.7628, + "step": 9160 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012492125731182103, + "loss": 0.767, + "step": 9180 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001245355788391078, + "loss": 0.764, + "step": 9200 + }, + { + "epoch": 1.76, + "eval_loss": 0.7991757988929749, + "eval_runtime": 16.5416, + "eval_samples_per_second": 120.907, + "eval_steps_per_second": 1.935, + "step": 9200 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012414990036639455, + "loss": 0.7658, + "step": 9220 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012376422189368128, + "loss": 0.7642, + "step": 9240 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012337854342096802, + "loss": 0.7611, + "step": 9260 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001229928649482548, + "loss": 0.7665, + "step": 9280 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012260718647554155, + "loss": 0.7785, + "step": 9300 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001222215080028283, + "loss": 0.7673, + "step": 9320 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012183582953011504, + "loss": 0.777, + "step": 9340 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001214501510574018, + "loss": 0.7684, + "step": 9360 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012106447258468855, + "loss": 0.7694, + "step": 9380 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001206787941119753, + "loss": 0.7634, + "step": 9400 + }, + { + "epoch": 1.8, + "eval_loss": 0.7980849742889404, + "eval_runtime": 16.5086, + "eval_samples_per_second": 121.149, + "eval_steps_per_second": 1.938, + "step": 9400 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012031239956289772, + "loss": 0.7636, + "step": 9420 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011992672109018447, + "loss": 0.7629, + "step": 9440 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011954104261747122, + "loss": 0.7724, + "step": 9460 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011915536414475797, + "loss": 0.7697, + "step": 9480 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011876968567204474, + "loss": 0.7574, + "step": 9500 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011838400719933149, + "loss": 0.7719, + "step": 9520 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011799832872661822, + "loss": 0.7761, + "step": 9540 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011761265025390497, + "loss": 0.7693, + "step": 9560 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011722697178119174, + "loss": 0.7687, + "step": 9580 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011684129330847849, + "loss": 0.7758, + "step": 9600 + }, + { + "epoch": 1.84, + "eval_loss": 0.7981218099594116, + "eval_runtime": 16.5407, + "eval_samples_per_second": 120.914, + "eval_steps_per_second": 1.935, + "step": 9600 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011645561483576524, + "loss": 0.7603, + "step": 9620 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011606993636305199, + "loss": 0.7579, + "step": 9640 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011568425789033875, + "loss": 0.7673, + "step": 9660 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001152985794176255, + "loss": 0.7745, + "step": 9680 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011491290094491225, + "loss": 0.758, + "step": 9700 + }, + { + "epoch": 1.86, + "learning_rate": 0.000114527222472199, + "loss": 0.7686, + "step": 9720 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011414154399948576, + "loss": 0.7741, + "step": 9740 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011375586552677251, + "loss": 0.7646, + "step": 9760 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011337018705405925, + "loss": 0.7675, + "step": 9780 + }, + { + "epoch": 1.88, + "learning_rate": 0.000112984508581346, + "loss": 0.7637, + "step": 9800 + }, + { + "epoch": 1.88, + "eval_loss": 0.7970672845840454, + "eval_runtime": 16.5386, + "eval_samples_per_second": 120.929, + "eval_steps_per_second": 1.935, + "step": 9800 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011259883010863276, + "loss": 0.7678, + "step": 9820 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011221315163591951, + "loss": 0.762, + "step": 9840 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011182747316320626, + "loss": 0.7653, + "step": 9860 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011144179469049301, + "loss": 0.7666, + "step": 9880 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011105611621777977, + "loss": 0.7621, + "step": 9900 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011067043774506652, + "loss": 0.7715, + "step": 9920 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011028475927235327, + "loss": 0.7605, + "step": 9940 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010989908079964002, + "loss": 0.7618, + "step": 9960 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010951340232692679, + "loss": 0.7726, + "step": 9980 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010912772385421352, + "loss": 0.7684, + "step": 10000 + }, + { + "epoch": 1.92, + "eval_loss": 0.7967627048492432, + "eval_runtime": 16.5033, + "eval_samples_per_second": 121.188, + "eval_steps_per_second": 1.939, + "step": 10000 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010874204538150027, + "loss": 0.7666, + "step": 10020 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010835636690878702, + "loss": 0.7661, + "step": 10040 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010797068843607378, + "loss": 0.7621, + "step": 10060 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010758500996336053, + "loss": 0.7736, + "step": 10080 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010719933149064728, + "loss": 0.76, + "step": 10100 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010681365301793405, + "loss": 0.764, + "step": 10120 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001064279745452208, + "loss": 0.7697, + "step": 10140 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010604229607250755, + "loss": 0.7602, + "step": 10160 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001056566175997943, + "loss": 0.766, + "step": 10180 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010527093912708106, + "loss": 0.7719, + "step": 10200 + }, + { + "epoch": 1.95, + "eval_loss": 0.7964752912521362, + "eval_runtime": 16.4947, + "eval_samples_per_second": 121.251, + "eval_steps_per_second": 1.94, + "step": 10200 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010488526065436781, + "loss": 0.7653, + "step": 10220 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010449958218165455, + "loss": 0.7653, + "step": 10240 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001041139037089413, + "loss": 0.7711, + "step": 10260 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010372822523622806, + "loss": 0.7729, + "step": 10280 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010334254676351481, + "loss": 0.7709, + "step": 10300 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010295686829080156, + "loss": 0.7611, + "step": 10320 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010257118981808831, + "loss": 0.7607, + "step": 10340 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010218551134537507, + "loss": 0.761, + "step": 10360 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010179983287266182, + "loss": 0.7645, + "step": 10380 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010141415439994857, + "loss": 0.7682, + "step": 10400 + }, + { + "epoch": 1.99, + "eval_loss": 0.7955361008644104, + "eval_runtime": 16.5066, + "eval_samples_per_second": 121.164, + "eval_steps_per_second": 1.939, + "step": 10400 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010102847592723531, + "loss": 0.76, + "step": 10420 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010064279745452208, + "loss": 0.7653, + "step": 10440 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010025711898180882, + "loss": 0.7625, + "step": 10460 + }, + { + "epoch": 2.01, + "learning_rate": 9.987144050909557e-05, + "loss": 0.764, + "step": 10480 + }, + { + "epoch": 2.01, + "learning_rate": 9.948576203638232e-05, + "loss": 0.766, + "step": 10500 + }, + { + "epoch": 2.02, + "learning_rate": 9.910008356366908e-05, + "loss": 0.7656, + "step": 10520 + }, + { + "epoch": 2.02, + "learning_rate": 9.871440509095583e-05, + "loss": 0.7698, + "step": 10540 + }, + { + "epoch": 2.02, + "learning_rate": 9.832872661824258e-05, + "loss": 0.7635, + "step": 10560 + }, + { + "epoch": 2.03, + "learning_rate": 9.794304814552933e-05, + "loss": 0.77, + "step": 10580 + }, + { + "epoch": 2.03, + "learning_rate": 9.75573696728161e-05, + "loss": 0.7651, + "step": 10600 + }, + { + "epoch": 2.03, + "eval_loss": 0.7953855395317078, + "eval_runtime": 16.5084, + "eval_samples_per_second": 121.15, + "eval_steps_per_second": 1.938, + "step": 10600 + }, + { + "epoch": 2.03, + "learning_rate": 9.717169120010285e-05, + "loss": 0.7628, + "step": 10620 + }, + { + "epoch": 2.04, + "learning_rate": 9.67860127273896e-05, + "loss": 0.7662, + "step": 10640 + }, + { + "epoch": 2.04, + "learning_rate": 9.640033425467633e-05, + "loss": 0.7635, + "step": 10660 + }, + { + "epoch": 2.05, + "learning_rate": 9.60146557819631e-05, + "loss": 0.7601, + "step": 10680 + }, + { + "epoch": 2.05, + "learning_rate": 9.562897730924984e-05, + "loss": 0.7649, + "step": 10700 + }, + { + "epoch": 2.05, + "learning_rate": 9.52432988365366e-05, + "loss": 0.758, + "step": 10720 + }, + { + "epoch": 2.06, + "learning_rate": 9.485762036382334e-05, + "loss": 0.767, + "step": 10740 + }, + { + "epoch": 2.06, + "learning_rate": 9.447194189111011e-05, + "loss": 0.7559, + "step": 10760 + }, + { + "epoch": 2.07, + "learning_rate": 9.408626341839686e-05, + "loss": 0.765, + "step": 10780 + }, + { + "epoch": 2.07, + "learning_rate": 9.37005849456836e-05, + "loss": 0.7641, + "step": 10800 + }, + { + "epoch": 2.07, + "eval_loss": 0.794941782951355, + "eval_runtime": 16.5101, + "eval_samples_per_second": 121.138, + "eval_steps_per_second": 1.938, + "step": 10800 + }, + { + "epoch": 2.07, + "learning_rate": 9.331490647297036e-05, + "loss": 0.7691, + "step": 10820 + }, + { + "epoch": 2.08, + "learning_rate": 9.292922800025712e-05, + "loss": 0.7611, + "step": 10840 + }, + { + "epoch": 2.08, + "learning_rate": 9.254354952754387e-05, + "loss": 0.7609, + "step": 10860 + }, + { + "epoch": 2.08, + "learning_rate": 9.21578710548306e-05, + "loss": 0.758, + "step": 10880 + }, + { + "epoch": 2.09, + "learning_rate": 9.177219258211736e-05, + "loss": 0.7637, + "step": 10900 + }, + { + "epoch": 2.09, + "learning_rate": 9.138651410940412e-05, + "loss": 0.7645, + "step": 10920 + }, + { + "epoch": 2.1, + "learning_rate": 9.100083563669087e-05, + "loss": 0.7507, + "step": 10940 + }, + { + "epoch": 2.1, + "learning_rate": 9.061515716397762e-05, + "loss": 0.7673, + "step": 10960 + }, + { + "epoch": 2.1, + "learning_rate": 9.022947869126437e-05, + "loss": 0.7552, + "step": 10980 + }, + { + "epoch": 2.11, + "learning_rate": 8.984380021855113e-05, + "loss": 0.7639, + "step": 11000 + }, + { + "epoch": 2.11, + "eval_loss": 0.7940524220466614, + "eval_runtime": 16.5029, + "eval_samples_per_second": 121.19, + "eval_steps_per_second": 1.939, + "step": 11000 + }, + { + "epoch": 2.11, + "learning_rate": 8.945812174583788e-05, + "loss": 0.7719, + "step": 11020 + }, + { + "epoch": 2.12, + "learning_rate": 8.907244327312463e-05, + "loss": 0.7641, + "step": 11040 + }, + { + "epoch": 2.12, + "learning_rate": 8.868676480041138e-05, + "loss": 0.7614, + "step": 11060 + }, + { + "epoch": 2.12, + "learning_rate": 8.830108632769814e-05, + "loss": 0.7785, + "step": 11080 + }, + { + "epoch": 2.13, + "learning_rate": 8.79154078549849e-05, + "loss": 0.7756, + "step": 11100 + }, + { + "epoch": 2.13, + "learning_rate": 8.752972938227163e-05, + "loss": 0.7645, + "step": 11120 + }, + { + "epoch": 2.13, + "learning_rate": 8.714405090955838e-05, + "loss": 0.7621, + "step": 11140 + }, + { + "epoch": 2.14, + "learning_rate": 8.675837243684514e-05, + "loss": 0.7662, + "step": 11160 + }, + { + "epoch": 2.14, + "learning_rate": 8.637269396413189e-05, + "loss": 0.7617, + "step": 11180 + }, + { + "epoch": 2.15, + "learning_rate": 8.598701549141864e-05, + "loss": 0.7683, + "step": 11200 + }, + { + "epoch": 2.15, + "eval_loss": 0.7937352061271667, + "eval_runtime": 16.5052, + "eval_samples_per_second": 121.174, + "eval_steps_per_second": 1.939, + "step": 11200 + }, + { + "epoch": 2.15, + "learning_rate": 8.560133701870539e-05, + "loss": 0.7635, + "step": 11220 + }, + { + "epoch": 2.15, + "learning_rate": 8.521565854599216e-05, + "loss": 0.7622, + "step": 11240 + }, + { + "epoch": 2.16, + "learning_rate": 8.48299800732789e-05, + "loss": 0.7616, + "step": 11260 + }, + { + "epoch": 2.16, + "learning_rate": 8.444430160056565e-05, + "loss": 0.7558, + "step": 11280 + }, + { + "epoch": 2.17, + "learning_rate": 8.40586231278524e-05, + "loss": 0.7714, + "step": 11300 + }, + { + "epoch": 2.17, + "learning_rate": 8.367294465513917e-05, + "loss": 0.7676, + "step": 11320 + }, + { + "epoch": 2.17, + "learning_rate": 8.32872661824259e-05, + "loss": 0.7623, + "step": 11340 + }, + { + "epoch": 2.18, + "learning_rate": 8.290158770971265e-05, + "loss": 0.7608, + "step": 11360 + }, + { + "epoch": 2.18, + "learning_rate": 8.251590923699942e-05, + "loss": 0.7746, + "step": 11380 + }, + { + "epoch": 2.18, + "learning_rate": 8.213023076428617e-05, + "loss": 0.7684, + "step": 11400 + }, + { + "epoch": 2.18, + "eval_loss": 0.7929428219795227, + "eval_runtime": 16.7561, + "eval_samples_per_second": 119.359, + "eval_steps_per_second": 1.91, + "step": 11400 + }, + { + "epoch": 2.19, + "learning_rate": 8.174455229157292e-05, + "loss": 0.7628, + "step": 11420 + }, + { + "epoch": 2.19, + "learning_rate": 8.135887381885967e-05, + "loss": 0.7614, + "step": 11440 + }, + { + "epoch": 2.2, + "learning_rate": 8.099247926978209e-05, + "loss": 0.7616, + "step": 11460 + }, + { + "epoch": 2.2, + "learning_rate": 8.060680079706884e-05, + "loss": 0.7614, + "step": 11480 + }, + { + "epoch": 2.2, + "learning_rate": 8.022112232435559e-05, + "loss": 0.7684, + "step": 11500 + }, + { + "epoch": 2.21, + "learning_rate": 7.983544385164233e-05, + "loss": 0.7663, + "step": 11520 + }, + { + "epoch": 2.21, + "learning_rate": 7.94497653789291e-05, + "loss": 0.7621, + "step": 11540 + }, + { + "epoch": 2.21, + "learning_rate": 7.906408690621584e-05, + "loss": 0.77, + "step": 11560 + }, + { + "epoch": 2.22, + "learning_rate": 7.867840843350259e-05, + "loss": 0.7629, + "step": 11580 + }, + { + "epoch": 2.22, + "learning_rate": 7.829272996078934e-05, + "loss": 0.7592, + "step": 11600 + }, + { + "epoch": 2.22, + "eval_loss": 0.7931132316589355, + "eval_runtime": 16.4886, + "eval_samples_per_second": 121.296, + "eval_steps_per_second": 1.941, + "step": 11600 + }, + { + "epoch": 2.23, + "learning_rate": 7.79070514880761e-05, + "loss": 0.7593, + "step": 11620 + }, + { + "epoch": 2.23, + "learning_rate": 7.752137301536285e-05, + "loss": 0.7579, + "step": 11640 + }, + { + "epoch": 2.23, + "learning_rate": 7.71356945426496e-05, + "loss": 0.7666, + "step": 11660 + }, + { + "epoch": 2.24, + "learning_rate": 7.675001606993635e-05, + "loss": 0.7573, + "step": 11680 + }, + { + "epoch": 2.24, + "learning_rate": 7.636433759722312e-05, + "loss": 0.7654, + "step": 11700 + }, + { + "epoch": 2.25, + "learning_rate": 7.597865912450986e-05, + "loss": 0.7637, + "step": 11720 + }, + { + "epoch": 2.25, + "learning_rate": 7.559298065179661e-05, + "loss": 0.7638, + "step": 11740 + }, + { + "epoch": 2.25, + "learning_rate": 7.520730217908335e-05, + "loss": 0.7538, + "step": 11760 + }, + { + "epoch": 2.26, + "learning_rate": 7.482162370637011e-05, + "loss": 0.7598, + "step": 11780 + }, + { + "epoch": 2.26, + "learning_rate": 7.443594523365686e-05, + "loss": 0.7577, + "step": 11800 + }, + { + "epoch": 2.26, + "eval_loss": 0.7928204536437988, + "eval_runtime": 16.5434, + "eval_samples_per_second": 120.894, + "eval_steps_per_second": 1.934, + "step": 11800 + }, + { + "epoch": 2.26, + "learning_rate": 7.405026676094361e-05, + "loss": 0.7561, + "step": 11820 + }, + { + "epoch": 2.27, + "learning_rate": 7.366458828823038e-05, + "loss": 0.7557, + "step": 11840 + }, + { + "epoch": 2.27, + "learning_rate": 7.327890981551713e-05, + "loss": 0.7606, + "step": 11860 + }, + { + "epoch": 2.28, + "learning_rate": 7.289323134280388e-05, + "loss": 0.7575, + "step": 11880 + }, + { + "epoch": 2.28, + "learning_rate": 7.250755287009063e-05, + "loss": 0.7557, + "step": 11900 + }, + { + "epoch": 2.28, + "learning_rate": 7.212187439737738e-05, + "loss": 0.7687, + "step": 11920 + }, + { + "epoch": 2.29, + "learning_rate": 7.173619592466414e-05, + "loss": 0.7647, + "step": 11940 + }, + { + "epoch": 2.29, + "learning_rate": 7.135051745195089e-05, + "loss": 0.7608, + "step": 11960 + }, + { + "epoch": 2.3, + "learning_rate": 7.096483897923764e-05, + "loss": 0.7624, + "step": 11980 + }, + { + "epoch": 2.3, + "learning_rate": 7.057916050652439e-05, + "loss": 0.7651, + "step": 12000 + }, + { + "epoch": 2.3, + "eval_loss": 0.7917994856834412, + "eval_runtime": 16.5312, + "eval_samples_per_second": 120.983, + "eval_steps_per_second": 1.936, + "step": 12000 + }, + { + "epoch": 2.3, + "learning_rate": 7.019348203381114e-05, + "loss": 0.7678, + "step": 12020 + }, + { + "epoch": 2.31, + "learning_rate": 6.980780356109789e-05, + "loss": 0.7606, + "step": 12040 + }, + { + "epoch": 2.31, + "learning_rate": 6.942212508838465e-05, + "loss": 0.7607, + "step": 12060 + }, + { + "epoch": 2.31, + "learning_rate": 6.90364466156714e-05, + "loss": 0.763, + "step": 12080 + }, + { + "epoch": 2.32, + "learning_rate": 6.865076814295815e-05, + "loss": 0.7669, + "step": 12100 + }, + { + "epoch": 2.32, + "learning_rate": 6.82650896702449e-05, + "loss": 0.755, + "step": 12120 + }, + { + "epoch": 2.33, + "learning_rate": 6.787941119753165e-05, + "loss": 0.7611, + "step": 12140 + }, + { + "epoch": 2.33, + "learning_rate": 6.74937327248184e-05, + "loss": 0.7576, + "step": 12160 + }, + { + "epoch": 2.33, + "learning_rate": 6.710805425210516e-05, + "loss": 0.7581, + "step": 12180 + }, + { + "epoch": 2.34, + "learning_rate": 6.672237577939191e-05, + "loss": 0.7647, + "step": 12200 + }, + { + "epoch": 2.34, + "eval_loss": 0.7914180755615234, + "eval_runtime": 16.5021, + "eval_samples_per_second": 121.196, + "eval_steps_per_second": 1.939, + "step": 12200 + }, + { + "epoch": 2.34, + "learning_rate": 6.633669730667866e-05, + "loss": 0.7582, + "step": 12220 + }, + { + "epoch": 2.35, + "learning_rate": 6.595101883396541e-05, + "loss": 0.7531, + "step": 12240 + }, + { + "epoch": 2.35, + "learning_rate": 6.556534036125216e-05, + "loss": 0.7526, + "step": 12260 + }, + { + "epoch": 2.35, + "learning_rate": 6.517966188853891e-05, + "loss": 0.7701, + "step": 12280 + }, + { + "epoch": 2.36, + "learning_rate": 6.479398341582568e-05, + "loss": 0.7662, + "step": 12300 + }, + { + "epoch": 2.36, + "learning_rate": 6.440830494311241e-05, + "loss": 0.7541, + "step": 12320 + }, + { + "epoch": 2.36, + "learning_rate": 6.402262647039918e-05, + "loss": 0.7578, + "step": 12340 + }, + { + "epoch": 2.37, + "learning_rate": 6.363694799768592e-05, + "loss": 0.7569, + "step": 12360 + }, + { + "epoch": 2.37, + "learning_rate": 6.325126952497267e-05, + "loss": 0.7635, + "step": 12380 + }, + { + "epoch": 2.38, + "learning_rate": 6.286559105225942e-05, + "loss": 0.7618, + "step": 12400 + }, + { + "epoch": 2.38, + "eval_loss": 0.7912635207176208, + "eval_runtime": 16.4984, + "eval_samples_per_second": 121.224, + "eval_steps_per_second": 1.94, + "step": 12400 + }, + { + "epoch": 2.38, + "learning_rate": 6.247991257954619e-05, + "loss": 0.7536, + "step": 12420 + }, + { + "epoch": 2.38, + "learning_rate": 6.209423410683292e-05, + "loss": 0.7478, + "step": 12440 + }, + { + "epoch": 2.39, + "learning_rate": 6.170855563411969e-05, + "loss": 0.745, + "step": 12460 + }, + { + "epoch": 2.39, + "learning_rate": 6.132287716140644e-05, + "loss": 0.7611, + "step": 12480 + }, + { + "epoch": 2.4, + "learning_rate": 6.0937198688693187e-05, + "loss": 0.763, + "step": 12500 + }, + { + "epoch": 2.4, + "learning_rate": 6.0551520215979936e-05, + "loss": 0.7647, + "step": 12520 + }, + { + "epoch": 2.4, + "learning_rate": 6.016584174326669e-05, + "loss": 0.7621, + "step": 12540 + }, + { + "epoch": 2.41, + "learning_rate": 5.978016327055344e-05, + "loss": 0.7568, + "step": 12560 + }, + { + "epoch": 2.41, + "learning_rate": 5.93944847978402e-05, + "loss": 0.7613, + "step": 12580 + }, + { + "epoch": 2.41, + "learning_rate": 5.900880632512694e-05, + "loss": 0.7568, + "step": 12600 + }, + { + "epoch": 2.41, + "eval_loss": 0.7910023331642151, + "eval_runtime": 16.5022, + "eval_samples_per_second": 121.196, + "eval_steps_per_second": 1.939, + "step": 12600 + }, + { + "epoch": 2.42, + "learning_rate": 5.86231278524137e-05, + "loss": 0.7636, + "step": 12620 + }, + { + "epoch": 2.42, + "learning_rate": 5.823744937970045e-05, + "loss": 0.7657, + "step": 12640 + }, + { + "epoch": 2.43, + "learning_rate": 5.7851770906987205e-05, + "loss": 0.7703, + "step": 12660 + }, + { + "epoch": 2.43, + "learning_rate": 5.7466092434273955e-05, + "loss": 0.7557, + "step": 12680 + }, + { + "epoch": 2.43, + "learning_rate": 5.708041396156071e-05, + "loss": 0.7667, + "step": 12700 + }, + { + "epoch": 2.44, + "learning_rate": 5.6694735488847454e-05, + "loss": 0.7605, + "step": 12720 + }, + { + "epoch": 2.44, + "learning_rate": 5.630905701613421e-05, + "loss": 0.7549, + "step": 12740 + }, + { + "epoch": 2.44, + "learning_rate": 5.592337854342096e-05, + "loss": 0.7592, + "step": 12760 + }, + { + "epoch": 2.45, + "learning_rate": 5.553770007070772e-05, + "loss": 0.7654, + "step": 12780 + }, + { + "epoch": 2.45, + "learning_rate": 5.5171305521630135e-05, + "loss": 0.7636, + "step": 12800 + }, + { + "epoch": 2.45, + "eval_loss": 0.7906058430671692, + "eval_runtime": 16.5012, + "eval_samples_per_second": 121.203, + "eval_steps_per_second": 1.939, + "step": 12800 + }, + { + "epoch": 2.46, + "learning_rate": 5.478562704891688e-05, + "loss": 0.7629, + "step": 12820 + }, + { + "epoch": 2.46, + "learning_rate": 5.4399948576203634e-05, + "loss": 0.7688, + "step": 12840 + }, + { + "epoch": 2.46, + "learning_rate": 5.4014270103490384e-05, + "loss": 0.7534, + "step": 12860 + }, + { + "epoch": 2.47, + "learning_rate": 5.362859163077714e-05, + "loss": 0.76, + "step": 12880 + }, + { + "epoch": 2.47, + "learning_rate": 5.324291315806389e-05, + "loss": 0.7561, + "step": 12900 + }, + { + "epoch": 2.48, + "learning_rate": 5.285723468535065e-05, + "loss": 0.7562, + "step": 12920 + }, + { + "epoch": 2.48, + "learning_rate": 5.247155621263739e-05, + "loss": 0.7607, + "step": 12940 + }, + { + "epoch": 2.48, + "learning_rate": 5.2085877739924146e-05, + "loss": 0.7612, + "step": 12960 + }, + { + "epoch": 2.49, + "learning_rate": 5.1700199267210896e-05, + "loss": 0.7643, + "step": 12980 + }, + { + "epoch": 2.49, + "learning_rate": 5.131452079449765e-05, + "loss": 0.7656, + "step": 13000 + }, + { + "epoch": 2.49, + "eval_loss": 0.790121853351593, + "eval_runtime": 16.5158, + "eval_samples_per_second": 121.096, + "eval_steps_per_second": 1.938, + "step": 13000 + }, + { + "epoch": 2.49, + "learning_rate": 5.09288423217844e-05, + "loss": 0.756, + "step": 13020 + }, + { + "epoch": 2.5, + "learning_rate": 5.054316384907115e-05, + "loss": 0.7597, + "step": 13040 + }, + { + "epoch": 2.5, + "learning_rate": 5.01574853763579e-05, + "loss": 0.7525, + "step": 13060 + }, + { + "epoch": 2.51, + "learning_rate": 4.977180690364466e-05, + "loss": 0.7565, + "step": 13080 + }, + { + "epoch": 2.51, + "learning_rate": 4.938612843093141e-05, + "loss": 0.7631, + "step": 13100 + }, + { + "epoch": 2.51, + "learning_rate": 4.9000449958218165e-05, + "loss": 0.7514, + "step": 13120 + }, + { + "epoch": 2.52, + "learning_rate": 4.861477148550491e-05, + "loss": 0.7576, + "step": 13140 + }, + { + "epoch": 2.52, + "learning_rate": 4.8229093012791664e-05, + "loss": 0.7539, + "step": 13160 + }, + { + "epoch": 2.53, + "learning_rate": 4.7843414540078414e-05, + "loss": 0.7586, + "step": 13180 + }, + { + "epoch": 2.53, + "learning_rate": 4.745773606736517e-05, + "loss": 0.7573, + "step": 13200 + }, + { + "epoch": 2.53, + "eval_loss": 0.7899668216705322, + "eval_runtime": 16.509, + "eval_samples_per_second": 121.146, + "eval_steps_per_second": 1.938, + "step": 13200 + }, + { + "epoch": 2.53, + "learning_rate": 4.707205759465192e-05, + "loss": 0.7671, + "step": 13220 + }, + { + "epoch": 2.54, + "learning_rate": 4.668637912193868e-05, + "loss": 0.758, + "step": 13240 + }, + { + "epoch": 2.54, + "learning_rate": 4.630070064922542e-05, + "loss": 0.7444, + "step": 13260 + }, + { + "epoch": 2.54, + "learning_rate": 4.5915022176512176e-05, + "loss": 0.7548, + "step": 13280 + }, + { + "epoch": 2.55, + "learning_rate": 4.5529343703798926e-05, + "loss": 0.7681, + "step": 13300 + }, + { + "epoch": 2.55, + "learning_rate": 4.514366523108568e-05, + "loss": 0.7599, + "step": 13320 + }, + { + "epoch": 2.56, + "learning_rate": 4.475798675837243e-05, + "loss": 0.7631, + "step": 13340 + }, + { + "epoch": 2.56, + "learning_rate": 4.437230828565919e-05, + "loss": 0.7565, + "step": 13360 + }, + { + "epoch": 2.56, + "learning_rate": 4.398662981294593e-05, + "loss": 0.7586, + "step": 13380 + }, + { + "epoch": 2.57, + "learning_rate": 4.360095134023269e-05, + "loss": 0.7526, + "step": 13400 + }, + { + "epoch": 2.57, + "eval_loss": 0.7896500825881958, + "eval_runtime": 16.5086, + "eval_samples_per_second": 121.149, + "eval_steps_per_second": 1.938, + "step": 13400 + }, + { + "epoch": 2.57, + "learning_rate": 4.321527286751944e-05, + "loss": 0.7591, + "step": 13420 + }, + { + "epoch": 2.58, + "learning_rate": 4.2829594394806195e-05, + "loss": 0.7645, + "step": 13440 + }, + { + "epoch": 2.58, + "learning_rate": 4.2443915922092944e-05, + "loss": 0.7532, + "step": 13460 + }, + { + "epoch": 2.58, + "learning_rate": 4.2077521373015355e-05, + "loss": 0.746, + "step": 13480 + }, + { + "epoch": 2.59, + "learning_rate": 4.169184290030211e-05, + "loss": 0.7534, + "step": 13500 + }, + { + "epoch": 2.59, + "learning_rate": 4.130616442758886e-05, + "loss": 0.7506, + "step": 13520 + }, + { + "epoch": 2.59, + "learning_rate": 4.092048595487562e-05, + "loss": 0.7535, + "step": 13540 + }, + { + "epoch": 2.6, + "learning_rate": 4.053480748216237e-05, + "loss": 0.7596, + "step": 13560 + }, + { + "epoch": 2.6, + "learning_rate": 4.0149129009449124e-05, + "loss": 0.7686, + "step": 13580 + }, + { + "epoch": 2.61, + "learning_rate": 3.976345053673587e-05, + "loss": 0.7537, + "step": 13600 + }, + { + "epoch": 2.61, + "eval_loss": 0.7891342639923096, + "eval_runtime": 16.5163, + "eval_samples_per_second": 121.092, + "eval_steps_per_second": 1.937, + "step": 13600 + }, + { + "epoch": 2.61, + "learning_rate": 3.9377772064022624e-05, + "loss": 0.7656, + "step": 13620 + }, + { + "epoch": 2.61, + "learning_rate": 3.8992093591309374e-05, + "loss": 0.7515, + "step": 13640 + }, + { + "epoch": 2.62, + "learning_rate": 3.860641511859613e-05, + "loss": 0.761, + "step": 13660 + }, + { + "epoch": 2.62, + "learning_rate": 3.822073664588288e-05, + "loss": 0.7648, + "step": 13680 + }, + { + "epoch": 2.63, + "learning_rate": 3.783505817316963e-05, + "loss": 0.7671, + "step": 13700 + }, + { + "epoch": 2.63, + "learning_rate": 3.7449379700456386e-05, + "loss": 0.7653, + "step": 13720 + }, + { + "epoch": 2.63, + "learning_rate": 3.7063701227743136e-05, + "loss": 0.7583, + "step": 13740 + }, + { + "epoch": 2.64, + "learning_rate": 3.6678022755029886e-05, + "loss": 0.7602, + "step": 13760 + }, + { + "epoch": 2.64, + "learning_rate": 3.629234428231664e-05, + "loss": 0.7626, + "step": 13780 + }, + { + "epoch": 2.64, + "learning_rate": 3.590666580960339e-05, + "loss": 0.7485, + "step": 13800 + }, + { + "epoch": 2.64, + "eval_loss": 0.7891269326210022, + "eval_runtime": 16.5041, + "eval_samples_per_second": 121.182, + "eval_steps_per_second": 1.939, + "step": 13800 + }, + { + "epoch": 2.65, + "learning_rate": 3.552098733689014e-05, + "loss": 0.7564, + "step": 13820 + }, + { + "epoch": 2.65, + "learning_rate": 3.51353088641769e-05, + "loss": 0.7603, + "step": 13840 + }, + { + "epoch": 2.66, + "learning_rate": 3.474963039146365e-05, + "loss": 0.7584, + "step": 13860 + }, + { + "epoch": 2.66, + "learning_rate": 3.43639519187504e-05, + "loss": 0.7608, + "step": 13880 + }, + { + "epoch": 2.66, + "learning_rate": 3.3978273446037154e-05, + "loss": 0.7535, + "step": 13900 + }, + { + "epoch": 2.67, + "learning_rate": 3.3592594973323904e-05, + "loss": 0.7614, + "step": 13920 + }, + { + "epoch": 2.67, + "learning_rate": 3.3206916500610654e-05, + "loss": 0.7654, + "step": 13940 + }, + { + "epoch": 2.67, + "learning_rate": 3.282123802789741e-05, + "loss": 0.7656, + "step": 13960 + }, + { + "epoch": 2.68, + "learning_rate": 3.243555955518416e-05, + "loss": 0.756, + "step": 13980 + }, + { + "epoch": 2.68, + "learning_rate": 3.204988108247091e-05, + "loss": 0.7653, + "step": 14000 + }, + { + "epoch": 2.68, + "eval_loss": 0.7888805866241455, + "eval_runtime": 16.5275, + "eval_samples_per_second": 121.01, + "eval_steps_per_second": 1.936, + "step": 14000 + }, + { + "epoch": 2.69, + "learning_rate": 3.1664202609757666e-05, + "loss": 0.7618, + "step": 14020 + }, + { + "epoch": 2.69, + "learning_rate": 3.1278524137044416e-05, + "loss": 0.7588, + "step": 14040 + }, + { + "epoch": 2.69, + "learning_rate": 3.0892845664331166e-05, + "loss": 0.7625, + "step": 14060 + }, + { + "epoch": 2.7, + "learning_rate": 3.050716719161792e-05, + "loss": 0.7532, + "step": 14080 + }, + { + "epoch": 2.7, + "learning_rate": 3.0121488718904672e-05, + "loss": 0.7538, + "step": 14100 + }, + { + "epoch": 2.71, + "learning_rate": 2.9735810246191422e-05, + "loss": 0.7531, + "step": 14120 + }, + { + "epoch": 2.71, + "learning_rate": 2.9350131773478175e-05, + "loss": 0.7551, + "step": 14140 + }, + { + "epoch": 2.71, + "learning_rate": 2.8964453300764928e-05, + "loss": 0.7479, + "step": 14160 + }, + { + "epoch": 2.72, + "learning_rate": 2.8578774828051678e-05, + "loss": 0.7629, + "step": 14180 + }, + { + "epoch": 2.72, + "learning_rate": 2.819309635533843e-05, + "loss": 0.7572, + "step": 14200 + }, + { + "epoch": 2.72, + "eval_loss": 0.7884878516197205, + "eval_runtime": 16.7595, + "eval_samples_per_second": 119.335, + "eval_steps_per_second": 1.909, + "step": 14200 + }, + { + "epoch": 2.72, + "learning_rate": 2.7807417882625184e-05, + "loss": 0.758, + "step": 14220 + }, + { + "epoch": 2.73, + "learning_rate": 2.7421739409911934e-05, + "loss": 0.7608, + "step": 14240 + }, + { + "epoch": 2.73, + "learning_rate": 2.7036060937198687e-05, + "loss": 0.7555, + "step": 14260 + }, + { + "epoch": 2.74, + "learning_rate": 2.6650382464485437e-05, + "loss": 0.7512, + "step": 14280 + }, + { + "epoch": 2.74, + "learning_rate": 2.626470399177219e-05, + "loss": 0.7488, + "step": 14300 + }, + { + "epoch": 2.74, + "learning_rate": 2.5879025519058943e-05, + "loss": 0.7532, + "step": 14320 + }, + { + "epoch": 2.75, + "learning_rate": 2.5493347046345693e-05, + "loss": 0.7525, + "step": 14340 + }, + { + "epoch": 2.75, + "learning_rate": 2.5107668573632446e-05, + "loss": 0.7662, + "step": 14360 + }, + { + "epoch": 2.76, + "learning_rate": 2.47219901009192e-05, + "loss": 0.7583, + "step": 14380 + }, + { + "epoch": 2.76, + "learning_rate": 2.433631162820595e-05, + "loss": 0.7442, + "step": 14400 + }, + { + "epoch": 2.76, + "eval_loss": 0.7883238196372986, + "eval_runtime": 16.474, + "eval_samples_per_second": 121.403, + "eval_steps_per_second": 1.942, + "step": 14400 + }, + { + "epoch": 2.76, + "learning_rate": 2.3950633155492702e-05, + "loss": 0.7612, + "step": 14420 + }, + { + "epoch": 2.77, + "learning_rate": 2.3564954682779455e-05, + "loss": 0.7571, + "step": 14440 + }, + { + "epoch": 2.77, + "learning_rate": 2.3179276210066205e-05, + "loss": 0.7511, + "step": 14460 + }, + { + "epoch": 2.77, + "learning_rate": 2.2793597737352958e-05, + "loss": 0.7567, + "step": 14480 + }, + { + "epoch": 2.78, + "learning_rate": 2.2407919264639708e-05, + "loss": 0.7555, + "step": 14500 + }, + { + "epoch": 2.78, + "learning_rate": 2.202224079192646e-05, + "loss": 0.7555, + "step": 14520 + }, + { + "epoch": 2.79, + "learning_rate": 2.1636562319213214e-05, + "loss": 0.7509, + "step": 14540 + }, + { + "epoch": 2.79, + "learning_rate": 2.1250883846499964e-05, + "loss": 0.7585, + "step": 14560 + }, + { + "epoch": 2.79, + "learning_rate": 2.0865205373786717e-05, + "loss": 0.7621, + "step": 14580 + }, + { + "epoch": 2.8, + "learning_rate": 2.047952690107347e-05, + "loss": 0.7601, + "step": 14600 + }, + { + "epoch": 2.8, + "eval_loss": 0.7880419492721558, + "eval_runtime": 16.6163, + "eval_samples_per_second": 120.364, + "eval_steps_per_second": 1.926, + "step": 14600 + }, + { + "epoch": 2.8, + "learning_rate": 2.009384842836022e-05, + "loss": 0.7574, + "step": 14620 + }, + { + "epoch": 2.81, + "learning_rate": 1.9708169955646973e-05, + "loss": 0.7538, + "step": 14640 + }, + { + "epoch": 2.81, + "learning_rate": 1.9322491482933726e-05, + "loss": 0.7611, + "step": 14660 + }, + { + "epoch": 2.81, + "learning_rate": 1.8936813010220476e-05, + "loss": 0.7519, + "step": 14680 + }, + { + "epoch": 2.82, + "learning_rate": 1.855113453750723e-05, + "loss": 0.7559, + "step": 14700 + }, + { + "epoch": 2.82, + "learning_rate": 1.8165456064793982e-05, + "loss": 0.7596, + "step": 14720 + }, + { + "epoch": 2.82, + "learning_rate": 1.7779777592080735e-05, + "loss": 0.7564, + "step": 14740 + }, + { + "epoch": 2.83, + "learning_rate": 1.7394099119367485e-05, + "loss": 0.7526, + "step": 14760 + }, + { + "epoch": 2.83, + "learning_rate": 1.7008420646654238e-05, + "loss": 0.7624, + "step": 14780 + }, + { + "epoch": 2.84, + "learning_rate": 1.662274217394099e-05, + "loss": 0.7569, + "step": 14800 + }, + { + "epoch": 2.84, + "eval_loss": 0.7879504561424255, + "eval_runtime": 16.5411, + "eval_samples_per_second": 120.911, + "eval_steps_per_second": 1.935, + "step": 14800 + }, + { + "epoch": 2.84, + "learning_rate": 1.623706370122774e-05, + "loss": 0.7543, + "step": 14820 + }, + { + "epoch": 2.84, + "learning_rate": 1.5851385228514494e-05, + "loss": 0.7533, + "step": 14840 + }, + { + "epoch": 2.85, + "learning_rate": 1.5465706755801247e-05, + "loss": 0.7579, + "step": 14860 + }, + { + "epoch": 2.85, + "learning_rate": 1.5080028283087997e-05, + "loss": 0.7638, + "step": 14880 + }, + { + "epoch": 2.85, + "learning_rate": 1.469434981037475e-05, + "loss": 0.7456, + "step": 14900 + }, + { + "epoch": 2.86, + "learning_rate": 1.4308671337661502e-05, + "loss": 0.7561, + "step": 14920 + }, + { + "epoch": 2.86, + "learning_rate": 1.3922992864948253e-05, + "loss": 0.7626, + "step": 14940 + }, + { + "epoch": 2.87, + "learning_rate": 1.3537314392235005e-05, + "loss": 0.7686, + "step": 14960 + }, + { + "epoch": 2.87, + "learning_rate": 1.3151635919521758e-05, + "loss": 0.7512, + "step": 14980 + }, + { + "epoch": 2.87, + "learning_rate": 1.276595744680851e-05, + "loss": 0.7526, + "step": 15000 + }, + { + "epoch": 2.87, + "eval_loss": 0.7875809073448181, + "eval_runtime": 16.5086, + "eval_samples_per_second": 121.149, + "eval_steps_per_second": 1.938, + "step": 15000 + }, + { + "epoch": 2.88, + "learning_rate": 1.238027897409526e-05, + "loss": 0.7582, + "step": 15020 + }, + { + "epoch": 2.88, + "learning_rate": 1.1994600501382012e-05, + "loss": 0.7584, + "step": 15040 + }, + { + "epoch": 2.89, + "learning_rate": 1.1608922028668765e-05, + "loss": 0.7643, + "step": 15060 + }, + { + "epoch": 2.89, + "learning_rate": 1.1223243555955517e-05, + "loss": 0.7693, + "step": 15080 + }, + { + "epoch": 2.89, + "learning_rate": 1.0837565083242268e-05, + "loss": 0.7591, + "step": 15100 + }, + { + "epoch": 2.9, + "learning_rate": 1.0451886610529021e-05, + "loss": 0.7482, + "step": 15120 + }, + { + "epoch": 2.9, + "learning_rate": 1.0066208137815773e-05, + "loss": 0.7553, + "step": 15140 + }, + { + "epoch": 2.9, + "learning_rate": 9.680529665102524e-06, + "loss": 0.7563, + "step": 15160 + }, + { + "epoch": 2.91, + "learning_rate": 9.294851192389277e-06, + "loss": 0.7639, + "step": 15180 + }, + { + "epoch": 2.91, + "learning_rate": 8.909172719676029e-06, + "loss": 0.7577, + "step": 15200 + }, + { + "epoch": 2.91, + "eval_loss": 0.7872186303138733, + "eval_runtime": 16.5027, + "eval_samples_per_second": 121.192, + "eval_steps_per_second": 1.939, + "step": 15200 + }, + { + "epoch": 2.92, + "learning_rate": 8.523494246962782e-06, + "loss": 0.7566, + "step": 15220 + }, + { + "epoch": 2.92, + "learning_rate": 8.137815774249533e-06, + "loss": 0.7594, + "step": 15240 + }, + { + "epoch": 2.92, + "learning_rate": 7.752137301536285e-06, + "loss": 0.758, + "step": 15260 + }, + { + "epoch": 2.93, + "learning_rate": 7.366458828823037e-06, + "loss": 0.766, + "step": 15280 + }, + { + "epoch": 2.93, + "learning_rate": 6.980780356109789e-06, + "loss": 0.7542, + "step": 15300 + }, + { + "epoch": 2.94, + "learning_rate": 6.595101883396541e-06, + "loss": 0.7643, + "step": 15320 + }, + { + "epoch": 2.94, + "learning_rate": 6.209423410683292e-06, + "loss": 0.7638, + "step": 15340 + }, + { + "epoch": 2.94, + "learning_rate": 5.823744937970045e-06, + "loss": 0.7629, + "step": 15360 + }, + { + "epoch": 2.95, + "learning_rate": 5.438066465256798e-06, + "loss": 0.752, + "step": 15380 + }, + { + "epoch": 2.95, + "learning_rate": 5.052387992543549e-06, + "loss": 0.7565, + "step": 15400 + }, + { + "epoch": 2.95, + "eval_loss": 0.787341296672821, + "eval_runtime": 16.5038, + "eval_samples_per_second": 121.184, + "eval_steps_per_second": 1.939, + "step": 15400 + } + ], + "max_steps": 15657, + "num_train_epochs": 3, + "total_flos": 4.00363440727235e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/training_args.bin b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cd8a096e4fd3ba848cec18e7c5691ebcb18ad76b --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a580d27270395c94c3ef0dba9604e87b9d9eebe09ad2bc995408d9ab207ebfd +size 3643 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/optimizer.pt b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..12b678699b2441e2be60d42ca719ecd909403658 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62500b73fcc1bf0a11456627f11ad5d82af25c7e1cddd6c7806b4ba8923ac778 +size 33629893 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/pytorch_model.bin b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ba5eb4d3b4065b534cded6b51ab3ae3eb84b0354 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48329985a16aa1fd8c59a345b3b34b80fc56bde56bf7f2cee81b8ff8305fe294 +size 16822989 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_0.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..63fbd65fff603be18a242e438b4a1fa5986118bf --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca169d2828cb6173fed1ef165fcb002274ef4682c336d5550dfc04127e1c2d93 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_1.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..f4d3c6862eb3404b1c68c558d615daf40d502b43 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:443e30212f68bc2281c60ebbc6f8cef22efe33ff9b9d2c502c63b911e4801b5a +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_2.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..3979bddccde9dae271fd6a14a112bfc93e2fe9b9 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c0afdda7fc89ff3316c50694c98806c17e5d5d205377e1a16201837d045133e +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_3.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..57137e1d41f7b4672c9d41e5e64d50f6fcad9ab6 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b4d5fe2eed52c9346c8b9e5ac4eb790414eba13428a1115e5ec439b422bf4ee +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_4.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..c25b2d4b1e7aa8d32c4f37050f00784edf7a14fc --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a38c9be70c62f943cb9aa7f628cf962b3b046c6baa8dfac261200f58c4c689d6 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_5.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..18de711c445e2e401afe99d842449e3d72b8b60c --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15dbb837856817f22997fa723ed226703e4eeda1cfeb7a58f627771920ec3bbf +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_6.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..94a0d83518470bb00b7b6de6c819c853461b6d25 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c31e696841f434096a22cef6f54870f135b85c94c9c1ade44387a6097c036267 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_7.pth b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..89cb3d373d7733ecb55d584bd537fb2fb54d43be --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae732a06022e376843182d541c37ce5221fd130b56f726ad56f3fc9f3be37683 +size 14583 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/scaler.pt b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a22e3f5c15aeb7f4f9296181632d02537d5b778 --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44ab08260e199bcd3afa09da1f61f970b4e5d1d1bf1aaf0e120a0f25a9818ff0 +size 557 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/scheduler.pt b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..47160cb81a8bce5e4e8cc7d19954bc3f572a738b --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f226e70722083a6f120d03c8de3df770a437c3c8dd19c077e5af09c5008aee3 +size 627 diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/trainer_state.json b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bf5b83e9a48beca95f485ac0efa275946815f01d --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/trainer_state.json @@ -0,0 +1,5320 @@ +{ + "best_metric": 0.7870123386383057, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-belle-cot7b/checkpoint-15600", + "epoch": 2.989078367503353, + "global_step": 15600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.8213, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 1.5494, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 1.179, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 0.00023999999999999998, + "loss": 1.1022, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.0003, + "loss": 1.078, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029961432152728675, + "loss": 1.0347, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002992286430545735, + "loss": 1.0169, + "step": 140 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029884296458186025, + "loss": 1.0088, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029845728610914697, + "loss": 0.9896, + "step": 180 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029807160763643375, + "loss": 0.99, + "step": 200 + }, + { + "epoch": 0.04, + "eval_loss": 1.0032634735107422, + "eval_runtime": 16.3952, + "eval_samples_per_second": 121.987, + "eval_steps_per_second": 1.952, + "step": 200 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029768592916372047, + "loss": 0.9724, + "step": 220 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029730025069100725, + "loss": 0.9719, + "step": 240 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029691457221829397, + "loss": 0.9652, + "step": 260 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029652889374558075, + "loss": 0.9579, + "step": 280 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029614321527286747, + "loss": 0.9532, + "step": 300 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029575753680015425, + "loss": 0.9613, + "step": 320 + }, + { + "epoch": 0.07, + "learning_rate": 0.000295371858327441, + "loss": 0.9473, + "step": 340 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029498617985472775, + "loss": 0.9416, + "step": 360 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002946005013820145, + "loss": 0.9386, + "step": 380 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029421482290930125, + "loss": 0.9338, + "step": 400 + }, + { + "epoch": 0.08, + "eval_loss": 0.957970380783081, + "eval_runtime": 16.3897, + "eval_samples_per_second": 122.028, + "eval_steps_per_second": 1.952, + "step": 400 + }, + { + "epoch": 0.08, + "learning_rate": 0.000293829144436588, + "loss": 0.937, + "step": 420 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029344346596387475, + "loss": 0.9304, + "step": 440 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002930577874911615, + "loss": 0.9323, + "step": 460 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029267210901844825, + "loss": 0.9185, + "step": 480 + }, + { + "epoch": 0.1, + "learning_rate": 0.000292286430545735, + "loss": 0.9273, + "step": 500 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029190075207302175, + "loss": 0.922, + "step": 520 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002915150736003085, + "loss": 0.9146, + "step": 540 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029112939512759525, + "loss": 0.9129, + "step": 560 + }, + { + "epoch": 0.11, + "learning_rate": 0.000290743716654882, + "loss": 0.9146, + "step": 580 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002903580381821688, + "loss": 0.9078, + "step": 600 + }, + { + "epoch": 0.11, + "eval_loss": 0.9345074892044067, + "eval_runtime": 16.4049, + "eval_samples_per_second": 121.914, + "eval_steps_per_second": 1.951, + "step": 600 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002899723597094555, + "loss": 0.9004, + "step": 620 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002895866812367423, + "loss": 0.9042, + "step": 640 + }, + { + "epoch": 0.13, + "learning_rate": 0.000289201002764029, + "loss": 0.9028, + "step": 660 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002888153242913158, + "loss": 0.8889, + "step": 680 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002884296458186025, + "loss": 0.8935, + "step": 700 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002880439673458893, + "loss": 0.9024, + "step": 720 + }, + { + "epoch": 0.14, + "learning_rate": 0.000287658288873176, + "loss": 0.8922, + "step": 740 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002872726104004628, + "loss": 0.8896, + "step": 760 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002868869319277495, + "loss": 0.8907, + "step": 780 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002865012534550363, + "loss": 0.8922, + "step": 800 + }, + { + "epoch": 0.15, + "eval_loss": 0.9149895310401917, + "eval_runtime": 16.4499, + "eval_samples_per_second": 121.581, + "eval_steps_per_second": 1.945, + "step": 800 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002861155749823231, + "loss": 0.8867, + "step": 820 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002857298965096098, + "loss": 0.891, + "step": 840 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028534421803689657, + "loss": 0.8882, + "step": 860 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002849585395641833, + "loss": 0.8835, + "step": 880 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028457286109147007, + "loss": 0.8798, + "step": 900 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002841871826187568, + "loss": 0.8784, + "step": 920 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028380150414604357, + "loss": 0.8841, + "step": 940 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002834158256733303, + "loss": 0.8787, + "step": 960 + }, + { + "epoch": 0.19, + "learning_rate": 0.00028303014720061707, + "loss": 0.8693, + "step": 980 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002826444687279038, + "loss": 0.8711, + "step": 1000 + }, + { + "epoch": 0.19, + "eval_loss": 0.9027432799339294, + "eval_runtime": 16.447, + "eval_samples_per_second": 121.603, + "eval_steps_per_second": 1.946, + "step": 1000 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028225879025519057, + "loss": 0.876, + "step": 1020 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002818731117824773, + "loss": 0.8749, + "step": 1040 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028148743330976407, + "loss": 0.877, + "step": 1060 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028110175483705085, + "loss": 0.8754, + "step": 1080 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028071607636433757, + "loss": 0.8792, + "step": 1100 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028033039789162435, + "loss": 0.8701, + "step": 1120 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027994471941891107, + "loss": 0.8667, + "step": 1140 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027955904094619785, + "loss": 0.8769, + "step": 1160 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027917336247348457, + "loss": 0.8734, + "step": 1180 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002787876840007713, + "loss": 0.8708, + "step": 1200 + }, + { + "epoch": 0.23, + "eval_loss": 0.8911536335945129, + "eval_runtime": 16.423, + "eval_samples_per_second": 121.78, + "eval_steps_per_second": 1.948, + "step": 1200 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027840200552805807, + "loss": 0.8673, + "step": 1220 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027801632705534485, + "loss": 0.8618, + "step": 1240 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027763064858263157, + "loss": 0.8739, + "step": 1260 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027724497010991834, + "loss": 0.8608, + "step": 1280 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002768592916372051, + "loss": 0.8631, + "step": 1300 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027647361316449184, + "loss": 0.8547, + "step": 1320 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002760879346917786, + "loss": 0.8589, + "step": 1340 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027570225621906534, + "loss": 0.8615, + "step": 1360 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002753165777463521, + "loss": 0.8644, + "step": 1380 + }, + { + "epoch": 0.27, + "learning_rate": 0.00027493089927363884, + "loss": 0.8524, + "step": 1400 + }, + { + "epoch": 0.27, + "eval_loss": 0.8813066482543945, + "eval_runtime": 16.4628, + "eval_samples_per_second": 121.486, + "eval_steps_per_second": 1.944, + "step": 1400 + }, + { + "epoch": 0.27, + "learning_rate": 0.00027454522080092557, + "loss": 0.8562, + "step": 1420 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027415954232821234, + "loss": 0.8547, + "step": 1440 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002737738638554991, + "loss": 0.8599, + "step": 1460 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027338818538278584, + "loss": 0.8491, + "step": 1480 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002730025069100726, + "loss": 0.8496, + "step": 1500 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002726168284373594, + "loss": 0.8594, + "step": 1520 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002722311499646461, + "loss": 0.8512, + "step": 1540 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002718454714919329, + "loss": 0.8441, + "step": 1560 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002714597930192196, + "loss": 0.8621, + "step": 1580 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002710741145465064, + "loss": 0.8525, + "step": 1600 + }, + { + "epoch": 0.31, + "eval_loss": 0.8728711009025574, + "eval_runtime": 16.4389, + "eval_samples_per_second": 121.663, + "eval_steps_per_second": 1.947, + "step": 1600 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002706884360737931, + "loss": 0.852, + "step": 1620 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027030275760107984, + "loss": 0.8553, + "step": 1640 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002699170791283666, + "loss": 0.8445, + "step": 1660 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002695314006556534, + "loss": 0.8518, + "step": 1680 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002691457221829401, + "loss": 0.8318, + "step": 1700 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002687600437102269, + "loss": 0.8492, + "step": 1720 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002683743652375136, + "loss": 0.8475, + "step": 1740 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002679886867648004, + "loss": 0.8437, + "step": 1760 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026760300829208717, + "loss": 0.8355, + "step": 1780 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002672173298193739, + "loss": 0.8486, + "step": 1800 + }, + { + "epoch": 0.34, + "eval_loss": 0.8663893938064575, + "eval_runtime": 16.4511, + "eval_samples_per_second": 121.572, + "eval_steps_per_second": 1.945, + "step": 1800 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026683165134666067, + "loss": 0.8449, + "step": 1820 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002664459728739474, + "loss": 0.853, + "step": 1840 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026606029440123417, + "loss": 0.8472, + "step": 1860 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002656746159285209, + "loss": 0.83, + "step": 1880 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002652889374558076, + "loss": 0.8398, + "step": 1900 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002649032589830944, + "loss": 0.8337, + "step": 1920 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026451758051038117, + "loss": 0.8314, + "step": 1940 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002641319020376679, + "loss": 0.8314, + "step": 1960 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026374622356495467, + "loss": 0.845, + "step": 1980 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026336054509224144, + "loss": 0.8294, + "step": 2000 + }, + { + "epoch": 0.38, + "eval_loss": 0.8619188666343689, + "eval_runtime": 16.4444, + "eval_samples_per_second": 121.622, + "eval_steps_per_second": 1.946, + "step": 2000 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026297486661952817, + "loss": 0.8404, + "step": 2020 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026258918814681494, + "loss": 0.839, + "step": 2040 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026220350967410167, + "loss": 0.84, + "step": 2060 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026181783120138844, + "loss": 0.8442, + "step": 2080 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026143215272867517, + "loss": 0.8443, + "step": 2100 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002610464742559619, + "loss": 0.8301, + "step": 2120 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026066079578324867, + "loss": 0.8302, + "step": 2140 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026027511731053544, + "loss": 0.836, + "step": 2160 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025988943883782216, + "loss": 0.8277, + "step": 2180 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025950376036510894, + "loss": 0.8335, + "step": 2200 + }, + { + "epoch": 0.42, + "eval_loss": 0.8562669157981873, + "eval_runtime": 16.4486, + "eval_samples_per_second": 121.591, + "eval_steps_per_second": 1.945, + "step": 2200 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025911808189239566, + "loss": 0.8267, + "step": 2220 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025873240341968244, + "loss": 0.8267, + "step": 2240 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002583467249469692, + "loss": 0.8293, + "step": 2260 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025796104647425594, + "loss": 0.836, + "step": 2280 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002575753680015427, + "loss": 0.8255, + "step": 2300 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025718968952882944, + "loss": 0.8177, + "step": 2320 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025680401105611616, + "loss": 0.8272, + "step": 2340 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025641833258340294, + "loss": 0.831, + "step": 2360 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025603265411068966, + "loss": 0.819, + "step": 2380 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025564697563797644, + "loss": 0.8216, + "step": 2400 + }, + { + "epoch": 0.46, + "eval_loss": 0.8516544103622437, + "eval_runtime": 16.476, + "eval_samples_per_second": 121.389, + "eval_steps_per_second": 1.942, + "step": 2400 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002552612971652632, + "loss": 0.8305, + "step": 2420 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025487561869254994, + "loss": 0.8305, + "step": 2440 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002544899402198367, + "loss": 0.8302, + "step": 2460 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002541042617471235, + "loss": 0.824, + "step": 2480 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002537185832744102, + "loss": 0.8315, + "step": 2500 + }, + { + "epoch": 0.48, + "learning_rate": 0.000253332904801697, + "loss": 0.8224, + "step": 2520 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002529472263289837, + "loss": 0.8229, + "step": 2540 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025256154785627044, + "loss": 0.8156, + "step": 2560 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002521758693835572, + "loss": 0.8319, + "step": 2580 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025179019091084394, + "loss": 0.8222, + "step": 2600 + }, + { + "epoch": 0.5, + "eval_loss": 0.8481459021568298, + "eval_runtime": 16.453, + "eval_samples_per_second": 121.558, + "eval_steps_per_second": 1.945, + "step": 2600 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002514045124381307, + "loss": 0.8205, + "step": 2620 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002510188339654175, + "loss": 0.8267, + "step": 2640 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002506331554927042, + "loss": 0.8116, + "step": 2660 + }, + { + "epoch": 0.51, + "learning_rate": 0.000250247477019991, + "loss": 0.8239, + "step": 2680 + }, + { + "epoch": 0.52, + "learning_rate": 0.00024986179854727777, + "loss": 0.8126, + "step": 2700 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002494761200745645, + "loss": 0.8226, + "step": 2720 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024909044160185127, + "loss": 0.8173, + "step": 2740 + }, + { + "epoch": 0.53, + "learning_rate": 0.000248704763129138, + "loss": 0.8227, + "step": 2760 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002483190846564247, + "loss": 0.8129, + "step": 2780 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002479334061837115, + "loss": 0.8164, + "step": 2800 + }, + { + "epoch": 0.54, + "eval_loss": 0.8439643979072571, + "eval_runtime": 16.4767, + "eval_samples_per_second": 121.384, + "eval_steps_per_second": 1.942, + "step": 2800 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002475477277109982, + "loss": 0.807, + "step": 2820 + }, + { + "epoch": 0.54, + "learning_rate": 0.000247162049238285, + "loss": 0.8126, + "step": 2840 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024677637076557176, + "loss": 0.8193, + "step": 2860 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002463906922928585, + "loss": 0.8091, + "step": 2880 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024600501382014526, + "loss": 0.8147, + "step": 2900 + }, + { + "epoch": 0.56, + "learning_rate": 0.000245619335347432, + "loss": 0.8207, + "step": 2920 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024523365687471876, + "loss": 0.8087, + "step": 2940 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024484797840200554, + "loss": 0.8198, + "step": 2960 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024446229992929226, + "loss": 0.8087, + "step": 2980 + }, + { + "epoch": 0.57, + "learning_rate": 0.000244076621456579, + "loss": 0.8182, + "step": 3000 + }, + { + "epoch": 0.57, + "eval_loss": 0.8408891558647156, + "eval_runtime": 16.4801, + "eval_samples_per_second": 121.358, + "eval_steps_per_second": 1.942, + "step": 3000 + }, + { + "epoch": 0.58, + "learning_rate": 0.0002436909429838658, + "loss": 0.8188, + "step": 3020 + }, + { + "epoch": 0.58, + "learning_rate": 0.0002433052645111525, + "loss": 0.8082, + "step": 3040 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024291958603843926, + "loss": 0.8171, + "step": 3060 + }, + { + "epoch": 0.59, + "learning_rate": 0.000242533907565726, + "loss": 0.8088, + "step": 3080 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024214822909301276, + "loss": 0.8148, + "step": 3100 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024176255062029954, + "loss": 0.8122, + "step": 3120 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024137687214758626, + "loss": 0.811, + "step": 3140 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024099119367487304, + "loss": 0.8179, + "step": 3160 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002406055152021598, + "loss": 0.8029, + "step": 3180 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002402198367294465, + "loss": 0.8143, + "step": 3200 + }, + { + "epoch": 0.61, + "eval_loss": 0.837196946144104, + "eval_runtime": 16.4913, + "eval_samples_per_second": 121.276, + "eval_steps_per_second": 1.94, + "step": 3200 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002398341582567333, + "loss": 0.7969, + "step": 3220 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023944847978402, + "loss": 0.8158, + "step": 3240 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002390628013113068, + "loss": 0.8019, + "step": 3260 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023867712283859354, + "loss": 0.8042, + "step": 3280 + }, + { + "epoch": 0.63, + "learning_rate": 0.0002382914443658803, + "loss": 0.8022, + "step": 3300 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023790576589316704, + "loss": 0.8043, + "step": 3320 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002375200874204538, + "loss": 0.8106, + "step": 3340 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023713440894774054, + "loss": 0.8146, + "step": 3360 + }, + { + "epoch": 0.65, + "learning_rate": 0.0002367487304750273, + "loss": 0.8004, + "step": 3380 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023636305200231404, + "loss": 0.8096, + "step": 3400 + }, + { + "epoch": 0.65, + "eval_loss": 0.8347571492195129, + "eval_runtime": 16.4822, + "eval_samples_per_second": 121.343, + "eval_steps_per_second": 1.941, + "step": 3400 + }, + { + "epoch": 0.66, + "learning_rate": 0.0002359773735296008, + "loss": 0.8226, + "step": 3420 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023559169505688756, + "loss": 0.8083, + "step": 3440 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023520601658417428, + "loss": 0.8168, + "step": 3460 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023482033811146106, + "loss": 0.8112, + "step": 3480 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002344346596387478, + "loss": 0.8131, + "step": 3500 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023404898116603456, + "loss": 0.8097, + "step": 3520 + }, + { + "epoch": 0.68, + "learning_rate": 0.0002336633026933213, + "loss": 0.804, + "step": 3540 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023327762422060806, + "loss": 0.8085, + "step": 3560 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002328919457478948, + "loss": 0.7992, + "step": 3580 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002325062672751816, + "loss": 0.8124, + "step": 3600 + }, + { + "epoch": 0.69, + "eval_loss": 0.8324670791625977, + "eval_runtime": 16.4936, + "eval_samples_per_second": 121.259, + "eval_steps_per_second": 1.94, + "step": 3600 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002321205888024683, + "loss": 0.8024, + "step": 3620 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002317349103297551, + "loss": 0.8032, + "step": 3640 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023134923185704184, + "loss": 0.8065, + "step": 3660 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023096355338432856, + "loss": 0.8106, + "step": 3680 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023057787491161534, + "loss": 0.8009, + "step": 3700 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023019219643890206, + "loss": 0.816, + "step": 3720 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022980651796618884, + "loss": 0.8103, + "step": 3740 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022942083949347559, + "loss": 0.8099, + "step": 3760 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022903516102076233, + "loss": 0.8085, + "step": 3780 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022864948254804908, + "loss": 0.8044, + "step": 3800 + }, + { + "epoch": 0.73, + "eval_loss": 0.830141544342041, + "eval_runtime": 16.4845, + "eval_samples_per_second": 121.326, + "eval_steps_per_second": 1.941, + "step": 3800 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022826380407533586, + "loss": 0.7969, + "step": 3820 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022787812560262258, + "loss": 0.8029, + "step": 3840 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022749244712990936, + "loss": 0.7921, + "step": 3860 + }, + { + "epoch": 0.74, + "learning_rate": 0.0002271067686571961, + "loss": 0.8051, + "step": 3880 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022672109018448283, + "loss": 0.807, + "step": 3900 + }, + { + "epoch": 0.75, + "learning_rate": 0.0002263354117117696, + "loss": 0.8042, + "step": 3920 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022594973323905633, + "loss": 0.7947, + "step": 3940 + }, + { + "epoch": 0.76, + "learning_rate": 0.0002255640547663431, + "loss": 0.7972, + "step": 3960 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022517837629362986, + "loss": 0.8038, + "step": 3980 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002247926978209166, + "loss": 0.8064, + "step": 4000 + }, + { + "epoch": 0.77, + "eval_loss": 0.828279435634613, + "eval_runtime": 16.4904, + "eval_samples_per_second": 121.283, + "eval_steps_per_second": 1.941, + "step": 4000 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022440701934820336, + "loss": 0.8032, + "step": 4020 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022402134087549014, + "loss": 0.7934, + "step": 4040 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022363566240277686, + "loss": 0.7919, + "step": 4060 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022324998393006364, + "loss": 0.8011, + "step": 4080 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022286430545735036, + "loss": 0.8026, + "step": 4100 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002224786269846371, + "loss": 0.804, + "step": 4120 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022209294851192388, + "loss": 0.8122, + "step": 4140 + }, + { + "epoch": 0.8, + "learning_rate": 0.0002217072700392106, + "loss": 0.7932, + "step": 4160 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022132159156649738, + "loss": 0.7911, + "step": 4180 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022093591309378413, + "loss": 0.8012, + "step": 4200 + }, + { + "epoch": 0.8, + "eval_loss": 0.8261794447898865, + "eval_runtime": 16.4921, + "eval_samples_per_second": 121.27, + "eval_steps_per_second": 1.94, + "step": 4200 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022055023462107088, + "loss": 0.7989, + "step": 4220 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022016455614835763, + "loss": 0.8031, + "step": 4240 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021977887767564438, + "loss": 0.8066, + "step": 4260 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021939319920293113, + "loss": 0.7964, + "step": 4280 + }, + { + "epoch": 0.82, + "learning_rate": 0.0002190075207302179, + "loss": 0.7947, + "step": 4300 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021862184225750463, + "loss": 0.8035, + "step": 4320 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021823616378479138, + "loss": 0.8029, + "step": 4340 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021785048531207816, + "loss": 0.7941, + "step": 4360 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021746480683936488, + "loss": 0.7934, + "step": 4380 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021707912836665166, + "loss": 0.7946, + "step": 4400 + }, + { + "epoch": 0.84, + "eval_loss": 0.823946475982666, + "eval_runtime": 16.4887, + "eval_samples_per_second": 121.295, + "eval_steps_per_second": 1.941, + "step": 4400 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021669344989393838, + "loss": 0.7974, + "step": 4420 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021630777142122516, + "loss": 0.7962, + "step": 4440 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002159220929485119, + "loss": 0.7946, + "step": 4460 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021553641447579866, + "loss": 0.7818, + "step": 4480 + }, + { + "epoch": 0.86, + "learning_rate": 0.0002151507360030854, + "loss": 0.803, + "step": 4500 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021476505753037218, + "loss": 0.7851, + "step": 4520 + }, + { + "epoch": 0.87, + "learning_rate": 0.0002143793790576589, + "loss": 0.7984, + "step": 4540 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021399370058494568, + "loss": 0.7973, + "step": 4560 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002136080221122324, + "loss": 0.782, + "step": 4580 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021322234363951916, + "loss": 0.7951, + "step": 4600 + }, + { + "epoch": 0.88, + "eval_loss": 0.8220962285995483, + "eval_runtime": 16.5191, + "eval_samples_per_second": 121.072, + "eval_steps_per_second": 1.937, + "step": 4600 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021283666516680593, + "loss": 0.7947, + "step": 4620 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021245098669409266, + "loss": 0.7957, + "step": 4640 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021206530822137943, + "loss": 0.797, + "step": 4660 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021167962974866618, + "loss": 0.8097, + "step": 4680 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021129395127595293, + "loss": 0.7894, + "step": 4700 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021090827280323968, + "loss": 0.7789, + "step": 4720 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002105225943305264, + "loss": 0.7949, + "step": 4740 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021013691585781318, + "loss": 0.7895, + "step": 4760 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020975123738509996, + "loss": 0.8036, + "step": 4780 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020936555891238668, + "loss": 0.7966, + "step": 4800 + }, + { + "epoch": 0.92, + "eval_loss": 0.8209095597267151, + "eval_runtime": 16.5035, + "eval_samples_per_second": 121.187, + "eval_steps_per_second": 1.939, + "step": 4800 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020897988043967343, + "loss": 0.7892, + "step": 4820 + }, + { + "epoch": 0.93, + "learning_rate": 0.0002085942019669602, + "loss": 0.7825, + "step": 4840 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020820852349424693, + "loss": 0.7937, + "step": 4860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002078228450215337, + "loss": 0.7893, + "step": 4880 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020743716654882043, + "loss": 0.7944, + "step": 4900 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002070514880761072, + "loss": 0.7973, + "step": 4920 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020666580960339396, + "loss": 0.7919, + "step": 4940 + }, + { + "epoch": 0.95, + "learning_rate": 0.0002062801311306807, + "loss": 0.7918, + "step": 4960 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020589445265796746, + "loss": 0.7901, + "step": 4980 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020550877418525423, + "loss": 0.7891, + "step": 5000 + }, + { + "epoch": 0.96, + "eval_loss": 0.8192855715751648, + "eval_runtime": 16.5248, + "eval_samples_per_second": 121.03, + "eval_steps_per_second": 1.936, + "step": 5000 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020512309571254096, + "loss": 0.7813, + "step": 5020 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002047374172398277, + "loss": 0.7831, + "step": 5040 + }, + { + "epoch": 0.97, + "learning_rate": 0.00020435173876711445, + "loss": 0.7911, + "step": 5060 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002039660602944012, + "loss": 0.7816, + "step": 5080 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020358038182168798, + "loss": 0.7915, + "step": 5100 + }, + { + "epoch": 0.98, + "learning_rate": 0.0002031947033489747, + "loss": 0.791, + "step": 5120 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020280902487626148, + "loss": 0.7851, + "step": 5140 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020242334640354823, + "loss": 0.7859, + "step": 5160 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020203766793083498, + "loss": 0.7888, + "step": 5180 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020165198945812173, + "loss": 0.7854, + "step": 5200 + }, + { + "epoch": 1.0, + "eval_loss": 0.8173321485519409, + "eval_runtime": 16.5042, + "eval_samples_per_second": 121.182, + "eval_steps_per_second": 1.939, + "step": 5200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0002012663109854085, + "loss": 0.7888, + "step": 5220 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020088063251269523, + "loss": 0.7893, + "step": 5240 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020049495403998198, + "loss": 0.7817, + "step": 5260 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020010927556726873, + "loss": 0.7755, + "step": 5280 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019972359709455548, + "loss": 0.7839, + "step": 5300 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019933791862184226, + "loss": 0.7911, + "step": 5320 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019895224014912898, + "loss": 0.7819, + "step": 5340 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019856656167641576, + "loss": 0.7802, + "step": 5360 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001981808832037025, + "loss": 0.7847, + "step": 5380 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019779520473098925, + "loss": 0.7824, + "step": 5400 + }, + { + "epoch": 1.03, + "eval_loss": 0.8163856267929077, + "eval_runtime": 16.5306, + "eval_samples_per_second": 120.988, + "eval_steps_per_second": 1.936, + "step": 5400 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019742881018191167, + "loss": 0.7757, + "step": 5420 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001970431317091984, + "loss": 0.786, + "step": 5440 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019665745323648517, + "loss": 0.7923, + "step": 5460 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019627177476377191, + "loss": 0.791, + "step": 5480 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019588609629105866, + "loss": 0.7863, + "step": 5500 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019550041781834541, + "loss": 0.7879, + "step": 5520 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001951147393456322, + "loss": 0.7924, + "step": 5540 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019472906087291891, + "loss": 0.7918, + "step": 5560 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001943433824002057, + "loss": 0.792, + "step": 5580 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001939577039274924, + "loss": 0.7784, + "step": 5600 + }, + { + "epoch": 1.07, + "eval_loss": 0.8148436546325684, + "eval_runtime": 16.5424, + "eval_samples_per_second": 120.901, + "eval_steps_per_second": 1.934, + "step": 5600 + }, + { + "epoch": 1.08, + "learning_rate": 0.0001935720254547792, + "loss": 0.7903, + "step": 5620 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019318634698206594, + "loss": 0.785, + "step": 5640 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019280066850935266, + "loss": 0.7916, + "step": 5660 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019241499003663944, + "loss": 0.779, + "step": 5680 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001920293115639262, + "loss": 0.7909, + "step": 5700 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019164363309121294, + "loss": 0.7798, + "step": 5720 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001912579546184997, + "loss": 0.7846, + "step": 5740 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019087227614578647, + "loss": 0.7887, + "step": 5760 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001904865976730732, + "loss": 0.7802, + "step": 5780 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019010091920035997, + "loss": 0.7891, + "step": 5800 + }, + { + "epoch": 1.11, + "eval_loss": 0.8130878806114197, + "eval_runtime": 16.5056, + "eval_samples_per_second": 121.171, + "eval_steps_per_second": 1.939, + "step": 5800 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001897152407276467, + "loss": 0.7819, + "step": 5820 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018932956225493346, + "loss": 0.7945, + "step": 5840 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018894388378222021, + "loss": 0.784, + "step": 5860 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018855820530950694, + "loss": 0.7838, + "step": 5880 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018817252683679371, + "loss": 0.7841, + "step": 5900 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001877868483640805, + "loss": 0.7909, + "step": 5920 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001874011698913672, + "loss": 0.7775, + "step": 5940 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018701549141865396, + "loss": 0.7827, + "step": 5960 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001866298129459407, + "loss": 0.7866, + "step": 5980 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018624413447322746, + "loss": 0.7696, + "step": 6000 + }, + { + "epoch": 1.15, + "eval_loss": 0.8125277757644653, + "eval_runtime": 16.506, + "eval_samples_per_second": 121.168, + "eval_steps_per_second": 1.939, + "step": 6000 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018585845600051424, + "loss": 0.783, + "step": 6020 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018547277752780096, + "loss": 0.7792, + "step": 6040 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018508709905508774, + "loss": 0.7775, + "step": 6060 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001847014205823745, + "loss": 0.7806, + "step": 6080 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001843157421096612, + "loss": 0.7801, + "step": 6100 + }, + { + "epoch": 1.17, + "learning_rate": 0.000183930063636948, + "loss": 0.7853, + "step": 6120 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001835443851642347, + "loss": 0.7937, + "step": 6140 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001831587066915215, + "loss": 0.7873, + "step": 6160 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018277302821880824, + "loss": 0.778, + "step": 6180 + }, + { + "epoch": 1.19, + "learning_rate": 0.000182387349746095, + "loss": 0.781, + "step": 6200 + }, + { + "epoch": 1.19, + "eval_loss": 0.8113830089569092, + "eval_runtime": 16.5217, + "eval_samples_per_second": 121.053, + "eval_steps_per_second": 1.937, + "step": 6200 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018200167127338174, + "loss": 0.7746, + "step": 6220 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018161599280066851, + "loss": 0.7752, + "step": 6240 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018123031432795524, + "loss": 0.7838, + "step": 6260 + }, + { + "epoch": 1.2, + "learning_rate": 0.000180844635855242, + "loss": 0.789, + "step": 6280 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018045895738252874, + "loss": 0.7882, + "step": 6300 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018007327890981549, + "loss": 0.7822, + "step": 6320 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017968760043710226, + "loss": 0.7889, + "step": 6340 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017930192196438899, + "loss": 0.7891, + "step": 6360 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017891624349167576, + "loss": 0.7884, + "step": 6380 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001785305650189625, + "loss": 0.7733, + "step": 6400 + }, + { + "epoch": 1.23, + "eval_loss": 0.810148298740387, + "eval_runtime": 16.5113, + "eval_samples_per_second": 121.129, + "eval_steps_per_second": 1.938, + "step": 6400 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017814488654624926, + "loss": 0.7794, + "step": 6420 + }, + { + "epoch": 1.23, + "learning_rate": 0.000177759208073536, + "loss": 0.775, + "step": 6440 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017737352960082276, + "loss": 0.7706, + "step": 6460 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001769878511281095, + "loss": 0.7808, + "step": 6480 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001766021726553963, + "loss": 0.7805, + "step": 6500 + }, + { + "epoch": 1.25, + "learning_rate": 0.000176216494182683, + "loss": 0.7813, + "step": 6520 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001758308157099698, + "loss": 0.7789, + "step": 6540 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017544513723725654, + "loss": 0.7827, + "step": 6560 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017505945876454326, + "loss": 0.7763, + "step": 6580 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017467378029183004, + "loss": 0.7779, + "step": 6600 + }, + { + "epoch": 1.26, + "eval_loss": 0.8090565800666809, + "eval_runtime": 16.4954, + "eval_samples_per_second": 121.246, + "eval_steps_per_second": 1.94, + "step": 6600 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017428810181911676, + "loss": 0.7793, + "step": 6620 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017390242334640354, + "loss": 0.7778, + "step": 6640 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017351674487369029, + "loss": 0.7802, + "step": 6660 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017313106640097704, + "loss": 0.7823, + "step": 6680 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017274538792826379, + "loss": 0.7868, + "step": 6700 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017235970945555056, + "loss": 0.7824, + "step": 6720 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017197403098283728, + "loss": 0.7777, + "step": 6740 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017158835251012406, + "loss": 0.7822, + "step": 6760 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017120267403741078, + "loss": 0.7798, + "step": 6780 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017081699556469753, + "loss": 0.7712, + "step": 6800 + }, + { + "epoch": 1.3, + "eval_loss": 0.8080956935882568, + "eval_runtime": 16.5234, + "eval_samples_per_second": 121.041, + "eval_steps_per_second": 1.937, + "step": 6800 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001704313170919843, + "loss": 0.7888, + "step": 6820 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017004563861927103, + "loss": 0.7769, + "step": 6840 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001696599601465578, + "loss": 0.7686, + "step": 6860 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016927428167384456, + "loss": 0.7762, + "step": 6880 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001688886032011313, + "loss": 0.7807, + "step": 6900 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016850292472841806, + "loss": 0.7831, + "step": 6920 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001681172462557048, + "loss": 0.7856, + "step": 6940 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016773156778299156, + "loss": 0.775, + "step": 6960 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016734588931027834, + "loss": 0.7835, + "step": 6980 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016696021083756506, + "loss": 0.7756, + "step": 7000 + }, + { + "epoch": 1.34, + "eval_loss": 0.8070209622383118, + "eval_runtime": 16.4997, + "eval_samples_per_second": 121.214, + "eval_steps_per_second": 1.939, + "step": 7000 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001665745323648518, + "loss": 0.7756, + "step": 7020 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016618885389213859, + "loss": 0.7783, + "step": 7040 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001658031754194253, + "loss": 0.7697, + "step": 7060 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016541749694671208, + "loss": 0.7889, + "step": 7080 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016503181847399883, + "loss": 0.7725, + "step": 7100 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016464614000128558, + "loss": 0.7726, + "step": 7120 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016426046152857233, + "loss": 0.7787, + "step": 7140 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016387478305585908, + "loss": 0.782, + "step": 7160 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016348910458314583, + "loss": 0.7736, + "step": 7180 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001631034261104326, + "loss": 0.7748, + "step": 7200 + }, + { + "epoch": 1.38, + "eval_loss": 0.8063712120056152, + "eval_runtime": 16.5096, + "eval_samples_per_second": 121.142, + "eval_steps_per_second": 1.938, + "step": 7200 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016271774763771933, + "loss": 0.7717, + "step": 7220 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016233206916500608, + "loss": 0.7676, + "step": 7240 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016194639069229286, + "loss": 0.7662, + "step": 7260 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016156071221957958, + "loss": 0.7809, + "step": 7280 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016117503374686636, + "loss": 0.7731, + "step": 7300 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016078935527415308, + "loss": 0.7795, + "step": 7320 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016040367680143986, + "loss": 0.78, + "step": 7340 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001600179983287266, + "loss": 0.7785, + "step": 7360 + }, + { + "epoch": 1.41, + "learning_rate": 0.00015963231985601336, + "loss": 0.7694, + "step": 7380 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001592466413833001, + "loss": 0.781, + "step": 7400 + }, + { + "epoch": 1.42, + "eval_loss": 0.8048364520072937, + "eval_runtime": 16.5235, + "eval_samples_per_second": 121.04, + "eval_steps_per_second": 1.937, + "step": 7400 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015886096291058688, + "loss": 0.7681, + "step": 7420 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001584752844378736, + "loss": 0.7835, + "step": 7440 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015808960596516038, + "loss": 0.7778, + "step": 7460 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001577039274924471, + "loss": 0.775, + "step": 7480 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015731824901973386, + "loss": 0.7758, + "step": 7500 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015693257054702063, + "loss": 0.7846, + "step": 7520 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015654689207430736, + "loss": 0.7756, + "step": 7540 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015616121360159413, + "loss": 0.7764, + "step": 7560 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015577553512888088, + "loss": 0.7684, + "step": 7580 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015538985665616763, + "loss": 0.7837, + "step": 7600 + }, + { + "epoch": 1.46, + "eval_loss": 0.8041849136352539, + "eval_runtime": 16.4633, + "eval_samples_per_second": 121.482, + "eval_steps_per_second": 1.944, + "step": 7600 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015500417818345438, + "loss": 0.772, + "step": 7620 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001546184997107411, + "loss": 0.7759, + "step": 7640 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015423282123802788, + "loss": 0.7778, + "step": 7660 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015384714276531466, + "loss": 0.78, + "step": 7680 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015346146429260138, + "loss": 0.7681, + "step": 7700 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015307578581988813, + "loss": 0.7731, + "step": 7720 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001526901073471749, + "loss": 0.78, + "step": 7740 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015230442887446163, + "loss": 0.7719, + "step": 7760 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001519187504017484, + "loss": 0.7667, + "step": 7780 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015153307192903513, + "loss": 0.7804, + "step": 7800 + }, + { + "epoch": 1.49, + "eval_loss": 0.8034607768058777, + "eval_runtime": 16.4833, + "eval_samples_per_second": 121.335, + "eval_steps_per_second": 1.941, + "step": 7800 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001511473934563219, + "loss": 0.7813, + "step": 7820 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015076171498360866, + "loss": 0.7751, + "step": 7840 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015037603651089538, + "loss": 0.7681, + "step": 7860 + }, + { + "epoch": 1.51, + "learning_rate": 0.00014999035803818216, + "loss": 0.7679, + "step": 7880 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001496046795654689, + "loss": 0.7723, + "step": 7900 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014921900109275566, + "loss": 0.7732, + "step": 7920 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001488333226200424, + "loss": 0.7805, + "step": 7940 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014844764414732916, + "loss": 0.7666, + "step": 7960 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001480619656746159, + "loss": 0.7801, + "step": 7980 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014767628720190265, + "loss": 0.7736, + "step": 8000 + }, + { + "epoch": 1.53, + "eval_loss": 0.8029702305793762, + "eval_runtime": 16.5088, + "eval_samples_per_second": 121.147, + "eval_steps_per_second": 1.938, + "step": 8000 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014729060872918943, + "loss": 0.7716, + "step": 8020 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014690493025647618, + "loss": 0.7771, + "step": 8040 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014651925178376293, + "loss": 0.7715, + "step": 8060 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014613357331104968, + "loss": 0.7731, + "step": 8080 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014574789483833643, + "loss": 0.7763, + "step": 8100 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014536221636562318, + "loss": 0.7705, + "step": 8120 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014497653789290993, + "loss": 0.7702, + "step": 8140 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014459085942019668, + "loss": 0.7752, + "step": 8160 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014420518094748343, + "loss": 0.7662, + "step": 8180 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014381950247477018, + "loss": 0.7757, + "step": 8200 + }, + { + "epoch": 1.57, + "eval_loss": 0.8025923371315002, + "eval_runtime": 16.5398, + "eval_samples_per_second": 120.921, + "eval_steps_per_second": 1.935, + "step": 8200 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014343382400205693, + "loss": 0.7638, + "step": 8220 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001430481455293437, + "loss": 0.7836, + "step": 8240 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014266246705663046, + "loss": 0.7685, + "step": 8260 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001422767885839172, + "loss": 0.7901, + "step": 8280 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014189111011120396, + "loss": 0.7729, + "step": 8300 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001415054316384907, + "loss": 0.7614, + "step": 8320 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014111975316577745, + "loss": 0.7789, + "step": 8340 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001407340746930642, + "loss": 0.7713, + "step": 8360 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014034839622035095, + "loss": 0.7831, + "step": 8380 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001399627177476377, + "loss": 0.7674, + "step": 8400 + }, + { + "epoch": 1.61, + "eval_loss": 0.801445722579956, + "eval_runtime": 16.5305, + "eval_samples_per_second": 120.989, + "eval_steps_per_second": 1.936, + "step": 8400 + }, + { + "epoch": 1.61, + "learning_rate": 0.00013957703927492445, + "loss": 0.7698, + "step": 8420 + }, + { + "epoch": 1.62, + "learning_rate": 0.0001391913608022112, + "loss": 0.7725, + "step": 8440 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013880568232949795, + "loss": 0.771, + "step": 8460 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013842000385678473, + "loss": 0.7679, + "step": 8480 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013803432538407148, + "loss": 0.7788, + "step": 8500 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013764864691135823, + "loss": 0.7705, + "step": 8520 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013726296843864495, + "loss": 0.7625, + "step": 8540 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013687728996593173, + "loss": 0.7626, + "step": 8560 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013649161149321848, + "loss": 0.7731, + "step": 8580 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013610593302050523, + "loss": 0.7788, + "step": 8600 + }, + { + "epoch": 1.65, + "eval_loss": 0.8010225296020508, + "eval_runtime": 16.5075, + "eval_samples_per_second": 121.157, + "eval_steps_per_second": 1.939, + "step": 8600 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013572025454779198, + "loss": 0.7758, + "step": 8620 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013533457607507873, + "loss": 0.7738, + "step": 8640 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013494889760236548, + "loss": 0.7827, + "step": 8660 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013456321912965223, + "loss": 0.779, + "step": 8680 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013417754065693898, + "loss": 0.771, + "step": 8700 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013379186218422575, + "loss": 0.7683, + "step": 8720 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001334061837115125, + "loss": 0.7728, + "step": 8740 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013302050523879925, + "loss": 0.7761, + "step": 8760 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013263482676608598, + "loss": 0.7705, + "step": 8780 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013224914829337275, + "loss": 0.7624, + "step": 8800 + }, + { + "epoch": 1.69, + "eval_loss": 0.8003928065299988, + "eval_runtime": 16.5035, + "eval_samples_per_second": 121.186, + "eval_steps_per_second": 1.939, + "step": 8800 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001318634698206595, + "loss": 0.7669, + "step": 8820 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013147779134794625, + "loss": 0.7675, + "step": 8840 + }, + { + "epoch": 1.7, + "learning_rate": 0.000131092112875233, + "loss": 0.7629, + "step": 8860 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013070643440251975, + "loss": 0.7663, + "step": 8880 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001303207559298065, + "loss": 0.7708, + "step": 8900 + }, + { + "epoch": 1.71, + "learning_rate": 0.00012993507745709325, + "loss": 0.7734, + "step": 8920 + }, + { + "epoch": 1.71, + "learning_rate": 0.00012954939898438, + "loss": 0.7711, + "step": 8940 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012916372051166678, + "loss": 0.769, + "step": 8960 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012877804203895353, + "loss": 0.7706, + "step": 8980 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012839236356624025, + "loss": 0.7752, + "step": 9000 + }, + { + "epoch": 1.72, + "eval_loss": 0.799389660358429, + "eval_runtime": 16.516, + "eval_samples_per_second": 121.094, + "eval_steps_per_second": 1.938, + "step": 9000 + }, + { + "epoch": 1.73, + "learning_rate": 0.000128006685093527, + "loss": 0.7678, + "step": 9020 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012762100662081378, + "loss": 0.7764, + "step": 9040 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012723532814810053, + "loss": 0.7672, + "step": 9060 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012684964967538728, + "loss": 0.7705, + "step": 9080 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012646397120267403, + "loss": 0.7657, + "step": 9100 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012607829272996078, + "loss": 0.7648, + "step": 9120 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012569261425724753, + "loss": 0.7737, + "step": 9140 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012530693578453428, + "loss": 0.7628, + "step": 9160 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012492125731182103, + "loss": 0.767, + "step": 9180 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001245355788391078, + "loss": 0.764, + "step": 9200 + }, + { + "epoch": 1.76, + "eval_loss": 0.7991757988929749, + "eval_runtime": 16.5416, + "eval_samples_per_second": 120.907, + "eval_steps_per_second": 1.935, + "step": 9200 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012414990036639455, + "loss": 0.7658, + "step": 9220 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012376422189368128, + "loss": 0.7642, + "step": 9240 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012337854342096802, + "loss": 0.7611, + "step": 9260 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001229928649482548, + "loss": 0.7665, + "step": 9280 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012260718647554155, + "loss": 0.7785, + "step": 9300 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001222215080028283, + "loss": 0.7673, + "step": 9320 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012183582953011504, + "loss": 0.777, + "step": 9340 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001214501510574018, + "loss": 0.7684, + "step": 9360 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012106447258468855, + "loss": 0.7694, + "step": 9380 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001206787941119753, + "loss": 0.7634, + "step": 9400 + }, + { + "epoch": 1.8, + "eval_loss": 0.7980849742889404, + "eval_runtime": 16.5086, + "eval_samples_per_second": 121.149, + "eval_steps_per_second": 1.938, + "step": 9400 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012031239956289772, + "loss": 0.7636, + "step": 9420 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011992672109018447, + "loss": 0.7629, + "step": 9440 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011954104261747122, + "loss": 0.7724, + "step": 9460 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011915536414475797, + "loss": 0.7697, + "step": 9480 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011876968567204474, + "loss": 0.7574, + "step": 9500 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011838400719933149, + "loss": 0.7719, + "step": 9520 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011799832872661822, + "loss": 0.7761, + "step": 9540 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011761265025390497, + "loss": 0.7693, + "step": 9560 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011722697178119174, + "loss": 0.7687, + "step": 9580 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011684129330847849, + "loss": 0.7758, + "step": 9600 + }, + { + "epoch": 1.84, + "eval_loss": 0.7981218099594116, + "eval_runtime": 16.5407, + "eval_samples_per_second": 120.914, + "eval_steps_per_second": 1.935, + "step": 9600 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011645561483576524, + "loss": 0.7603, + "step": 9620 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011606993636305199, + "loss": 0.7579, + "step": 9640 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011568425789033875, + "loss": 0.7673, + "step": 9660 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001152985794176255, + "loss": 0.7745, + "step": 9680 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011491290094491225, + "loss": 0.758, + "step": 9700 + }, + { + "epoch": 1.86, + "learning_rate": 0.000114527222472199, + "loss": 0.7686, + "step": 9720 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011414154399948576, + "loss": 0.7741, + "step": 9740 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011375586552677251, + "loss": 0.7646, + "step": 9760 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011337018705405925, + "loss": 0.7675, + "step": 9780 + }, + { + "epoch": 1.88, + "learning_rate": 0.000112984508581346, + "loss": 0.7637, + "step": 9800 + }, + { + "epoch": 1.88, + "eval_loss": 0.7970672845840454, + "eval_runtime": 16.5386, + "eval_samples_per_second": 120.929, + "eval_steps_per_second": 1.935, + "step": 9800 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011259883010863276, + "loss": 0.7678, + "step": 9820 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011221315163591951, + "loss": 0.762, + "step": 9840 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011182747316320626, + "loss": 0.7653, + "step": 9860 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011144179469049301, + "loss": 0.7666, + "step": 9880 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011105611621777977, + "loss": 0.7621, + "step": 9900 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011067043774506652, + "loss": 0.7715, + "step": 9920 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011028475927235327, + "loss": 0.7605, + "step": 9940 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010989908079964002, + "loss": 0.7618, + "step": 9960 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010951340232692679, + "loss": 0.7726, + "step": 9980 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010912772385421352, + "loss": 0.7684, + "step": 10000 + }, + { + "epoch": 1.92, + "eval_loss": 0.7967627048492432, + "eval_runtime": 16.5033, + "eval_samples_per_second": 121.188, + "eval_steps_per_second": 1.939, + "step": 10000 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010874204538150027, + "loss": 0.7666, + "step": 10020 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010835636690878702, + "loss": 0.7661, + "step": 10040 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010797068843607378, + "loss": 0.7621, + "step": 10060 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010758500996336053, + "loss": 0.7736, + "step": 10080 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010719933149064728, + "loss": 0.76, + "step": 10100 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010681365301793405, + "loss": 0.764, + "step": 10120 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001064279745452208, + "loss": 0.7697, + "step": 10140 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010604229607250755, + "loss": 0.7602, + "step": 10160 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001056566175997943, + "loss": 0.766, + "step": 10180 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010527093912708106, + "loss": 0.7719, + "step": 10200 + }, + { + "epoch": 1.95, + "eval_loss": 0.7964752912521362, + "eval_runtime": 16.4947, + "eval_samples_per_second": 121.251, + "eval_steps_per_second": 1.94, + "step": 10200 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010488526065436781, + "loss": 0.7653, + "step": 10220 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010449958218165455, + "loss": 0.7653, + "step": 10240 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001041139037089413, + "loss": 0.7711, + "step": 10260 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010372822523622806, + "loss": 0.7729, + "step": 10280 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010334254676351481, + "loss": 0.7709, + "step": 10300 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010295686829080156, + "loss": 0.7611, + "step": 10320 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010257118981808831, + "loss": 0.7607, + "step": 10340 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010218551134537507, + "loss": 0.761, + "step": 10360 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010179983287266182, + "loss": 0.7645, + "step": 10380 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010141415439994857, + "loss": 0.7682, + "step": 10400 + }, + { + "epoch": 1.99, + "eval_loss": 0.7955361008644104, + "eval_runtime": 16.5066, + "eval_samples_per_second": 121.164, + "eval_steps_per_second": 1.939, + "step": 10400 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010102847592723531, + "loss": 0.76, + "step": 10420 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010064279745452208, + "loss": 0.7653, + "step": 10440 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010025711898180882, + "loss": 0.7625, + "step": 10460 + }, + { + "epoch": 2.01, + "learning_rate": 9.987144050909557e-05, + "loss": 0.764, + "step": 10480 + }, + { + "epoch": 2.01, + "learning_rate": 9.948576203638232e-05, + "loss": 0.766, + "step": 10500 + }, + { + "epoch": 2.02, + "learning_rate": 9.910008356366908e-05, + "loss": 0.7656, + "step": 10520 + }, + { + "epoch": 2.02, + "learning_rate": 9.871440509095583e-05, + "loss": 0.7698, + "step": 10540 + }, + { + "epoch": 2.02, + "learning_rate": 9.832872661824258e-05, + "loss": 0.7635, + "step": 10560 + }, + { + "epoch": 2.03, + "learning_rate": 9.794304814552933e-05, + "loss": 0.77, + "step": 10580 + }, + { + "epoch": 2.03, + "learning_rate": 9.75573696728161e-05, + "loss": 0.7651, + "step": 10600 + }, + { + "epoch": 2.03, + "eval_loss": 0.7953855395317078, + "eval_runtime": 16.5084, + "eval_samples_per_second": 121.15, + "eval_steps_per_second": 1.938, + "step": 10600 + }, + { + "epoch": 2.03, + "learning_rate": 9.717169120010285e-05, + "loss": 0.7628, + "step": 10620 + }, + { + "epoch": 2.04, + "learning_rate": 9.67860127273896e-05, + "loss": 0.7662, + "step": 10640 + }, + { + "epoch": 2.04, + "learning_rate": 9.640033425467633e-05, + "loss": 0.7635, + "step": 10660 + }, + { + "epoch": 2.05, + "learning_rate": 9.60146557819631e-05, + "loss": 0.7601, + "step": 10680 + }, + { + "epoch": 2.05, + "learning_rate": 9.562897730924984e-05, + "loss": 0.7649, + "step": 10700 + }, + { + "epoch": 2.05, + "learning_rate": 9.52432988365366e-05, + "loss": 0.758, + "step": 10720 + }, + { + "epoch": 2.06, + "learning_rate": 9.485762036382334e-05, + "loss": 0.767, + "step": 10740 + }, + { + "epoch": 2.06, + "learning_rate": 9.447194189111011e-05, + "loss": 0.7559, + "step": 10760 + }, + { + "epoch": 2.07, + "learning_rate": 9.408626341839686e-05, + "loss": 0.765, + "step": 10780 + }, + { + "epoch": 2.07, + "learning_rate": 9.37005849456836e-05, + "loss": 0.7641, + "step": 10800 + }, + { + "epoch": 2.07, + "eval_loss": 0.794941782951355, + "eval_runtime": 16.5101, + "eval_samples_per_second": 121.138, + "eval_steps_per_second": 1.938, + "step": 10800 + }, + { + "epoch": 2.07, + "learning_rate": 9.331490647297036e-05, + "loss": 0.7691, + "step": 10820 + }, + { + "epoch": 2.08, + "learning_rate": 9.292922800025712e-05, + "loss": 0.7611, + "step": 10840 + }, + { + "epoch": 2.08, + "learning_rate": 9.254354952754387e-05, + "loss": 0.7609, + "step": 10860 + }, + { + "epoch": 2.08, + "learning_rate": 9.21578710548306e-05, + "loss": 0.758, + "step": 10880 + }, + { + "epoch": 2.09, + "learning_rate": 9.177219258211736e-05, + "loss": 0.7637, + "step": 10900 + }, + { + "epoch": 2.09, + "learning_rate": 9.138651410940412e-05, + "loss": 0.7645, + "step": 10920 + }, + { + "epoch": 2.1, + "learning_rate": 9.100083563669087e-05, + "loss": 0.7507, + "step": 10940 + }, + { + "epoch": 2.1, + "learning_rate": 9.061515716397762e-05, + "loss": 0.7673, + "step": 10960 + }, + { + "epoch": 2.1, + "learning_rate": 9.022947869126437e-05, + "loss": 0.7552, + "step": 10980 + }, + { + "epoch": 2.11, + "learning_rate": 8.984380021855113e-05, + "loss": 0.7639, + "step": 11000 + }, + { + "epoch": 2.11, + "eval_loss": 0.7940524220466614, + "eval_runtime": 16.5029, + "eval_samples_per_second": 121.19, + "eval_steps_per_second": 1.939, + "step": 11000 + }, + { + "epoch": 2.11, + "learning_rate": 8.945812174583788e-05, + "loss": 0.7719, + "step": 11020 + }, + { + "epoch": 2.12, + "learning_rate": 8.907244327312463e-05, + "loss": 0.7641, + "step": 11040 + }, + { + "epoch": 2.12, + "learning_rate": 8.868676480041138e-05, + "loss": 0.7614, + "step": 11060 + }, + { + "epoch": 2.12, + "learning_rate": 8.830108632769814e-05, + "loss": 0.7785, + "step": 11080 + }, + { + "epoch": 2.13, + "learning_rate": 8.79154078549849e-05, + "loss": 0.7756, + "step": 11100 + }, + { + "epoch": 2.13, + "learning_rate": 8.752972938227163e-05, + "loss": 0.7645, + "step": 11120 + }, + { + "epoch": 2.13, + "learning_rate": 8.714405090955838e-05, + "loss": 0.7621, + "step": 11140 + }, + { + "epoch": 2.14, + "learning_rate": 8.675837243684514e-05, + "loss": 0.7662, + "step": 11160 + }, + { + "epoch": 2.14, + "learning_rate": 8.637269396413189e-05, + "loss": 0.7617, + "step": 11180 + }, + { + "epoch": 2.15, + "learning_rate": 8.598701549141864e-05, + "loss": 0.7683, + "step": 11200 + }, + { + "epoch": 2.15, + "eval_loss": 0.7937352061271667, + "eval_runtime": 16.5052, + "eval_samples_per_second": 121.174, + "eval_steps_per_second": 1.939, + "step": 11200 + }, + { + "epoch": 2.15, + "learning_rate": 8.560133701870539e-05, + "loss": 0.7635, + "step": 11220 + }, + { + "epoch": 2.15, + "learning_rate": 8.521565854599216e-05, + "loss": 0.7622, + "step": 11240 + }, + { + "epoch": 2.16, + "learning_rate": 8.48299800732789e-05, + "loss": 0.7616, + "step": 11260 + }, + { + "epoch": 2.16, + "learning_rate": 8.444430160056565e-05, + "loss": 0.7558, + "step": 11280 + }, + { + "epoch": 2.17, + "learning_rate": 8.40586231278524e-05, + "loss": 0.7714, + "step": 11300 + }, + { + "epoch": 2.17, + "learning_rate": 8.367294465513917e-05, + "loss": 0.7676, + "step": 11320 + }, + { + "epoch": 2.17, + "learning_rate": 8.32872661824259e-05, + "loss": 0.7623, + "step": 11340 + }, + { + "epoch": 2.18, + "learning_rate": 8.290158770971265e-05, + "loss": 0.7608, + "step": 11360 + }, + { + "epoch": 2.18, + "learning_rate": 8.251590923699942e-05, + "loss": 0.7746, + "step": 11380 + }, + { + "epoch": 2.18, + "learning_rate": 8.213023076428617e-05, + "loss": 0.7684, + "step": 11400 + }, + { + "epoch": 2.18, + "eval_loss": 0.7929428219795227, + "eval_runtime": 16.7561, + "eval_samples_per_second": 119.359, + "eval_steps_per_second": 1.91, + "step": 11400 + }, + { + "epoch": 2.19, + "learning_rate": 8.174455229157292e-05, + "loss": 0.7628, + "step": 11420 + }, + { + "epoch": 2.19, + "learning_rate": 8.135887381885967e-05, + "loss": 0.7614, + "step": 11440 + }, + { + "epoch": 2.2, + "learning_rate": 8.099247926978209e-05, + "loss": 0.7616, + "step": 11460 + }, + { + "epoch": 2.2, + "learning_rate": 8.060680079706884e-05, + "loss": 0.7614, + "step": 11480 + }, + { + "epoch": 2.2, + "learning_rate": 8.022112232435559e-05, + "loss": 0.7684, + "step": 11500 + }, + { + "epoch": 2.21, + "learning_rate": 7.983544385164233e-05, + "loss": 0.7663, + "step": 11520 + }, + { + "epoch": 2.21, + "learning_rate": 7.94497653789291e-05, + "loss": 0.7621, + "step": 11540 + }, + { + "epoch": 2.21, + "learning_rate": 7.906408690621584e-05, + "loss": 0.77, + "step": 11560 + }, + { + "epoch": 2.22, + "learning_rate": 7.867840843350259e-05, + "loss": 0.7629, + "step": 11580 + }, + { + "epoch": 2.22, + "learning_rate": 7.829272996078934e-05, + "loss": 0.7592, + "step": 11600 + }, + { + "epoch": 2.22, + "eval_loss": 0.7931132316589355, + "eval_runtime": 16.4886, + "eval_samples_per_second": 121.296, + "eval_steps_per_second": 1.941, + "step": 11600 + }, + { + "epoch": 2.23, + "learning_rate": 7.79070514880761e-05, + "loss": 0.7593, + "step": 11620 + }, + { + "epoch": 2.23, + "learning_rate": 7.752137301536285e-05, + "loss": 0.7579, + "step": 11640 + }, + { + "epoch": 2.23, + "learning_rate": 7.71356945426496e-05, + "loss": 0.7666, + "step": 11660 + }, + { + "epoch": 2.24, + "learning_rate": 7.675001606993635e-05, + "loss": 0.7573, + "step": 11680 + }, + { + "epoch": 2.24, + "learning_rate": 7.636433759722312e-05, + "loss": 0.7654, + "step": 11700 + }, + { + "epoch": 2.25, + "learning_rate": 7.597865912450986e-05, + "loss": 0.7637, + "step": 11720 + }, + { + "epoch": 2.25, + "learning_rate": 7.559298065179661e-05, + "loss": 0.7638, + "step": 11740 + }, + { + "epoch": 2.25, + "learning_rate": 7.520730217908335e-05, + "loss": 0.7538, + "step": 11760 + }, + { + "epoch": 2.26, + "learning_rate": 7.482162370637011e-05, + "loss": 0.7598, + "step": 11780 + }, + { + "epoch": 2.26, + "learning_rate": 7.443594523365686e-05, + "loss": 0.7577, + "step": 11800 + }, + { + "epoch": 2.26, + "eval_loss": 0.7928204536437988, + "eval_runtime": 16.5434, + "eval_samples_per_second": 120.894, + "eval_steps_per_second": 1.934, + "step": 11800 + }, + { + "epoch": 2.26, + "learning_rate": 7.405026676094361e-05, + "loss": 0.7561, + "step": 11820 + }, + { + "epoch": 2.27, + "learning_rate": 7.366458828823038e-05, + "loss": 0.7557, + "step": 11840 + }, + { + "epoch": 2.27, + "learning_rate": 7.327890981551713e-05, + "loss": 0.7606, + "step": 11860 + }, + { + "epoch": 2.28, + "learning_rate": 7.289323134280388e-05, + "loss": 0.7575, + "step": 11880 + }, + { + "epoch": 2.28, + "learning_rate": 7.250755287009063e-05, + "loss": 0.7557, + "step": 11900 + }, + { + "epoch": 2.28, + "learning_rate": 7.212187439737738e-05, + "loss": 0.7687, + "step": 11920 + }, + { + "epoch": 2.29, + "learning_rate": 7.173619592466414e-05, + "loss": 0.7647, + "step": 11940 + }, + { + "epoch": 2.29, + "learning_rate": 7.135051745195089e-05, + "loss": 0.7608, + "step": 11960 + }, + { + "epoch": 2.3, + "learning_rate": 7.096483897923764e-05, + "loss": 0.7624, + "step": 11980 + }, + { + "epoch": 2.3, + "learning_rate": 7.057916050652439e-05, + "loss": 0.7651, + "step": 12000 + }, + { + "epoch": 2.3, + "eval_loss": 0.7917994856834412, + "eval_runtime": 16.5312, + "eval_samples_per_second": 120.983, + "eval_steps_per_second": 1.936, + "step": 12000 + }, + { + "epoch": 2.3, + "learning_rate": 7.019348203381114e-05, + "loss": 0.7678, + "step": 12020 + }, + { + "epoch": 2.31, + "learning_rate": 6.980780356109789e-05, + "loss": 0.7606, + "step": 12040 + }, + { + "epoch": 2.31, + "learning_rate": 6.942212508838465e-05, + "loss": 0.7607, + "step": 12060 + }, + { + "epoch": 2.31, + "learning_rate": 6.90364466156714e-05, + "loss": 0.763, + "step": 12080 + }, + { + "epoch": 2.32, + "learning_rate": 6.865076814295815e-05, + "loss": 0.7669, + "step": 12100 + }, + { + "epoch": 2.32, + "learning_rate": 6.82650896702449e-05, + "loss": 0.755, + "step": 12120 + }, + { + "epoch": 2.33, + "learning_rate": 6.787941119753165e-05, + "loss": 0.7611, + "step": 12140 + }, + { + "epoch": 2.33, + "learning_rate": 6.74937327248184e-05, + "loss": 0.7576, + "step": 12160 + }, + { + "epoch": 2.33, + "learning_rate": 6.710805425210516e-05, + "loss": 0.7581, + "step": 12180 + }, + { + "epoch": 2.34, + "learning_rate": 6.672237577939191e-05, + "loss": 0.7647, + "step": 12200 + }, + { + "epoch": 2.34, + "eval_loss": 0.7914180755615234, + "eval_runtime": 16.5021, + "eval_samples_per_second": 121.196, + "eval_steps_per_second": 1.939, + "step": 12200 + }, + { + "epoch": 2.34, + "learning_rate": 6.633669730667866e-05, + "loss": 0.7582, + "step": 12220 + }, + { + "epoch": 2.35, + "learning_rate": 6.595101883396541e-05, + "loss": 0.7531, + "step": 12240 + }, + { + "epoch": 2.35, + "learning_rate": 6.556534036125216e-05, + "loss": 0.7526, + "step": 12260 + }, + { + "epoch": 2.35, + "learning_rate": 6.517966188853891e-05, + "loss": 0.7701, + "step": 12280 + }, + { + "epoch": 2.36, + "learning_rate": 6.479398341582568e-05, + "loss": 0.7662, + "step": 12300 + }, + { + "epoch": 2.36, + "learning_rate": 6.440830494311241e-05, + "loss": 0.7541, + "step": 12320 + }, + { + "epoch": 2.36, + "learning_rate": 6.402262647039918e-05, + "loss": 0.7578, + "step": 12340 + }, + { + "epoch": 2.37, + "learning_rate": 6.363694799768592e-05, + "loss": 0.7569, + "step": 12360 + }, + { + "epoch": 2.37, + "learning_rate": 6.325126952497267e-05, + "loss": 0.7635, + "step": 12380 + }, + { + "epoch": 2.38, + "learning_rate": 6.286559105225942e-05, + "loss": 0.7618, + "step": 12400 + }, + { + "epoch": 2.38, + "eval_loss": 0.7912635207176208, + "eval_runtime": 16.4984, + "eval_samples_per_second": 121.224, + "eval_steps_per_second": 1.94, + "step": 12400 + }, + { + "epoch": 2.38, + "learning_rate": 6.247991257954619e-05, + "loss": 0.7536, + "step": 12420 + }, + { + "epoch": 2.38, + "learning_rate": 6.209423410683292e-05, + "loss": 0.7478, + "step": 12440 + }, + { + "epoch": 2.39, + "learning_rate": 6.170855563411969e-05, + "loss": 0.745, + "step": 12460 + }, + { + "epoch": 2.39, + "learning_rate": 6.132287716140644e-05, + "loss": 0.7611, + "step": 12480 + }, + { + "epoch": 2.4, + "learning_rate": 6.0937198688693187e-05, + "loss": 0.763, + "step": 12500 + }, + { + "epoch": 2.4, + "learning_rate": 6.0551520215979936e-05, + "loss": 0.7647, + "step": 12520 + }, + { + "epoch": 2.4, + "learning_rate": 6.016584174326669e-05, + "loss": 0.7621, + "step": 12540 + }, + { + "epoch": 2.41, + "learning_rate": 5.978016327055344e-05, + "loss": 0.7568, + "step": 12560 + }, + { + "epoch": 2.41, + "learning_rate": 5.93944847978402e-05, + "loss": 0.7613, + "step": 12580 + }, + { + "epoch": 2.41, + "learning_rate": 5.900880632512694e-05, + "loss": 0.7568, + "step": 12600 + }, + { + "epoch": 2.41, + "eval_loss": 0.7910023331642151, + "eval_runtime": 16.5022, + "eval_samples_per_second": 121.196, + "eval_steps_per_second": 1.939, + "step": 12600 + }, + { + "epoch": 2.42, + "learning_rate": 5.86231278524137e-05, + "loss": 0.7636, + "step": 12620 + }, + { + "epoch": 2.42, + "learning_rate": 5.823744937970045e-05, + "loss": 0.7657, + "step": 12640 + }, + { + "epoch": 2.43, + "learning_rate": 5.7851770906987205e-05, + "loss": 0.7703, + "step": 12660 + }, + { + "epoch": 2.43, + "learning_rate": 5.7466092434273955e-05, + "loss": 0.7557, + "step": 12680 + }, + { + "epoch": 2.43, + "learning_rate": 5.708041396156071e-05, + "loss": 0.7667, + "step": 12700 + }, + { + "epoch": 2.44, + "learning_rate": 5.6694735488847454e-05, + "loss": 0.7605, + "step": 12720 + }, + { + "epoch": 2.44, + "learning_rate": 5.630905701613421e-05, + "loss": 0.7549, + "step": 12740 + }, + { + "epoch": 2.44, + "learning_rate": 5.592337854342096e-05, + "loss": 0.7592, + "step": 12760 + }, + { + "epoch": 2.45, + "learning_rate": 5.553770007070772e-05, + "loss": 0.7654, + "step": 12780 + }, + { + "epoch": 2.45, + "learning_rate": 5.5171305521630135e-05, + "loss": 0.7636, + "step": 12800 + }, + { + "epoch": 2.45, + "eval_loss": 0.7906058430671692, + "eval_runtime": 16.5012, + "eval_samples_per_second": 121.203, + "eval_steps_per_second": 1.939, + "step": 12800 + }, + { + "epoch": 2.46, + "learning_rate": 5.478562704891688e-05, + "loss": 0.7629, + "step": 12820 + }, + { + "epoch": 2.46, + "learning_rate": 5.4399948576203634e-05, + "loss": 0.7688, + "step": 12840 + }, + { + "epoch": 2.46, + "learning_rate": 5.4014270103490384e-05, + "loss": 0.7534, + "step": 12860 + }, + { + "epoch": 2.47, + "learning_rate": 5.362859163077714e-05, + "loss": 0.76, + "step": 12880 + }, + { + "epoch": 2.47, + "learning_rate": 5.324291315806389e-05, + "loss": 0.7561, + "step": 12900 + }, + { + "epoch": 2.48, + "learning_rate": 5.285723468535065e-05, + "loss": 0.7562, + "step": 12920 + }, + { + "epoch": 2.48, + "learning_rate": 5.247155621263739e-05, + "loss": 0.7607, + "step": 12940 + }, + { + "epoch": 2.48, + "learning_rate": 5.2085877739924146e-05, + "loss": 0.7612, + "step": 12960 + }, + { + "epoch": 2.49, + "learning_rate": 5.1700199267210896e-05, + "loss": 0.7643, + "step": 12980 + }, + { + "epoch": 2.49, + "learning_rate": 5.131452079449765e-05, + "loss": 0.7656, + "step": 13000 + }, + { + "epoch": 2.49, + "eval_loss": 0.790121853351593, + "eval_runtime": 16.5158, + "eval_samples_per_second": 121.096, + "eval_steps_per_second": 1.938, + "step": 13000 + }, + { + "epoch": 2.49, + "learning_rate": 5.09288423217844e-05, + "loss": 0.756, + "step": 13020 + }, + { + "epoch": 2.5, + "learning_rate": 5.054316384907115e-05, + "loss": 0.7597, + "step": 13040 + }, + { + "epoch": 2.5, + "learning_rate": 5.01574853763579e-05, + "loss": 0.7525, + "step": 13060 + }, + { + "epoch": 2.51, + "learning_rate": 4.977180690364466e-05, + "loss": 0.7565, + "step": 13080 + }, + { + "epoch": 2.51, + "learning_rate": 4.938612843093141e-05, + "loss": 0.7631, + "step": 13100 + }, + { + "epoch": 2.51, + "learning_rate": 4.9000449958218165e-05, + "loss": 0.7514, + "step": 13120 + }, + { + "epoch": 2.52, + "learning_rate": 4.861477148550491e-05, + "loss": 0.7576, + "step": 13140 + }, + { + "epoch": 2.52, + "learning_rate": 4.8229093012791664e-05, + "loss": 0.7539, + "step": 13160 + }, + { + "epoch": 2.53, + "learning_rate": 4.7843414540078414e-05, + "loss": 0.7586, + "step": 13180 + }, + { + "epoch": 2.53, + "learning_rate": 4.745773606736517e-05, + "loss": 0.7573, + "step": 13200 + }, + { + "epoch": 2.53, + "eval_loss": 0.7899668216705322, + "eval_runtime": 16.509, + "eval_samples_per_second": 121.146, + "eval_steps_per_second": 1.938, + "step": 13200 + }, + { + "epoch": 2.53, + "learning_rate": 4.707205759465192e-05, + "loss": 0.7671, + "step": 13220 + }, + { + "epoch": 2.54, + "learning_rate": 4.668637912193868e-05, + "loss": 0.758, + "step": 13240 + }, + { + "epoch": 2.54, + "learning_rate": 4.630070064922542e-05, + "loss": 0.7444, + "step": 13260 + }, + { + "epoch": 2.54, + "learning_rate": 4.5915022176512176e-05, + "loss": 0.7548, + "step": 13280 + }, + { + "epoch": 2.55, + "learning_rate": 4.5529343703798926e-05, + "loss": 0.7681, + "step": 13300 + }, + { + "epoch": 2.55, + "learning_rate": 4.514366523108568e-05, + "loss": 0.7599, + "step": 13320 + }, + { + "epoch": 2.56, + "learning_rate": 4.475798675837243e-05, + "loss": 0.7631, + "step": 13340 + }, + { + "epoch": 2.56, + "learning_rate": 4.437230828565919e-05, + "loss": 0.7565, + "step": 13360 + }, + { + "epoch": 2.56, + "learning_rate": 4.398662981294593e-05, + "loss": 0.7586, + "step": 13380 + }, + { + "epoch": 2.57, + "learning_rate": 4.360095134023269e-05, + "loss": 0.7526, + "step": 13400 + }, + { + "epoch": 2.57, + "eval_loss": 0.7896500825881958, + "eval_runtime": 16.5086, + "eval_samples_per_second": 121.149, + "eval_steps_per_second": 1.938, + "step": 13400 + }, + { + "epoch": 2.57, + "learning_rate": 4.321527286751944e-05, + "loss": 0.7591, + "step": 13420 + }, + { + "epoch": 2.58, + "learning_rate": 4.2829594394806195e-05, + "loss": 0.7645, + "step": 13440 + }, + { + "epoch": 2.58, + "learning_rate": 4.2443915922092944e-05, + "loss": 0.7532, + "step": 13460 + }, + { + "epoch": 2.58, + "learning_rate": 4.2077521373015355e-05, + "loss": 0.746, + "step": 13480 + }, + { + "epoch": 2.59, + "learning_rate": 4.169184290030211e-05, + "loss": 0.7534, + "step": 13500 + }, + { + "epoch": 2.59, + "learning_rate": 4.130616442758886e-05, + "loss": 0.7506, + "step": 13520 + }, + { + "epoch": 2.59, + "learning_rate": 4.092048595487562e-05, + "loss": 0.7535, + "step": 13540 + }, + { + "epoch": 2.6, + "learning_rate": 4.053480748216237e-05, + "loss": 0.7596, + "step": 13560 + }, + { + "epoch": 2.6, + "learning_rate": 4.0149129009449124e-05, + "loss": 0.7686, + "step": 13580 + }, + { + "epoch": 2.61, + "learning_rate": 3.976345053673587e-05, + "loss": 0.7537, + "step": 13600 + }, + { + "epoch": 2.61, + "eval_loss": 0.7891342639923096, + "eval_runtime": 16.5163, + "eval_samples_per_second": 121.092, + "eval_steps_per_second": 1.937, + "step": 13600 + }, + { + "epoch": 2.61, + "learning_rate": 3.9377772064022624e-05, + "loss": 0.7656, + "step": 13620 + }, + { + "epoch": 2.61, + "learning_rate": 3.8992093591309374e-05, + "loss": 0.7515, + "step": 13640 + }, + { + "epoch": 2.62, + "learning_rate": 3.860641511859613e-05, + "loss": 0.761, + "step": 13660 + }, + { + "epoch": 2.62, + "learning_rate": 3.822073664588288e-05, + "loss": 0.7648, + "step": 13680 + }, + { + "epoch": 2.63, + "learning_rate": 3.783505817316963e-05, + "loss": 0.7671, + "step": 13700 + }, + { + "epoch": 2.63, + "learning_rate": 3.7449379700456386e-05, + "loss": 0.7653, + "step": 13720 + }, + { + "epoch": 2.63, + "learning_rate": 3.7063701227743136e-05, + "loss": 0.7583, + "step": 13740 + }, + { + "epoch": 2.64, + "learning_rate": 3.6678022755029886e-05, + "loss": 0.7602, + "step": 13760 + }, + { + "epoch": 2.64, + "learning_rate": 3.629234428231664e-05, + "loss": 0.7626, + "step": 13780 + }, + { + "epoch": 2.64, + "learning_rate": 3.590666580960339e-05, + "loss": 0.7485, + "step": 13800 + }, + { + "epoch": 2.64, + "eval_loss": 0.7891269326210022, + "eval_runtime": 16.5041, + "eval_samples_per_second": 121.182, + "eval_steps_per_second": 1.939, + "step": 13800 + }, + { + "epoch": 2.65, + "learning_rate": 3.552098733689014e-05, + "loss": 0.7564, + "step": 13820 + }, + { + "epoch": 2.65, + "learning_rate": 3.51353088641769e-05, + "loss": 0.7603, + "step": 13840 + }, + { + "epoch": 2.66, + "learning_rate": 3.474963039146365e-05, + "loss": 0.7584, + "step": 13860 + }, + { + "epoch": 2.66, + "learning_rate": 3.43639519187504e-05, + "loss": 0.7608, + "step": 13880 + }, + { + "epoch": 2.66, + "learning_rate": 3.3978273446037154e-05, + "loss": 0.7535, + "step": 13900 + }, + { + "epoch": 2.67, + "learning_rate": 3.3592594973323904e-05, + "loss": 0.7614, + "step": 13920 + }, + { + "epoch": 2.67, + "learning_rate": 3.3206916500610654e-05, + "loss": 0.7654, + "step": 13940 + }, + { + "epoch": 2.67, + "learning_rate": 3.282123802789741e-05, + "loss": 0.7656, + "step": 13960 + }, + { + "epoch": 2.68, + "learning_rate": 3.243555955518416e-05, + "loss": 0.756, + "step": 13980 + }, + { + "epoch": 2.68, + "learning_rate": 3.204988108247091e-05, + "loss": 0.7653, + "step": 14000 + }, + { + "epoch": 2.68, + "eval_loss": 0.7888805866241455, + "eval_runtime": 16.5275, + "eval_samples_per_second": 121.01, + "eval_steps_per_second": 1.936, + "step": 14000 + }, + { + "epoch": 2.69, + "learning_rate": 3.1664202609757666e-05, + "loss": 0.7618, + "step": 14020 + }, + { + "epoch": 2.69, + "learning_rate": 3.1278524137044416e-05, + "loss": 0.7588, + "step": 14040 + }, + { + "epoch": 2.69, + "learning_rate": 3.0892845664331166e-05, + "loss": 0.7625, + "step": 14060 + }, + { + "epoch": 2.7, + "learning_rate": 3.050716719161792e-05, + "loss": 0.7532, + "step": 14080 + }, + { + "epoch": 2.7, + "learning_rate": 3.0121488718904672e-05, + "loss": 0.7538, + "step": 14100 + }, + { + "epoch": 2.71, + "learning_rate": 2.9735810246191422e-05, + "loss": 0.7531, + "step": 14120 + }, + { + "epoch": 2.71, + "learning_rate": 2.9350131773478175e-05, + "loss": 0.7551, + "step": 14140 + }, + { + "epoch": 2.71, + "learning_rate": 2.8964453300764928e-05, + "loss": 0.7479, + "step": 14160 + }, + { + "epoch": 2.72, + "learning_rate": 2.8578774828051678e-05, + "loss": 0.7629, + "step": 14180 + }, + { + "epoch": 2.72, + "learning_rate": 2.819309635533843e-05, + "loss": 0.7572, + "step": 14200 + }, + { + "epoch": 2.72, + "eval_loss": 0.7884878516197205, + "eval_runtime": 16.7595, + "eval_samples_per_second": 119.335, + "eval_steps_per_second": 1.909, + "step": 14200 + }, + { + "epoch": 2.72, + "learning_rate": 2.7807417882625184e-05, + "loss": 0.758, + "step": 14220 + }, + { + "epoch": 2.73, + "learning_rate": 2.7421739409911934e-05, + "loss": 0.7608, + "step": 14240 + }, + { + "epoch": 2.73, + "learning_rate": 2.7036060937198687e-05, + "loss": 0.7555, + "step": 14260 + }, + { + "epoch": 2.74, + "learning_rate": 2.6650382464485437e-05, + "loss": 0.7512, + "step": 14280 + }, + { + "epoch": 2.74, + "learning_rate": 2.626470399177219e-05, + "loss": 0.7488, + "step": 14300 + }, + { + "epoch": 2.74, + "learning_rate": 2.5879025519058943e-05, + "loss": 0.7532, + "step": 14320 + }, + { + "epoch": 2.75, + "learning_rate": 2.5493347046345693e-05, + "loss": 0.7525, + "step": 14340 + }, + { + "epoch": 2.75, + "learning_rate": 2.5107668573632446e-05, + "loss": 0.7662, + "step": 14360 + }, + { + "epoch": 2.76, + "learning_rate": 2.47219901009192e-05, + "loss": 0.7583, + "step": 14380 + }, + { + "epoch": 2.76, + "learning_rate": 2.433631162820595e-05, + "loss": 0.7442, + "step": 14400 + }, + { + "epoch": 2.76, + "eval_loss": 0.7883238196372986, + "eval_runtime": 16.474, + "eval_samples_per_second": 121.403, + "eval_steps_per_second": 1.942, + "step": 14400 + }, + { + "epoch": 2.76, + "learning_rate": 2.3950633155492702e-05, + "loss": 0.7612, + "step": 14420 + }, + { + "epoch": 2.77, + "learning_rate": 2.3564954682779455e-05, + "loss": 0.7571, + "step": 14440 + }, + { + "epoch": 2.77, + "learning_rate": 2.3179276210066205e-05, + "loss": 0.7511, + "step": 14460 + }, + { + "epoch": 2.77, + "learning_rate": 2.2793597737352958e-05, + "loss": 0.7567, + "step": 14480 + }, + { + "epoch": 2.78, + "learning_rate": 2.2407919264639708e-05, + "loss": 0.7555, + "step": 14500 + }, + { + "epoch": 2.78, + "learning_rate": 2.202224079192646e-05, + "loss": 0.7555, + "step": 14520 + }, + { + "epoch": 2.79, + "learning_rate": 2.1636562319213214e-05, + "loss": 0.7509, + "step": 14540 + }, + { + "epoch": 2.79, + "learning_rate": 2.1250883846499964e-05, + "loss": 0.7585, + "step": 14560 + }, + { + "epoch": 2.79, + "learning_rate": 2.0865205373786717e-05, + "loss": 0.7621, + "step": 14580 + }, + { + "epoch": 2.8, + "learning_rate": 2.047952690107347e-05, + "loss": 0.7601, + "step": 14600 + }, + { + "epoch": 2.8, + "eval_loss": 0.7880419492721558, + "eval_runtime": 16.6163, + "eval_samples_per_second": 120.364, + "eval_steps_per_second": 1.926, + "step": 14600 + }, + { + "epoch": 2.8, + "learning_rate": 2.009384842836022e-05, + "loss": 0.7574, + "step": 14620 + }, + { + "epoch": 2.81, + "learning_rate": 1.9708169955646973e-05, + "loss": 0.7538, + "step": 14640 + }, + { + "epoch": 2.81, + "learning_rate": 1.9322491482933726e-05, + "loss": 0.7611, + "step": 14660 + }, + { + "epoch": 2.81, + "learning_rate": 1.8936813010220476e-05, + "loss": 0.7519, + "step": 14680 + }, + { + "epoch": 2.82, + "learning_rate": 1.855113453750723e-05, + "loss": 0.7559, + "step": 14700 + }, + { + "epoch": 2.82, + "learning_rate": 1.8165456064793982e-05, + "loss": 0.7596, + "step": 14720 + }, + { + "epoch": 2.82, + "learning_rate": 1.7779777592080735e-05, + "loss": 0.7564, + "step": 14740 + }, + { + "epoch": 2.83, + "learning_rate": 1.7394099119367485e-05, + "loss": 0.7526, + "step": 14760 + }, + { + "epoch": 2.83, + "learning_rate": 1.7008420646654238e-05, + "loss": 0.7624, + "step": 14780 + }, + { + "epoch": 2.84, + "learning_rate": 1.662274217394099e-05, + "loss": 0.7569, + "step": 14800 + }, + { + "epoch": 2.84, + "eval_loss": 0.7879504561424255, + "eval_runtime": 16.5411, + "eval_samples_per_second": 120.911, + "eval_steps_per_second": 1.935, + "step": 14800 + }, + { + "epoch": 2.84, + "learning_rate": 1.623706370122774e-05, + "loss": 0.7543, + "step": 14820 + }, + { + "epoch": 2.84, + "learning_rate": 1.5851385228514494e-05, + "loss": 0.7533, + "step": 14840 + }, + { + "epoch": 2.85, + "learning_rate": 1.5465706755801247e-05, + "loss": 0.7579, + "step": 14860 + }, + { + "epoch": 2.85, + "learning_rate": 1.5080028283087997e-05, + "loss": 0.7638, + "step": 14880 + }, + { + "epoch": 2.85, + "learning_rate": 1.469434981037475e-05, + "loss": 0.7456, + "step": 14900 + }, + { + "epoch": 2.86, + "learning_rate": 1.4308671337661502e-05, + "loss": 0.7561, + "step": 14920 + }, + { + "epoch": 2.86, + "learning_rate": 1.3922992864948253e-05, + "loss": 0.7626, + "step": 14940 + }, + { + "epoch": 2.87, + "learning_rate": 1.3537314392235005e-05, + "loss": 0.7686, + "step": 14960 + }, + { + "epoch": 2.87, + "learning_rate": 1.3151635919521758e-05, + "loss": 0.7512, + "step": 14980 + }, + { + "epoch": 2.87, + "learning_rate": 1.276595744680851e-05, + "loss": 0.7526, + "step": 15000 + }, + { + "epoch": 2.87, + "eval_loss": 0.7875809073448181, + "eval_runtime": 16.5086, + "eval_samples_per_second": 121.149, + "eval_steps_per_second": 1.938, + "step": 15000 + }, + { + "epoch": 2.88, + "learning_rate": 1.238027897409526e-05, + "loss": 0.7582, + "step": 15020 + }, + { + "epoch": 2.88, + "learning_rate": 1.1994600501382012e-05, + "loss": 0.7584, + "step": 15040 + }, + { + "epoch": 2.89, + "learning_rate": 1.1608922028668765e-05, + "loss": 0.7643, + "step": 15060 + }, + { + "epoch": 2.89, + "learning_rate": 1.1223243555955517e-05, + "loss": 0.7693, + "step": 15080 + }, + { + "epoch": 2.89, + "learning_rate": 1.0837565083242268e-05, + "loss": 0.7591, + "step": 15100 + }, + { + "epoch": 2.9, + "learning_rate": 1.0451886610529021e-05, + "loss": 0.7482, + "step": 15120 + }, + { + "epoch": 2.9, + "learning_rate": 1.0066208137815773e-05, + "loss": 0.7553, + "step": 15140 + }, + { + "epoch": 2.9, + "learning_rate": 9.680529665102524e-06, + "loss": 0.7563, + "step": 15160 + }, + { + "epoch": 2.91, + "learning_rate": 9.294851192389277e-06, + "loss": 0.7639, + "step": 15180 + }, + { + "epoch": 2.91, + "learning_rate": 8.909172719676029e-06, + "loss": 0.7577, + "step": 15200 + }, + { + "epoch": 2.91, + "eval_loss": 0.7872186303138733, + "eval_runtime": 16.5027, + "eval_samples_per_second": 121.192, + "eval_steps_per_second": 1.939, + "step": 15200 + }, + { + "epoch": 2.92, + "learning_rate": 8.523494246962782e-06, + "loss": 0.7566, + "step": 15220 + }, + { + "epoch": 2.92, + "learning_rate": 8.137815774249533e-06, + "loss": 0.7594, + "step": 15240 + }, + { + "epoch": 2.92, + "learning_rate": 7.752137301536285e-06, + "loss": 0.758, + "step": 15260 + }, + { + "epoch": 2.93, + "learning_rate": 7.366458828823037e-06, + "loss": 0.766, + "step": 15280 + }, + { + "epoch": 2.93, + "learning_rate": 6.980780356109789e-06, + "loss": 0.7542, + "step": 15300 + }, + { + "epoch": 2.94, + "learning_rate": 6.595101883396541e-06, + "loss": 0.7643, + "step": 15320 + }, + { + "epoch": 2.94, + "learning_rate": 6.209423410683292e-06, + "loss": 0.7638, + "step": 15340 + }, + { + "epoch": 2.94, + "learning_rate": 5.823744937970045e-06, + "loss": 0.7629, + "step": 15360 + }, + { + "epoch": 2.95, + "learning_rate": 5.438066465256798e-06, + "loss": 0.752, + "step": 15380 + }, + { + "epoch": 2.95, + "learning_rate": 5.052387992543549e-06, + "loss": 0.7565, + "step": 15400 + }, + { + "epoch": 2.95, + "eval_loss": 0.787341296672821, + "eval_runtime": 16.5038, + "eval_samples_per_second": 121.184, + "eval_steps_per_second": 1.939, + "step": 15400 + }, + { + "epoch": 2.95, + "learning_rate": 4.666709519830301e-06, + "loss": 0.7558, + "step": 15420 + }, + { + "epoch": 2.96, + "learning_rate": 4.281031047117053e-06, + "loss": 0.7683, + "step": 15440 + }, + { + "epoch": 2.96, + "learning_rate": 3.895352574403805e-06, + "loss": 0.7454, + "step": 15460 + }, + { + "epoch": 2.97, + "learning_rate": 3.509674101690557e-06, + "loss": 0.7566, + "step": 15480 + }, + { + "epoch": 2.97, + "learning_rate": 3.123995628977309e-06, + "loss": 0.7536, + "step": 15500 + }, + { + "epoch": 2.97, + "learning_rate": 2.738317156264061e-06, + "loss": 0.7491, + "step": 15520 + }, + { + "epoch": 2.98, + "learning_rate": 2.3526386835508128e-06, + "loss": 0.7571, + "step": 15540 + }, + { + "epoch": 2.98, + "learning_rate": 1.9669602108375647e-06, + "loss": 0.7604, + "step": 15560 + }, + { + "epoch": 2.99, + "learning_rate": 1.581281738124317e-06, + "loss": 0.7586, + "step": 15580 + }, + { + "epoch": 2.99, + "learning_rate": 1.1956032654110688e-06, + "loss": 0.7599, + "step": 15600 + }, + { + "epoch": 2.99, + "eval_loss": 0.7870123386383057, + "eval_runtime": 16.493, + "eval_samples_per_second": 121.263, + "eval_steps_per_second": 1.94, + "step": 15600 + } + ], + "max_steps": 15657, + "num_train_epochs": 3, + "total_flos": 4.055629659314848e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/training_args.bin b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cd8a096e4fd3ba848cec18e7c5691ebcb18ad76b --- /dev/null +++ b/adapters/saved-alpaca-belle-cot7b/checkpoint-15600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a580d27270395c94c3ef0dba9604e87b9d9eebe09ad2bc995408d9ab207ebfd +size 3643 diff --git a/adapters/saved-alpaca-belle13b/adapter_config.json b/adapters/saved-alpaca-belle13b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4ba49948a8f8232ee95452e47fcf9bd523635048 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/adapter_config.json @@ -0,0 +1,18 @@ +{ + "base_model_name_or_path": "/mnt/bn/qingyi-bn-lq/llama/llama-13b-hf", + "bias": "none", + "enable_lora": null, + "fan_in_fan_out": false, + "inference_mode": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved-alpaca-belle13b/adapter_model.bin b/adapters/saved-alpaca-belle13b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..51379094771b9d98a4ecf58e827e2fdcb7eab1f9 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0e1b5e5c52ec1ff7918f91cf22fb36d14a3b15b0ea533af4ac89548035df0e4 +size 26271757 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13400/optimizer.pt b/adapters/saved-alpaca-belle13b/checkpoint-13400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e33ffa11451133a940876943ea2675d5b6e48167 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31ee8f99aebe0d36e10a08ff9b876315d3595eb5fd03f4145b4d59e0ea5f1f60 +size 52523141 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13400/pytorch_model.bin b/adapters/saved-alpaca-belle13b/checkpoint-13400/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..3d168ec93930eed67f4aff287a093433c9e78bf6 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13400/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36f51181fc1d79a19d61f61f3077e95455a5fbca5737bc1661b5b9436a225b15 +size 26271757 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_0.pth b/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..868eebdaecfb56cb3fce690ff3751fdf019bbda2 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:335384f928604f1d3705f85855ff339aaa01b3d51b2baa578047e923595a5afd +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_1.pth b/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6fd5c6031a46383c36abb686b5934ada6dad6028 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e93d2a17532c01c3ace8174e693b43407deb3b8ce424dc43badfc84abd21d18e +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_2.pth b/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..3de4c082c0c238235dd72a24c894a97b8d312f9d --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc1bbb9d41a589c0f63389e8ebc335d78e64a8f8b3ced74a3d381589a7e14bc5 +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_3.pth b/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..72e6bd1544cdb1ca7eff8b687c40fb29656a0847 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a016a54d076cbcaf9b85787cea3317890d3b8aaee4719352a03d38c0a4f272ae +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_4.pth b/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc54bac75576090dd7d6323eb4ed8863852541a8 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c45ad1c724e0241e3fc1c828739468cd895792e7209aae90295cf3c5050ce40e +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_5.pth b/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..8fc6d2571348e8e0a728a2c406975260d6d47af2 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d29a82d8f052efaec3a290541ac8977914af1ee9d63ae567ab442371bb3c75e1 +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_6.pth b/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..a1f61e2b961f37e2defab5deb4f3f2730e6f82b1 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ccc4591e6aa18932332eb06f02e8cf9ef469d960ab5e3d27e5f1e5c26a82ae0 +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_7.pth b/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..82710ea9b9900c47aec7dcd2e88720ac2c761564 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13400/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7122aace04d9d0808ffe8ef7112a2c7222d070b417aa97fb0ab05dd6c08f9c84 +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13400/scaler.pt b/adapters/saved-alpaca-belle13b/checkpoint-13400/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..311a380a3b819e9ef883a8707669fe05d1afe8d0 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13400/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:821bcb3e6604a4b8f1e629e9c3e0629375eb9f30b26f637670c11d3b63c9d35e +size 557 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13400/scheduler.pt b/adapters/saved-alpaca-belle13b/checkpoint-13400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d12795ae2f07ce1dbe785fac1496aa496ffa754 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d30f00ef23455dd93f9b3f26acdf4d06aa02ffafaf9b66b2c33a90de1443c3c +size 627 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13400/trainer_state.json b/adapters/saved-alpaca-belle13b/checkpoint-13400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..aeb8c6813de5ac078fe3dfcb381785f633afea8e --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13400/trainer_state.json @@ -0,0 +1,4572 @@ +{ + "best_metric": 0.7123447060585022, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-belle13b/checkpoint-13400", + "epoch": 2.8910463861920173, + "global_step": 13400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.6589, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 1.4071, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 1.044, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 0.00023999999999999998, + "loss": 0.9883, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.0003, + "loss": 0.9659, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029956537486417964, + "loss": 0.9505, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029913074972835925, + "loss": 0.9205, + "step": 140 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002986961245925389, + "loss": 0.9168, + "step": 160 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002982614994567186, + "loss": 0.9117, + "step": 180 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002978268743208982, + "loss": 0.9064, + "step": 200 + }, + { + "epoch": 0.04, + "eval_loss": 0.9033477306365967, + "eval_runtime": 25.3136, + "eval_samples_per_second": 79.009, + "eval_steps_per_second": 1.264, + "step": 200 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029739224918507785, + "loss": 0.8981, + "step": 220 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002969576240492575, + "loss": 0.8912, + "step": 240 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002965229989134371, + "loss": 0.8875, + "step": 260 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002960883737776168, + "loss": 0.8907, + "step": 280 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029565374864179645, + "loss": 0.8753, + "step": 300 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029521912350597606, + "loss": 0.8782, + "step": 320 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002947844983701557, + "loss": 0.8697, + "step": 340 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002943498732343354, + "loss": 0.8745, + "step": 360 + }, + { + "epoch": 0.08, + "learning_rate": 0.000293915248098515, + "loss": 0.8725, + "step": 380 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029348062296269466, + "loss": 0.8658, + "step": 400 + }, + { + "epoch": 0.09, + "eval_loss": 0.8655584454536438, + "eval_runtime": 25.3343, + "eval_samples_per_second": 78.944, + "eval_steps_per_second": 1.263, + "step": 400 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002930459978268743, + "loss": 0.8641, + "step": 420 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029261137269105393, + "loss": 0.8509, + "step": 440 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002921767475552336, + "loss": 0.8541, + "step": 460 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029174212241941326, + "loss": 0.8575, + "step": 480 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029130749728359287, + "loss": 0.8482, + "step": 500 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029087287214777253, + "loss": 0.8572, + "step": 520 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002904382470119522, + "loss": 0.8489, + "step": 540 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002900036218761318, + "loss": 0.8585, + "step": 560 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028956899674031147, + "loss": 0.8387, + "step": 580 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028913437160449113, + "loss": 0.8306, + "step": 600 + }, + { + "epoch": 0.13, + "eval_loss": 0.8434031009674072, + "eval_runtime": 25.3211, + "eval_samples_per_second": 78.986, + "eval_steps_per_second": 1.264, + "step": 600 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028869974646867074, + "loss": 0.8331, + "step": 620 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002882651213328504, + "loss": 0.8447, + "step": 640 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028783049619703007, + "loss": 0.836, + "step": 660 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002873958710612097, + "loss": 0.8436, + "step": 680 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028696124592538934, + "loss": 0.8281, + "step": 700 + }, + { + "epoch": 0.16, + "learning_rate": 0.000286526620789569, + "loss": 0.8378, + "step": 720 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002860919956537486, + "loss": 0.8338, + "step": 740 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002856573705179283, + "loss": 0.8323, + "step": 760 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028522274538210794, + "loss": 0.8153, + "step": 780 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028478812024628755, + "loss": 0.8349, + "step": 800 + }, + { + "epoch": 0.17, + "eval_loss": 0.8282934427261353, + "eval_runtime": 25.4025, + "eval_samples_per_second": 78.733, + "eval_steps_per_second": 1.26, + "step": 800 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002843534951104672, + "loss": 0.8198, + "step": 820 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002839188699746469, + "loss": 0.8254, + "step": 840 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002834842448388265, + "loss": 0.8165, + "step": 860 + }, + { + "epoch": 0.19, + "learning_rate": 0.00028304961970300615, + "loss": 0.8241, + "step": 880 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002826149945671858, + "loss": 0.814, + "step": 900 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002821803694313654, + "loss": 0.8222, + "step": 920 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002817457442955451, + "loss": 0.825, + "step": 940 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028131111915972475, + "loss": 0.8153, + "step": 960 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028087649402390436, + "loss": 0.8229, + "step": 980 + }, + { + "epoch": 0.22, + "learning_rate": 0.00028044186888808397, + "loss": 0.8129, + "step": 1000 + }, + { + "epoch": 0.22, + "eval_loss": 0.816320538520813, + "eval_runtime": 25.4153, + "eval_samples_per_second": 78.693, + "eval_steps_per_second": 1.259, + "step": 1000 + }, + { + "epoch": 0.22, + "learning_rate": 0.00028000724375226363, + "loss": 0.8121, + "step": 1020 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002795726186164433, + "loss": 0.8063, + "step": 1040 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002791379934806229, + "loss": 0.8097, + "step": 1060 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027870336834480257, + "loss": 0.8142, + "step": 1080 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027826874320898223, + "loss": 0.8021, + "step": 1100 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027783411807316184, + "loss": 0.8014, + "step": 1120 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002773994929373415, + "loss": 0.8031, + "step": 1140 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027696486780152117, + "loss": 0.8011, + "step": 1160 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002765302426657008, + "loss": 0.7944, + "step": 1180 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027609561752988044, + "loss": 0.8071, + "step": 1200 + }, + { + "epoch": 0.26, + "eval_loss": 0.8064733147621155, + "eval_runtime": 25.3901, + "eval_samples_per_second": 78.771, + "eval_steps_per_second": 1.26, + "step": 1200 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002756609923940601, + "loss": 0.8025, + "step": 1220 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002752263672582397, + "loss": 0.7954, + "step": 1240 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002747917421224194, + "loss": 0.8013, + "step": 1260 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027435711698659904, + "loss": 0.7967, + "step": 1280 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027392249185077865, + "loss": 0.8132, + "step": 1300 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002734878667149583, + "loss": 0.8017, + "step": 1320 + }, + { + "epoch": 0.29, + "learning_rate": 0.000273053241579138, + "loss": 0.7964, + "step": 1340 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002726186164433176, + "loss": 0.8012, + "step": 1360 + }, + { + "epoch": 0.3, + "learning_rate": 0.00027218399130749725, + "loss": 0.7982, + "step": 1380 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002717493661716769, + "loss": 0.8031, + "step": 1400 + }, + { + "epoch": 0.3, + "eval_loss": 0.798474133014679, + "eval_runtime": 25.432, + "eval_samples_per_second": 78.641, + "eval_steps_per_second": 1.258, + "step": 1400 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002713147410358565, + "loss": 0.7925, + "step": 1420 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002708801159000362, + "loss": 0.794, + "step": 1440 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027044549076421585, + "loss": 0.804, + "step": 1460 + }, + { + "epoch": 0.32, + "learning_rate": 0.00027001086562839546, + "loss": 0.7942, + "step": 1480 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002695762404925751, + "loss": 0.7872, + "step": 1500 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002691416153567548, + "loss": 0.7962, + "step": 1520 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002687069902209344, + "loss": 0.7898, + "step": 1540 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026827236508511406, + "loss": 0.7886, + "step": 1560 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002678377399492937, + "loss": 0.7904, + "step": 1580 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026740311481347333, + "loss": 0.7892, + "step": 1600 + }, + { + "epoch": 0.35, + "eval_loss": 0.7912269234657288, + "eval_runtime": 25.444, + "eval_samples_per_second": 78.604, + "eval_steps_per_second": 1.258, + "step": 1600 + }, + { + "epoch": 0.35, + "learning_rate": 0.000266968489677653, + "loss": 0.7897, + "step": 1620 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026653386454183266, + "loss": 0.7927, + "step": 1640 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026609923940601227, + "loss": 0.7829, + "step": 1660 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026566461427019193, + "loss": 0.7788, + "step": 1680 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002652299891343716, + "loss": 0.786, + "step": 1700 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002647953639985512, + "loss": 0.7828, + "step": 1720 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026436073886273087, + "loss": 0.7788, + "step": 1740 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026392611372691053, + "loss": 0.7851, + "step": 1760 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026349148859109014, + "loss": 0.7936, + "step": 1780 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002630568634552698, + "loss": 0.7758, + "step": 1800 + }, + { + "epoch": 0.39, + "eval_loss": 0.7854430675506592, + "eval_runtime": 25.4734, + "eval_samples_per_second": 78.513, + "eval_steps_per_second": 1.256, + "step": 1800 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026262223831944947, + "loss": 0.787, + "step": 1820 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002621876131836291, + "loss": 0.7779, + "step": 1840 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026175298804780874, + "loss": 0.7792, + "step": 1860 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002613183629119884, + "loss": 0.7728, + "step": 1880 + }, + { + "epoch": 0.41, + "learning_rate": 0.000260883737776168, + "loss": 0.7844, + "step": 1900 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002604491126403477, + "loss": 0.7726, + "step": 1920 + }, + { + "epoch": 0.42, + "learning_rate": 0.00026001448750452734, + "loss": 0.7706, + "step": 1940 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025957986236870695, + "loss": 0.7659, + "step": 1960 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002591452372328866, + "loss": 0.7808, + "step": 1980 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002587106120970663, + "loss": 0.7692, + "step": 2000 + }, + { + "epoch": 0.43, + "eval_loss": 0.7800412774085999, + "eval_runtime": 25.5146, + "eval_samples_per_second": 78.387, + "eval_steps_per_second": 1.254, + "step": 2000 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002582759869612459, + "loss": 0.7665, + "step": 2020 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025784136182542555, + "loss": 0.7795, + "step": 2040 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002574067366896052, + "loss": 0.7846, + "step": 2060 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002569721115537848, + "loss": 0.7639, + "step": 2080 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002565374864179645, + "loss": 0.7827, + "step": 2100 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025610286128214415, + "loss": 0.7751, + "step": 2120 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025566823614632376, + "loss": 0.776, + "step": 2140 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002552336110105034, + "loss": 0.7773, + "step": 2160 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002547989858746831, + "loss": 0.7757, + "step": 2180 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002543643607388627, + "loss": 0.7769, + "step": 2200 + }, + { + "epoch": 0.47, + "eval_loss": 0.7759379744529724, + "eval_runtime": 25.4789, + "eval_samples_per_second": 78.496, + "eval_steps_per_second": 1.256, + "step": 2200 + }, + { + "epoch": 0.48, + "learning_rate": 0.00025392973560304236, + "loss": 0.7657, + "step": 2220 + }, + { + "epoch": 0.48, + "learning_rate": 0.000253495110467222, + "loss": 0.7664, + "step": 2240 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025306048533140163, + "loss": 0.7774, + "step": 2260 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002526258601955813, + "loss": 0.7591, + "step": 2280 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025219123505976096, + "loss": 0.7605, + "step": 2300 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025175660992394057, + "loss": 0.7693, + "step": 2320 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025132198478812023, + "loss": 0.7702, + "step": 2340 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002508873596522999, + "loss": 0.7706, + "step": 2360 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002504527345164795, + "loss": 0.7664, + "step": 2380 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025001810938065917, + "loss": 0.76, + "step": 2400 + }, + { + "epoch": 0.52, + "eval_loss": 0.7723669409751892, + "eval_runtime": 25.4827, + "eval_samples_per_second": 78.485, + "eval_steps_per_second": 1.256, + "step": 2400 + }, + { + "epoch": 0.52, + "learning_rate": 0.00024958348424483883, + "loss": 0.7702, + "step": 2420 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024914885910901844, + "loss": 0.7686, + "step": 2440 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002487142339731981, + "loss": 0.762, + "step": 2460 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024827960883737777, + "loss": 0.7719, + "step": 2480 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002478449837015574, + "loss": 0.7612, + "step": 2500 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024741035856573704, + "loss": 0.7565, + "step": 2520 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002469757334299167, + "loss": 0.7719, + "step": 2540 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002465411082940963, + "loss": 0.7619, + "step": 2560 + }, + { + "epoch": 0.56, + "learning_rate": 0.000246106483158276, + "loss": 0.7607, + "step": 2580 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024567185802245564, + "loss": 0.7564, + "step": 2600 + }, + { + "epoch": 0.56, + "eval_loss": 0.7678729295730591, + "eval_runtime": 25.4455, + "eval_samples_per_second": 78.599, + "eval_steps_per_second": 1.258, + "step": 2600 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024523723288663525, + "loss": 0.7613, + "step": 2620 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002448026077508149, + "loss": 0.7525, + "step": 2640 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002443679826149946, + "loss": 0.7563, + "step": 2660 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024393335747917422, + "loss": 0.7601, + "step": 2680 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024349873234335383, + "loss": 0.7633, + "step": 2700 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024306410720753346, + "loss": 0.75, + "step": 2720 + }, + { + "epoch": 0.59, + "learning_rate": 0.0002426294820717131, + "loss": 0.7602, + "step": 2740 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024219485693589276, + "loss": 0.7546, + "step": 2760 + }, + { + "epoch": 0.6, + "learning_rate": 0.0002417602318000724, + "loss": 0.7532, + "step": 2780 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024132560666425203, + "loss": 0.7661, + "step": 2800 + }, + { + "epoch": 0.6, + "eval_loss": 0.7649803757667542, + "eval_runtime": 25.4783, + "eval_samples_per_second": 78.498, + "eval_steps_per_second": 1.256, + "step": 2800 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002408909815284317, + "loss": 0.7587, + "step": 2820 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024045635639261133, + "loss": 0.7543, + "step": 2840 + }, + { + "epoch": 0.62, + "learning_rate": 0.00024002173125679097, + "loss": 0.7672, + "step": 2860 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023958710612097063, + "loss": 0.7623, + "step": 2880 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023915248098515027, + "loss": 0.7487, + "step": 2900 + }, + { + "epoch": 0.63, + "learning_rate": 0.0002387178558493299, + "loss": 0.75, + "step": 2920 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023828323071350957, + "loss": 0.7567, + "step": 2940 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002378486055776892, + "loss": 0.7592, + "step": 2960 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023741398044186884, + "loss": 0.7569, + "step": 2980 + }, + { + "epoch": 0.65, + "learning_rate": 0.0002369793553060485, + "loss": 0.7524, + "step": 3000 + }, + { + "epoch": 0.65, + "eval_loss": 0.7613279819488525, + "eval_runtime": 25.4837, + "eval_samples_per_second": 78.482, + "eval_steps_per_second": 1.256, + "step": 3000 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023654473017022814, + "loss": 0.7593, + "step": 3020 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023611010503440778, + "loss": 0.7516, + "step": 3040 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023567547989858744, + "loss": 0.7525, + "step": 3060 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023524085476276708, + "loss": 0.7583, + "step": 3080 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023480622962694672, + "loss": 0.7535, + "step": 3100 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023437160449112638, + "loss": 0.7528, + "step": 3120 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023393697935530602, + "loss": 0.7418, + "step": 3140 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023350235421948565, + "loss": 0.7496, + "step": 3160 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023306772908366532, + "loss": 0.7537, + "step": 3180 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023263310394784495, + "loss": 0.7569, + "step": 3200 + }, + { + "epoch": 0.69, + "eval_loss": 0.7581906914710999, + "eval_runtime": 25.4588, + "eval_samples_per_second": 78.558, + "eval_steps_per_second": 1.257, + "step": 3200 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002321984788120246, + "loss": 0.7465, + "step": 3220 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023176385367620425, + "loss": 0.7367, + "step": 3240 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002313292285403839, + "loss": 0.7425, + "step": 3260 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023089460340456353, + "loss": 0.7637, + "step": 3280 + }, + { + "epoch": 0.71, + "learning_rate": 0.0002304599782687432, + "loss": 0.7574, + "step": 3300 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023002535313292283, + "loss": 0.7448, + "step": 3320 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022959072799710246, + "loss": 0.7595, + "step": 3340 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022915610286128213, + "loss": 0.7465, + "step": 3360 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022872147772546176, + "loss": 0.7532, + "step": 3380 + }, + { + "epoch": 0.73, + "learning_rate": 0.0002282868525896414, + "loss": 0.7466, + "step": 3400 + }, + { + "epoch": 0.73, + "eval_loss": 0.7559078931808472, + "eval_runtime": 25.464, + "eval_samples_per_second": 78.542, + "eval_steps_per_second": 1.257, + "step": 3400 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022785222745382106, + "loss": 0.753, + "step": 3420 + }, + { + "epoch": 0.74, + "learning_rate": 0.0002274176023180007, + "loss": 0.7459, + "step": 3440 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022698297718218034, + "loss": 0.7519, + "step": 3460 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022654835204636, + "loss": 0.7451, + "step": 3480 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022611372691053964, + "loss": 0.7468, + "step": 3500 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022567910177471927, + "loss": 0.7491, + "step": 3520 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022524447663889894, + "loss": 0.7524, + "step": 3540 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022480985150307857, + "loss": 0.7484, + "step": 3560 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002243752263672582, + "loss": 0.7484, + "step": 3580 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022394060123143787, + "loss": 0.7529, + "step": 3600 + }, + { + "epoch": 0.78, + "eval_loss": 0.7531791925430298, + "eval_runtime": 25.4572, + "eval_samples_per_second": 78.563, + "eval_steps_per_second": 1.257, + "step": 3600 + }, + { + "epoch": 0.78, + "learning_rate": 0.0002235059760956175, + "loss": 0.7475, + "step": 3620 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022307135095979715, + "loss": 0.7518, + "step": 3640 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002226367258239768, + "loss": 0.751, + "step": 3660 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022220210068815645, + "loss": 0.7402, + "step": 3680 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022176747555233608, + "loss": 0.755, + "step": 3700 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022133285041651575, + "loss": 0.7441, + "step": 3720 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022089822528069538, + "loss": 0.746, + "step": 3740 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022046360014487502, + "loss": 0.7441, + "step": 3760 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022002897500905468, + "loss": 0.7475, + "step": 3780 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021959434987323432, + "loss": 0.7458, + "step": 3800 + }, + { + "epoch": 0.82, + "eval_loss": 0.7513870596885681, + "eval_runtime": 25.4906, + "eval_samples_per_second": 78.46, + "eval_steps_per_second": 1.255, + "step": 3800 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021915972473741396, + "loss": 0.7436, + "step": 3820 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021872509960159362, + "loss": 0.7451, + "step": 3840 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021829047446577326, + "loss": 0.7475, + "step": 3860 + }, + { + "epoch": 0.84, + "learning_rate": 0.0002178558493299529, + "loss": 0.7424, + "step": 3880 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021742122419413256, + "loss": 0.7503, + "step": 3900 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002169865990583122, + "loss": 0.7334, + "step": 3920 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021655197392249183, + "loss": 0.7436, + "step": 3940 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002161173487866715, + "loss": 0.7453, + "step": 3960 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021568272365085113, + "loss": 0.7424, + "step": 3980 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021524809851503076, + "loss": 0.7509, + "step": 4000 + }, + { + "epoch": 0.86, + "eval_loss": 0.7488968968391418, + "eval_runtime": 25.492, + "eval_samples_per_second": 78.456, + "eval_steps_per_second": 1.255, + "step": 4000 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021481347337921043, + "loss": 0.7445, + "step": 4020 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021437884824339006, + "loss": 0.74, + "step": 4040 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002139442231075697, + "loss": 0.7362, + "step": 4060 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021350959797174936, + "loss": 0.7409, + "step": 4080 + }, + { + "epoch": 0.88, + "learning_rate": 0.000213074972835929, + "loss": 0.7315, + "step": 4100 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021264034770010864, + "loss": 0.7488, + "step": 4120 + }, + { + "epoch": 0.89, + "learning_rate": 0.0002122057225642883, + "loss": 0.7375, + "step": 4140 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021177109742846794, + "loss": 0.7481, + "step": 4160 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021133647229264757, + "loss": 0.7524, + "step": 4180 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021092357841361823, + "loss": 0.7403, + "step": 4200 + }, + { + "epoch": 0.91, + "eval_loss": 0.7469983100891113, + "eval_runtime": 25.4847, + "eval_samples_per_second": 78.479, + "eval_steps_per_second": 1.256, + "step": 4200 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021048895327779787, + "loss": 0.7394, + "step": 4220 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002100543281419775, + "loss": 0.7405, + "step": 4240 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020961970300615717, + "loss": 0.7534, + "step": 4260 + }, + { + "epoch": 0.92, + "learning_rate": 0.0002091850778703368, + "loss": 0.7412, + "step": 4280 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020875045273451644, + "loss": 0.7393, + "step": 4300 + }, + { + "epoch": 0.93, + "learning_rate": 0.0002083158275986961, + "loss": 0.7289, + "step": 4320 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020788120246287574, + "loss": 0.7342, + "step": 4340 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020744657732705538, + "loss": 0.7427, + "step": 4360 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020701195219123504, + "loss": 0.7386, + "step": 4380 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020657732705541468, + "loss": 0.7374, + "step": 4400 + }, + { + "epoch": 0.95, + "eval_loss": 0.7451291680335999, + "eval_runtime": 25.461, + "eval_samples_per_second": 78.552, + "eval_steps_per_second": 1.257, + "step": 4400 + }, + { + "epoch": 0.95, + "learning_rate": 0.0002061427019195943, + "loss": 0.7364, + "step": 4420 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020570807678377398, + "loss": 0.7377, + "step": 4440 + }, + { + "epoch": 0.96, + "learning_rate": 0.0002052734516479536, + "loss": 0.7391, + "step": 4460 + }, + { + "epoch": 0.97, + "learning_rate": 0.00020483882651213325, + "loss": 0.731, + "step": 4480 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002044042013763129, + "loss": 0.735, + "step": 4500 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020396957624049255, + "loss": 0.7344, + "step": 4520 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020353495110467219, + "loss": 0.7355, + "step": 4540 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020310032596885185, + "loss": 0.7357, + "step": 4560 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020266570083303149, + "loss": 0.7377, + "step": 4580 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020223107569721112, + "loss": 0.7438, + "step": 4600 + }, + { + "epoch": 0.99, + "eval_loss": 0.7437875270843506, + "eval_runtime": 25.5255, + "eval_samples_per_second": 78.353, + "eval_steps_per_second": 1.254, + "step": 4600 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020179645056139079, + "loss": 0.7343, + "step": 4620 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020136182542557042, + "loss": 0.7473, + "step": 4640 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020092720028975006, + "loss": 0.7305, + "step": 4660 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020049257515392972, + "loss": 0.7284, + "step": 4680 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020005795001810936, + "loss": 0.7335, + "step": 4700 + }, + { + "epoch": 1.02, + "learning_rate": 0.000199623324882289, + "loss": 0.7282, + "step": 4720 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019918869974646866, + "loss": 0.7337, + "step": 4740 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001987540746106483, + "loss": 0.7195, + "step": 4760 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019831944947482793, + "loss": 0.7327, + "step": 4780 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001978848243390076, + "loss": 0.7259, + "step": 4800 + }, + { + "epoch": 1.04, + "eval_loss": 0.7413464188575745, + "eval_runtime": 25.4959, + "eval_samples_per_second": 78.444, + "eval_steps_per_second": 1.255, + "step": 4800 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019745019920318723, + "loss": 0.7263, + "step": 4820 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019701557406736687, + "loss": 0.7341, + "step": 4840 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019658094893154653, + "loss": 0.7406, + "step": 4860 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019614632379572617, + "loss": 0.7309, + "step": 4880 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001957116986599058, + "loss": 0.7274, + "step": 4900 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019527707352408547, + "loss": 0.7241, + "step": 4920 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001948424483882651, + "loss": 0.7368, + "step": 4940 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019440782325244474, + "loss": 0.7445, + "step": 4960 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001939731981166244, + "loss": 0.7347, + "step": 4980 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019353857298080404, + "loss": 0.7436, + "step": 5000 + }, + { + "epoch": 1.08, + "eval_loss": 0.7399871945381165, + "eval_runtime": 25.5032, + "eval_samples_per_second": 78.422, + "eval_steps_per_second": 1.255, + "step": 5000 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019310394784498368, + "loss": 0.7248, + "step": 5020 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019266932270916334, + "loss": 0.7374, + "step": 5040 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019223469757334298, + "loss": 0.7187, + "step": 5060 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019180007243752261, + "loss": 0.7381, + "step": 5080 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019136544730170228, + "loss": 0.7389, + "step": 5100 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019093082216588191, + "loss": 0.7343, + "step": 5120 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019049619703006155, + "loss": 0.7323, + "step": 5140 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019006157189424121, + "loss": 0.723, + "step": 5160 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018962694675842085, + "loss": 0.7236, + "step": 5180 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001891923216226005, + "loss": 0.7399, + "step": 5200 + }, + { + "epoch": 1.12, + "eval_loss": 0.7393975257873535, + "eval_runtime": 25.6137, + "eval_samples_per_second": 78.083, + "eval_steps_per_second": 1.249, + "step": 5200 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018875769648678015, + "loss": 0.7373, + "step": 5220 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001883230713509598, + "loss": 0.7257, + "step": 5240 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018788844621513942, + "loss": 0.7261, + "step": 5260 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001874538210793191, + "loss": 0.7302, + "step": 5280 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018701919594349872, + "loss": 0.7337, + "step": 5300 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018658457080767836, + "loss": 0.7237, + "step": 5320 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018614994567185802, + "loss": 0.7238, + "step": 5340 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018571532053603766, + "loss": 0.7287, + "step": 5360 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001852806954002173, + "loss": 0.7237, + "step": 5380 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018484607026439696, + "loss": 0.7256, + "step": 5400 + }, + { + "epoch": 1.17, + "eval_loss": 0.7377527952194214, + "eval_runtime": 25.4964, + "eval_samples_per_second": 78.442, + "eval_steps_per_second": 1.255, + "step": 5400 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001844114451285766, + "loss": 0.7279, + "step": 5420 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018397681999275623, + "loss": 0.7226, + "step": 5440 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001835421948569359, + "loss": 0.7167, + "step": 5460 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018310756972111553, + "loss": 0.7268, + "step": 5480 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018267294458529517, + "loss": 0.7398, + "step": 5500 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018223831944947483, + "loss": 0.7331, + "step": 5520 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018180369431365447, + "loss": 0.7372, + "step": 5540 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001813690691778341, + "loss": 0.7321, + "step": 5560 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018093444404201377, + "loss": 0.7346, + "step": 5580 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001804998189061934, + "loss": 0.722, + "step": 5600 + }, + { + "epoch": 1.21, + "eval_loss": 0.7368175983428955, + "eval_runtime": 25.5045, + "eval_samples_per_second": 78.417, + "eval_steps_per_second": 1.255, + "step": 5600 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018006519377037304, + "loss": 0.7279, + "step": 5620 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001796305686345527, + "loss": 0.72, + "step": 5640 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017919594349873234, + "loss": 0.7295, + "step": 5660 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017876131836291198, + "loss": 0.7245, + "step": 5680 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017832669322709164, + "loss": 0.7418, + "step": 5700 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017789206809127128, + "loss": 0.7317, + "step": 5720 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017745744295545092, + "loss": 0.7303, + "step": 5740 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017702281781963058, + "loss": 0.7332, + "step": 5760 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017658819268381022, + "loss": 0.7202, + "step": 5780 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017615356754798983, + "loss": 0.7238, + "step": 5800 + }, + { + "epoch": 1.25, + "eval_loss": 0.7348505854606628, + "eval_runtime": 25.509, + "eval_samples_per_second": 78.404, + "eval_steps_per_second": 1.254, + "step": 5800 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017571894241216946, + "loss": 0.724, + "step": 5820 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017528431727634913, + "loss": 0.7258, + "step": 5840 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017484969214052876, + "loss": 0.7217, + "step": 5860 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001744150670047084, + "loss": 0.7209, + "step": 5880 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017398044186888806, + "loss": 0.7276, + "step": 5900 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001735458167330677, + "loss": 0.7287, + "step": 5920 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017311119159724733, + "loss": 0.7244, + "step": 5940 + }, + { + "epoch": 1.29, + "learning_rate": 0.000172676566461427, + "loss": 0.7247, + "step": 5960 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017224194132560663, + "loss": 0.7191, + "step": 5980 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017180731618978627, + "loss": 0.7208, + "step": 6000 + }, + { + "epoch": 1.29, + "eval_loss": 0.7340711951255798, + "eval_runtime": 25.4669, + "eval_samples_per_second": 78.533, + "eval_steps_per_second": 1.257, + "step": 6000 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017137269105396593, + "loss": 0.7285, + "step": 6020 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017093806591814557, + "loss": 0.7294, + "step": 6040 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001705034407823252, + "loss": 0.7365, + "step": 6060 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017006881564650487, + "loss": 0.7149, + "step": 6080 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001696341905106845, + "loss": 0.7229, + "step": 6100 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016919956537486414, + "loss": 0.7253, + "step": 6120 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001687649402390438, + "loss": 0.7188, + "step": 6140 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016833031510322344, + "loss": 0.7308, + "step": 6160 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016789568996740308, + "loss": 0.7186, + "step": 6180 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016746106483158274, + "loss": 0.7121, + "step": 6200 + }, + { + "epoch": 1.34, + "eval_loss": 0.7324739694595337, + "eval_runtime": 25.5, + "eval_samples_per_second": 78.431, + "eval_steps_per_second": 1.255, + "step": 6200 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016702643969576238, + "loss": 0.7286, + "step": 6220 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016659181455994202, + "loss": 0.7246, + "step": 6240 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016615718942412168, + "loss": 0.7234, + "step": 6260 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016572256428830132, + "loss": 0.7245, + "step": 6280 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016528793915248095, + "loss": 0.7252, + "step": 6300 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016485331401666062, + "loss": 0.7259, + "step": 6320 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016441868888084025, + "loss": 0.7173, + "step": 6340 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001639840637450199, + "loss": 0.7222, + "step": 6360 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016354943860919955, + "loss": 0.7113, + "step": 6380 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001631148134733792, + "loss": 0.72, + "step": 6400 + }, + { + "epoch": 1.38, + "eval_loss": 0.7319995164871216, + "eval_runtime": 25.5112, + "eval_samples_per_second": 78.397, + "eval_steps_per_second": 1.254, + "step": 6400 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016268018833755883, + "loss": 0.7333, + "step": 6420 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001622455632017385, + "loss": 0.7208, + "step": 6440 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016181093806591813, + "loss": 0.7161, + "step": 6460 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016137631293009776, + "loss": 0.7171, + "step": 6480 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016094168779427743, + "loss": 0.7297, + "step": 6500 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016050706265845706, + "loss": 0.7156, + "step": 6520 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001600724375226367, + "loss": 0.7175, + "step": 6540 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015963781238681636, + "loss": 0.7152, + "step": 6560 + }, + { + "epoch": 1.42, + "learning_rate": 0.000159203187250996, + "loss": 0.7282, + "step": 6580 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015876856211517564, + "loss": 0.722, + "step": 6600 + }, + { + "epoch": 1.42, + "eval_loss": 0.7307416796684265, + "eval_runtime": 25.4967, + "eval_samples_per_second": 78.442, + "eval_steps_per_second": 1.255, + "step": 6600 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001583339369793553, + "loss": 0.7274, + "step": 6620 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015789931184353494, + "loss": 0.7313, + "step": 6640 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015746468670771457, + "loss": 0.7209, + "step": 6660 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015703006157189424, + "loss": 0.7202, + "step": 6680 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015659543643607387, + "loss": 0.7264, + "step": 6700 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001561608113002535, + "loss": 0.7226, + "step": 6720 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015572618616443317, + "loss": 0.711, + "step": 6740 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001552915610286128, + "loss": 0.7216, + "step": 6760 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015485693589279245, + "loss": 0.7184, + "step": 6780 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001544223107569721, + "loss": 0.7216, + "step": 6800 + }, + { + "epoch": 1.47, + "eval_loss": 0.7297094464302063, + "eval_runtime": 25.4826, + "eval_samples_per_second": 78.485, + "eval_steps_per_second": 1.256, + "step": 6800 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015398768562115175, + "loss": 0.7203, + "step": 6820 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015355306048533138, + "loss": 0.7184, + "step": 6840 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015311843534951105, + "loss": 0.7183, + "step": 6860 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015268381021369068, + "loss": 0.7267, + "step": 6880 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015224918507787032, + "loss": 0.7299, + "step": 6900 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015181455994204998, + "loss": 0.719, + "step": 6920 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015137993480622962, + "loss": 0.7229, + "step": 6940 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015094530967040926, + "loss": 0.7231, + "step": 6960 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015051068453458892, + "loss": 0.7279, + "step": 6980 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015007605939876856, + "loss": 0.7252, + "step": 7000 + }, + { + "epoch": 1.51, + "eval_loss": 0.7288112640380859, + "eval_runtime": 25.4887, + "eval_samples_per_second": 78.466, + "eval_steps_per_second": 1.255, + "step": 7000 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001496414342629482, + "loss": 0.7148, + "step": 7020 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014920680912712786, + "loss": 0.7147, + "step": 7040 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001487721839913075, + "loss": 0.7209, + "step": 7060 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014833755885548713, + "loss": 0.724, + "step": 7080 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014790293371966676, + "loss": 0.7256, + "step": 7100 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001474683085838464, + "loss": 0.7246, + "step": 7120 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014703368344802606, + "loss": 0.7103, + "step": 7140 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001465990583122057, + "loss": 0.7223, + "step": 7160 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014616443317638534, + "loss": 0.7149, + "step": 7180 + }, + { + "epoch": 1.55, + "learning_rate": 0.000145729808040565, + "loss": 0.7214, + "step": 7200 + }, + { + "epoch": 1.55, + "eval_loss": 0.7280930876731873, + "eval_runtime": 25.4883, + "eval_samples_per_second": 78.467, + "eval_steps_per_second": 1.255, + "step": 7200 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014529518290474464, + "loss": 0.7118, + "step": 7220 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014486055776892427, + "loss": 0.7171, + "step": 7240 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014442593263310394, + "loss": 0.7191, + "step": 7260 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014399130749728357, + "loss": 0.7155, + "step": 7280 + }, + { + "epoch": 1.57, + "learning_rate": 0.0001435566823614632, + "loss": 0.7198, + "step": 7300 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014312205722564287, + "loss": 0.7188, + "step": 7320 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001426874320898225, + "loss": 0.7236, + "step": 7340 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014225280695400215, + "loss": 0.712, + "step": 7360 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001418181818181818, + "loss": 0.7181, + "step": 7380 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014138355668236145, + "loss": 0.7198, + "step": 7400 + }, + { + "epoch": 1.6, + "eval_loss": 0.7276077270507812, + "eval_runtime": 25.4843, + "eval_samples_per_second": 78.48, + "eval_steps_per_second": 1.256, + "step": 7400 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014094893154654108, + "loss": 0.7187, + "step": 7420 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014051430641072075, + "loss": 0.7153, + "step": 7440 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014007968127490038, + "loss": 0.7208, + "step": 7460 + }, + { + "epoch": 1.61, + "learning_rate": 0.00013964505613908002, + "loss": 0.7153, + "step": 7480 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013921043100325968, + "loss": 0.7207, + "step": 7500 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013877580586743932, + "loss": 0.7167, + "step": 7520 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013834118073161896, + "loss": 0.7183, + "step": 7540 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013792828685258964, + "loss": 0.7196, + "step": 7560 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013749366171676928, + "loss": 0.7233, + "step": 7580 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013705903658094894, + "loss": 0.7237, + "step": 7600 + }, + { + "epoch": 1.64, + "eval_loss": 0.7260885238647461, + "eval_runtime": 25.503, + "eval_samples_per_second": 78.422, + "eval_steps_per_second": 1.255, + "step": 7600 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013662441144512855, + "loss": 0.72, + "step": 7620 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001361897863093082, + "loss": 0.7094, + "step": 7640 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013575516117348785, + "loss": 0.7111, + "step": 7660 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013532053603766749, + "loss": 0.7182, + "step": 7680 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013488591090184715, + "loss": 0.7182, + "step": 7700 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013445128576602679, + "loss": 0.7183, + "step": 7720 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013401666063020642, + "loss": 0.7112, + "step": 7740 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013358203549438609, + "loss": 0.7183, + "step": 7760 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013314741035856572, + "loss": 0.7152, + "step": 7780 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013271278522274536, + "loss": 0.7233, + "step": 7800 + }, + { + "epoch": 1.68, + "eval_loss": 0.7252987027168274, + "eval_runtime": 25.5066, + "eval_samples_per_second": 78.411, + "eval_steps_per_second": 1.255, + "step": 7800 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013227816008692502, + "loss": 0.7124, + "step": 7820 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013184353495110466, + "loss": 0.7109, + "step": 7840 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001314089098152843, + "loss": 0.7132, + "step": 7860 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013097428467946396, + "loss": 0.7157, + "step": 7880 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001305396595436436, + "loss": 0.7237, + "step": 7900 + }, + { + "epoch": 1.71, + "learning_rate": 0.00013010503440782323, + "loss": 0.7176, + "step": 7920 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001296704092720029, + "loss": 0.7199, + "step": 7940 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012923578413618253, + "loss": 0.7119, + "step": 7960 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012880115900036217, + "loss": 0.717, + "step": 7980 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012836653386454183, + "loss": 0.7155, + "step": 8000 + }, + { + "epoch": 1.73, + "eval_loss": 0.7248360514640808, + "eval_runtime": 25.5301, + "eval_samples_per_second": 78.339, + "eval_steps_per_second": 1.253, + "step": 8000 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012793190872872147, + "loss": 0.7085, + "step": 8020 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001274972835929011, + "loss": 0.7174, + "step": 8040 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012706265845708077, + "loss": 0.7224, + "step": 8060 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001266280333212604, + "loss": 0.7169, + "step": 8080 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012619340818544004, + "loss": 0.7191, + "step": 8100 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001257587830496197, + "loss": 0.7179, + "step": 8120 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012532415791379934, + "loss": 0.7208, + "step": 8140 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012488953277797898, + "loss": 0.7168, + "step": 8160 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012445490764215864, + "loss": 0.7101, + "step": 8180 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012402028250633828, + "loss": 0.7167, + "step": 8200 + }, + { + "epoch": 1.77, + "eval_loss": 0.7242170572280884, + "eval_runtime": 25.4873, + "eval_samples_per_second": 78.47, + "eval_steps_per_second": 1.256, + "step": 8200 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012358565737051791, + "loss": 0.7062, + "step": 8220 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012315103223469758, + "loss": 0.7177, + "step": 8240 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012271640709887721, + "loss": 0.7035, + "step": 8260 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012228178196305685, + "loss": 0.7157, + "step": 8280 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001218471568272365, + "loss": 0.7196, + "step": 8300 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012141253169141615, + "loss": 0.7105, + "step": 8320 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012097790655559579, + "loss": 0.7105, + "step": 8340 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012054328141977544, + "loss": 0.7139, + "step": 8360 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012010865628395509, + "loss": 0.7215, + "step": 8380 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011967403114813472, + "loss": 0.725, + "step": 8400 + }, + { + "epoch": 1.81, + "eval_loss": 0.7237139344215393, + "eval_runtime": 25.506, + "eval_samples_per_second": 78.413, + "eval_steps_per_second": 1.255, + "step": 8400 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011923940601231437, + "loss": 0.7107, + "step": 8420 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011880478087649402, + "loss": 0.7095, + "step": 8440 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011837015574067366, + "loss": 0.7061, + "step": 8460 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001179355306048533, + "loss": 0.716, + "step": 8480 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011750090546903295, + "loss": 0.7203, + "step": 8500 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011706628033321258, + "loss": 0.7098, + "step": 8520 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011663165519739223, + "loss": 0.7104, + "step": 8540 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011619703006157188, + "loss": 0.7051, + "step": 8560 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011576240492575152, + "loss": 0.7198, + "step": 8580 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011532777978993117, + "loss": 0.7175, + "step": 8600 + }, + { + "epoch": 1.86, + "eval_loss": 0.7230754494667053, + "eval_runtime": 25.5133, + "eval_samples_per_second": 78.39, + "eval_steps_per_second": 1.254, + "step": 8600 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011489315465411082, + "loss": 0.7046, + "step": 8620 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011445852951829046, + "loss": 0.7176, + "step": 8640 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001140239043824701, + "loss": 0.7193, + "step": 8660 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011358927924664976, + "loss": 0.7046, + "step": 8680 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011315465411082939, + "loss": 0.7116, + "step": 8700 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011274176023180006, + "loss": 0.7152, + "step": 8720 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011230713509597971, + "loss": 0.7164, + "step": 8740 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011187250996015936, + "loss": 0.7192, + "step": 8760 + }, + { + "epoch": 1.89, + "learning_rate": 0.000111437884824339, + "loss": 0.7124, + "step": 8780 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011100325968851865, + "loss": 0.7032, + "step": 8800 + }, + { + "epoch": 1.9, + "eval_loss": 0.7217770218849182, + "eval_runtime": 25.4723, + "eval_samples_per_second": 78.517, + "eval_steps_per_second": 1.256, + "step": 8800 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001105686345526983, + "loss": 0.7157, + "step": 8820 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011013400941687794, + "loss": 0.7115, + "step": 8840 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010969938428105759, + "loss": 0.7137, + "step": 8860 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010926475914523724, + "loss": 0.7176, + "step": 8880 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010883013400941687, + "loss": 0.7081, + "step": 8900 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010839550887359652, + "loss": 0.7233, + "step": 8920 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010796088373777617, + "loss": 0.7058, + "step": 8940 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010752625860195581, + "loss": 0.7154, + "step": 8960 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010709163346613546, + "loss": 0.7135, + "step": 8980 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010665700833031508, + "loss": 0.7078, + "step": 9000 + }, + { + "epoch": 1.94, + "eval_loss": 0.7215875387191772, + "eval_runtime": 25.484, + "eval_samples_per_second": 78.481, + "eval_steps_per_second": 1.256, + "step": 9000 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010622238319449473, + "loss": 0.7061, + "step": 9020 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010578775805867438, + "loss": 0.7174, + "step": 9040 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010535313292285402, + "loss": 0.7132, + "step": 9060 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010491850778703367, + "loss": 0.7247, + "step": 9080 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010448388265121332, + "loss": 0.7064, + "step": 9100 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010404925751539295, + "loss": 0.7098, + "step": 9120 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001036146323795726, + "loss": 0.708, + "step": 9140 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010318000724375225, + "loss": 0.7144, + "step": 9160 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010274538210793189, + "loss": 0.7151, + "step": 9180 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010231075697211154, + "loss": 0.718, + "step": 9200 + }, + { + "epoch": 1.98, + "eval_loss": 0.7208251357078552, + "eval_runtime": 25.5022, + "eval_samples_per_second": 78.425, + "eval_steps_per_second": 1.255, + "step": 9200 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010187613183629119, + "loss": 0.7108, + "step": 9220 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010144150670047083, + "loss": 0.6952, + "step": 9240 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010100688156465048, + "loss": 0.7013, + "step": 9260 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010057225642883013, + "loss": 0.7013, + "step": 9280 + }, + { + "epoch": 2.01, + "learning_rate": 0.00010013763129300976, + "loss": 0.7049, + "step": 9300 + }, + { + "epoch": 2.01, + "learning_rate": 9.970300615718941e-05, + "loss": 0.7093, + "step": 9320 + }, + { + "epoch": 2.02, + "learning_rate": 9.926838102136906e-05, + "loss": 0.713, + "step": 9340 + }, + { + "epoch": 2.02, + "learning_rate": 9.88337558855487e-05, + "loss": 0.7108, + "step": 9360 + }, + { + "epoch": 2.02, + "learning_rate": 9.839913074972835e-05, + "loss": 0.7115, + "step": 9380 + }, + { + "epoch": 2.03, + "learning_rate": 9.7964505613908e-05, + "loss": 0.7119, + "step": 9400 + }, + { + "epoch": 2.03, + "eval_loss": 0.7202969789505005, + "eval_runtime": 25.504, + "eval_samples_per_second": 78.419, + "eval_steps_per_second": 1.255, + "step": 9400 + }, + { + "epoch": 2.03, + "learning_rate": 9.752988047808764e-05, + "loss": 0.7107, + "step": 9420 + }, + { + "epoch": 2.04, + "learning_rate": 9.709525534226729e-05, + "loss": 0.7065, + "step": 9440 + }, + { + "epoch": 2.04, + "learning_rate": 9.666063020644694e-05, + "loss": 0.7121, + "step": 9460 + }, + { + "epoch": 2.05, + "learning_rate": 9.622600507062657e-05, + "loss": 0.7163, + "step": 9480 + }, + { + "epoch": 2.05, + "learning_rate": 9.579137993480622e-05, + "loss": 0.7026, + "step": 9500 + }, + { + "epoch": 2.05, + "learning_rate": 9.535675479898587e-05, + "loss": 0.7158, + "step": 9520 + }, + { + "epoch": 2.06, + "learning_rate": 9.492212966316551e-05, + "loss": 0.7016, + "step": 9540 + }, + { + "epoch": 2.06, + "learning_rate": 9.448750452734516e-05, + "loss": 0.7149, + "step": 9560 + }, + { + "epoch": 2.07, + "learning_rate": 9.405287939152481e-05, + "loss": 0.7079, + "step": 9580 + }, + { + "epoch": 2.07, + "learning_rate": 9.361825425570445e-05, + "loss": 0.709, + "step": 9600 + }, + { + "epoch": 2.07, + "eval_loss": 0.7194134593009949, + "eval_runtime": 25.5286, + "eval_samples_per_second": 78.343, + "eval_steps_per_second": 1.253, + "step": 9600 + }, + { + "epoch": 2.08, + "learning_rate": 9.31836291198841e-05, + "loss": 0.7127, + "step": 9620 + }, + { + "epoch": 2.08, + "learning_rate": 9.274900398406375e-05, + "loss": 0.7037, + "step": 9640 + }, + { + "epoch": 2.08, + "learning_rate": 9.231437884824338e-05, + "loss": 0.7114, + "step": 9660 + }, + { + "epoch": 2.09, + "learning_rate": 9.187975371242303e-05, + "loss": 0.706, + "step": 9680 + }, + { + "epoch": 2.09, + "learning_rate": 9.144512857660268e-05, + "loss": 0.7026, + "step": 9700 + }, + { + "epoch": 2.1, + "learning_rate": 9.101050344078232e-05, + "loss": 0.7079, + "step": 9720 + }, + { + "epoch": 2.1, + "learning_rate": 9.057587830496197e-05, + "loss": 0.7053, + "step": 9740 + }, + { + "epoch": 2.11, + "learning_rate": 9.014125316914162e-05, + "loss": 0.7125, + "step": 9760 + }, + { + "epoch": 2.11, + "learning_rate": 8.970662803332126e-05, + "loss": 0.7045, + "step": 9780 + }, + { + "epoch": 2.11, + "learning_rate": 8.92720028975009e-05, + "loss": 0.7109, + "step": 9800 + }, + { + "epoch": 2.11, + "eval_loss": 0.7186465859413147, + "eval_runtime": 25.5049, + "eval_samples_per_second": 78.416, + "eval_steps_per_second": 1.255, + "step": 9800 + }, + { + "epoch": 2.12, + "learning_rate": 8.883737776168056e-05, + "loss": 0.7035, + "step": 9820 + }, + { + "epoch": 2.12, + "learning_rate": 8.840275262586019e-05, + "loss": 0.7073, + "step": 9840 + }, + { + "epoch": 2.13, + "learning_rate": 8.796812749003983e-05, + "loss": 0.7114, + "step": 9860 + }, + { + "epoch": 2.13, + "learning_rate": 8.753350235421946e-05, + "loss": 0.7066, + "step": 9880 + }, + { + "epoch": 2.14, + "learning_rate": 8.709887721839911e-05, + "loss": 0.7055, + "step": 9900 + }, + { + "epoch": 2.14, + "learning_rate": 8.666425208257877e-05, + "loss": 0.7064, + "step": 9920 + }, + { + "epoch": 2.14, + "learning_rate": 8.62296269467584e-05, + "loss": 0.7154, + "step": 9940 + }, + { + "epoch": 2.15, + "learning_rate": 8.579500181093805e-05, + "loss": 0.7099, + "step": 9960 + }, + { + "epoch": 2.15, + "learning_rate": 8.53603766751177e-05, + "loss": 0.7112, + "step": 9980 + }, + { + "epoch": 2.16, + "learning_rate": 8.492575153929734e-05, + "loss": 0.7086, + "step": 10000 + }, + { + "epoch": 2.16, + "eval_loss": 0.7181739211082458, + "eval_runtime": 25.5087, + "eval_samples_per_second": 78.405, + "eval_steps_per_second": 1.254, + "step": 10000 + }, + { + "epoch": 2.16, + "learning_rate": 8.449112640347699e-05, + "loss": 0.7155, + "step": 10020 + }, + { + "epoch": 2.17, + "learning_rate": 8.405650126765664e-05, + "loss": 0.7097, + "step": 10040 + }, + { + "epoch": 2.17, + "learning_rate": 8.362187613183627e-05, + "loss": 0.7025, + "step": 10060 + }, + { + "epoch": 2.17, + "learning_rate": 8.318725099601592e-05, + "loss": 0.7065, + "step": 10080 + }, + { + "epoch": 2.18, + "learning_rate": 8.275262586019557e-05, + "loss": 0.6982, + "step": 10100 + }, + { + "epoch": 2.18, + "learning_rate": 8.231800072437521e-05, + "loss": 0.7039, + "step": 10120 + }, + { + "epoch": 2.19, + "learning_rate": 8.188337558855486e-05, + "loss": 0.7097, + "step": 10140 + }, + { + "epoch": 2.19, + "learning_rate": 8.144875045273451e-05, + "loss": 0.7089, + "step": 10160 + }, + { + "epoch": 2.2, + "learning_rate": 8.101412531691415e-05, + "loss": 0.7018, + "step": 10180 + }, + { + "epoch": 2.2, + "learning_rate": 8.05795001810938e-05, + "loss": 0.7025, + "step": 10200 + }, + { + "epoch": 2.2, + "eval_loss": 0.7179592251777649, + "eval_runtime": 25.4993, + "eval_samples_per_second": 78.433, + "eval_steps_per_second": 1.255, + "step": 10200 + }, + { + "epoch": 2.2, + "learning_rate": 8.014487504527345e-05, + "loss": 0.7067, + "step": 10220 + }, + { + "epoch": 2.21, + "learning_rate": 7.971024990945308e-05, + "loss": 0.71, + "step": 10240 + }, + { + "epoch": 2.21, + "learning_rate": 7.927562477363273e-05, + "loss": 0.7255, + "step": 10260 + }, + { + "epoch": 2.22, + "learning_rate": 7.884099963781238e-05, + "loss": 0.7065, + "step": 10280 + }, + { + "epoch": 2.22, + "learning_rate": 7.840637450199202e-05, + "loss": 0.712, + "step": 10300 + }, + { + "epoch": 2.23, + "learning_rate": 7.797174936617167e-05, + "loss": 0.7132, + "step": 10320 + }, + { + "epoch": 2.23, + "learning_rate": 7.753712423035132e-05, + "loss": 0.7106, + "step": 10340 + }, + { + "epoch": 2.24, + "learning_rate": 7.710249909453096e-05, + "loss": 0.708, + "step": 10360 + }, + { + "epoch": 2.24, + "learning_rate": 7.666787395871061e-05, + "loss": 0.7054, + "step": 10380 + }, + { + "epoch": 2.24, + "learning_rate": 7.623324882289026e-05, + "loss": 0.7087, + "step": 10400 + }, + { + "epoch": 2.24, + "eval_loss": 0.717901349067688, + "eval_runtime": 25.4862, + "eval_samples_per_second": 78.474, + "eval_steps_per_second": 1.256, + "step": 10400 + }, + { + "epoch": 2.25, + "learning_rate": 7.57986236870699e-05, + "loss": 0.7014, + "step": 10420 + }, + { + "epoch": 2.25, + "learning_rate": 7.536399855124954e-05, + "loss": 0.7103, + "step": 10440 + }, + { + "epoch": 2.26, + "learning_rate": 7.49293734154292e-05, + "loss": 0.7089, + "step": 10460 + }, + { + "epoch": 2.26, + "learning_rate": 7.449474827960883e-05, + "loss": 0.704, + "step": 10480 + }, + { + "epoch": 2.27, + "learning_rate": 7.406012314378847e-05, + "loss": 0.7074, + "step": 10500 + }, + { + "epoch": 2.27, + "learning_rate": 7.362549800796812e-05, + "loss": 0.7094, + "step": 10520 + }, + { + "epoch": 2.27, + "learning_rate": 7.319087287214777e-05, + "loss": 0.7069, + "step": 10540 + }, + { + "epoch": 2.28, + "learning_rate": 7.27562477363274e-05, + "loss": 0.7081, + "step": 10560 + }, + { + "epoch": 2.28, + "learning_rate": 7.232162260050705e-05, + "loss": 0.7036, + "step": 10580 + }, + { + "epoch": 2.29, + "learning_rate": 7.18869974646867e-05, + "loss": 0.6984, + "step": 10600 + }, + { + "epoch": 2.29, + "eval_loss": 0.7175166010856628, + "eval_runtime": 25.5016, + "eval_samples_per_second": 78.426, + "eval_steps_per_second": 1.255, + "step": 10600 + }, + { + "epoch": 2.29, + "learning_rate": 7.145237232886634e-05, + "loss": 0.7097, + "step": 10620 + }, + { + "epoch": 2.3, + "learning_rate": 7.101774719304599e-05, + "loss": 0.7143, + "step": 10640 + }, + { + "epoch": 2.3, + "learning_rate": 7.058312205722564e-05, + "loss": 0.7099, + "step": 10660 + }, + { + "epoch": 2.3, + "learning_rate": 7.014849692140528e-05, + "loss": 0.6994, + "step": 10680 + }, + { + "epoch": 2.31, + "learning_rate": 6.971387178558493e-05, + "loss": 0.7129, + "step": 10700 + }, + { + "epoch": 2.31, + "learning_rate": 6.927924664976458e-05, + "loss": 0.7067, + "step": 10720 + }, + { + "epoch": 2.32, + "learning_rate": 6.884462151394421e-05, + "loss": 0.7044, + "step": 10740 + }, + { + "epoch": 2.32, + "learning_rate": 6.840999637812386e-05, + "loss": 0.7092, + "step": 10760 + }, + { + "epoch": 2.33, + "learning_rate": 6.797537124230351e-05, + "loss": 0.7075, + "step": 10780 + }, + { + "epoch": 2.33, + "learning_rate": 6.754074610648315e-05, + "loss": 0.7073, + "step": 10800 + }, + { + "epoch": 2.33, + "eval_loss": 0.7168901562690735, + "eval_runtime": 25.5153, + "eval_samples_per_second": 78.384, + "eval_steps_per_second": 1.254, + "step": 10800 + }, + { + "epoch": 2.33, + "learning_rate": 6.71061209706628e-05, + "loss": 0.7088, + "step": 10820 + }, + { + "epoch": 2.34, + "learning_rate": 6.667149583484245e-05, + "loss": 0.7046, + "step": 10840 + }, + { + "epoch": 2.34, + "learning_rate": 6.623687069902209e-05, + "loss": 0.7029, + "step": 10860 + }, + { + "epoch": 2.35, + "learning_rate": 6.580224556320174e-05, + "loss": 0.7055, + "step": 10880 + }, + { + "epoch": 2.35, + "learning_rate": 6.536762042738139e-05, + "loss": 0.7095, + "step": 10900 + }, + { + "epoch": 2.36, + "learning_rate": 6.493299529156102e-05, + "loss": 0.7057, + "step": 10920 + }, + { + "epoch": 2.36, + "learning_rate": 6.449837015574066e-05, + "loss": 0.7064, + "step": 10940 + }, + { + "epoch": 2.36, + "learning_rate": 6.406374501992031e-05, + "loss": 0.7039, + "step": 10960 + }, + { + "epoch": 2.37, + "learning_rate": 6.362911988409996e-05, + "loss": 0.7109, + "step": 10980 + }, + { + "epoch": 2.37, + "learning_rate": 6.31944947482796e-05, + "loss": 0.7051, + "step": 11000 + }, + { + "epoch": 2.37, + "eval_loss": 0.7164381146430969, + "eval_runtime": 25.4817, + "eval_samples_per_second": 78.488, + "eval_steps_per_second": 1.256, + "step": 11000 + }, + { + "epoch": 2.38, + "learning_rate": 6.275986961245924e-05, + "loss": 0.7117, + "step": 11020 + }, + { + "epoch": 2.38, + "learning_rate": 6.23252444766389e-05, + "loss": 0.6972, + "step": 11040 + }, + { + "epoch": 2.39, + "learning_rate": 6.189061934081853e-05, + "loss": 0.7087, + "step": 11060 + }, + { + "epoch": 2.39, + "learning_rate": 6.145599420499818e-05, + "loss": 0.703, + "step": 11080 + }, + { + "epoch": 2.39, + "learning_rate": 6.1021369069177825e-05, + "loss": 0.7062, + "step": 11100 + }, + { + "epoch": 2.4, + "learning_rate": 6.0586743933357475e-05, + "loss": 0.7018, + "step": 11120 + }, + { + "epoch": 2.4, + "learning_rate": 6.015211879753712e-05, + "loss": 0.7003, + "step": 11140 + }, + { + "epoch": 2.41, + "learning_rate": 5.971749366171676e-05, + "loss": 0.7005, + "step": 11160 + }, + { + "epoch": 2.41, + "learning_rate": 5.928286852589641e-05, + "loss": 0.7099, + "step": 11180 + }, + { + "epoch": 2.42, + "learning_rate": 5.8848243390076054e-05, + "loss": 0.7002, + "step": 11200 + }, + { + "epoch": 2.42, + "eval_loss": 0.7161288857460022, + "eval_runtime": 25.5084, + "eval_samples_per_second": 78.406, + "eval_steps_per_second": 1.254, + "step": 11200 + }, + { + "epoch": 2.42, + "learning_rate": 5.84136182542557e-05, + "loss": 0.7071, + "step": 11220 + }, + { + "epoch": 2.43, + "learning_rate": 5.797899311843535e-05, + "loss": 0.7028, + "step": 11240 + }, + { + "epoch": 2.43, + "learning_rate": 5.754436798261499e-05, + "loss": 0.7199, + "step": 11260 + }, + { + "epoch": 2.43, + "learning_rate": 5.7109742846794634e-05, + "loss": 0.6974, + "step": 11280 + }, + { + "epoch": 2.44, + "learning_rate": 5.6675117710974284e-05, + "loss": 0.7003, + "step": 11300 + }, + { + "epoch": 2.44, + "learning_rate": 5.624049257515393e-05, + "loss": 0.7079, + "step": 11320 + }, + { + "epoch": 2.45, + "learning_rate": 5.580586743933357e-05, + "loss": 0.6988, + "step": 11340 + }, + { + "epoch": 2.45, + "learning_rate": 5.537124230351322e-05, + "loss": 0.7047, + "step": 11360 + }, + { + "epoch": 2.46, + "learning_rate": 5.493661716769286e-05, + "loss": 0.6946, + "step": 11380 + }, + { + "epoch": 2.46, + "learning_rate": 5.45019920318725e-05, + "loss": 0.7096, + "step": 11400 + }, + { + "epoch": 2.46, + "eval_loss": 0.7155815958976746, + "eval_runtime": 25.525, + "eval_samples_per_second": 78.355, + "eval_steps_per_second": 1.254, + "step": 11400 + }, + { + "epoch": 2.46, + "learning_rate": 5.406736689605215e-05, + "loss": 0.709, + "step": 11420 + }, + { + "epoch": 2.47, + "learning_rate": 5.3632741760231794e-05, + "loss": 0.7112, + "step": 11440 + }, + { + "epoch": 2.47, + "learning_rate": 5.319811662441144e-05, + "loss": 0.6983, + "step": 11460 + }, + { + "epoch": 2.48, + "learning_rate": 5.276349148859109e-05, + "loss": 0.7, + "step": 11480 + }, + { + "epoch": 2.48, + "learning_rate": 5.232886635277073e-05, + "loss": 0.7006, + "step": 11500 + }, + { + "epoch": 2.49, + "learning_rate": 5.189424121695037e-05, + "loss": 0.7068, + "step": 11520 + }, + { + "epoch": 2.49, + "learning_rate": 5.1459616081130023e-05, + "loss": 0.7012, + "step": 11540 + }, + { + "epoch": 2.49, + "learning_rate": 5.102499094530967e-05, + "loss": 0.7079, + "step": 11560 + }, + { + "epoch": 2.5, + "learning_rate": 5.059036580948931e-05, + "loss": 0.7031, + "step": 11580 + }, + { + "epoch": 2.5, + "learning_rate": 5.015574067366896e-05, + "loss": 0.7038, + "step": 11600 + }, + { + "epoch": 2.5, + "eval_loss": 0.7149330973625183, + "eval_runtime": 25.4843, + "eval_samples_per_second": 78.48, + "eval_steps_per_second": 1.256, + "step": 11600 + }, + { + "epoch": 2.51, + "learning_rate": 4.97211155378486e-05, + "loss": 0.6972, + "step": 11620 + }, + { + "epoch": 2.51, + "learning_rate": 4.9286490402028246e-05, + "loss": 0.7039, + "step": 11640 + }, + { + "epoch": 2.52, + "learning_rate": 4.885186526620789e-05, + "loss": 0.7052, + "step": 11660 + }, + { + "epoch": 2.52, + "learning_rate": 4.841724013038754e-05, + "loss": 0.7045, + "step": 11680 + }, + { + "epoch": 2.52, + "learning_rate": 4.798261499456718e-05, + "loss": 0.701, + "step": 11700 + }, + { + "epoch": 2.53, + "learning_rate": 4.7547989858746826e-05, + "loss": 0.7084, + "step": 11720 + }, + { + "epoch": 2.53, + "learning_rate": 4.7113364722926476e-05, + "loss": 0.6988, + "step": 11740 + }, + { + "epoch": 2.54, + "learning_rate": 4.667873958710612e-05, + "loss": 0.7155, + "step": 11760 + }, + { + "epoch": 2.54, + "learning_rate": 4.624411445128576e-05, + "loss": 0.7044, + "step": 11780 + }, + { + "epoch": 2.55, + "learning_rate": 4.5809489315465406e-05, + "loss": 0.7014, + "step": 11800 + }, + { + "epoch": 2.55, + "eval_loss": 0.714367151260376, + "eval_runtime": 25.4959, + "eval_samples_per_second": 78.444, + "eval_steps_per_second": 1.255, + "step": 11800 + }, + { + "epoch": 2.55, + "learning_rate": 4.537486417964505e-05, + "loss": 0.708, + "step": 11820 + }, + { + "epoch": 2.55, + "learning_rate": 4.494023904382469e-05, + "loss": 0.6976, + "step": 11840 + }, + { + "epoch": 2.56, + "learning_rate": 4.450561390800434e-05, + "loss": 0.7057, + "step": 11860 + }, + { + "epoch": 2.56, + "learning_rate": 4.4070988772183986e-05, + "loss": 0.7039, + "step": 11880 + }, + { + "epoch": 2.57, + "learning_rate": 4.363636363636363e-05, + "loss": 0.7089, + "step": 11900 + }, + { + "epoch": 2.57, + "learning_rate": 4.320173850054328e-05, + "loss": 0.7026, + "step": 11920 + }, + { + "epoch": 2.58, + "learning_rate": 4.276711336472292e-05, + "loss": 0.7023, + "step": 11940 + }, + { + "epoch": 2.58, + "learning_rate": 4.2332488228902565e-05, + "loss": 0.7006, + "step": 11960 + }, + { + "epoch": 2.58, + "learning_rate": 4.1897863093082215e-05, + "loss": 0.7008, + "step": 11980 + }, + { + "epoch": 2.59, + "learning_rate": 4.146323795726186e-05, + "loss": 0.7057, + "step": 12000 + }, + { + "epoch": 2.59, + "eval_loss": 0.7141902446746826, + "eval_runtime": 25.5019, + "eval_samples_per_second": 78.426, + "eval_steps_per_second": 1.255, + "step": 12000 + }, + { + "epoch": 2.59, + "learning_rate": 4.10286128214415e-05, + "loss": 0.7083, + "step": 12020 + }, + { + "epoch": 2.6, + "learning_rate": 4.059398768562115e-05, + "loss": 0.6986, + "step": 12040 + }, + { + "epoch": 2.6, + "learning_rate": 4.0159362549800795e-05, + "loss": 0.7076, + "step": 12060 + }, + { + "epoch": 2.61, + "learning_rate": 3.972473741398044e-05, + "loss": 0.7071, + "step": 12080 + }, + { + "epoch": 2.61, + "learning_rate": 3.929011227816009e-05, + "loss": 0.6984, + "step": 12100 + }, + { + "epoch": 2.61, + "learning_rate": 3.885548714233973e-05, + "loss": 0.7096, + "step": 12120 + }, + { + "epoch": 2.62, + "learning_rate": 3.8420862006519375e-05, + "loss": 0.7027, + "step": 12140 + }, + { + "epoch": 2.62, + "learning_rate": 3.7986236870699025e-05, + "loss": 0.7062, + "step": 12160 + }, + { + "epoch": 2.63, + "learning_rate": 3.755161173487867e-05, + "loss": 0.7049, + "step": 12180 + }, + { + "epoch": 2.63, + "learning_rate": 3.711698659905831e-05, + "loss": 0.7052, + "step": 12200 + }, + { + "epoch": 2.63, + "eval_loss": 0.7140177488327026, + "eval_runtime": 25.4673, + "eval_samples_per_second": 78.532, + "eval_steps_per_second": 1.257, + "step": 12200 + }, + { + "epoch": 2.64, + "learning_rate": 3.6682361463237955e-05, + "loss": 0.7011, + "step": 12220 + }, + { + "epoch": 2.64, + "learning_rate": 3.62477363274176e-05, + "loss": 0.7025, + "step": 12240 + }, + { + "epoch": 2.65, + "learning_rate": 3.581311119159725e-05, + "loss": 0.7006, + "step": 12260 + }, + { + "epoch": 2.65, + "learning_rate": 3.537848605577689e-05, + "loss": 0.7073, + "step": 12280 + }, + { + "epoch": 2.65, + "learning_rate": 3.4943860919956534e-05, + "loss": 0.7033, + "step": 12300 + }, + { + "epoch": 2.66, + "learning_rate": 3.4509235784136184e-05, + "loss": 0.6992, + "step": 12320 + }, + { + "epoch": 2.66, + "learning_rate": 3.407461064831582e-05, + "loss": 0.7043, + "step": 12340 + }, + { + "epoch": 2.67, + "learning_rate": 3.363998551249547e-05, + "loss": 0.7083, + "step": 12360 + }, + { + "epoch": 2.67, + "learning_rate": 3.3205360376675114e-05, + "loss": 0.7086, + "step": 12380 + }, + { + "epoch": 2.68, + "learning_rate": 3.277073524085476e-05, + "loss": 0.7168, + "step": 12400 + }, + { + "epoch": 2.68, + "eval_loss": 0.7138265371322632, + "eval_runtime": 25.5077, + "eval_samples_per_second": 78.408, + "eval_steps_per_second": 1.255, + "step": 12400 + }, + { + "epoch": 2.68, + "learning_rate": 3.233611010503441e-05, + "loss": 0.7026, + "step": 12420 + }, + { + "epoch": 2.68, + "learning_rate": 3.190148496921405e-05, + "loss": 0.7097, + "step": 12440 + }, + { + "epoch": 2.69, + "learning_rate": 3.1466859833393694e-05, + "loss": 0.7094, + "step": 12460 + }, + { + "epoch": 2.69, + "learning_rate": 3.1032234697573344e-05, + "loss": 0.6971, + "step": 12480 + }, + { + "epoch": 2.7, + "learning_rate": 3.059760956175299e-05, + "loss": 0.6977, + "step": 12500 + }, + { + "epoch": 2.7, + "learning_rate": 3.016298442593263e-05, + "loss": 0.6945, + "step": 12520 + }, + { + "epoch": 2.71, + "learning_rate": 2.9728359290112277e-05, + "loss": 0.6998, + "step": 12540 + }, + { + "epoch": 2.71, + "learning_rate": 2.929373415429192e-05, + "loss": 0.7067, + "step": 12560 + }, + { + "epoch": 2.71, + "learning_rate": 2.8859109018471563e-05, + "loss": 0.6935, + "step": 12580 + }, + { + "epoch": 2.72, + "learning_rate": 2.842448388265121e-05, + "loss": 0.6927, + "step": 12600 + }, + { + "epoch": 2.72, + "eval_loss": 0.7132371664047241, + "eval_runtime": 25.516, + "eval_samples_per_second": 78.382, + "eval_steps_per_second": 1.254, + "step": 12600 + }, + { + "epoch": 2.72, + "learning_rate": 2.7989858746830857e-05, + "loss": 0.7025, + "step": 12620 + }, + { + "epoch": 2.73, + "learning_rate": 2.75552336110105e-05, + "loss": 0.7098, + "step": 12640 + }, + { + "epoch": 2.73, + "learning_rate": 2.7120608475190147e-05, + "loss": 0.6939, + "step": 12660 + }, + { + "epoch": 2.74, + "learning_rate": 2.6685983339369793e-05, + "loss": 0.7038, + "step": 12680 + }, + { + "epoch": 2.74, + "learning_rate": 2.6251358203549436e-05, + "loss": 0.7039, + "step": 12700 + }, + { + "epoch": 2.74, + "learning_rate": 2.5816733067729083e-05, + "loss": 0.7018, + "step": 12720 + }, + { + "epoch": 2.75, + "learning_rate": 2.538210793190873e-05, + "loss": 0.6943, + "step": 12740 + }, + { + "epoch": 2.75, + "learning_rate": 2.4947482796088373e-05, + "loss": 0.7007, + "step": 12760 + }, + { + "epoch": 2.76, + "learning_rate": 2.4512857660268016e-05, + "loss": 0.7019, + "step": 12780 + }, + { + "epoch": 2.76, + "learning_rate": 2.407823252444766e-05, + "loss": 0.6957, + "step": 12800 + }, + { + "epoch": 2.76, + "eval_loss": 0.7126932144165039, + "eval_runtime": 25.4915, + "eval_samples_per_second": 78.458, + "eval_steps_per_second": 1.255, + "step": 12800 + }, + { + "epoch": 2.77, + "learning_rate": 2.3643607388627306e-05, + "loss": 0.6993, + "step": 12820 + }, + { + "epoch": 2.77, + "learning_rate": 2.3208982252806953e-05, + "loss": 0.6951, + "step": 12840 + }, + { + "epoch": 2.77, + "learning_rate": 2.2774357116986596e-05, + "loss": 0.7056, + "step": 12860 + }, + { + "epoch": 2.78, + "learning_rate": 2.2339731981166243e-05, + "loss": 0.7153, + "step": 12880 + }, + { + "epoch": 2.78, + "learning_rate": 2.190510684534589e-05, + "loss": 0.7022, + "step": 12900 + }, + { + "epoch": 2.79, + "learning_rate": 2.1470481709525532e-05, + "loss": 0.7078, + "step": 12920 + }, + { + "epoch": 2.79, + "learning_rate": 2.103585657370518e-05, + "loss": 0.6969, + "step": 12940 + }, + { + "epoch": 2.8, + "learning_rate": 2.0601231437884826e-05, + "loss": 0.7056, + "step": 12960 + }, + { + "epoch": 2.8, + "learning_rate": 2.016660630206447e-05, + "loss": 0.6975, + "step": 12980 + }, + { + "epoch": 2.8, + "learning_rate": 1.9731981166244112e-05, + "loss": 0.7065, + "step": 13000 + }, + { + "epoch": 2.8, + "eval_loss": 0.7130131721496582, + "eval_runtime": 25.4905, + "eval_samples_per_second": 78.461, + "eval_steps_per_second": 1.255, + "step": 13000 + }, + { + "epoch": 2.81, + "learning_rate": 1.9297356030423755e-05, + "loss": 0.7, + "step": 13020 + }, + { + "epoch": 2.81, + "learning_rate": 1.8862730894603402e-05, + "loss": 0.7144, + "step": 13040 + }, + { + "epoch": 2.82, + "learning_rate": 1.842810575878305e-05, + "loss": 0.6964, + "step": 13060 + }, + { + "epoch": 2.82, + "learning_rate": 1.7993480622962692e-05, + "loss": 0.6981, + "step": 13080 + }, + { + "epoch": 2.83, + "learning_rate": 1.755885548714234e-05, + "loss": 0.7102, + "step": 13100 + }, + { + "epoch": 2.83, + "learning_rate": 1.7124230351321985e-05, + "loss": 0.6975, + "step": 13120 + }, + { + "epoch": 2.83, + "learning_rate": 1.668960521550163e-05, + "loss": 0.7062, + "step": 13140 + }, + { + "epoch": 2.84, + "learning_rate": 1.625498007968127e-05, + "loss": 0.6956, + "step": 13160 + }, + { + "epoch": 2.84, + "learning_rate": 1.5820354943860918e-05, + "loss": 0.71, + "step": 13180 + }, + { + "epoch": 2.85, + "learning_rate": 1.5385729808040565e-05, + "loss": 0.7081, + "step": 13200 + }, + { + "epoch": 2.85, + "eval_loss": 0.7126001119613647, + "eval_runtime": 25.5102, + "eval_samples_per_second": 78.4, + "eval_steps_per_second": 1.254, + "step": 13200 + }, + { + "epoch": 2.85, + "learning_rate": 1.495110467222021e-05, + "loss": 0.6977, + "step": 13220 + }, + { + "epoch": 2.86, + "learning_rate": 1.4516479536399855e-05, + "loss": 0.705, + "step": 13240 + }, + { + "epoch": 2.86, + "learning_rate": 1.4081854400579498e-05, + "loss": 0.7016, + "step": 13260 + }, + { + "epoch": 2.87, + "learning_rate": 1.3647229264759143e-05, + "loss": 0.6922, + "step": 13280 + }, + { + "epoch": 2.87, + "learning_rate": 1.321260412893879e-05, + "loss": 0.6987, + "step": 13300 + }, + { + "epoch": 2.87, + "learning_rate": 1.2777978993118434e-05, + "loss": 0.7041, + "step": 13320 + }, + { + "epoch": 2.88, + "learning_rate": 1.234335385729808e-05, + "loss": 0.7101, + "step": 13340 + }, + { + "epoch": 2.88, + "learning_rate": 1.1908728721477723e-05, + "loss": 0.6976, + "step": 13360 + }, + { + "epoch": 2.89, + "learning_rate": 1.147410358565737e-05, + "loss": 0.7011, + "step": 13380 + }, + { + "epoch": 2.89, + "learning_rate": 1.1039478449837014e-05, + "loss": 0.6973, + "step": 13400 + }, + { + "epoch": 2.89, + "eval_loss": 0.7123447060585022, + "eval_runtime": 25.5029, + "eval_samples_per_second": 78.422, + "eval_steps_per_second": 1.255, + "step": 13400 + } + ], + "max_steps": 13905, + "num_train_epochs": 3, + "total_flos": 6.775116663531084e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13400/training_args.bin b/adapters/saved-alpaca-belle13b/checkpoint-13400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d9fc651b09d1fbcbbf76356c2181acb1def32585 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d94e5a4fbc2ed544893c730b1ef244fb2123fe494b95f38bd148f9dd38f68e0 +size 3643 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13600/optimizer.pt b/adapters/saved-alpaca-belle13b/checkpoint-13600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c49f34eaf1a83dea92a65c9f8e0a48ba76758cc --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cefdaa5ce23c172c7299b760e2993297f31afbb92682efc9a6670dd385168e7b +size 52523141 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13600/pytorch_model.bin b/adapters/saved-alpaca-belle13b/checkpoint-13600/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..e53eb5e534494330d211a5a761e84d11068fe3a8 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13600/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1158118239f7edc0b9bcc1c6c610818fce5cd2ba0aa5c77e1973bf8cbffbd8e2 +size 26271757 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_0.pth b/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..eb0d964d2564eb3e72ed8f5b888c82fa3424bdac --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a4c0421b04bdf31a128658543bb70ddcc357d54fb4b700d712ccd56a762f554 +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_1.pth b/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..f2add8ddf130334e7e41735d9421206415b917de --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c9691519c8a07201aad39784b2f70eb1379ae4e708d630a6d4b94917c44da53 +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_2.pth b/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..fb7357e5a35745610f582e51237d8274b928fdae --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61d8bc77a2cdb230eab1cfd28c6f82ff6d29eef294ad92ab86252bb3338ec07f +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_3.pth b/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..ba5b8c8b0bdcc4b0662434f7ff640e009158f503 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42f14cc15d369082d88422be2b7270c7e642db36450feafb1dedc3afb18b8c4a +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_4.pth b/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..42cb632afc8f3ac07d82e8b0aeaf192b0bc7778a --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1746e26c28548fc876c116f1455d9c9f785df75c624a816cda18e3b58b2452de +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_5.pth b/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..6d6bcdd9385743c8a95e8840ae8e0e5890e01ae2 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e7a5162870013e955314e54ce59aa8f6b6b06a2012fe80e19c13c2c12e4318f +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_6.pth b/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..a1b6c9eaeeb02c5f21e91cc62ea3bbf018a01861 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01008732ea1c8334d8248d30c3002da778c84adf4be332d963af490b9d119e77 +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_7.pth b/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..9ed7dd960edf1e92abebb25d7a090b93be35b86a --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13600/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8253631518ab250ebc59140c91edf1a46a798d937d45c33040b36f5572c55292 +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13600/scaler.pt b/adapters/saved-alpaca-belle13b/checkpoint-13600/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ebca15e2650dc4a6ff4c8280e8f1b5f335a6bd6 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13600/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13b64bf69394f86c77a886eb6aa8dfdc88adad6ec20ac18c0381490441dad2ff +size 557 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13600/scheduler.pt b/adapters/saved-alpaca-belle13b/checkpoint-13600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b393a0e9efd2034798302fa9ffa47cc81deec730 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14033b842dbf1351d9c61af9e263fe448634d23ac8cab45a8e45e80d57ba0f2f +size 627 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13600/trainer_state.json b/adapters/saved-alpaca-belle13b/checkpoint-13600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bae16bd18228dfdfbcff9986c7dd15d9503e3148 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13600/trainer_state.json @@ -0,0 +1,4640 @@ +{ + "best_metric": 0.712183952331543, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-belle13b/checkpoint-13600", + "epoch": 2.9341963322545848, + "global_step": 13600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.6589, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 1.4071, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 1.044, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 0.00023999999999999998, + "loss": 0.9883, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.0003, + "loss": 0.9659, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029956537486417964, + "loss": 0.9505, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029913074972835925, + "loss": 0.9205, + "step": 140 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002986961245925389, + "loss": 0.9168, + "step": 160 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002982614994567186, + "loss": 0.9117, + "step": 180 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002978268743208982, + "loss": 0.9064, + "step": 200 + }, + { + "epoch": 0.04, + "eval_loss": 0.9033477306365967, + "eval_runtime": 25.3136, + "eval_samples_per_second": 79.009, + "eval_steps_per_second": 1.264, + "step": 200 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029739224918507785, + "loss": 0.8981, + "step": 220 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002969576240492575, + "loss": 0.8912, + "step": 240 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002965229989134371, + "loss": 0.8875, + "step": 260 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002960883737776168, + "loss": 0.8907, + "step": 280 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029565374864179645, + "loss": 0.8753, + "step": 300 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029521912350597606, + "loss": 0.8782, + "step": 320 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002947844983701557, + "loss": 0.8697, + "step": 340 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002943498732343354, + "loss": 0.8745, + "step": 360 + }, + { + "epoch": 0.08, + "learning_rate": 0.000293915248098515, + "loss": 0.8725, + "step": 380 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029348062296269466, + "loss": 0.8658, + "step": 400 + }, + { + "epoch": 0.09, + "eval_loss": 0.8655584454536438, + "eval_runtime": 25.3343, + "eval_samples_per_second": 78.944, + "eval_steps_per_second": 1.263, + "step": 400 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002930459978268743, + "loss": 0.8641, + "step": 420 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029261137269105393, + "loss": 0.8509, + "step": 440 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002921767475552336, + "loss": 0.8541, + "step": 460 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029174212241941326, + "loss": 0.8575, + "step": 480 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029130749728359287, + "loss": 0.8482, + "step": 500 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029087287214777253, + "loss": 0.8572, + "step": 520 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002904382470119522, + "loss": 0.8489, + "step": 540 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002900036218761318, + "loss": 0.8585, + "step": 560 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028956899674031147, + "loss": 0.8387, + "step": 580 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028913437160449113, + "loss": 0.8306, + "step": 600 + }, + { + "epoch": 0.13, + "eval_loss": 0.8434031009674072, + "eval_runtime": 25.3211, + "eval_samples_per_second": 78.986, + "eval_steps_per_second": 1.264, + "step": 600 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028869974646867074, + "loss": 0.8331, + "step": 620 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002882651213328504, + "loss": 0.8447, + "step": 640 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028783049619703007, + "loss": 0.836, + "step": 660 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002873958710612097, + "loss": 0.8436, + "step": 680 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028696124592538934, + "loss": 0.8281, + "step": 700 + }, + { + "epoch": 0.16, + "learning_rate": 0.000286526620789569, + "loss": 0.8378, + "step": 720 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002860919956537486, + "loss": 0.8338, + "step": 740 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002856573705179283, + "loss": 0.8323, + "step": 760 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028522274538210794, + "loss": 0.8153, + "step": 780 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028478812024628755, + "loss": 0.8349, + "step": 800 + }, + { + "epoch": 0.17, + "eval_loss": 0.8282934427261353, + "eval_runtime": 25.4025, + "eval_samples_per_second": 78.733, + "eval_steps_per_second": 1.26, + "step": 800 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002843534951104672, + "loss": 0.8198, + "step": 820 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002839188699746469, + "loss": 0.8254, + "step": 840 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002834842448388265, + "loss": 0.8165, + "step": 860 + }, + { + "epoch": 0.19, + "learning_rate": 0.00028304961970300615, + "loss": 0.8241, + "step": 880 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002826149945671858, + "loss": 0.814, + "step": 900 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002821803694313654, + "loss": 0.8222, + "step": 920 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002817457442955451, + "loss": 0.825, + "step": 940 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028131111915972475, + "loss": 0.8153, + "step": 960 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028087649402390436, + "loss": 0.8229, + "step": 980 + }, + { + "epoch": 0.22, + "learning_rate": 0.00028044186888808397, + "loss": 0.8129, + "step": 1000 + }, + { + "epoch": 0.22, + "eval_loss": 0.816320538520813, + "eval_runtime": 25.4153, + "eval_samples_per_second": 78.693, + "eval_steps_per_second": 1.259, + "step": 1000 + }, + { + "epoch": 0.22, + "learning_rate": 0.00028000724375226363, + "loss": 0.8121, + "step": 1020 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002795726186164433, + "loss": 0.8063, + "step": 1040 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002791379934806229, + "loss": 0.8097, + "step": 1060 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027870336834480257, + "loss": 0.8142, + "step": 1080 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027826874320898223, + "loss": 0.8021, + "step": 1100 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027783411807316184, + "loss": 0.8014, + "step": 1120 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002773994929373415, + "loss": 0.8031, + "step": 1140 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027696486780152117, + "loss": 0.8011, + "step": 1160 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002765302426657008, + "loss": 0.7944, + "step": 1180 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027609561752988044, + "loss": 0.8071, + "step": 1200 + }, + { + "epoch": 0.26, + "eval_loss": 0.8064733147621155, + "eval_runtime": 25.3901, + "eval_samples_per_second": 78.771, + "eval_steps_per_second": 1.26, + "step": 1200 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002756609923940601, + "loss": 0.8025, + "step": 1220 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002752263672582397, + "loss": 0.7954, + "step": 1240 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002747917421224194, + "loss": 0.8013, + "step": 1260 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027435711698659904, + "loss": 0.7967, + "step": 1280 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027392249185077865, + "loss": 0.8132, + "step": 1300 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002734878667149583, + "loss": 0.8017, + "step": 1320 + }, + { + "epoch": 0.29, + "learning_rate": 0.000273053241579138, + "loss": 0.7964, + "step": 1340 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002726186164433176, + "loss": 0.8012, + "step": 1360 + }, + { + "epoch": 0.3, + "learning_rate": 0.00027218399130749725, + "loss": 0.7982, + "step": 1380 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002717493661716769, + "loss": 0.8031, + "step": 1400 + }, + { + "epoch": 0.3, + "eval_loss": 0.798474133014679, + "eval_runtime": 25.432, + "eval_samples_per_second": 78.641, + "eval_steps_per_second": 1.258, + "step": 1400 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002713147410358565, + "loss": 0.7925, + "step": 1420 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002708801159000362, + "loss": 0.794, + "step": 1440 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027044549076421585, + "loss": 0.804, + "step": 1460 + }, + { + "epoch": 0.32, + "learning_rate": 0.00027001086562839546, + "loss": 0.7942, + "step": 1480 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002695762404925751, + "loss": 0.7872, + "step": 1500 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002691416153567548, + "loss": 0.7962, + "step": 1520 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002687069902209344, + "loss": 0.7898, + "step": 1540 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026827236508511406, + "loss": 0.7886, + "step": 1560 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002678377399492937, + "loss": 0.7904, + "step": 1580 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026740311481347333, + "loss": 0.7892, + "step": 1600 + }, + { + "epoch": 0.35, + "eval_loss": 0.7912269234657288, + "eval_runtime": 25.444, + "eval_samples_per_second": 78.604, + "eval_steps_per_second": 1.258, + "step": 1600 + }, + { + "epoch": 0.35, + "learning_rate": 0.000266968489677653, + "loss": 0.7897, + "step": 1620 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026653386454183266, + "loss": 0.7927, + "step": 1640 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026609923940601227, + "loss": 0.7829, + "step": 1660 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026566461427019193, + "loss": 0.7788, + "step": 1680 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002652299891343716, + "loss": 0.786, + "step": 1700 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002647953639985512, + "loss": 0.7828, + "step": 1720 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026436073886273087, + "loss": 0.7788, + "step": 1740 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026392611372691053, + "loss": 0.7851, + "step": 1760 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026349148859109014, + "loss": 0.7936, + "step": 1780 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002630568634552698, + "loss": 0.7758, + "step": 1800 + }, + { + "epoch": 0.39, + "eval_loss": 0.7854430675506592, + "eval_runtime": 25.4734, + "eval_samples_per_second": 78.513, + "eval_steps_per_second": 1.256, + "step": 1800 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026262223831944947, + "loss": 0.787, + "step": 1820 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002621876131836291, + "loss": 0.7779, + "step": 1840 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026175298804780874, + "loss": 0.7792, + "step": 1860 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002613183629119884, + "loss": 0.7728, + "step": 1880 + }, + { + "epoch": 0.41, + "learning_rate": 0.000260883737776168, + "loss": 0.7844, + "step": 1900 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002604491126403477, + "loss": 0.7726, + "step": 1920 + }, + { + "epoch": 0.42, + "learning_rate": 0.00026001448750452734, + "loss": 0.7706, + "step": 1940 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025957986236870695, + "loss": 0.7659, + "step": 1960 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002591452372328866, + "loss": 0.7808, + "step": 1980 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002587106120970663, + "loss": 0.7692, + "step": 2000 + }, + { + "epoch": 0.43, + "eval_loss": 0.7800412774085999, + "eval_runtime": 25.5146, + "eval_samples_per_second": 78.387, + "eval_steps_per_second": 1.254, + "step": 2000 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002582759869612459, + "loss": 0.7665, + "step": 2020 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025784136182542555, + "loss": 0.7795, + "step": 2040 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002574067366896052, + "loss": 0.7846, + "step": 2060 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002569721115537848, + "loss": 0.7639, + "step": 2080 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002565374864179645, + "loss": 0.7827, + "step": 2100 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025610286128214415, + "loss": 0.7751, + "step": 2120 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025566823614632376, + "loss": 0.776, + "step": 2140 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002552336110105034, + "loss": 0.7773, + "step": 2160 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002547989858746831, + "loss": 0.7757, + "step": 2180 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002543643607388627, + "loss": 0.7769, + "step": 2200 + }, + { + "epoch": 0.47, + "eval_loss": 0.7759379744529724, + "eval_runtime": 25.4789, + "eval_samples_per_second": 78.496, + "eval_steps_per_second": 1.256, + "step": 2200 + }, + { + "epoch": 0.48, + "learning_rate": 0.00025392973560304236, + "loss": 0.7657, + "step": 2220 + }, + { + "epoch": 0.48, + "learning_rate": 0.000253495110467222, + "loss": 0.7664, + "step": 2240 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025306048533140163, + "loss": 0.7774, + "step": 2260 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002526258601955813, + "loss": 0.7591, + "step": 2280 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025219123505976096, + "loss": 0.7605, + "step": 2300 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025175660992394057, + "loss": 0.7693, + "step": 2320 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025132198478812023, + "loss": 0.7702, + "step": 2340 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002508873596522999, + "loss": 0.7706, + "step": 2360 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002504527345164795, + "loss": 0.7664, + "step": 2380 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025001810938065917, + "loss": 0.76, + "step": 2400 + }, + { + "epoch": 0.52, + "eval_loss": 0.7723669409751892, + "eval_runtime": 25.4827, + "eval_samples_per_second": 78.485, + "eval_steps_per_second": 1.256, + "step": 2400 + }, + { + "epoch": 0.52, + "learning_rate": 0.00024958348424483883, + "loss": 0.7702, + "step": 2420 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024914885910901844, + "loss": 0.7686, + "step": 2440 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002487142339731981, + "loss": 0.762, + "step": 2460 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024827960883737777, + "loss": 0.7719, + "step": 2480 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002478449837015574, + "loss": 0.7612, + "step": 2500 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024741035856573704, + "loss": 0.7565, + "step": 2520 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002469757334299167, + "loss": 0.7719, + "step": 2540 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002465411082940963, + "loss": 0.7619, + "step": 2560 + }, + { + "epoch": 0.56, + "learning_rate": 0.000246106483158276, + "loss": 0.7607, + "step": 2580 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024567185802245564, + "loss": 0.7564, + "step": 2600 + }, + { + "epoch": 0.56, + "eval_loss": 0.7678729295730591, + "eval_runtime": 25.4455, + "eval_samples_per_second": 78.599, + "eval_steps_per_second": 1.258, + "step": 2600 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024523723288663525, + "loss": 0.7613, + "step": 2620 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002448026077508149, + "loss": 0.7525, + "step": 2640 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002443679826149946, + "loss": 0.7563, + "step": 2660 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024393335747917422, + "loss": 0.7601, + "step": 2680 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024349873234335383, + "loss": 0.7633, + "step": 2700 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024306410720753346, + "loss": 0.75, + "step": 2720 + }, + { + "epoch": 0.59, + "learning_rate": 0.0002426294820717131, + "loss": 0.7602, + "step": 2740 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024219485693589276, + "loss": 0.7546, + "step": 2760 + }, + { + "epoch": 0.6, + "learning_rate": 0.0002417602318000724, + "loss": 0.7532, + "step": 2780 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024132560666425203, + "loss": 0.7661, + "step": 2800 + }, + { + "epoch": 0.6, + "eval_loss": 0.7649803757667542, + "eval_runtime": 25.4783, + "eval_samples_per_second": 78.498, + "eval_steps_per_second": 1.256, + "step": 2800 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002408909815284317, + "loss": 0.7587, + "step": 2820 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024045635639261133, + "loss": 0.7543, + "step": 2840 + }, + { + "epoch": 0.62, + "learning_rate": 0.00024002173125679097, + "loss": 0.7672, + "step": 2860 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023958710612097063, + "loss": 0.7623, + "step": 2880 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023915248098515027, + "loss": 0.7487, + "step": 2900 + }, + { + "epoch": 0.63, + "learning_rate": 0.0002387178558493299, + "loss": 0.75, + "step": 2920 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023828323071350957, + "loss": 0.7567, + "step": 2940 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002378486055776892, + "loss": 0.7592, + "step": 2960 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023741398044186884, + "loss": 0.7569, + "step": 2980 + }, + { + "epoch": 0.65, + "learning_rate": 0.0002369793553060485, + "loss": 0.7524, + "step": 3000 + }, + { + "epoch": 0.65, + "eval_loss": 0.7613279819488525, + "eval_runtime": 25.4837, + "eval_samples_per_second": 78.482, + "eval_steps_per_second": 1.256, + "step": 3000 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023654473017022814, + "loss": 0.7593, + "step": 3020 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023611010503440778, + "loss": 0.7516, + "step": 3040 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023567547989858744, + "loss": 0.7525, + "step": 3060 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023524085476276708, + "loss": 0.7583, + "step": 3080 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023480622962694672, + "loss": 0.7535, + "step": 3100 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023437160449112638, + "loss": 0.7528, + "step": 3120 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023393697935530602, + "loss": 0.7418, + "step": 3140 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023350235421948565, + "loss": 0.7496, + "step": 3160 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023306772908366532, + "loss": 0.7537, + "step": 3180 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023263310394784495, + "loss": 0.7569, + "step": 3200 + }, + { + "epoch": 0.69, + "eval_loss": 0.7581906914710999, + "eval_runtime": 25.4588, + "eval_samples_per_second": 78.558, + "eval_steps_per_second": 1.257, + "step": 3200 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002321984788120246, + "loss": 0.7465, + "step": 3220 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023176385367620425, + "loss": 0.7367, + "step": 3240 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002313292285403839, + "loss": 0.7425, + "step": 3260 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023089460340456353, + "loss": 0.7637, + "step": 3280 + }, + { + "epoch": 0.71, + "learning_rate": 0.0002304599782687432, + "loss": 0.7574, + "step": 3300 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023002535313292283, + "loss": 0.7448, + "step": 3320 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022959072799710246, + "loss": 0.7595, + "step": 3340 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022915610286128213, + "loss": 0.7465, + "step": 3360 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022872147772546176, + "loss": 0.7532, + "step": 3380 + }, + { + "epoch": 0.73, + "learning_rate": 0.0002282868525896414, + "loss": 0.7466, + "step": 3400 + }, + { + "epoch": 0.73, + "eval_loss": 0.7559078931808472, + "eval_runtime": 25.464, + "eval_samples_per_second": 78.542, + "eval_steps_per_second": 1.257, + "step": 3400 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022785222745382106, + "loss": 0.753, + "step": 3420 + }, + { + "epoch": 0.74, + "learning_rate": 0.0002274176023180007, + "loss": 0.7459, + "step": 3440 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022698297718218034, + "loss": 0.7519, + "step": 3460 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022654835204636, + "loss": 0.7451, + "step": 3480 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022611372691053964, + "loss": 0.7468, + "step": 3500 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022567910177471927, + "loss": 0.7491, + "step": 3520 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022524447663889894, + "loss": 0.7524, + "step": 3540 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022480985150307857, + "loss": 0.7484, + "step": 3560 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002243752263672582, + "loss": 0.7484, + "step": 3580 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022394060123143787, + "loss": 0.7529, + "step": 3600 + }, + { + "epoch": 0.78, + "eval_loss": 0.7531791925430298, + "eval_runtime": 25.4572, + "eval_samples_per_second": 78.563, + "eval_steps_per_second": 1.257, + "step": 3600 + }, + { + "epoch": 0.78, + "learning_rate": 0.0002235059760956175, + "loss": 0.7475, + "step": 3620 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022307135095979715, + "loss": 0.7518, + "step": 3640 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002226367258239768, + "loss": 0.751, + "step": 3660 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022220210068815645, + "loss": 0.7402, + "step": 3680 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022176747555233608, + "loss": 0.755, + "step": 3700 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022133285041651575, + "loss": 0.7441, + "step": 3720 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022089822528069538, + "loss": 0.746, + "step": 3740 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022046360014487502, + "loss": 0.7441, + "step": 3760 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022002897500905468, + "loss": 0.7475, + "step": 3780 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021959434987323432, + "loss": 0.7458, + "step": 3800 + }, + { + "epoch": 0.82, + "eval_loss": 0.7513870596885681, + "eval_runtime": 25.4906, + "eval_samples_per_second": 78.46, + "eval_steps_per_second": 1.255, + "step": 3800 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021915972473741396, + "loss": 0.7436, + "step": 3820 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021872509960159362, + "loss": 0.7451, + "step": 3840 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021829047446577326, + "loss": 0.7475, + "step": 3860 + }, + { + "epoch": 0.84, + "learning_rate": 0.0002178558493299529, + "loss": 0.7424, + "step": 3880 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021742122419413256, + "loss": 0.7503, + "step": 3900 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002169865990583122, + "loss": 0.7334, + "step": 3920 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021655197392249183, + "loss": 0.7436, + "step": 3940 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002161173487866715, + "loss": 0.7453, + "step": 3960 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021568272365085113, + "loss": 0.7424, + "step": 3980 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021524809851503076, + "loss": 0.7509, + "step": 4000 + }, + { + "epoch": 0.86, + "eval_loss": 0.7488968968391418, + "eval_runtime": 25.492, + "eval_samples_per_second": 78.456, + "eval_steps_per_second": 1.255, + "step": 4000 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021481347337921043, + "loss": 0.7445, + "step": 4020 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021437884824339006, + "loss": 0.74, + "step": 4040 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002139442231075697, + "loss": 0.7362, + "step": 4060 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021350959797174936, + "loss": 0.7409, + "step": 4080 + }, + { + "epoch": 0.88, + "learning_rate": 0.000213074972835929, + "loss": 0.7315, + "step": 4100 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021264034770010864, + "loss": 0.7488, + "step": 4120 + }, + { + "epoch": 0.89, + "learning_rate": 0.0002122057225642883, + "loss": 0.7375, + "step": 4140 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021177109742846794, + "loss": 0.7481, + "step": 4160 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021133647229264757, + "loss": 0.7524, + "step": 4180 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021092357841361823, + "loss": 0.7403, + "step": 4200 + }, + { + "epoch": 0.91, + "eval_loss": 0.7469983100891113, + "eval_runtime": 25.4847, + "eval_samples_per_second": 78.479, + "eval_steps_per_second": 1.256, + "step": 4200 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021048895327779787, + "loss": 0.7394, + "step": 4220 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002100543281419775, + "loss": 0.7405, + "step": 4240 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020961970300615717, + "loss": 0.7534, + "step": 4260 + }, + { + "epoch": 0.92, + "learning_rate": 0.0002091850778703368, + "loss": 0.7412, + "step": 4280 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020875045273451644, + "loss": 0.7393, + "step": 4300 + }, + { + "epoch": 0.93, + "learning_rate": 0.0002083158275986961, + "loss": 0.7289, + "step": 4320 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020788120246287574, + "loss": 0.7342, + "step": 4340 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020744657732705538, + "loss": 0.7427, + "step": 4360 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020701195219123504, + "loss": 0.7386, + "step": 4380 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020657732705541468, + "loss": 0.7374, + "step": 4400 + }, + { + "epoch": 0.95, + "eval_loss": 0.7451291680335999, + "eval_runtime": 25.461, + "eval_samples_per_second": 78.552, + "eval_steps_per_second": 1.257, + "step": 4400 + }, + { + "epoch": 0.95, + "learning_rate": 0.0002061427019195943, + "loss": 0.7364, + "step": 4420 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020570807678377398, + "loss": 0.7377, + "step": 4440 + }, + { + "epoch": 0.96, + "learning_rate": 0.0002052734516479536, + "loss": 0.7391, + "step": 4460 + }, + { + "epoch": 0.97, + "learning_rate": 0.00020483882651213325, + "loss": 0.731, + "step": 4480 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002044042013763129, + "loss": 0.735, + "step": 4500 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020396957624049255, + "loss": 0.7344, + "step": 4520 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020353495110467219, + "loss": 0.7355, + "step": 4540 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020310032596885185, + "loss": 0.7357, + "step": 4560 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020266570083303149, + "loss": 0.7377, + "step": 4580 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020223107569721112, + "loss": 0.7438, + "step": 4600 + }, + { + "epoch": 0.99, + "eval_loss": 0.7437875270843506, + "eval_runtime": 25.5255, + "eval_samples_per_second": 78.353, + "eval_steps_per_second": 1.254, + "step": 4600 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020179645056139079, + "loss": 0.7343, + "step": 4620 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020136182542557042, + "loss": 0.7473, + "step": 4640 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020092720028975006, + "loss": 0.7305, + "step": 4660 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020049257515392972, + "loss": 0.7284, + "step": 4680 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020005795001810936, + "loss": 0.7335, + "step": 4700 + }, + { + "epoch": 1.02, + "learning_rate": 0.000199623324882289, + "loss": 0.7282, + "step": 4720 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019918869974646866, + "loss": 0.7337, + "step": 4740 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001987540746106483, + "loss": 0.7195, + "step": 4760 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019831944947482793, + "loss": 0.7327, + "step": 4780 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001978848243390076, + "loss": 0.7259, + "step": 4800 + }, + { + "epoch": 1.04, + "eval_loss": 0.7413464188575745, + "eval_runtime": 25.4959, + "eval_samples_per_second": 78.444, + "eval_steps_per_second": 1.255, + "step": 4800 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019745019920318723, + "loss": 0.7263, + "step": 4820 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019701557406736687, + "loss": 0.7341, + "step": 4840 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019658094893154653, + "loss": 0.7406, + "step": 4860 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019614632379572617, + "loss": 0.7309, + "step": 4880 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001957116986599058, + "loss": 0.7274, + "step": 4900 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019527707352408547, + "loss": 0.7241, + "step": 4920 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001948424483882651, + "loss": 0.7368, + "step": 4940 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019440782325244474, + "loss": 0.7445, + "step": 4960 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001939731981166244, + "loss": 0.7347, + "step": 4980 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019353857298080404, + "loss": 0.7436, + "step": 5000 + }, + { + "epoch": 1.08, + "eval_loss": 0.7399871945381165, + "eval_runtime": 25.5032, + "eval_samples_per_second": 78.422, + "eval_steps_per_second": 1.255, + "step": 5000 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019310394784498368, + "loss": 0.7248, + "step": 5020 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019266932270916334, + "loss": 0.7374, + "step": 5040 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019223469757334298, + "loss": 0.7187, + "step": 5060 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019180007243752261, + "loss": 0.7381, + "step": 5080 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019136544730170228, + "loss": 0.7389, + "step": 5100 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019093082216588191, + "loss": 0.7343, + "step": 5120 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019049619703006155, + "loss": 0.7323, + "step": 5140 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019006157189424121, + "loss": 0.723, + "step": 5160 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018962694675842085, + "loss": 0.7236, + "step": 5180 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001891923216226005, + "loss": 0.7399, + "step": 5200 + }, + { + "epoch": 1.12, + "eval_loss": 0.7393975257873535, + "eval_runtime": 25.6137, + "eval_samples_per_second": 78.083, + "eval_steps_per_second": 1.249, + "step": 5200 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018875769648678015, + "loss": 0.7373, + "step": 5220 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001883230713509598, + "loss": 0.7257, + "step": 5240 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018788844621513942, + "loss": 0.7261, + "step": 5260 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001874538210793191, + "loss": 0.7302, + "step": 5280 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018701919594349872, + "loss": 0.7337, + "step": 5300 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018658457080767836, + "loss": 0.7237, + "step": 5320 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018614994567185802, + "loss": 0.7238, + "step": 5340 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018571532053603766, + "loss": 0.7287, + "step": 5360 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001852806954002173, + "loss": 0.7237, + "step": 5380 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018484607026439696, + "loss": 0.7256, + "step": 5400 + }, + { + "epoch": 1.17, + "eval_loss": 0.7377527952194214, + "eval_runtime": 25.4964, + "eval_samples_per_second": 78.442, + "eval_steps_per_second": 1.255, + "step": 5400 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001844114451285766, + "loss": 0.7279, + "step": 5420 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018397681999275623, + "loss": 0.7226, + "step": 5440 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001835421948569359, + "loss": 0.7167, + "step": 5460 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018310756972111553, + "loss": 0.7268, + "step": 5480 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018267294458529517, + "loss": 0.7398, + "step": 5500 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018223831944947483, + "loss": 0.7331, + "step": 5520 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018180369431365447, + "loss": 0.7372, + "step": 5540 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001813690691778341, + "loss": 0.7321, + "step": 5560 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018093444404201377, + "loss": 0.7346, + "step": 5580 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001804998189061934, + "loss": 0.722, + "step": 5600 + }, + { + "epoch": 1.21, + "eval_loss": 0.7368175983428955, + "eval_runtime": 25.5045, + "eval_samples_per_second": 78.417, + "eval_steps_per_second": 1.255, + "step": 5600 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018006519377037304, + "loss": 0.7279, + "step": 5620 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001796305686345527, + "loss": 0.72, + "step": 5640 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017919594349873234, + "loss": 0.7295, + "step": 5660 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017876131836291198, + "loss": 0.7245, + "step": 5680 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017832669322709164, + "loss": 0.7418, + "step": 5700 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017789206809127128, + "loss": 0.7317, + "step": 5720 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017745744295545092, + "loss": 0.7303, + "step": 5740 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017702281781963058, + "loss": 0.7332, + "step": 5760 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017658819268381022, + "loss": 0.7202, + "step": 5780 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017615356754798983, + "loss": 0.7238, + "step": 5800 + }, + { + "epoch": 1.25, + "eval_loss": 0.7348505854606628, + "eval_runtime": 25.509, + "eval_samples_per_second": 78.404, + "eval_steps_per_second": 1.254, + "step": 5800 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017571894241216946, + "loss": 0.724, + "step": 5820 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017528431727634913, + "loss": 0.7258, + "step": 5840 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017484969214052876, + "loss": 0.7217, + "step": 5860 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001744150670047084, + "loss": 0.7209, + "step": 5880 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017398044186888806, + "loss": 0.7276, + "step": 5900 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001735458167330677, + "loss": 0.7287, + "step": 5920 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017311119159724733, + "loss": 0.7244, + "step": 5940 + }, + { + "epoch": 1.29, + "learning_rate": 0.000172676566461427, + "loss": 0.7247, + "step": 5960 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017224194132560663, + "loss": 0.7191, + "step": 5980 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017180731618978627, + "loss": 0.7208, + "step": 6000 + }, + { + "epoch": 1.29, + "eval_loss": 0.7340711951255798, + "eval_runtime": 25.4669, + "eval_samples_per_second": 78.533, + "eval_steps_per_second": 1.257, + "step": 6000 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017137269105396593, + "loss": 0.7285, + "step": 6020 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017093806591814557, + "loss": 0.7294, + "step": 6040 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001705034407823252, + "loss": 0.7365, + "step": 6060 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017006881564650487, + "loss": 0.7149, + "step": 6080 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001696341905106845, + "loss": 0.7229, + "step": 6100 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016919956537486414, + "loss": 0.7253, + "step": 6120 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001687649402390438, + "loss": 0.7188, + "step": 6140 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016833031510322344, + "loss": 0.7308, + "step": 6160 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016789568996740308, + "loss": 0.7186, + "step": 6180 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016746106483158274, + "loss": 0.7121, + "step": 6200 + }, + { + "epoch": 1.34, + "eval_loss": 0.7324739694595337, + "eval_runtime": 25.5, + "eval_samples_per_second": 78.431, + "eval_steps_per_second": 1.255, + "step": 6200 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016702643969576238, + "loss": 0.7286, + "step": 6220 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016659181455994202, + "loss": 0.7246, + "step": 6240 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016615718942412168, + "loss": 0.7234, + "step": 6260 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016572256428830132, + "loss": 0.7245, + "step": 6280 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016528793915248095, + "loss": 0.7252, + "step": 6300 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016485331401666062, + "loss": 0.7259, + "step": 6320 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016441868888084025, + "loss": 0.7173, + "step": 6340 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001639840637450199, + "loss": 0.7222, + "step": 6360 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016354943860919955, + "loss": 0.7113, + "step": 6380 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001631148134733792, + "loss": 0.72, + "step": 6400 + }, + { + "epoch": 1.38, + "eval_loss": 0.7319995164871216, + "eval_runtime": 25.5112, + "eval_samples_per_second": 78.397, + "eval_steps_per_second": 1.254, + "step": 6400 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016268018833755883, + "loss": 0.7333, + "step": 6420 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001622455632017385, + "loss": 0.7208, + "step": 6440 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016181093806591813, + "loss": 0.7161, + "step": 6460 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016137631293009776, + "loss": 0.7171, + "step": 6480 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016094168779427743, + "loss": 0.7297, + "step": 6500 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016050706265845706, + "loss": 0.7156, + "step": 6520 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001600724375226367, + "loss": 0.7175, + "step": 6540 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015963781238681636, + "loss": 0.7152, + "step": 6560 + }, + { + "epoch": 1.42, + "learning_rate": 0.000159203187250996, + "loss": 0.7282, + "step": 6580 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015876856211517564, + "loss": 0.722, + "step": 6600 + }, + { + "epoch": 1.42, + "eval_loss": 0.7307416796684265, + "eval_runtime": 25.4967, + "eval_samples_per_second": 78.442, + "eval_steps_per_second": 1.255, + "step": 6600 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001583339369793553, + "loss": 0.7274, + "step": 6620 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015789931184353494, + "loss": 0.7313, + "step": 6640 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015746468670771457, + "loss": 0.7209, + "step": 6660 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015703006157189424, + "loss": 0.7202, + "step": 6680 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015659543643607387, + "loss": 0.7264, + "step": 6700 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001561608113002535, + "loss": 0.7226, + "step": 6720 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015572618616443317, + "loss": 0.711, + "step": 6740 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001552915610286128, + "loss": 0.7216, + "step": 6760 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015485693589279245, + "loss": 0.7184, + "step": 6780 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001544223107569721, + "loss": 0.7216, + "step": 6800 + }, + { + "epoch": 1.47, + "eval_loss": 0.7297094464302063, + "eval_runtime": 25.4826, + "eval_samples_per_second": 78.485, + "eval_steps_per_second": 1.256, + "step": 6800 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015398768562115175, + "loss": 0.7203, + "step": 6820 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015355306048533138, + "loss": 0.7184, + "step": 6840 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015311843534951105, + "loss": 0.7183, + "step": 6860 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015268381021369068, + "loss": 0.7267, + "step": 6880 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015224918507787032, + "loss": 0.7299, + "step": 6900 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015181455994204998, + "loss": 0.719, + "step": 6920 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015137993480622962, + "loss": 0.7229, + "step": 6940 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015094530967040926, + "loss": 0.7231, + "step": 6960 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015051068453458892, + "loss": 0.7279, + "step": 6980 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015007605939876856, + "loss": 0.7252, + "step": 7000 + }, + { + "epoch": 1.51, + "eval_loss": 0.7288112640380859, + "eval_runtime": 25.4887, + "eval_samples_per_second": 78.466, + "eval_steps_per_second": 1.255, + "step": 7000 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001496414342629482, + "loss": 0.7148, + "step": 7020 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014920680912712786, + "loss": 0.7147, + "step": 7040 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001487721839913075, + "loss": 0.7209, + "step": 7060 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014833755885548713, + "loss": 0.724, + "step": 7080 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014790293371966676, + "loss": 0.7256, + "step": 7100 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001474683085838464, + "loss": 0.7246, + "step": 7120 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014703368344802606, + "loss": 0.7103, + "step": 7140 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001465990583122057, + "loss": 0.7223, + "step": 7160 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014616443317638534, + "loss": 0.7149, + "step": 7180 + }, + { + "epoch": 1.55, + "learning_rate": 0.000145729808040565, + "loss": 0.7214, + "step": 7200 + }, + { + "epoch": 1.55, + "eval_loss": 0.7280930876731873, + "eval_runtime": 25.4883, + "eval_samples_per_second": 78.467, + "eval_steps_per_second": 1.255, + "step": 7200 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014529518290474464, + "loss": 0.7118, + "step": 7220 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014486055776892427, + "loss": 0.7171, + "step": 7240 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014442593263310394, + "loss": 0.7191, + "step": 7260 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014399130749728357, + "loss": 0.7155, + "step": 7280 + }, + { + "epoch": 1.57, + "learning_rate": 0.0001435566823614632, + "loss": 0.7198, + "step": 7300 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014312205722564287, + "loss": 0.7188, + "step": 7320 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001426874320898225, + "loss": 0.7236, + "step": 7340 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014225280695400215, + "loss": 0.712, + "step": 7360 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001418181818181818, + "loss": 0.7181, + "step": 7380 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014138355668236145, + "loss": 0.7198, + "step": 7400 + }, + { + "epoch": 1.6, + "eval_loss": 0.7276077270507812, + "eval_runtime": 25.4843, + "eval_samples_per_second": 78.48, + "eval_steps_per_second": 1.256, + "step": 7400 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014094893154654108, + "loss": 0.7187, + "step": 7420 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014051430641072075, + "loss": 0.7153, + "step": 7440 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014007968127490038, + "loss": 0.7208, + "step": 7460 + }, + { + "epoch": 1.61, + "learning_rate": 0.00013964505613908002, + "loss": 0.7153, + "step": 7480 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013921043100325968, + "loss": 0.7207, + "step": 7500 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013877580586743932, + "loss": 0.7167, + "step": 7520 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013834118073161896, + "loss": 0.7183, + "step": 7540 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013792828685258964, + "loss": 0.7196, + "step": 7560 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013749366171676928, + "loss": 0.7233, + "step": 7580 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013705903658094894, + "loss": 0.7237, + "step": 7600 + }, + { + "epoch": 1.64, + "eval_loss": 0.7260885238647461, + "eval_runtime": 25.503, + "eval_samples_per_second": 78.422, + "eval_steps_per_second": 1.255, + "step": 7600 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013662441144512855, + "loss": 0.72, + "step": 7620 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001361897863093082, + "loss": 0.7094, + "step": 7640 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013575516117348785, + "loss": 0.7111, + "step": 7660 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013532053603766749, + "loss": 0.7182, + "step": 7680 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013488591090184715, + "loss": 0.7182, + "step": 7700 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013445128576602679, + "loss": 0.7183, + "step": 7720 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013401666063020642, + "loss": 0.7112, + "step": 7740 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013358203549438609, + "loss": 0.7183, + "step": 7760 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013314741035856572, + "loss": 0.7152, + "step": 7780 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013271278522274536, + "loss": 0.7233, + "step": 7800 + }, + { + "epoch": 1.68, + "eval_loss": 0.7252987027168274, + "eval_runtime": 25.5066, + "eval_samples_per_second": 78.411, + "eval_steps_per_second": 1.255, + "step": 7800 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013227816008692502, + "loss": 0.7124, + "step": 7820 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013184353495110466, + "loss": 0.7109, + "step": 7840 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001314089098152843, + "loss": 0.7132, + "step": 7860 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013097428467946396, + "loss": 0.7157, + "step": 7880 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001305396595436436, + "loss": 0.7237, + "step": 7900 + }, + { + "epoch": 1.71, + "learning_rate": 0.00013010503440782323, + "loss": 0.7176, + "step": 7920 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001296704092720029, + "loss": 0.7199, + "step": 7940 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012923578413618253, + "loss": 0.7119, + "step": 7960 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012880115900036217, + "loss": 0.717, + "step": 7980 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012836653386454183, + "loss": 0.7155, + "step": 8000 + }, + { + "epoch": 1.73, + "eval_loss": 0.7248360514640808, + "eval_runtime": 25.5301, + "eval_samples_per_second": 78.339, + "eval_steps_per_second": 1.253, + "step": 8000 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012793190872872147, + "loss": 0.7085, + "step": 8020 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001274972835929011, + "loss": 0.7174, + "step": 8040 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012706265845708077, + "loss": 0.7224, + "step": 8060 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001266280333212604, + "loss": 0.7169, + "step": 8080 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012619340818544004, + "loss": 0.7191, + "step": 8100 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001257587830496197, + "loss": 0.7179, + "step": 8120 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012532415791379934, + "loss": 0.7208, + "step": 8140 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012488953277797898, + "loss": 0.7168, + "step": 8160 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012445490764215864, + "loss": 0.7101, + "step": 8180 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012402028250633828, + "loss": 0.7167, + "step": 8200 + }, + { + "epoch": 1.77, + "eval_loss": 0.7242170572280884, + "eval_runtime": 25.4873, + "eval_samples_per_second": 78.47, + "eval_steps_per_second": 1.256, + "step": 8200 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012358565737051791, + "loss": 0.7062, + "step": 8220 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012315103223469758, + "loss": 0.7177, + "step": 8240 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012271640709887721, + "loss": 0.7035, + "step": 8260 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012228178196305685, + "loss": 0.7157, + "step": 8280 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001218471568272365, + "loss": 0.7196, + "step": 8300 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012141253169141615, + "loss": 0.7105, + "step": 8320 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012097790655559579, + "loss": 0.7105, + "step": 8340 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012054328141977544, + "loss": 0.7139, + "step": 8360 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012010865628395509, + "loss": 0.7215, + "step": 8380 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011967403114813472, + "loss": 0.725, + "step": 8400 + }, + { + "epoch": 1.81, + "eval_loss": 0.7237139344215393, + "eval_runtime": 25.506, + "eval_samples_per_second": 78.413, + "eval_steps_per_second": 1.255, + "step": 8400 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011923940601231437, + "loss": 0.7107, + "step": 8420 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011880478087649402, + "loss": 0.7095, + "step": 8440 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011837015574067366, + "loss": 0.7061, + "step": 8460 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001179355306048533, + "loss": 0.716, + "step": 8480 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011750090546903295, + "loss": 0.7203, + "step": 8500 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011706628033321258, + "loss": 0.7098, + "step": 8520 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011663165519739223, + "loss": 0.7104, + "step": 8540 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011619703006157188, + "loss": 0.7051, + "step": 8560 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011576240492575152, + "loss": 0.7198, + "step": 8580 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011532777978993117, + "loss": 0.7175, + "step": 8600 + }, + { + "epoch": 1.86, + "eval_loss": 0.7230754494667053, + "eval_runtime": 25.5133, + "eval_samples_per_second": 78.39, + "eval_steps_per_second": 1.254, + "step": 8600 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011489315465411082, + "loss": 0.7046, + "step": 8620 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011445852951829046, + "loss": 0.7176, + "step": 8640 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001140239043824701, + "loss": 0.7193, + "step": 8660 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011358927924664976, + "loss": 0.7046, + "step": 8680 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011315465411082939, + "loss": 0.7116, + "step": 8700 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011274176023180006, + "loss": 0.7152, + "step": 8720 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011230713509597971, + "loss": 0.7164, + "step": 8740 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011187250996015936, + "loss": 0.7192, + "step": 8760 + }, + { + "epoch": 1.89, + "learning_rate": 0.000111437884824339, + "loss": 0.7124, + "step": 8780 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011100325968851865, + "loss": 0.7032, + "step": 8800 + }, + { + "epoch": 1.9, + "eval_loss": 0.7217770218849182, + "eval_runtime": 25.4723, + "eval_samples_per_second": 78.517, + "eval_steps_per_second": 1.256, + "step": 8800 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001105686345526983, + "loss": 0.7157, + "step": 8820 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011013400941687794, + "loss": 0.7115, + "step": 8840 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010969938428105759, + "loss": 0.7137, + "step": 8860 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010926475914523724, + "loss": 0.7176, + "step": 8880 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010883013400941687, + "loss": 0.7081, + "step": 8900 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010839550887359652, + "loss": 0.7233, + "step": 8920 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010796088373777617, + "loss": 0.7058, + "step": 8940 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010752625860195581, + "loss": 0.7154, + "step": 8960 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010709163346613546, + "loss": 0.7135, + "step": 8980 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010665700833031508, + "loss": 0.7078, + "step": 9000 + }, + { + "epoch": 1.94, + "eval_loss": 0.7215875387191772, + "eval_runtime": 25.484, + "eval_samples_per_second": 78.481, + "eval_steps_per_second": 1.256, + "step": 9000 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010622238319449473, + "loss": 0.7061, + "step": 9020 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010578775805867438, + "loss": 0.7174, + "step": 9040 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010535313292285402, + "loss": 0.7132, + "step": 9060 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010491850778703367, + "loss": 0.7247, + "step": 9080 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010448388265121332, + "loss": 0.7064, + "step": 9100 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010404925751539295, + "loss": 0.7098, + "step": 9120 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001036146323795726, + "loss": 0.708, + "step": 9140 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010318000724375225, + "loss": 0.7144, + "step": 9160 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010274538210793189, + "loss": 0.7151, + "step": 9180 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010231075697211154, + "loss": 0.718, + "step": 9200 + }, + { + "epoch": 1.98, + "eval_loss": 0.7208251357078552, + "eval_runtime": 25.5022, + "eval_samples_per_second": 78.425, + "eval_steps_per_second": 1.255, + "step": 9200 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010187613183629119, + "loss": 0.7108, + "step": 9220 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010144150670047083, + "loss": 0.6952, + "step": 9240 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010100688156465048, + "loss": 0.7013, + "step": 9260 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010057225642883013, + "loss": 0.7013, + "step": 9280 + }, + { + "epoch": 2.01, + "learning_rate": 0.00010013763129300976, + "loss": 0.7049, + "step": 9300 + }, + { + "epoch": 2.01, + "learning_rate": 9.970300615718941e-05, + "loss": 0.7093, + "step": 9320 + }, + { + "epoch": 2.02, + "learning_rate": 9.926838102136906e-05, + "loss": 0.713, + "step": 9340 + }, + { + "epoch": 2.02, + "learning_rate": 9.88337558855487e-05, + "loss": 0.7108, + "step": 9360 + }, + { + "epoch": 2.02, + "learning_rate": 9.839913074972835e-05, + "loss": 0.7115, + "step": 9380 + }, + { + "epoch": 2.03, + "learning_rate": 9.7964505613908e-05, + "loss": 0.7119, + "step": 9400 + }, + { + "epoch": 2.03, + "eval_loss": 0.7202969789505005, + "eval_runtime": 25.504, + "eval_samples_per_second": 78.419, + "eval_steps_per_second": 1.255, + "step": 9400 + }, + { + "epoch": 2.03, + "learning_rate": 9.752988047808764e-05, + "loss": 0.7107, + "step": 9420 + }, + { + "epoch": 2.04, + "learning_rate": 9.709525534226729e-05, + "loss": 0.7065, + "step": 9440 + }, + { + "epoch": 2.04, + "learning_rate": 9.666063020644694e-05, + "loss": 0.7121, + "step": 9460 + }, + { + "epoch": 2.05, + "learning_rate": 9.622600507062657e-05, + "loss": 0.7163, + "step": 9480 + }, + { + "epoch": 2.05, + "learning_rate": 9.579137993480622e-05, + "loss": 0.7026, + "step": 9500 + }, + { + "epoch": 2.05, + "learning_rate": 9.535675479898587e-05, + "loss": 0.7158, + "step": 9520 + }, + { + "epoch": 2.06, + "learning_rate": 9.492212966316551e-05, + "loss": 0.7016, + "step": 9540 + }, + { + "epoch": 2.06, + "learning_rate": 9.448750452734516e-05, + "loss": 0.7149, + "step": 9560 + }, + { + "epoch": 2.07, + "learning_rate": 9.405287939152481e-05, + "loss": 0.7079, + "step": 9580 + }, + { + "epoch": 2.07, + "learning_rate": 9.361825425570445e-05, + "loss": 0.709, + "step": 9600 + }, + { + "epoch": 2.07, + "eval_loss": 0.7194134593009949, + "eval_runtime": 25.5286, + "eval_samples_per_second": 78.343, + "eval_steps_per_second": 1.253, + "step": 9600 + }, + { + "epoch": 2.08, + "learning_rate": 9.31836291198841e-05, + "loss": 0.7127, + "step": 9620 + }, + { + "epoch": 2.08, + "learning_rate": 9.274900398406375e-05, + "loss": 0.7037, + "step": 9640 + }, + { + "epoch": 2.08, + "learning_rate": 9.231437884824338e-05, + "loss": 0.7114, + "step": 9660 + }, + { + "epoch": 2.09, + "learning_rate": 9.187975371242303e-05, + "loss": 0.706, + "step": 9680 + }, + { + "epoch": 2.09, + "learning_rate": 9.144512857660268e-05, + "loss": 0.7026, + "step": 9700 + }, + { + "epoch": 2.1, + "learning_rate": 9.101050344078232e-05, + "loss": 0.7079, + "step": 9720 + }, + { + "epoch": 2.1, + "learning_rate": 9.057587830496197e-05, + "loss": 0.7053, + "step": 9740 + }, + { + "epoch": 2.11, + "learning_rate": 9.014125316914162e-05, + "loss": 0.7125, + "step": 9760 + }, + { + "epoch": 2.11, + "learning_rate": 8.970662803332126e-05, + "loss": 0.7045, + "step": 9780 + }, + { + "epoch": 2.11, + "learning_rate": 8.92720028975009e-05, + "loss": 0.7109, + "step": 9800 + }, + { + "epoch": 2.11, + "eval_loss": 0.7186465859413147, + "eval_runtime": 25.5049, + "eval_samples_per_second": 78.416, + "eval_steps_per_second": 1.255, + "step": 9800 + }, + { + "epoch": 2.12, + "learning_rate": 8.883737776168056e-05, + "loss": 0.7035, + "step": 9820 + }, + { + "epoch": 2.12, + "learning_rate": 8.840275262586019e-05, + "loss": 0.7073, + "step": 9840 + }, + { + "epoch": 2.13, + "learning_rate": 8.796812749003983e-05, + "loss": 0.7114, + "step": 9860 + }, + { + "epoch": 2.13, + "learning_rate": 8.753350235421946e-05, + "loss": 0.7066, + "step": 9880 + }, + { + "epoch": 2.14, + "learning_rate": 8.709887721839911e-05, + "loss": 0.7055, + "step": 9900 + }, + { + "epoch": 2.14, + "learning_rate": 8.666425208257877e-05, + "loss": 0.7064, + "step": 9920 + }, + { + "epoch": 2.14, + "learning_rate": 8.62296269467584e-05, + "loss": 0.7154, + "step": 9940 + }, + { + "epoch": 2.15, + "learning_rate": 8.579500181093805e-05, + "loss": 0.7099, + "step": 9960 + }, + { + "epoch": 2.15, + "learning_rate": 8.53603766751177e-05, + "loss": 0.7112, + "step": 9980 + }, + { + "epoch": 2.16, + "learning_rate": 8.492575153929734e-05, + "loss": 0.7086, + "step": 10000 + }, + { + "epoch": 2.16, + "eval_loss": 0.7181739211082458, + "eval_runtime": 25.5087, + "eval_samples_per_second": 78.405, + "eval_steps_per_second": 1.254, + "step": 10000 + }, + { + "epoch": 2.16, + "learning_rate": 8.449112640347699e-05, + "loss": 0.7155, + "step": 10020 + }, + { + "epoch": 2.17, + "learning_rate": 8.405650126765664e-05, + "loss": 0.7097, + "step": 10040 + }, + { + "epoch": 2.17, + "learning_rate": 8.362187613183627e-05, + "loss": 0.7025, + "step": 10060 + }, + { + "epoch": 2.17, + "learning_rate": 8.318725099601592e-05, + "loss": 0.7065, + "step": 10080 + }, + { + "epoch": 2.18, + "learning_rate": 8.275262586019557e-05, + "loss": 0.6982, + "step": 10100 + }, + { + "epoch": 2.18, + "learning_rate": 8.231800072437521e-05, + "loss": 0.7039, + "step": 10120 + }, + { + "epoch": 2.19, + "learning_rate": 8.188337558855486e-05, + "loss": 0.7097, + "step": 10140 + }, + { + "epoch": 2.19, + "learning_rate": 8.144875045273451e-05, + "loss": 0.7089, + "step": 10160 + }, + { + "epoch": 2.2, + "learning_rate": 8.101412531691415e-05, + "loss": 0.7018, + "step": 10180 + }, + { + "epoch": 2.2, + "learning_rate": 8.05795001810938e-05, + "loss": 0.7025, + "step": 10200 + }, + { + "epoch": 2.2, + "eval_loss": 0.7179592251777649, + "eval_runtime": 25.4993, + "eval_samples_per_second": 78.433, + "eval_steps_per_second": 1.255, + "step": 10200 + }, + { + "epoch": 2.2, + "learning_rate": 8.014487504527345e-05, + "loss": 0.7067, + "step": 10220 + }, + { + "epoch": 2.21, + "learning_rate": 7.971024990945308e-05, + "loss": 0.71, + "step": 10240 + }, + { + "epoch": 2.21, + "learning_rate": 7.927562477363273e-05, + "loss": 0.7255, + "step": 10260 + }, + { + "epoch": 2.22, + "learning_rate": 7.884099963781238e-05, + "loss": 0.7065, + "step": 10280 + }, + { + "epoch": 2.22, + "learning_rate": 7.840637450199202e-05, + "loss": 0.712, + "step": 10300 + }, + { + "epoch": 2.23, + "learning_rate": 7.797174936617167e-05, + "loss": 0.7132, + "step": 10320 + }, + { + "epoch": 2.23, + "learning_rate": 7.753712423035132e-05, + "loss": 0.7106, + "step": 10340 + }, + { + "epoch": 2.24, + "learning_rate": 7.710249909453096e-05, + "loss": 0.708, + "step": 10360 + }, + { + "epoch": 2.24, + "learning_rate": 7.666787395871061e-05, + "loss": 0.7054, + "step": 10380 + }, + { + "epoch": 2.24, + "learning_rate": 7.623324882289026e-05, + "loss": 0.7087, + "step": 10400 + }, + { + "epoch": 2.24, + "eval_loss": 0.717901349067688, + "eval_runtime": 25.4862, + "eval_samples_per_second": 78.474, + "eval_steps_per_second": 1.256, + "step": 10400 + }, + { + "epoch": 2.25, + "learning_rate": 7.57986236870699e-05, + "loss": 0.7014, + "step": 10420 + }, + { + "epoch": 2.25, + "learning_rate": 7.536399855124954e-05, + "loss": 0.7103, + "step": 10440 + }, + { + "epoch": 2.26, + "learning_rate": 7.49293734154292e-05, + "loss": 0.7089, + "step": 10460 + }, + { + "epoch": 2.26, + "learning_rate": 7.449474827960883e-05, + "loss": 0.704, + "step": 10480 + }, + { + "epoch": 2.27, + "learning_rate": 7.406012314378847e-05, + "loss": 0.7074, + "step": 10500 + }, + { + "epoch": 2.27, + "learning_rate": 7.362549800796812e-05, + "loss": 0.7094, + "step": 10520 + }, + { + "epoch": 2.27, + "learning_rate": 7.319087287214777e-05, + "loss": 0.7069, + "step": 10540 + }, + { + "epoch": 2.28, + "learning_rate": 7.27562477363274e-05, + "loss": 0.7081, + "step": 10560 + }, + { + "epoch": 2.28, + "learning_rate": 7.232162260050705e-05, + "loss": 0.7036, + "step": 10580 + }, + { + "epoch": 2.29, + "learning_rate": 7.18869974646867e-05, + "loss": 0.6984, + "step": 10600 + }, + { + "epoch": 2.29, + "eval_loss": 0.7175166010856628, + "eval_runtime": 25.5016, + "eval_samples_per_second": 78.426, + "eval_steps_per_second": 1.255, + "step": 10600 + }, + { + "epoch": 2.29, + "learning_rate": 7.145237232886634e-05, + "loss": 0.7097, + "step": 10620 + }, + { + "epoch": 2.3, + "learning_rate": 7.101774719304599e-05, + "loss": 0.7143, + "step": 10640 + }, + { + "epoch": 2.3, + "learning_rate": 7.058312205722564e-05, + "loss": 0.7099, + "step": 10660 + }, + { + "epoch": 2.3, + "learning_rate": 7.014849692140528e-05, + "loss": 0.6994, + "step": 10680 + }, + { + "epoch": 2.31, + "learning_rate": 6.971387178558493e-05, + "loss": 0.7129, + "step": 10700 + }, + { + "epoch": 2.31, + "learning_rate": 6.927924664976458e-05, + "loss": 0.7067, + "step": 10720 + }, + { + "epoch": 2.32, + "learning_rate": 6.884462151394421e-05, + "loss": 0.7044, + "step": 10740 + }, + { + "epoch": 2.32, + "learning_rate": 6.840999637812386e-05, + "loss": 0.7092, + "step": 10760 + }, + { + "epoch": 2.33, + "learning_rate": 6.797537124230351e-05, + "loss": 0.7075, + "step": 10780 + }, + { + "epoch": 2.33, + "learning_rate": 6.754074610648315e-05, + "loss": 0.7073, + "step": 10800 + }, + { + "epoch": 2.33, + "eval_loss": 0.7168901562690735, + "eval_runtime": 25.5153, + "eval_samples_per_second": 78.384, + "eval_steps_per_second": 1.254, + "step": 10800 + }, + { + "epoch": 2.33, + "learning_rate": 6.71061209706628e-05, + "loss": 0.7088, + "step": 10820 + }, + { + "epoch": 2.34, + "learning_rate": 6.667149583484245e-05, + "loss": 0.7046, + "step": 10840 + }, + { + "epoch": 2.34, + "learning_rate": 6.623687069902209e-05, + "loss": 0.7029, + "step": 10860 + }, + { + "epoch": 2.35, + "learning_rate": 6.580224556320174e-05, + "loss": 0.7055, + "step": 10880 + }, + { + "epoch": 2.35, + "learning_rate": 6.536762042738139e-05, + "loss": 0.7095, + "step": 10900 + }, + { + "epoch": 2.36, + "learning_rate": 6.493299529156102e-05, + "loss": 0.7057, + "step": 10920 + }, + { + "epoch": 2.36, + "learning_rate": 6.449837015574066e-05, + "loss": 0.7064, + "step": 10940 + }, + { + "epoch": 2.36, + "learning_rate": 6.406374501992031e-05, + "loss": 0.7039, + "step": 10960 + }, + { + "epoch": 2.37, + "learning_rate": 6.362911988409996e-05, + "loss": 0.7109, + "step": 10980 + }, + { + "epoch": 2.37, + "learning_rate": 6.31944947482796e-05, + "loss": 0.7051, + "step": 11000 + }, + { + "epoch": 2.37, + "eval_loss": 0.7164381146430969, + "eval_runtime": 25.4817, + "eval_samples_per_second": 78.488, + "eval_steps_per_second": 1.256, + "step": 11000 + }, + { + "epoch": 2.38, + "learning_rate": 6.275986961245924e-05, + "loss": 0.7117, + "step": 11020 + }, + { + "epoch": 2.38, + "learning_rate": 6.23252444766389e-05, + "loss": 0.6972, + "step": 11040 + }, + { + "epoch": 2.39, + "learning_rate": 6.189061934081853e-05, + "loss": 0.7087, + "step": 11060 + }, + { + "epoch": 2.39, + "learning_rate": 6.145599420499818e-05, + "loss": 0.703, + "step": 11080 + }, + { + "epoch": 2.39, + "learning_rate": 6.1021369069177825e-05, + "loss": 0.7062, + "step": 11100 + }, + { + "epoch": 2.4, + "learning_rate": 6.0586743933357475e-05, + "loss": 0.7018, + "step": 11120 + }, + { + "epoch": 2.4, + "learning_rate": 6.015211879753712e-05, + "loss": 0.7003, + "step": 11140 + }, + { + "epoch": 2.41, + "learning_rate": 5.971749366171676e-05, + "loss": 0.7005, + "step": 11160 + }, + { + "epoch": 2.41, + "learning_rate": 5.928286852589641e-05, + "loss": 0.7099, + "step": 11180 + }, + { + "epoch": 2.42, + "learning_rate": 5.8848243390076054e-05, + "loss": 0.7002, + "step": 11200 + }, + { + "epoch": 2.42, + "eval_loss": 0.7161288857460022, + "eval_runtime": 25.5084, + "eval_samples_per_second": 78.406, + "eval_steps_per_second": 1.254, + "step": 11200 + }, + { + "epoch": 2.42, + "learning_rate": 5.84136182542557e-05, + "loss": 0.7071, + "step": 11220 + }, + { + "epoch": 2.43, + "learning_rate": 5.797899311843535e-05, + "loss": 0.7028, + "step": 11240 + }, + { + "epoch": 2.43, + "learning_rate": 5.754436798261499e-05, + "loss": 0.7199, + "step": 11260 + }, + { + "epoch": 2.43, + "learning_rate": 5.7109742846794634e-05, + "loss": 0.6974, + "step": 11280 + }, + { + "epoch": 2.44, + "learning_rate": 5.6675117710974284e-05, + "loss": 0.7003, + "step": 11300 + }, + { + "epoch": 2.44, + "learning_rate": 5.624049257515393e-05, + "loss": 0.7079, + "step": 11320 + }, + { + "epoch": 2.45, + "learning_rate": 5.580586743933357e-05, + "loss": 0.6988, + "step": 11340 + }, + { + "epoch": 2.45, + "learning_rate": 5.537124230351322e-05, + "loss": 0.7047, + "step": 11360 + }, + { + "epoch": 2.46, + "learning_rate": 5.493661716769286e-05, + "loss": 0.6946, + "step": 11380 + }, + { + "epoch": 2.46, + "learning_rate": 5.45019920318725e-05, + "loss": 0.7096, + "step": 11400 + }, + { + "epoch": 2.46, + "eval_loss": 0.7155815958976746, + "eval_runtime": 25.525, + "eval_samples_per_second": 78.355, + "eval_steps_per_second": 1.254, + "step": 11400 + }, + { + "epoch": 2.46, + "learning_rate": 5.406736689605215e-05, + "loss": 0.709, + "step": 11420 + }, + { + "epoch": 2.47, + "learning_rate": 5.3632741760231794e-05, + "loss": 0.7112, + "step": 11440 + }, + { + "epoch": 2.47, + "learning_rate": 5.319811662441144e-05, + "loss": 0.6983, + "step": 11460 + }, + { + "epoch": 2.48, + "learning_rate": 5.276349148859109e-05, + "loss": 0.7, + "step": 11480 + }, + { + "epoch": 2.48, + "learning_rate": 5.232886635277073e-05, + "loss": 0.7006, + "step": 11500 + }, + { + "epoch": 2.49, + "learning_rate": 5.189424121695037e-05, + "loss": 0.7068, + "step": 11520 + }, + { + "epoch": 2.49, + "learning_rate": 5.1459616081130023e-05, + "loss": 0.7012, + "step": 11540 + }, + { + "epoch": 2.49, + "learning_rate": 5.102499094530967e-05, + "loss": 0.7079, + "step": 11560 + }, + { + "epoch": 2.5, + "learning_rate": 5.059036580948931e-05, + "loss": 0.7031, + "step": 11580 + }, + { + "epoch": 2.5, + "learning_rate": 5.015574067366896e-05, + "loss": 0.7038, + "step": 11600 + }, + { + "epoch": 2.5, + "eval_loss": 0.7149330973625183, + "eval_runtime": 25.4843, + "eval_samples_per_second": 78.48, + "eval_steps_per_second": 1.256, + "step": 11600 + }, + { + "epoch": 2.51, + "learning_rate": 4.97211155378486e-05, + "loss": 0.6972, + "step": 11620 + }, + { + "epoch": 2.51, + "learning_rate": 4.9286490402028246e-05, + "loss": 0.7039, + "step": 11640 + }, + { + "epoch": 2.52, + "learning_rate": 4.885186526620789e-05, + "loss": 0.7052, + "step": 11660 + }, + { + "epoch": 2.52, + "learning_rate": 4.841724013038754e-05, + "loss": 0.7045, + "step": 11680 + }, + { + "epoch": 2.52, + "learning_rate": 4.798261499456718e-05, + "loss": 0.701, + "step": 11700 + }, + { + "epoch": 2.53, + "learning_rate": 4.7547989858746826e-05, + "loss": 0.7084, + "step": 11720 + }, + { + "epoch": 2.53, + "learning_rate": 4.7113364722926476e-05, + "loss": 0.6988, + "step": 11740 + }, + { + "epoch": 2.54, + "learning_rate": 4.667873958710612e-05, + "loss": 0.7155, + "step": 11760 + }, + { + "epoch": 2.54, + "learning_rate": 4.624411445128576e-05, + "loss": 0.7044, + "step": 11780 + }, + { + "epoch": 2.55, + "learning_rate": 4.5809489315465406e-05, + "loss": 0.7014, + "step": 11800 + }, + { + "epoch": 2.55, + "eval_loss": 0.714367151260376, + "eval_runtime": 25.4959, + "eval_samples_per_second": 78.444, + "eval_steps_per_second": 1.255, + "step": 11800 + }, + { + "epoch": 2.55, + "learning_rate": 4.537486417964505e-05, + "loss": 0.708, + "step": 11820 + }, + { + "epoch": 2.55, + "learning_rate": 4.494023904382469e-05, + "loss": 0.6976, + "step": 11840 + }, + { + "epoch": 2.56, + "learning_rate": 4.450561390800434e-05, + "loss": 0.7057, + "step": 11860 + }, + { + "epoch": 2.56, + "learning_rate": 4.4070988772183986e-05, + "loss": 0.7039, + "step": 11880 + }, + { + "epoch": 2.57, + "learning_rate": 4.363636363636363e-05, + "loss": 0.7089, + "step": 11900 + }, + { + "epoch": 2.57, + "learning_rate": 4.320173850054328e-05, + "loss": 0.7026, + "step": 11920 + }, + { + "epoch": 2.58, + "learning_rate": 4.276711336472292e-05, + "loss": 0.7023, + "step": 11940 + }, + { + "epoch": 2.58, + "learning_rate": 4.2332488228902565e-05, + "loss": 0.7006, + "step": 11960 + }, + { + "epoch": 2.58, + "learning_rate": 4.1897863093082215e-05, + "loss": 0.7008, + "step": 11980 + }, + { + "epoch": 2.59, + "learning_rate": 4.146323795726186e-05, + "loss": 0.7057, + "step": 12000 + }, + { + "epoch": 2.59, + "eval_loss": 0.7141902446746826, + "eval_runtime": 25.5019, + "eval_samples_per_second": 78.426, + "eval_steps_per_second": 1.255, + "step": 12000 + }, + { + "epoch": 2.59, + "learning_rate": 4.10286128214415e-05, + "loss": 0.7083, + "step": 12020 + }, + { + "epoch": 2.6, + "learning_rate": 4.059398768562115e-05, + "loss": 0.6986, + "step": 12040 + }, + { + "epoch": 2.6, + "learning_rate": 4.0159362549800795e-05, + "loss": 0.7076, + "step": 12060 + }, + { + "epoch": 2.61, + "learning_rate": 3.972473741398044e-05, + "loss": 0.7071, + "step": 12080 + }, + { + "epoch": 2.61, + "learning_rate": 3.929011227816009e-05, + "loss": 0.6984, + "step": 12100 + }, + { + "epoch": 2.61, + "learning_rate": 3.885548714233973e-05, + "loss": 0.7096, + "step": 12120 + }, + { + "epoch": 2.62, + "learning_rate": 3.8420862006519375e-05, + "loss": 0.7027, + "step": 12140 + }, + { + "epoch": 2.62, + "learning_rate": 3.7986236870699025e-05, + "loss": 0.7062, + "step": 12160 + }, + { + "epoch": 2.63, + "learning_rate": 3.755161173487867e-05, + "loss": 0.7049, + "step": 12180 + }, + { + "epoch": 2.63, + "learning_rate": 3.711698659905831e-05, + "loss": 0.7052, + "step": 12200 + }, + { + "epoch": 2.63, + "eval_loss": 0.7140177488327026, + "eval_runtime": 25.4673, + "eval_samples_per_second": 78.532, + "eval_steps_per_second": 1.257, + "step": 12200 + }, + { + "epoch": 2.64, + "learning_rate": 3.6682361463237955e-05, + "loss": 0.7011, + "step": 12220 + }, + { + "epoch": 2.64, + "learning_rate": 3.62477363274176e-05, + "loss": 0.7025, + "step": 12240 + }, + { + "epoch": 2.65, + "learning_rate": 3.581311119159725e-05, + "loss": 0.7006, + "step": 12260 + }, + { + "epoch": 2.65, + "learning_rate": 3.537848605577689e-05, + "loss": 0.7073, + "step": 12280 + }, + { + "epoch": 2.65, + "learning_rate": 3.4943860919956534e-05, + "loss": 0.7033, + "step": 12300 + }, + { + "epoch": 2.66, + "learning_rate": 3.4509235784136184e-05, + "loss": 0.6992, + "step": 12320 + }, + { + "epoch": 2.66, + "learning_rate": 3.407461064831582e-05, + "loss": 0.7043, + "step": 12340 + }, + { + "epoch": 2.67, + "learning_rate": 3.363998551249547e-05, + "loss": 0.7083, + "step": 12360 + }, + { + "epoch": 2.67, + "learning_rate": 3.3205360376675114e-05, + "loss": 0.7086, + "step": 12380 + }, + { + "epoch": 2.68, + "learning_rate": 3.277073524085476e-05, + "loss": 0.7168, + "step": 12400 + }, + { + "epoch": 2.68, + "eval_loss": 0.7138265371322632, + "eval_runtime": 25.5077, + "eval_samples_per_second": 78.408, + "eval_steps_per_second": 1.255, + "step": 12400 + }, + { + "epoch": 2.68, + "learning_rate": 3.233611010503441e-05, + "loss": 0.7026, + "step": 12420 + }, + { + "epoch": 2.68, + "learning_rate": 3.190148496921405e-05, + "loss": 0.7097, + "step": 12440 + }, + { + "epoch": 2.69, + "learning_rate": 3.1466859833393694e-05, + "loss": 0.7094, + "step": 12460 + }, + { + "epoch": 2.69, + "learning_rate": 3.1032234697573344e-05, + "loss": 0.6971, + "step": 12480 + }, + { + "epoch": 2.7, + "learning_rate": 3.059760956175299e-05, + "loss": 0.6977, + "step": 12500 + }, + { + "epoch": 2.7, + "learning_rate": 3.016298442593263e-05, + "loss": 0.6945, + "step": 12520 + }, + { + "epoch": 2.71, + "learning_rate": 2.9728359290112277e-05, + "loss": 0.6998, + "step": 12540 + }, + { + "epoch": 2.71, + "learning_rate": 2.929373415429192e-05, + "loss": 0.7067, + "step": 12560 + }, + { + "epoch": 2.71, + "learning_rate": 2.8859109018471563e-05, + "loss": 0.6935, + "step": 12580 + }, + { + "epoch": 2.72, + "learning_rate": 2.842448388265121e-05, + "loss": 0.6927, + "step": 12600 + }, + { + "epoch": 2.72, + "eval_loss": 0.7132371664047241, + "eval_runtime": 25.516, + "eval_samples_per_second": 78.382, + "eval_steps_per_second": 1.254, + "step": 12600 + }, + { + "epoch": 2.72, + "learning_rate": 2.7989858746830857e-05, + "loss": 0.7025, + "step": 12620 + }, + { + "epoch": 2.73, + "learning_rate": 2.75552336110105e-05, + "loss": 0.7098, + "step": 12640 + }, + { + "epoch": 2.73, + "learning_rate": 2.7120608475190147e-05, + "loss": 0.6939, + "step": 12660 + }, + { + "epoch": 2.74, + "learning_rate": 2.6685983339369793e-05, + "loss": 0.7038, + "step": 12680 + }, + { + "epoch": 2.74, + "learning_rate": 2.6251358203549436e-05, + "loss": 0.7039, + "step": 12700 + }, + { + "epoch": 2.74, + "learning_rate": 2.5816733067729083e-05, + "loss": 0.7018, + "step": 12720 + }, + { + "epoch": 2.75, + "learning_rate": 2.538210793190873e-05, + "loss": 0.6943, + "step": 12740 + }, + { + "epoch": 2.75, + "learning_rate": 2.4947482796088373e-05, + "loss": 0.7007, + "step": 12760 + }, + { + "epoch": 2.76, + "learning_rate": 2.4512857660268016e-05, + "loss": 0.7019, + "step": 12780 + }, + { + "epoch": 2.76, + "learning_rate": 2.407823252444766e-05, + "loss": 0.6957, + "step": 12800 + }, + { + "epoch": 2.76, + "eval_loss": 0.7126932144165039, + "eval_runtime": 25.4915, + "eval_samples_per_second": 78.458, + "eval_steps_per_second": 1.255, + "step": 12800 + }, + { + "epoch": 2.77, + "learning_rate": 2.3643607388627306e-05, + "loss": 0.6993, + "step": 12820 + }, + { + "epoch": 2.77, + "learning_rate": 2.3208982252806953e-05, + "loss": 0.6951, + "step": 12840 + }, + { + "epoch": 2.77, + "learning_rate": 2.2774357116986596e-05, + "loss": 0.7056, + "step": 12860 + }, + { + "epoch": 2.78, + "learning_rate": 2.2339731981166243e-05, + "loss": 0.7153, + "step": 12880 + }, + { + "epoch": 2.78, + "learning_rate": 2.190510684534589e-05, + "loss": 0.7022, + "step": 12900 + }, + { + "epoch": 2.79, + "learning_rate": 2.1470481709525532e-05, + "loss": 0.7078, + "step": 12920 + }, + { + "epoch": 2.79, + "learning_rate": 2.103585657370518e-05, + "loss": 0.6969, + "step": 12940 + }, + { + "epoch": 2.8, + "learning_rate": 2.0601231437884826e-05, + "loss": 0.7056, + "step": 12960 + }, + { + "epoch": 2.8, + "learning_rate": 2.016660630206447e-05, + "loss": 0.6975, + "step": 12980 + }, + { + "epoch": 2.8, + "learning_rate": 1.9731981166244112e-05, + "loss": 0.7065, + "step": 13000 + }, + { + "epoch": 2.8, + "eval_loss": 0.7130131721496582, + "eval_runtime": 25.4905, + "eval_samples_per_second": 78.461, + "eval_steps_per_second": 1.255, + "step": 13000 + }, + { + "epoch": 2.81, + "learning_rate": 1.9297356030423755e-05, + "loss": 0.7, + "step": 13020 + }, + { + "epoch": 2.81, + "learning_rate": 1.8862730894603402e-05, + "loss": 0.7144, + "step": 13040 + }, + { + "epoch": 2.82, + "learning_rate": 1.842810575878305e-05, + "loss": 0.6964, + "step": 13060 + }, + { + "epoch": 2.82, + "learning_rate": 1.7993480622962692e-05, + "loss": 0.6981, + "step": 13080 + }, + { + "epoch": 2.83, + "learning_rate": 1.755885548714234e-05, + "loss": 0.7102, + "step": 13100 + }, + { + "epoch": 2.83, + "learning_rate": 1.7124230351321985e-05, + "loss": 0.6975, + "step": 13120 + }, + { + "epoch": 2.83, + "learning_rate": 1.668960521550163e-05, + "loss": 0.7062, + "step": 13140 + }, + { + "epoch": 2.84, + "learning_rate": 1.625498007968127e-05, + "loss": 0.6956, + "step": 13160 + }, + { + "epoch": 2.84, + "learning_rate": 1.5820354943860918e-05, + "loss": 0.71, + "step": 13180 + }, + { + "epoch": 2.85, + "learning_rate": 1.5385729808040565e-05, + "loss": 0.7081, + "step": 13200 + }, + { + "epoch": 2.85, + "eval_loss": 0.7126001119613647, + "eval_runtime": 25.5102, + "eval_samples_per_second": 78.4, + "eval_steps_per_second": 1.254, + "step": 13200 + }, + { + "epoch": 2.85, + "learning_rate": 1.495110467222021e-05, + "loss": 0.6977, + "step": 13220 + }, + { + "epoch": 2.86, + "learning_rate": 1.4516479536399855e-05, + "loss": 0.705, + "step": 13240 + }, + { + "epoch": 2.86, + "learning_rate": 1.4081854400579498e-05, + "loss": 0.7016, + "step": 13260 + }, + { + "epoch": 2.87, + "learning_rate": 1.3647229264759143e-05, + "loss": 0.6922, + "step": 13280 + }, + { + "epoch": 2.87, + "learning_rate": 1.321260412893879e-05, + "loss": 0.6987, + "step": 13300 + }, + { + "epoch": 2.87, + "learning_rate": 1.2777978993118434e-05, + "loss": 0.7041, + "step": 13320 + }, + { + "epoch": 2.88, + "learning_rate": 1.234335385729808e-05, + "loss": 0.7101, + "step": 13340 + }, + { + "epoch": 2.88, + "learning_rate": 1.1908728721477723e-05, + "loss": 0.6976, + "step": 13360 + }, + { + "epoch": 2.89, + "learning_rate": 1.147410358565737e-05, + "loss": 0.7011, + "step": 13380 + }, + { + "epoch": 2.89, + "learning_rate": 1.1039478449837014e-05, + "loss": 0.6973, + "step": 13400 + }, + { + "epoch": 2.89, + "eval_loss": 0.7123447060585022, + "eval_runtime": 25.5029, + "eval_samples_per_second": 78.422, + "eval_steps_per_second": 1.255, + "step": 13400 + }, + { + "epoch": 2.9, + "learning_rate": 1.060485331401666e-05, + "loss": 0.7084, + "step": 13420 + }, + { + "epoch": 2.9, + "learning_rate": 1.0170228178196306e-05, + "loss": 0.7133, + "step": 13440 + }, + { + "epoch": 2.9, + "learning_rate": 9.73560304237595e-06, + "loss": 0.6988, + "step": 13460 + }, + { + "epoch": 2.91, + "learning_rate": 9.300977906555596e-06, + "loss": 0.7045, + "step": 13480 + }, + { + "epoch": 2.91, + "learning_rate": 8.866352770735239e-06, + "loss": 0.6985, + "step": 13500 + }, + { + "epoch": 2.92, + "learning_rate": 8.431727634914886e-06, + "loss": 0.6967, + "step": 13520 + }, + { + "epoch": 2.92, + "learning_rate": 7.99710249909453e-06, + "loss": 0.7008, + "step": 13540 + }, + { + "epoch": 2.93, + "learning_rate": 7.562477363274175e-06, + "loss": 0.6956, + "step": 13560 + }, + { + "epoch": 2.93, + "learning_rate": 7.12785222745382e-06, + "loss": 0.7065, + "step": 13580 + }, + { + "epoch": 2.93, + "learning_rate": 6.693227091633466e-06, + "loss": 0.7018, + "step": 13600 + }, + { + "epoch": 2.93, + "eval_loss": 0.712183952331543, + "eval_runtime": 25.5697, + "eval_samples_per_second": 78.218, + "eval_steps_per_second": 1.251, + "step": 13600 + } + ], + "max_steps": 13905, + "num_train_epochs": 3, + "total_flos": 6.876240637723253e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13600/training_args.bin b/adapters/saved-alpaca-belle13b/checkpoint-13600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d9fc651b09d1fbcbbf76356c2181acb1def32585 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d94e5a4fbc2ed544893c730b1ef244fb2123fe494b95f38bd148f9dd38f68e0 +size 3643 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13800/optimizer.pt b/adapters/saved-alpaca-belle13b/checkpoint-13800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..11253e3b0befbb890ec3f26a96bba4a905ce34aa --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7081107a3788d570585493d08b0bb6afc55174835cce6b16fd6009738e2ccc8d +size 52523141 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13800/pytorch_model.bin b/adapters/saved-alpaca-belle13b/checkpoint-13800/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ff9478e5318c4bd7373ffd18afd9070fdb8fc342 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13800/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d23de6648c8d3163fcec811cb9b6aa71ecc442c0603e6993e3542f17cd8acd08 +size 26271757 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_0.pth b/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..1287ae6beb735eb64ec67e29fbf44d20c53549b1 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfcfd3818bf536413ab4defe5b5ac5b89751a304586ff8717b62751ec485cf9f +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_1.pth b/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b921d8f65702475570de4a3e74dde44a9b3adb4 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8091ea6c44a20f67f6746e785279982f8af08dc67efb477cad9423373ca59a98 +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_2.pth b/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6d11c23b6491fe62aed14bc5592d9e9353d48272 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d89a82db3eaf02463d9ea66caf5062a9924e9aa1ea0960714bb67ac6a9d1d5c1 +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_3.pth b/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..288105966d4cbdacb0e87ccae3a8a1ba089e21ff --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0cbcdaa240273ca9ebacb2d8df8e57f504aec85f721af29ec6431dfb14655e3 +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_4.pth b/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..ca0106feb032ae2565df904c603be9d8ce6c6aa7 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:876edf0dfdbe178457081da30f41a490121e521c838168dd2f32e97aafe289a2 +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_5.pth b/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..925da0489b127b6bc4956d1d9afd94735b46a980 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8a348f6bbfbc9c4d4a7b4c5007639527bbd9ebf59a398d833497a417eaaf9d9 +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_6.pth b/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..2947a9352c36bdccc1a4b28e7d105049257f4ccd --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d5158c19ba9206de2d40c35a326fcff3a3a52d3ff029641807dab9f7f71b91a +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_7.pth b/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..e6a5bb9d201bb6f20252a6f7e6ee542ca6ecb88f --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13800/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:330a5b972a34b49b47a65efc611ce042f3a7351c55964dcde96e80379bb3c989 +size 14583 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13800/scaler.pt b/adapters/saved-alpaca-belle13b/checkpoint-13800/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..50be494ebd6d6090a88213d32130f58d25d140ea --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13800/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1df6e273fe71bcc8c09374a929e6e4693cf5586327d064c8cc4f4b3cb2200ca4 +size 557 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13800/scheduler.pt b/adapters/saved-alpaca-belle13b/checkpoint-13800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3cc4c4833afda594f410ea81fc7658b7a4f88176 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:796b21de149c9406ed9607f586e2c1cb51c0f1c158f06a67f77fb720ceafbad5 +size 627 diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13800/trainer_state.json b/adapters/saved-alpaca-belle13b/checkpoint-13800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c794deb89aadf74cee4f9008526b1ad187c2d10e --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13800/trainer_state.json @@ -0,0 +1,4708 @@ +{ + "best_metric": 0.7120471596717834, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-belle13b/checkpoint-13800", + "epoch": 2.9773462783171523, + "global_step": 13800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.6589, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 1.4071, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 1.044, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 0.00023999999999999998, + "loss": 0.9883, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.0003, + "loss": 0.9659, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029956537486417964, + "loss": 0.9505, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029913074972835925, + "loss": 0.9205, + "step": 140 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002986961245925389, + "loss": 0.9168, + "step": 160 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002982614994567186, + "loss": 0.9117, + "step": 180 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002978268743208982, + "loss": 0.9064, + "step": 200 + }, + { + "epoch": 0.04, + "eval_loss": 0.9033477306365967, + "eval_runtime": 25.3136, + "eval_samples_per_second": 79.009, + "eval_steps_per_second": 1.264, + "step": 200 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029739224918507785, + "loss": 0.8981, + "step": 220 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002969576240492575, + "loss": 0.8912, + "step": 240 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002965229989134371, + "loss": 0.8875, + "step": 260 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002960883737776168, + "loss": 0.8907, + "step": 280 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029565374864179645, + "loss": 0.8753, + "step": 300 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029521912350597606, + "loss": 0.8782, + "step": 320 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002947844983701557, + "loss": 0.8697, + "step": 340 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002943498732343354, + "loss": 0.8745, + "step": 360 + }, + { + "epoch": 0.08, + "learning_rate": 0.000293915248098515, + "loss": 0.8725, + "step": 380 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029348062296269466, + "loss": 0.8658, + "step": 400 + }, + { + "epoch": 0.09, + "eval_loss": 0.8655584454536438, + "eval_runtime": 25.3343, + "eval_samples_per_second": 78.944, + "eval_steps_per_second": 1.263, + "step": 400 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002930459978268743, + "loss": 0.8641, + "step": 420 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029261137269105393, + "loss": 0.8509, + "step": 440 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002921767475552336, + "loss": 0.8541, + "step": 460 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029174212241941326, + "loss": 0.8575, + "step": 480 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029130749728359287, + "loss": 0.8482, + "step": 500 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029087287214777253, + "loss": 0.8572, + "step": 520 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002904382470119522, + "loss": 0.8489, + "step": 540 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002900036218761318, + "loss": 0.8585, + "step": 560 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028956899674031147, + "loss": 0.8387, + "step": 580 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028913437160449113, + "loss": 0.8306, + "step": 600 + }, + { + "epoch": 0.13, + "eval_loss": 0.8434031009674072, + "eval_runtime": 25.3211, + "eval_samples_per_second": 78.986, + "eval_steps_per_second": 1.264, + "step": 600 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028869974646867074, + "loss": 0.8331, + "step": 620 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002882651213328504, + "loss": 0.8447, + "step": 640 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028783049619703007, + "loss": 0.836, + "step": 660 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002873958710612097, + "loss": 0.8436, + "step": 680 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028696124592538934, + "loss": 0.8281, + "step": 700 + }, + { + "epoch": 0.16, + "learning_rate": 0.000286526620789569, + "loss": 0.8378, + "step": 720 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002860919956537486, + "loss": 0.8338, + "step": 740 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002856573705179283, + "loss": 0.8323, + "step": 760 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028522274538210794, + "loss": 0.8153, + "step": 780 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028478812024628755, + "loss": 0.8349, + "step": 800 + }, + { + "epoch": 0.17, + "eval_loss": 0.8282934427261353, + "eval_runtime": 25.4025, + "eval_samples_per_second": 78.733, + "eval_steps_per_second": 1.26, + "step": 800 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002843534951104672, + "loss": 0.8198, + "step": 820 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002839188699746469, + "loss": 0.8254, + "step": 840 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002834842448388265, + "loss": 0.8165, + "step": 860 + }, + { + "epoch": 0.19, + "learning_rate": 0.00028304961970300615, + "loss": 0.8241, + "step": 880 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002826149945671858, + "loss": 0.814, + "step": 900 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002821803694313654, + "loss": 0.8222, + "step": 920 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002817457442955451, + "loss": 0.825, + "step": 940 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028131111915972475, + "loss": 0.8153, + "step": 960 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028087649402390436, + "loss": 0.8229, + "step": 980 + }, + { + "epoch": 0.22, + "learning_rate": 0.00028044186888808397, + "loss": 0.8129, + "step": 1000 + }, + { + "epoch": 0.22, + "eval_loss": 0.816320538520813, + "eval_runtime": 25.4153, + "eval_samples_per_second": 78.693, + "eval_steps_per_second": 1.259, + "step": 1000 + }, + { + "epoch": 0.22, + "learning_rate": 0.00028000724375226363, + "loss": 0.8121, + "step": 1020 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002795726186164433, + "loss": 0.8063, + "step": 1040 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002791379934806229, + "loss": 0.8097, + "step": 1060 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027870336834480257, + "loss": 0.8142, + "step": 1080 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027826874320898223, + "loss": 0.8021, + "step": 1100 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027783411807316184, + "loss": 0.8014, + "step": 1120 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002773994929373415, + "loss": 0.8031, + "step": 1140 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027696486780152117, + "loss": 0.8011, + "step": 1160 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002765302426657008, + "loss": 0.7944, + "step": 1180 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027609561752988044, + "loss": 0.8071, + "step": 1200 + }, + { + "epoch": 0.26, + "eval_loss": 0.8064733147621155, + "eval_runtime": 25.3901, + "eval_samples_per_second": 78.771, + "eval_steps_per_second": 1.26, + "step": 1200 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002756609923940601, + "loss": 0.8025, + "step": 1220 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002752263672582397, + "loss": 0.7954, + "step": 1240 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002747917421224194, + "loss": 0.8013, + "step": 1260 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027435711698659904, + "loss": 0.7967, + "step": 1280 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027392249185077865, + "loss": 0.8132, + "step": 1300 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002734878667149583, + "loss": 0.8017, + "step": 1320 + }, + { + "epoch": 0.29, + "learning_rate": 0.000273053241579138, + "loss": 0.7964, + "step": 1340 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002726186164433176, + "loss": 0.8012, + "step": 1360 + }, + { + "epoch": 0.3, + "learning_rate": 0.00027218399130749725, + "loss": 0.7982, + "step": 1380 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002717493661716769, + "loss": 0.8031, + "step": 1400 + }, + { + "epoch": 0.3, + "eval_loss": 0.798474133014679, + "eval_runtime": 25.432, + "eval_samples_per_second": 78.641, + "eval_steps_per_second": 1.258, + "step": 1400 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002713147410358565, + "loss": 0.7925, + "step": 1420 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002708801159000362, + "loss": 0.794, + "step": 1440 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027044549076421585, + "loss": 0.804, + "step": 1460 + }, + { + "epoch": 0.32, + "learning_rate": 0.00027001086562839546, + "loss": 0.7942, + "step": 1480 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002695762404925751, + "loss": 0.7872, + "step": 1500 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002691416153567548, + "loss": 0.7962, + "step": 1520 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002687069902209344, + "loss": 0.7898, + "step": 1540 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026827236508511406, + "loss": 0.7886, + "step": 1560 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002678377399492937, + "loss": 0.7904, + "step": 1580 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026740311481347333, + "loss": 0.7892, + "step": 1600 + }, + { + "epoch": 0.35, + "eval_loss": 0.7912269234657288, + "eval_runtime": 25.444, + "eval_samples_per_second": 78.604, + "eval_steps_per_second": 1.258, + "step": 1600 + }, + { + "epoch": 0.35, + "learning_rate": 0.000266968489677653, + "loss": 0.7897, + "step": 1620 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026653386454183266, + "loss": 0.7927, + "step": 1640 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026609923940601227, + "loss": 0.7829, + "step": 1660 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026566461427019193, + "loss": 0.7788, + "step": 1680 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002652299891343716, + "loss": 0.786, + "step": 1700 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002647953639985512, + "loss": 0.7828, + "step": 1720 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026436073886273087, + "loss": 0.7788, + "step": 1740 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026392611372691053, + "loss": 0.7851, + "step": 1760 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026349148859109014, + "loss": 0.7936, + "step": 1780 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002630568634552698, + "loss": 0.7758, + "step": 1800 + }, + { + "epoch": 0.39, + "eval_loss": 0.7854430675506592, + "eval_runtime": 25.4734, + "eval_samples_per_second": 78.513, + "eval_steps_per_second": 1.256, + "step": 1800 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026262223831944947, + "loss": 0.787, + "step": 1820 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002621876131836291, + "loss": 0.7779, + "step": 1840 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026175298804780874, + "loss": 0.7792, + "step": 1860 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002613183629119884, + "loss": 0.7728, + "step": 1880 + }, + { + "epoch": 0.41, + "learning_rate": 0.000260883737776168, + "loss": 0.7844, + "step": 1900 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002604491126403477, + "loss": 0.7726, + "step": 1920 + }, + { + "epoch": 0.42, + "learning_rate": 0.00026001448750452734, + "loss": 0.7706, + "step": 1940 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025957986236870695, + "loss": 0.7659, + "step": 1960 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002591452372328866, + "loss": 0.7808, + "step": 1980 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002587106120970663, + "loss": 0.7692, + "step": 2000 + }, + { + "epoch": 0.43, + "eval_loss": 0.7800412774085999, + "eval_runtime": 25.5146, + "eval_samples_per_second": 78.387, + "eval_steps_per_second": 1.254, + "step": 2000 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002582759869612459, + "loss": 0.7665, + "step": 2020 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025784136182542555, + "loss": 0.7795, + "step": 2040 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002574067366896052, + "loss": 0.7846, + "step": 2060 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002569721115537848, + "loss": 0.7639, + "step": 2080 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002565374864179645, + "loss": 0.7827, + "step": 2100 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025610286128214415, + "loss": 0.7751, + "step": 2120 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025566823614632376, + "loss": 0.776, + "step": 2140 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002552336110105034, + "loss": 0.7773, + "step": 2160 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002547989858746831, + "loss": 0.7757, + "step": 2180 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002543643607388627, + "loss": 0.7769, + "step": 2200 + }, + { + "epoch": 0.47, + "eval_loss": 0.7759379744529724, + "eval_runtime": 25.4789, + "eval_samples_per_second": 78.496, + "eval_steps_per_second": 1.256, + "step": 2200 + }, + { + "epoch": 0.48, + "learning_rate": 0.00025392973560304236, + "loss": 0.7657, + "step": 2220 + }, + { + "epoch": 0.48, + "learning_rate": 0.000253495110467222, + "loss": 0.7664, + "step": 2240 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025306048533140163, + "loss": 0.7774, + "step": 2260 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002526258601955813, + "loss": 0.7591, + "step": 2280 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025219123505976096, + "loss": 0.7605, + "step": 2300 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025175660992394057, + "loss": 0.7693, + "step": 2320 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025132198478812023, + "loss": 0.7702, + "step": 2340 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002508873596522999, + "loss": 0.7706, + "step": 2360 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002504527345164795, + "loss": 0.7664, + "step": 2380 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025001810938065917, + "loss": 0.76, + "step": 2400 + }, + { + "epoch": 0.52, + "eval_loss": 0.7723669409751892, + "eval_runtime": 25.4827, + "eval_samples_per_second": 78.485, + "eval_steps_per_second": 1.256, + "step": 2400 + }, + { + "epoch": 0.52, + "learning_rate": 0.00024958348424483883, + "loss": 0.7702, + "step": 2420 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024914885910901844, + "loss": 0.7686, + "step": 2440 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002487142339731981, + "loss": 0.762, + "step": 2460 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024827960883737777, + "loss": 0.7719, + "step": 2480 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002478449837015574, + "loss": 0.7612, + "step": 2500 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024741035856573704, + "loss": 0.7565, + "step": 2520 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002469757334299167, + "loss": 0.7719, + "step": 2540 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002465411082940963, + "loss": 0.7619, + "step": 2560 + }, + { + "epoch": 0.56, + "learning_rate": 0.000246106483158276, + "loss": 0.7607, + "step": 2580 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024567185802245564, + "loss": 0.7564, + "step": 2600 + }, + { + "epoch": 0.56, + "eval_loss": 0.7678729295730591, + "eval_runtime": 25.4455, + "eval_samples_per_second": 78.599, + "eval_steps_per_second": 1.258, + "step": 2600 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024523723288663525, + "loss": 0.7613, + "step": 2620 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002448026077508149, + "loss": 0.7525, + "step": 2640 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002443679826149946, + "loss": 0.7563, + "step": 2660 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024393335747917422, + "loss": 0.7601, + "step": 2680 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024349873234335383, + "loss": 0.7633, + "step": 2700 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024306410720753346, + "loss": 0.75, + "step": 2720 + }, + { + "epoch": 0.59, + "learning_rate": 0.0002426294820717131, + "loss": 0.7602, + "step": 2740 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024219485693589276, + "loss": 0.7546, + "step": 2760 + }, + { + "epoch": 0.6, + "learning_rate": 0.0002417602318000724, + "loss": 0.7532, + "step": 2780 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024132560666425203, + "loss": 0.7661, + "step": 2800 + }, + { + "epoch": 0.6, + "eval_loss": 0.7649803757667542, + "eval_runtime": 25.4783, + "eval_samples_per_second": 78.498, + "eval_steps_per_second": 1.256, + "step": 2800 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002408909815284317, + "loss": 0.7587, + "step": 2820 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024045635639261133, + "loss": 0.7543, + "step": 2840 + }, + { + "epoch": 0.62, + "learning_rate": 0.00024002173125679097, + "loss": 0.7672, + "step": 2860 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023958710612097063, + "loss": 0.7623, + "step": 2880 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023915248098515027, + "loss": 0.7487, + "step": 2900 + }, + { + "epoch": 0.63, + "learning_rate": 0.0002387178558493299, + "loss": 0.75, + "step": 2920 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023828323071350957, + "loss": 0.7567, + "step": 2940 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002378486055776892, + "loss": 0.7592, + "step": 2960 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023741398044186884, + "loss": 0.7569, + "step": 2980 + }, + { + "epoch": 0.65, + "learning_rate": 0.0002369793553060485, + "loss": 0.7524, + "step": 3000 + }, + { + "epoch": 0.65, + "eval_loss": 0.7613279819488525, + "eval_runtime": 25.4837, + "eval_samples_per_second": 78.482, + "eval_steps_per_second": 1.256, + "step": 3000 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023654473017022814, + "loss": 0.7593, + "step": 3020 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023611010503440778, + "loss": 0.7516, + "step": 3040 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023567547989858744, + "loss": 0.7525, + "step": 3060 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023524085476276708, + "loss": 0.7583, + "step": 3080 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023480622962694672, + "loss": 0.7535, + "step": 3100 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023437160449112638, + "loss": 0.7528, + "step": 3120 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023393697935530602, + "loss": 0.7418, + "step": 3140 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023350235421948565, + "loss": 0.7496, + "step": 3160 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023306772908366532, + "loss": 0.7537, + "step": 3180 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023263310394784495, + "loss": 0.7569, + "step": 3200 + }, + { + "epoch": 0.69, + "eval_loss": 0.7581906914710999, + "eval_runtime": 25.4588, + "eval_samples_per_second": 78.558, + "eval_steps_per_second": 1.257, + "step": 3200 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002321984788120246, + "loss": 0.7465, + "step": 3220 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023176385367620425, + "loss": 0.7367, + "step": 3240 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002313292285403839, + "loss": 0.7425, + "step": 3260 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023089460340456353, + "loss": 0.7637, + "step": 3280 + }, + { + "epoch": 0.71, + "learning_rate": 0.0002304599782687432, + "loss": 0.7574, + "step": 3300 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023002535313292283, + "loss": 0.7448, + "step": 3320 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022959072799710246, + "loss": 0.7595, + "step": 3340 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022915610286128213, + "loss": 0.7465, + "step": 3360 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022872147772546176, + "loss": 0.7532, + "step": 3380 + }, + { + "epoch": 0.73, + "learning_rate": 0.0002282868525896414, + "loss": 0.7466, + "step": 3400 + }, + { + "epoch": 0.73, + "eval_loss": 0.7559078931808472, + "eval_runtime": 25.464, + "eval_samples_per_second": 78.542, + "eval_steps_per_second": 1.257, + "step": 3400 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022785222745382106, + "loss": 0.753, + "step": 3420 + }, + { + "epoch": 0.74, + "learning_rate": 0.0002274176023180007, + "loss": 0.7459, + "step": 3440 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022698297718218034, + "loss": 0.7519, + "step": 3460 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022654835204636, + "loss": 0.7451, + "step": 3480 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022611372691053964, + "loss": 0.7468, + "step": 3500 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022567910177471927, + "loss": 0.7491, + "step": 3520 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022524447663889894, + "loss": 0.7524, + "step": 3540 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022480985150307857, + "loss": 0.7484, + "step": 3560 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002243752263672582, + "loss": 0.7484, + "step": 3580 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022394060123143787, + "loss": 0.7529, + "step": 3600 + }, + { + "epoch": 0.78, + "eval_loss": 0.7531791925430298, + "eval_runtime": 25.4572, + "eval_samples_per_second": 78.563, + "eval_steps_per_second": 1.257, + "step": 3600 + }, + { + "epoch": 0.78, + "learning_rate": 0.0002235059760956175, + "loss": 0.7475, + "step": 3620 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022307135095979715, + "loss": 0.7518, + "step": 3640 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002226367258239768, + "loss": 0.751, + "step": 3660 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022220210068815645, + "loss": 0.7402, + "step": 3680 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022176747555233608, + "loss": 0.755, + "step": 3700 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022133285041651575, + "loss": 0.7441, + "step": 3720 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022089822528069538, + "loss": 0.746, + "step": 3740 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022046360014487502, + "loss": 0.7441, + "step": 3760 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022002897500905468, + "loss": 0.7475, + "step": 3780 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021959434987323432, + "loss": 0.7458, + "step": 3800 + }, + { + "epoch": 0.82, + "eval_loss": 0.7513870596885681, + "eval_runtime": 25.4906, + "eval_samples_per_second": 78.46, + "eval_steps_per_second": 1.255, + "step": 3800 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021915972473741396, + "loss": 0.7436, + "step": 3820 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021872509960159362, + "loss": 0.7451, + "step": 3840 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021829047446577326, + "loss": 0.7475, + "step": 3860 + }, + { + "epoch": 0.84, + "learning_rate": 0.0002178558493299529, + "loss": 0.7424, + "step": 3880 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021742122419413256, + "loss": 0.7503, + "step": 3900 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002169865990583122, + "loss": 0.7334, + "step": 3920 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021655197392249183, + "loss": 0.7436, + "step": 3940 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002161173487866715, + "loss": 0.7453, + "step": 3960 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021568272365085113, + "loss": 0.7424, + "step": 3980 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021524809851503076, + "loss": 0.7509, + "step": 4000 + }, + { + "epoch": 0.86, + "eval_loss": 0.7488968968391418, + "eval_runtime": 25.492, + "eval_samples_per_second": 78.456, + "eval_steps_per_second": 1.255, + "step": 4000 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021481347337921043, + "loss": 0.7445, + "step": 4020 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021437884824339006, + "loss": 0.74, + "step": 4040 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002139442231075697, + "loss": 0.7362, + "step": 4060 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021350959797174936, + "loss": 0.7409, + "step": 4080 + }, + { + "epoch": 0.88, + "learning_rate": 0.000213074972835929, + "loss": 0.7315, + "step": 4100 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021264034770010864, + "loss": 0.7488, + "step": 4120 + }, + { + "epoch": 0.89, + "learning_rate": 0.0002122057225642883, + "loss": 0.7375, + "step": 4140 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021177109742846794, + "loss": 0.7481, + "step": 4160 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021133647229264757, + "loss": 0.7524, + "step": 4180 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021092357841361823, + "loss": 0.7403, + "step": 4200 + }, + { + "epoch": 0.91, + "eval_loss": 0.7469983100891113, + "eval_runtime": 25.4847, + "eval_samples_per_second": 78.479, + "eval_steps_per_second": 1.256, + "step": 4200 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021048895327779787, + "loss": 0.7394, + "step": 4220 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002100543281419775, + "loss": 0.7405, + "step": 4240 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020961970300615717, + "loss": 0.7534, + "step": 4260 + }, + { + "epoch": 0.92, + "learning_rate": 0.0002091850778703368, + "loss": 0.7412, + "step": 4280 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020875045273451644, + "loss": 0.7393, + "step": 4300 + }, + { + "epoch": 0.93, + "learning_rate": 0.0002083158275986961, + "loss": 0.7289, + "step": 4320 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020788120246287574, + "loss": 0.7342, + "step": 4340 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020744657732705538, + "loss": 0.7427, + "step": 4360 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020701195219123504, + "loss": 0.7386, + "step": 4380 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020657732705541468, + "loss": 0.7374, + "step": 4400 + }, + { + "epoch": 0.95, + "eval_loss": 0.7451291680335999, + "eval_runtime": 25.461, + "eval_samples_per_second": 78.552, + "eval_steps_per_second": 1.257, + "step": 4400 + }, + { + "epoch": 0.95, + "learning_rate": 0.0002061427019195943, + "loss": 0.7364, + "step": 4420 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020570807678377398, + "loss": 0.7377, + "step": 4440 + }, + { + "epoch": 0.96, + "learning_rate": 0.0002052734516479536, + "loss": 0.7391, + "step": 4460 + }, + { + "epoch": 0.97, + "learning_rate": 0.00020483882651213325, + "loss": 0.731, + "step": 4480 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002044042013763129, + "loss": 0.735, + "step": 4500 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020396957624049255, + "loss": 0.7344, + "step": 4520 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020353495110467219, + "loss": 0.7355, + "step": 4540 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020310032596885185, + "loss": 0.7357, + "step": 4560 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020266570083303149, + "loss": 0.7377, + "step": 4580 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020223107569721112, + "loss": 0.7438, + "step": 4600 + }, + { + "epoch": 0.99, + "eval_loss": 0.7437875270843506, + "eval_runtime": 25.5255, + "eval_samples_per_second": 78.353, + "eval_steps_per_second": 1.254, + "step": 4600 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020179645056139079, + "loss": 0.7343, + "step": 4620 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020136182542557042, + "loss": 0.7473, + "step": 4640 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020092720028975006, + "loss": 0.7305, + "step": 4660 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020049257515392972, + "loss": 0.7284, + "step": 4680 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020005795001810936, + "loss": 0.7335, + "step": 4700 + }, + { + "epoch": 1.02, + "learning_rate": 0.000199623324882289, + "loss": 0.7282, + "step": 4720 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019918869974646866, + "loss": 0.7337, + "step": 4740 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001987540746106483, + "loss": 0.7195, + "step": 4760 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019831944947482793, + "loss": 0.7327, + "step": 4780 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001978848243390076, + "loss": 0.7259, + "step": 4800 + }, + { + "epoch": 1.04, + "eval_loss": 0.7413464188575745, + "eval_runtime": 25.4959, + "eval_samples_per_second": 78.444, + "eval_steps_per_second": 1.255, + "step": 4800 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019745019920318723, + "loss": 0.7263, + "step": 4820 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019701557406736687, + "loss": 0.7341, + "step": 4840 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019658094893154653, + "loss": 0.7406, + "step": 4860 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019614632379572617, + "loss": 0.7309, + "step": 4880 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001957116986599058, + "loss": 0.7274, + "step": 4900 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019527707352408547, + "loss": 0.7241, + "step": 4920 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001948424483882651, + "loss": 0.7368, + "step": 4940 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019440782325244474, + "loss": 0.7445, + "step": 4960 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001939731981166244, + "loss": 0.7347, + "step": 4980 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019353857298080404, + "loss": 0.7436, + "step": 5000 + }, + { + "epoch": 1.08, + "eval_loss": 0.7399871945381165, + "eval_runtime": 25.5032, + "eval_samples_per_second": 78.422, + "eval_steps_per_second": 1.255, + "step": 5000 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019310394784498368, + "loss": 0.7248, + "step": 5020 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019266932270916334, + "loss": 0.7374, + "step": 5040 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019223469757334298, + "loss": 0.7187, + "step": 5060 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019180007243752261, + "loss": 0.7381, + "step": 5080 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019136544730170228, + "loss": 0.7389, + "step": 5100 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019093082216588191, + "loss": 0.7343, + "step": 5120 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019049619703006155, + "loss": 0.7323, + "step": 5140 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019006157189424121, + "loss": 0.723, + "step": 5160 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018962694675842085, + "loss": 0.7236, + "step": 5180 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001891923216226005, + "loss": 0.7399, + "step": 5200 + }, + { + "epoch": 1.12, + "eval_loss": 0.7393975257873535, + "eval_runtime": 25.6137, + "eval_samples_per_second": 78.083, + "eval_steps_per_second": 1.249, + "step": 5200 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018875769648678015, + "loss": 0.7373, + "step": 5220 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001883230713509598, + "loss": 0.7257, + "step": 5240 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018788844621513942, + "loss": 0.7261, + "step": 5260 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001874538210793191, + "loss": 0.7302, + "step": 5280 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018701919594349872, + "loss": 0.7337, + "step": 5300 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018658457080767836, + "loss": 0.7237, + "step": 5320 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018614994567185802, + "loss": 0.7238, + "step": 5340 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018571532053603766, + "loss": 0.7287, + "step": 5360 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001852806954002173, + "loss": 0.7237, + "step": 5380 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018484607026439696, + "loss": 0.7256, + "step": 5400 + }, + { + "epoch": 1.17, + "eval_loss": 0.7377527952194214, + "eval_runtime": 25.4964, + "eval_samples_per_second": 78.442, + "eval_steps_per_second": 1.255, + "step": 5400 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001844114451285766, + "loss": 0.7279, + "step": 5420 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018397681999275623, + "loss": 0.7226, + "step": 5440 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001835421948569359, + "loss": 0.7167, + "step": 5460 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018310756972111553, + "loss": 0.7268, + "step": 5480 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018267294458529517, + "loss": 0.7398, + "step": 5500 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018223831944947483, + "loss": 0.7331, + "step": 5520 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018180369431365447, + "loss": 0.7372, + "step": 5540 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001813690691778341, + "loss": 0.7321, + "step": 5560 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018093444404201377, + "loss": 0.7346, + "step": 5580 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001804998189061934, + "loss": 0.722, + "step": 5600 + }, + { + "epoch": 1.21, + "eval_loss": 0.7368175983428955, + "eval_runtime": 25.5045, + "eval_samples_per_second": 78.417, + "eval_steps_per_second": 1.255, + "step": 5600 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018006519377037304, + "loss": 0.7279, + "step": 5620 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001796305686345527, + "loss": 0.72, + "step": 5640 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017919594349873234, + "loss": 0.7295, + "step": 5660 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017876131836291198, + "loss": 0.7245, + "step": 5680 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017832669322709164, + "loss": 0.7418, + "step": 5700 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017789206809127128, + "loss": 0.7317, + "step": 5720 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017745744295545092, + "loss": 0.7303, + "step": 5740 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017702281781963058, + "loss": 0.7332, + "step": 5760 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017658819268381022, + "loss": 0.7202, + "step": 5780 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017615356754798983, + "loss": 0.7238, + "step": 5800 + }, + { + "epoch": 1.25, + "eval_loss": 0.7348505854606628, + "eval_runtime": 25.509, + "eval_samples_per_second": 78.404, + "eval_steps_per_second": 1.254, + "step": 5800 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017571894241216946, + "loss": 0.724, + "step": 5820 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017528431727634913, + "loss": 0.7258, + "step": 5840 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017484969214052876, + "loss": 0.7217, + "step": 5860 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001744150670047084, + "loss": 0.7209, + "step": 5880 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017398044186888806, + "loss": 0.7276, + "step": 5900 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001735458167330677, + "loss": 0.7287, + "step": 5920 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017311119159724733, + "loss": 0.7244, + "step": 5940 + }, + { + "epoch": 1.29, + "learning_rate": 0.000172676566461427, + "loss": 0.7247, + "step": 5960 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017224194132560663, + "loss": 0.7191, + "step": 5980 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017180731618978627, + "loss": 0.7208, + "step": 6000 + }, + { + "epoch": 1.29, + "eval_loss": 0.7340711951255798, + "eval_runtime": 25.4669, + "eval_samples_per_second": 78.533, + "eval_steps_per_second": 1.257, + "step": 6000 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017137269105396593, + "loss": 0.7285, + "step": 6020 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017093806591814557, + "loss": 0.7294, + "step": 6040 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001705034407823252, + "loss": 0.7365, + "step": 6060 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017006881564650487, + "loss": 0.7149, + "step": 6080 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001696341905106845, + "loss": 0.7229, + "step": 6100 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016919956537486414, + "loss": 0.7253, + "step": 6120 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001687649402390438, + "loss": 0.7188, + "step": 6140 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016833031510322344, + "loss": 0.7308, + "step": 6160 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016789568996740308, + "loss": 0.7186, + "step": 6180 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016746106483158274, + "loss": 0.7121, + "step": 6200 + }, + { + "epoch": 1.34, + "eval_loss": 0.7324739694595337, + "eval_runtime": 25.5, + "eval_samples_per_second": 78.431, + "eval_steps_per_second": 1.255, + "step": 6200 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016702643969576238, + "loss": 0.7286, + "step": 6220 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016659181455994202, + "loss": 0.7246, + "step": 6240 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016615718942412168, + "loss": 0.7234, + "step": 6260 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016572256428830132, + "loss": 0.7245, + "step": 6280 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016528793915248095, + "loss": 0.7252, + "step": 6300 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016485331401666062, + "loss": 0.7259, + "step": 6320 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016441868888084025, + "loss": 0.7173, + "step": 6340 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001639840637450199, + "loss": 0.7222, + "step": 6360 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016354943860919955, + "loss": 0.7113, + "step": 6380 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001631148134733792, + "loss": 0.72, + "step": 6400 + }, + { + "epoch": 1.38, + "eval_loss": 0.7319995164871216, + "eval_runtime": 25.5112, + "eval_samples_per_second": 78.397, + "eval_steps_per_second": 1.254, + "step": 6400 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016268018833755883, + "loss": 0.7333, + "step": 6420 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001622455632017385, + "loss": 0.7208, + "step": 6440 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016181093806591813, + "loss": 0.7161, + "step": 6460 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016137631293009776, + "loss": 0.7171, + "step": 6480 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016094168779427743, + "loss": 0.7297, + "step": 6500 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016050706265845706, + "loss": 0.7156, + "step": 6520 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001600724375226367, + "loss": 0.7175, + "step": 6540 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015963781238681636, + "loss": 0.7152, + "step": 6560 + }, + { + "epoch": 1.42, + "learning_rate": 0.000159203187250996, + "loss": 0.7282, + "step": 6580 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015876856211517564, + "loss": 0.722, + "step": 6600 + }, + { + "epoch": 1.42, + "eval_loss": 0.7307416796684265, + "eval_runtime": 25.4967, + "eval_samples_per_second": 78.442, + "eval_steps_per_second": 1.255, + "step": 6600 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001583339369793553, + "loss": 0.7274, + "step": 6620 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015789931184353494, + "loss": 0.7313, + "step": 6640 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015746468670771457, + "loss": 0.7209, + "step": 6660 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015703006157189424, + "loss": 0.7202, + "step": 6680 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015659543643607387, + "loss": 0.7264, + "step": 6700 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001561608113002535, + "loss": 0.7226, + "step": 6720 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015572618616443317, + "loss": 0.711, + "step": 6740 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001552915610286128, + "loss": 0.7216, + "step": 6760 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015485693589279245, + "loss": 0.7184, + "step": 6780 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001544223107569721, + "loss": 0.7216, + "step": 6800 + }, + { + "epoch": 1.47, + "eval_loss": 0.7297094464302063, + "eval_runtime": 25.4826, + "eval_samples_per_second": 78.485, + "eval_steps_per_second": 1.256, + "step": 6800 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015398768562115175, + "loss": 0.7203, + "step": 6820 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015355306048533138, + "loss": 0.7184, + "step": 6840 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015311843534951105, + "loss": 0.7183, + "step": 6860 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015268381021369068, + "loss": 0.7267, + "step": 6880 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015224918507787032, + "loss": 0.7299, + "step": 6900 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015181455994204998, + "loss": 0.719, + "step": 6920 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015137993480622962, + "loss": 0.7229, + "step": 6940 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015094530967040926, + "loss": 0.7231, + "step": 6960 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015051068453458892, + "loss": 0.7279, + "step": 6980 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015007605939876856, + "loss": 0.7252, + "step": 7000 + }, + { + "epoch": 1.51, + "eval_loss": 0.7288112640380859, + "eval_runtime": 25.4887, + "eval_samples_per_second": 78.466, + "eval_steps_per_second": 1.255, + "step": 7000 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001496414342629482, + "loss": 0.7148, + "step": 7020 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014920680912712786, + "loss": 0.7147, + "step": 7040 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001487721839913075, + "loss": 0.7209, + "step": 7060 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014833755885548713, + "loss": 0.724, + "step": 7080 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014790293371966676, + "loss": 0.7256, + "step": 7100 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001474683085838464, + "loss": 0.7246, + "step": 7120 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014703368344802606, + "loss": 0.7103, + "step": 7140 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001465990583122057, + "loss": 0.7223, + "step": 7160 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014616443317638534, + "loss": 0.7149, + "step": 7180 + }, + { + "epoch": 1.55, + "learning_rate": 0.000145729808040565, + "loss": 0.7214, + "step": 7200 + }, + { + "epoch": 1.55, + "eval_loss": 0.7280930876731873, + "eval_runtime": 25.4883, + "eval_samples_per_second": 78.467, + "eval_steps_per_second": 1.255, + "step": 7200 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014529518290474464, + "loss": 0.7118, + "step": 7220 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014486055776892427, + "loss": 0.7171, + "step": 7240 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014442593263310394, + "loss": 0.7191, + "step": 7260 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014399130749728357, + "loss": 0.7155, + "step": 7280 + }, + { + "epoch": 1.57, + "learning_rate": 0.0001435566823614632, + "loss": 0.7198, + "step": 7300 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014312205722564287, + "loss": 0.7188, + "step": 7320 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001426874320898225, + "loss": 0.7236, + "step": 7340 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014225280695400215, + "loss": 0.712, + "step": 7360 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001418181818181818, + "loss": 0.7181, + "step": 7380 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014138355668236145, + "loss": 0.7198, + "step": 7400 + }, + { + "epoch": 1.6, + "eval_loss": 0.7276077270507812, + "eval_runtime": 25.4843, + "eval_samples_per_second": 78.48, + "eval_steps_per_second": 1.256, + "step": 7400 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014094893154654108, + "loss": 0.7187, + "step": 7420 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014051430641072075, + "loss": 0.7153, + "step": 7440 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014007968127490038, + "loss": 0.7208, + "step": 7460 + }, + { + "epoch": 1.61, + "learning_rate": 0.00013964505613908002, + "loss": 0.7153, + "step": 7480 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013921043100325968, + "loss": 0.7207, + "step": 7500 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013877580586743932, + "loss": 0.7167, + "step": 7520 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013834118073161896, + "loss": 0.7183, + "step": 7540 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013792828685258964, + "loss": 0.7196, + "step": 7560 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013749366171676928, + "loss": 0.7233, + "step": 7580 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013705903658094894, + "loss": 0.7237, + "step": 7600 + }, + { + "epoch": 1.64, + "eval_loss": 0.7260885238647461, + "eval_runtime": 25.503, + "eval_samples_per_second": 78.422, + "eval_steps_per_second": 1.255, + "step": 7600 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013662441144512855, + "loss": 0.72, + "step": 7620 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001361897863093082, + "loss": 0.7094, + "step": 7640 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013575516117348785, + "loss": 0.7111, + "step": 7660 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013532053603766749, + "loss": 0.7182, + "step": 7680 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013488591090184715, + "loss": 0.7182, + "step": 7700 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013445128576602679, + "loss": 0.7183, + "step": 7720 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013401666063020642, + "loss": 0.7112, + "step": 7740 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013358203549438609, + "loss": 0.7183, + "step": 7760 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013314741035856572, + "loss": 0.7152, + "step": 7780 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013271278522274536, + "loss": 0.7233, + "step": 7800 + }, + { + "epoch": 1.68, + "eval_loss": 0.7252987027168274, + "eval_runtime": 25.5066, + "eval_samples_per_second": 78.411, + "eval_steps_per_second": 1.255, + "step": 7800 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013227816008692502, + "loss": 0.7124, + "step": 7820 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013184353495110466, + "loss": 0.7109, + "step": 7840 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001314089098152843, + "loss": 0.7132, + "step": 7860 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013097428467946396, + "loss": 0.7157, + "step": 7880 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001305396595436436, + "loss": 0.7237, + "step": 7900 + }, + { + "epoch": 1.71, + "learning_rate": 0.00013010503440782323, + "loss": 0.7176, + "step": 7920 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001296704092720029, + "loss": 0.7199, + "step": 7940 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012923578413618253, + "loss": 0.7119, + "step": 7960 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012880115900036217, + "loss": 0.717, + "step": 7980 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012836653386454183, + "loss": 0.7155, + "step": 8000 + }, + { + "epoch": 1.73, + "eval_loss": 0.7248360514640808, + "eval_runtime": 25.5301, + "eval_samples_per_second": 78.339, + "eval_steps_per_second": 1.253, + "step": 8000 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012793190872872147, + "loss": 0.7085, + "step": 8020 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001274972835929011, + "loss": 0.7174, + "step": 8040 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012706265845708077, + "loss": 0.7224, + "step": 8060 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001266280333212604, + "loss": 0.7169, + "step": 8080 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012619340818544004, + "loss": 0.7191, + "step": 8100 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001257587830496197, + "loss": 0.7179, + "step": 8120 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012532415791379934, + "loss": 0.7208, + "step": 8140 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012488953277797898, + "loss": 0.7168, + "step": 8160 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012445490764215864, + "loss": 0.7101, + "step": 8180 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012402028250633828, + "loss": 0.7167, + "step": 8200 + }, + { + "epoch": 1.77, + "eval_loss": 0.7242170572280884, + "eval_runtime": 25.4873, + "eval_samples_per_second": 78.47, + "eval_steps_per_second": 1.256, + "step": 8200 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012358565737051791, + "loss": 0.7062, + "step": 8220 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012315103223469758, + "loss": 0.7177, + "step": 8240 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012271640709887721, + "loss": 0.7035, + "step": 8260 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012228178196305685, + "loss": 0.7157, + "step": 8280 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001218471568272365, + "loss": 0.7196, + "step": 8300 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012141253169141615, + "loss": 0.7105, + "step": 8320 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012097790655559579, + "loss": 0.7105, + "step": 8340 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012054328141977544, + "loss": 0.7139, + "step": 8360 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012010865628395509, + "loss": 0.7215, + "step": 8380 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011967403114813472, + "loss": 0.725, + "step": 8400 + }, + { + "epoch": 1.81, + "eval_loss": 0.7237139344215393, + "eval_runtime": 25.506, + "eval_samples_per_second": 78.413, + "eval_steps_per_second": 1.255, + "step": 8400 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011923940601231437, + "loss": 0.7107, + "step": 8420 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011880478087649402, + "loss": 0.7095, + "step": 8440 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011837015574067366, + "loss": 0.7061, + "step": 8460 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001179355306048533, + "loss": 0.716, + "step": 8480 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011750090546903295, + "loss": 0.7203, + "step": 8500 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011706628033321258, + "loss": 0.7098, + "step": 8520 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011663165519739223, + "loss": 0.7104, + "step": 8540 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011619703006157188, + "loss": 0.7051, + "step": 8560 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011576240492575152, + "loss": 0.7198, + "step": 8580 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011532777978993117, + "loss": 0.7175, + "step": 8600 + }, + { + "epoch": 1.86, + "eval_loss": 0.7230754494667053, + "eval_runtime": 25.5133, + "eval_samples_per_second": 78.39, + "eval_steps_per_second": 1.254, + "step": 8600 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011489315465411082, + "loss": 0.7046, + "step": 8620 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011445852951829046, + "loss": 0.7176, + "step": 8640 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001140239043824701, + "loss": 0.7193, + "step": 8660 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011358927924664976, + "loss": 0.7046, + "step": 8680 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011315465411082939, + "loss": 0.7116, + "step": 8700 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011274176023180006, + "loss": 0.7152, + "step": 8720 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011230713509597971, + "loss": 0.7164, + "step": 8740 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011187250996015936, + "loss": 0.7192, + "step": 8760 + }, + { + "epoch": 1.89, + "learning_rate": 0.000111437884824339, + "loss": 0.7124, + "step": 8780 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011100325968851865, + "loss": 0.7032, + "step": 8800 + }, + { + "epoch": 1.9, + "eval_loss": 0.7217770218849182, + "eval_runtime": 25.4723, + "eval_samples_per_second": 78.517, + "eval_steps_per_second": 1.256, + "step": 8800 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001105686345526983, + "loss": 0.7157, + "step": 8820 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011013400941687794, + "loss": 0.7115, + "step": 8840 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010969938428105759, + "loss": 0.7137, + "step": 8860 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010926475914523724, + "loss": 0.7176, + "step": 8880 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010883013400941687, + "loss": 0.7081, + "step": 8900 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010839550887359652, + "loss": 0.7233, + "step": 8920 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010796088373777617, + "loss": 0.7058, + "step": 8940 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010752625860195581, + "loss": 0.7154, + "step": 8960 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010709163346613546, + "loss": 0.7135, + "step": 8980 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010665700833031508, + "loss": 0.7078, + "step": 9000 + }, + { + "epoch": 1.94, + "eval_loss": 0.7215875387191772, + "eval_runtime": 25.484, + "eval_samples_per_second": 78.481, + "eval_steps_per_second": 1.256, + "step": 9000 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010622238319449473, + "loss": 0.7061, + "step": 9020 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010578775805867438, + "loss": 0.7174, + "step": 9040 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010535313292285402, + "loss": 0.7132, + "step": 9060 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010491850778703367, + "loss": 0.7247, + "step": 9080 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010448388265121332, + "loss": 0.7064, + "step": 9100 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010404925751539295, + "loss": 0.7098, + "step": 9120 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001036146323795726, + "loss": 0.708, + "step": 9140 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010318000724375225, + "loss": 0.7144, + "step": 9160 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010274538210793189, + "loss": 0.7151, + "step": 9180 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010231075697211154, + "loss": 0.718, + "step": 9200 + }, + { + "epoch": 1.98, + "eval_loss": 0.7208251357078552, + "eval_runtime": 25.5022, + "eval_samples_per_second": 78.425, + "eval_steps_per_second": 1.255, + "step": 9200 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010187613183629119, + "loss": 0.7108, + "step": 9220 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010144150670047083, + "loss": 0.6952, + "step": 9240 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010100688156465048, + "loss": 0.7013, + "step": 9260 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010057225642883013, + "loss": 0.7013, + "step": 9280 + }, + { + "epoch": 2.01, + "learning_rate": 0.00010013763129300976, + "loss": 0.7049, + "step": 9300 + }, + { + "epoch": 2.01, + "learning_rate": 9.970300615718941e-05, + "loss": 0.7093, + "step": 9320 + }, + { + "epoch": 2.02, + "learning_rate": 9.926838102136906e-05, + "loss": 0.713, + "step": 9340 + }, + { + "epoch": 2.02, + "learning_rate": 9.88337558855487e-05, + "loss": 0.7108, + "step": 9360 + }, + { + "epoch": 2.02, + "learning_rate": 9.839913074972835e-05, + "loss": 0.7115, + "step": 9380 + }, + { + "epoch": 2.03, + "learning_rate": 9.7964505613908e-05, + "loss": 0.7119, + "step": 9400 + }, + { + "epoch": 2.03, + "eval_loss": 0.7202969789505005, + "eval_runtime": 25.504, + "eval_samples_per_second": 78.419, + "eval_steps_per_second": 1.255, + "step": 9400 + }, + { + "epoch": 2.03, + "learning_rate": 9.752988047808764e-05, + "loss": 0.7107, + "step": 9420 + }, + { + "epoch": 2.04, + "learning_rate": 9.709525534226729e-05, + "loss": 0.7065, + "step": 9440 + }, + { + "epoch": 2.04, + "learning_rate": 9.666063020644694e-05, + "loss": 0.7121, + "step": 9460 + }, + { + "epoch": 2.05, + "learning_rate": 9.622600507062657e-05, + "loss": 0.7163, + "step": 9480 + }, + { + "epoch": 2.05, + "learning_rate": 9.579137993480622e-05, + "loss": 0.7026, + "step": 9500 + }, + { + "epoch": 2.05, + "learning_rate": 9.535675479898587e-05, + "loss": 0.7158, + "step": 9520 + }, + { + "epoch": 2.06, + "learning_rate": 9.492212966316551e-05, + "loss": 0.7016, + "step": 9540 + }, + { + "epoch": 2.06, + "learning_rate": 9.448750452734516e-05, + "loss": 0.7149, + "step": 9560 + }, + { + "epoch": 2.07, + "learning_rate": 9.405287939152481e-05, + "loss": 0.7079, + "step": 9580 + }, + { + "epoch": 2.07, + "learning_rate": 9.361825425570445e-05, + "loss": 0.709, + "step": 9600 + }, + { + "epoch": 2.07, + "eval_loss": 0.7194134593009949, + "eval_runtime": 25.5286, + "eval_samples_per_second": 78.343, + "eval_steps_per_second": 1.253, + "step": 9600 + }, + { + "epoch": 2.08, + "learning_rate": 9.31836291198841e-05, + "loss": 0.7127, + "step": 9620 + }, + { + "epoch": 2.08, + "learning_rate": 9.274900398406375e-05, + "loss": 0.7037, + "step": 9640 + }, + { + "epoch": 2.08, + "learning_rate": 9.231437884824338e-05, + "loss": 0.7114, + "step": 9660 + }, + { + "epoch": 2.09, + "learning_rate": 9.187975371242303e-05, + "loss": 0.706, + "step": 9680 + }, + { + "epoch": 2.09, + "learning_rate": 9.144512857660268e-05, + "loss": 0.7026, + "step": 9700 + }, + { + "epoch": 2.1, + "learning_rate": 9.101050344078232e-05, + "loss": 0.7079, + "step": 9720 + }, + { + "epoch": 2.1, + "learning_rate": 9.057587830496197e-05, + "loss": 0.7053, + "step": 9740 + }, + { + "epoch": 2.11, + "learning_rate": 9.014125316914162e-05, + "loss": 0.7125, + "step": 9760 + }, + { + "epoch": 2.11, + "learning_rate": 8.970662803332126e-05, + "loss": 0.7045, + "step": 9780 + }, + { + "epoch": 2.11, + "learning_rate": 8.92720028975009e-05, + "loss": 0.7109, + "step": 9800 + }, + { + "epoch": 2.11, + "eval_loss": 0.7186465859413147, + "eval_runtime": 25.5049, + "eval_samples_per_second": 78.416, + "eval_steps_per_second": 1.255, + "step": 9800 + }, + { + "epoch": 2.12, + "learning_rate": 8.883737776168056e-05, + "loss": 0.7035, + "step": 9820 + }, + { + "epoch": 2.12, + "learning_rate": 8.840275262586019e-05, + "loss": 0.7073, + "step": 9840 + }, + { + "epoch": 2.13, + "learning_rate": 8.796812749003983e-05, + "loss": 0.7114, + "step": 9860 + }, + { + "epoch": 2.13, + "learning_rate": 8.753350235421946e-05, + "loss": 0.7066, + "step": 9880 + }, + { + "epoch": 2.14, + "learning_rate": 8.709887721839911e-05, + "loss": 0.7055, + "step": 9900 + }, + { + "epoch": 2.14, + "learning_rate": 8.666425208257877e-05, + "loss": 0.7064, + "step": 9920 + }, + { + "epoch": 2.14, + "learning_rate": 8.62296269467584e-05, + "loss": 0.7154, + "step": 9940 + }, + { + "epoch": 2.15, + "learning_rate": 8.579500181093805e-05, + "loss": 0.7099, + "step": 9960 + }, + { + "epoch": 2.15, + "learning_rate": 8.53603766751177e-05, + "loss": 0.7112, + "step": 9980 + }, + { + "epoch": 2.16, + "learning_rate": 8.492575153929734e-05, + "loss": 0.7086, + "step": 10000 + }, + { + "epoch": 2.16, + "eval_loss": 0.7181739211082458, + "eval_runtime": 25.5087, + "eval_samples_per_second": 78.405, + "eval_steps_per_second": 1.254, + "step": 10000 + }, + { + "epoch": 2.16, + "learning_rate": 8.449112640347699e-05, + "loss": 0.7155, + "step": 10020 + }, + { + "epoch": 2.17, + "learning_rate": 8.405650126765664e-05, + "loss": 0.7097, + "step": 10040 + }, + { + "epoch": 2.17, + "learning_rate": 8.362187613183627e-05, + "loss": 0.7025, + "step": 10060 + }, + { + "epoch": 2.17, + "learning_rate": 8.318725099601592e-05, + "loss": 0.7065, + "step": 10080 + }, + { + "epoch": 2.18, + "learning_rate": 8.275262586019557e-05, + "loss": 0.6982, + "step": 10100 + }, + { + "epoch": 2.18, + "learning_rate": 8.231800072437521e-05, + "loss": 0.7039, + "step": 10120 + }, + { + "epoch": 2.19, + "learning_rate": 8.188337558855486e-05, + "loss": 0.7097, + "step": 10140 + }, + { + "epoch": 2.19, + "learning_rate": 8.144875045273451e-05, + "loss": 0.7089, + "step": 10160 + }, + { + "epoch": 2.2, + "learning_rate": 8.101412531691415e-05, + "loss": 0.7018, + "step": 10180 + }, + { + "epoch": 2.2, + "learning_rate": 8.05795001810938e-05, + "loss": 0.7025, + "step": 10200 + }, + { + "epoch": 2.2, + "eval_loss": 0.7179592251777649, + "eval_runtime": 25.4993, + "eval_samples_per_second": 78.433, + "eval_steps_per_second": 1.255, + "step": 10200 + }, + { + "epoch": 2.2, + "learning_rate": 8.014487504527345e-05, + "loss": 0.7067, + "step": 10220 + }, + { + "epoch": 2.21, + "learning_rate": 7.971024990945308e-05, + "loss": 0.71, + "step": 10240 + }, + { + "epoch": 2.21, + "learning_rate": 7.927562477363273e-05, + "loss": 0.7255, + "step": 10260 + }, + { + "epoch": 2.22, + "learning_rate": 7.884099963781238e-05, + "loss": 0.7065, + "step": 10280 + }, + { + "epoch": 2.22, + "learning_rate": 7.840637450199202e-05, + "loss": 0.712, + "step": 10300 + }, + { + "epoch": 2.23, + "learning_rate": 7.797174936617167e-05, + "loss": 0.7132, + "step": 10320 + }, + { + "epoch": 2.23, + "learning_rate": 7.753712423035132e-05, + "loss": 0.7106, + "step": 10340 + }, + { + "epoch": 2.24, + "learning_rate": 7.710249909453096e-05, + "loss": 0.708, + "step": 10360 + }, + { + "epoch": 2.24, + "learning_rate": 7.666787395871061e-05, + "loss": 0.7054, + "step": 10380 + }, + { + "epoch": 2.24, + "learning_rate": 7.623324882289026e-05, + "loss": 0.7087, + "step": 10400 + }, + { + "epoch": 2.24, + "eval_loss": 0.717901349067688, + "eval_runtime": 25.4862, + "eval_samples_per_second": 78.474, + "eval_steps_per_second": 1.256, + "step": 10400 + }, + { + "epoch": 2.25, + "learning_rate": 7.57986236870699e-05, + "loss": 0.7014, + "step": 10420 + }, + { + "epoch": 2.25, + "learning_rate": 7.536399855124954e-05, + "loss": 0.7103, + "step": 10440 + }, + { + "epoch": 2.26, + "learning_rate": 7.49293734154292e-05, + "loss": 0.7089, + "step": 10460 + }, + { + "epoch": 2.26, + "learning_rate": 7.449474827960883e-05, + "loss": 0.704, + "step": 10480 + }, + { + "epoch": 2.27, + "learning_rate": 7.406012314378847e-05, + "loss": 0.7074, + "step": 10500 + }, + { + "epoch": 2.27, + "learning_rate": 7.362549800796812e-05, + "loss": 0.7094, + "step": 10520 + }, + { + "epoch": 2.27, + "learning_rate": 7.319087287214777e-05, + "loss": 0.7069, + "step": 10540 + }, + { + "epoch": 2.28, + "learning_rate": 7.27562477363274e-05, + "loss": 0.7081, + "step": 10560 + }, + { + "epoch": 2.28, + "learning_rate": 7.232162260050705e-05, + "loss": 0.7036, + "step": 10580 + }, + { + "epoch": 2.29, + "learning_rate": 7.18869974646867e-05, + "loss": 0.6984, + "step": 10600 + }, + { + "epoch": 2.29, + "eval_loss": 0.7175166010856628, + "eval_runtime": 25.5016, + "eval_samples_per_second": 78.426, + "eval_steps_per_second": 1.255, + "step": 10600 + }, + { + "epoch": 2.29, + "learning_rate": 7.145237232886634e-05, + "loss": 0.7097, + "step": 10620 + }, + { + "epoch": 2.3, + "learning_rate": 7.101774719304599e-05, + "loss": 0.7143, + "step": 10640 + }, + { + "epoch": 2.3, + "learning_rate": 7.058312205722564e-05, + "loss": 0.7099, + "step": 10660 + }, + { + "epoch": 2.3, + "learning_rate": 7.014849692140528e-05, + "loss": 0.6994, + "step": 10680 + }, + { + "epoch": 2.31, + "learning_rate": 6.971387178558493e-05, + "loss": 0.7129, + "step": 10700 + }, + { + "epoch": 2.31, + "learning_rate": 6.927924664976458e-05, + "loss": 0.7067, + "step": 10720 + }, + { + "epoch": 2.32, + "learning_rate": 6.884462151394421e-05, + "loss": 0.7044, + "step": 10740 + }, + { + "epoch": 2.32, + "learning_rate": 6.840999637812386e-05, + "loss": 0.7092, + "step": 10760 + }, + { + "epoch": 2.33, + "learning_rate": 6.797537124230351e-05, + "loss": 0.7075, + "step": 10780 + }, + { + "epoch": 2.33, + "learning_rate": 6.754074610648315e-05, + "loss": 0.7073, + "step": 10800 + }, + { + "epoch": 2.33, + "eval_loss": 0.7168901562690735, + "eval_runtime": 25.5153, + "eval_samples_per_second": 78.384, + "eval_steps_per_second": 1.254, + "step": 10800 + }, + { + "epoch": 2.33, + "learning_rate": 6.71061209706628e-05, + "loss": 0.7088, + "step": 10820 + }, + { + "epoch": 2.34, + "learning_rate": 6.667149583484245e-05, + "loss": 0.7046, + "step": 10840 + }, + { + "epoch": 2.34, + "learning_rate": 6.623687069902209e-05, + "loss": 0.7029, + "step": 10860 + }, + { + "epoch": 2.35, + "learning_rate": 6.580224556320174e-05, + "loss": 0.7055, + "step": 10880 + }, + { + "epoch": 2.35, + "learning_rate": 6.536762042738139e-05, + "loss": 0.7095, + "step": 10900 + }, + { + "epoch": 2.36, + "learning_rate": 6.493299529156102e-05, + "loss": 0.7057, + "step": 10920 + }, + { + "epoch": 2.36, + "learning_rate": 6.449837015574066e-05, + "loss": 0.7064, + "step": 10940 + }, + { + "epoch": 2.36, + "learning_rate": 6.406374501992031e-05, + "loss": 0.7039, + "step": 10960 + }, + { + "epoch": 2.37, + "learning_rate": 6.362911988409996e-05, + "loss": 0.7109, + "step": 10980 + }, + { + "epoch": 2.37, + "learning_rate": 6.31944947482796e-05, + "loss": 0.7051, + "step": 11000 + }, + { + "epoch": 2.37, + "eval_loss": 0.7164381146430969, + "eval_runtime": 25.4817, + "eval_samples_per_second": 78.488, + "eval_steps_per_second": 1.256, + "step": 11000 + }, + { + "epoch": 2.38, + "learning_rate": 6.275986961245924e-05, + "loss": 0.7117, + "step": 11020 + }, + { + "epoch": 2.38, + "learning_rate": 6.23252444766389e-05, + "loss": 0.6972, + "step": 11040 + }, + { + "epoch": 2.39, + "learning_rate": 6.189061934081853e-05, + "loss": 0.7087, + "step": 11060 + }, + { + "epoch": 2.39, + "learning_rate": 6.145599420499818e-05, + "loss": 0.703, + "step": 11080 + }, + { + "epoch": 2.39, + "learning_rate": 6.1021369069177825e-05, + "loss": 0.7062, + "step": 11100 + }, + { + "epoch": 2.4, + "learning_rate": 6.0586743933357475e-05, + "loss": 0.7018, + "step": 11120 + }, + { + "epoch": 2.4, + "learning_rate": 6.015211879753712e-05, + "loss": 0.7003, + "step": 11140 + }, + { + "epoch": 2.41, + "learning_rate": 5.971749366171676e-05, + "loss": 0.7005, + "step": 11160 + }, + { + "epoch": 2.41, + "learning_rate": 5.928286852589641e-05, + "loss": 0.7099, + "step": 11180 + }, + { + "epoch": 2.42, + "learning_rate": 5.8848243390076054e-05, + "loss": 0.7002, + "step": 11200 + }, + { + "epoch": 2.42, + "eval_loss": 0.7161288857460022, + "eval_runtime": 25.5084, + "eval_samples_per_second": 78.406, + "eval_steps_per_second": 1.254, + "step": 11200 + }, + { + "epoch": 2.42, + "learning_rate": 5.84136182542557e-05, + "loss": 0.7071, + "step": 11220 + }, + { + "epoch": 2.43, + "learning_rate": 5.797899311843535e-05, + "loss": 0.7028, + "step": 11240 + }, + { + "epoch": 2.43, + "learning_rate": 5.754436798261499e-05, + "loss": 0.7199, + "step": 11260 + }, + { + "epoch": 2.43, + "learning_rate": 5.7109742846794634e-05, + "loss": 0.6974, + "step": 11280 + }, + { + "epoch": 2.44, + "learning_rate": 5.6675117710974284e-05, + "loss": 0.7003, + "step": 11300 + }, + { + "epoch": 2.44, + "learning_rate": 5.624049257515393e-05, + "loss": 0.7079, + "step": 11320 + }, + { + "epoch": 2.45, + "learning_rate": 5.580586743933357e-05, + "loss": 0.6988, + "step": 11340 + }, + { + "epoch": 2.45, + "learning_rate": 5.537124230351322e-05, + "loss": 0.7047, + "step": 11360 + }, + { + "epoch": 2.46, + "learning_rate": 5.493661716769286e-05, + "loss": 0.6946, + "step": 11380 + }, + { + "epoch": 2.46, + "learning_rate": 5.45019920318725e-05, + "loss": 0.7096, + "step": 11400 + }, + { + "epoch": 2.46, + "eval_loss": 0.7155815958976746, + "eval_runtime": 25.525, + "eval_samples_per_second": 78.355, + "eval_steps_per_second": 1.254, + "step": 11400 + }, + { + "epoch": 2.46, + "learning_rate": 5.406736689605215e-05, + "loss": 0.709, + "step": 11420 + }, + { + "epoch": 2.47, + "learning_rate": 5.3632741760231794e-05, + "loss": 0.7112, + "step": 11440 + }, + { + "epoch": 2.47, + "learning_rate": 5.319811662441144e-05, + "loss": 0.6983, + "step": 11460 + }, + { + "epoch": 2.48, + "learning_rate": 5.276349148859109e-05, + "loss": 0.7, + "step": 11480 + }, + { + "epoch": 2.48, + "learning_rate": 5.232886635277073e-05, + "loss": 0.7006, + "step": 11500 + }, + { + "epoch": 2.49, + "learning_rate": 5.189424121695037e-05, + "loss": 0.7068, + "step": 11520 + }, + { + "epoch": 2.49, + "learning_rate": 5.1459616081130023e-05, + "loss": 0.7012, + "step": 11540 + }, + { + "epoch": 2.49, + "learning_rate": 5.102499094530967e-05, + "loss": 0.7079, + "step": 11560 + }, + { + "epoch": 2.5, + "learning_rate": 5.059036580948931e-05, + "loss": 0.7031, + "step": 11580 + }, + { + "epoch": 2.5, + "learning_rate": 5.015574067366896e-05, + "loss": 0.7038, + "step": 11600 + }, + { + "epoch": 2.5, + "eval_loss": 0.7149330973625183, + "eval_runtime": 25.4843, + "eval_samples_per_second": 78.48, + "eval_steps_per_second": 1.256, + "step": 11600 + }, + { + "epoch": 2.51, + "learning_rate": 4.97211155378486e-05, + "loss": 0.6972, + "step": 11620 + }, + { + "epoch": 2.51, + "learning_rate": 4.9286490402028246e-05, + "loss": 0.7039, + "step": 11640 + }, + { + "epoch": 2.52, + "learning_rate": 4.885186526620789e-05, + "loss": 0.7052, + "step": 11660 + }, + { + "epoch": 2.52, + "learning_rate": 4.841724013038754e-05, + "loss": 0.7045, + "step": 11680 + }, + { + "epoch": 2.52, + "learning_rate": 4.798261499456718e-05, + "loss": 0.701, + "step": 11700 + }, + { + "epoch": 2.53, + "learning_rate": 4.7547989858746826e-05, + "loss": 0.7084, + "step": 11720 + }, + { + "epoch": 2.53, + "learning_rate": 4.7113364722926476e-05, + "loss": 0.6988, + "step": 11740 + }, + { + "epoch": 2.54, + "learning_rate": 4.667873958710612e-05, + "loss": 0.7155, + "step": 11760 + }, + { + "epoch": 2.54, + "learning_rate": 4.624411445128576e-05, + "loss": 0.7044, + "step": 11780 + }, + { + "epoch": 2.55, + "learning_rate": 4.5809489315465406e-05, + "loss": 0.7014, + "step": 11800 + }, + { + "epoch": 2.55, + "eval_loss": 0.714367151260376, + "eval_runtime": 25.4959, + "eval_samples_per_second": 78.444, + "eval_steps_per_second": 1.255, + "step": 11800 + }, + { + "epoch": 2.55, + "learning_rate": 4.537486417964505e-05, + "loss": 0.708, + "step": 11820 + }, + { + "epoch": 2.55, + "learning_rate": 4.494023904382469e-05, + "loss": 0.6976, + "step": 11840 + }, + { + "epoch": 2.56, + "learning_rate": 4.450561390800434e-05, + "loss": 0.7057, + "step": 11860 + }, + { + "epoch": 2.56, + "learning_rate": 4.4070988772183986e-05, + "loss": 0.7039, + "step": 11880 + }, + { + "epoch": 2.57, + "learning_rate": 4.363636363636363e-05, + "loss": 0.7089, + "step": 11900 + }, + { + "epoch": 2.57, + "learning_rate": 4.320173850054328e-05, + "loss": 0.7026, + "step": 11920 + }, + { + "epoch": 2.58, + "learning_rate": 4.276711336472292e-05, + "loss": 0.7023, + "step": 11940 + }, + { + "epoch": 2.58, + "learning_rate": 4.2332488228902565e-05, + "loss": 0.7006, + "step": 11960 + }, + { + "epoch": 2.58, + "learning_rate": 4.1897863093082215e-05, + "loss": 0.7008, + "step": 11980 + }, + { + "epoch": 2.59, + "learning_rate": 4.146323795726186e-05, + "loss": 0.7057, + "step": 12000 + }, + { + "epoch": 2.59, + "eval_loss": 0.7141902446746826, + "eval_runtime": 25.5019, + "eval_samples_per_second": 78.426, + "eval_steps_per_second": 1.255, + "step": 12000 + }, + { + "epoch": 2.59, + "learning_rate": 4.10286128214415e-05, + "loss": 0.7083, + "step": 12020 + }, + { + "epoch": 2.6, + "learning_rate": 4.059398768562115e-05, + "loss": 0.6986, + "step": 12040 + }, + { + "epoch": 2.6, + "learning_rate": 4.0159362549800795e-05, + "loss": 0.7076, + "step": 12060 + }, + { + "epoch": 2.61, + "learning_rate": 3.972473741398044e-05, + "loss": 0.7071, + "step": 12080 + }, + { + "epoch": 2.61, + "learning_rate": 3.929011227816009e-05, + "loss": 0.6984, + "step": 12100 + }, + { + "epoch": 2.61, + "learning_rate": 3.885548714233973e-05, + "loss": 0.7096, + "step": 12120 + }, + { + "epoch": 2.62, + "learning_rate": 3.8420862006519375e-05, + "loss": 0.7027, + "step": 12140 + }, + { + "epoch": 2.62, + "learning_rate": 3.7986236870699025e-05, + "loss": 0.7062, + "step": 12160 + }, + { + "epoch": 2.63, + "learning_rate": 3.755161173487867e-05, + "loss": 0.7049, + "step": 12180 + }, + { + "epoch": 2.63, + "learning_rate": 3.711698659905831e-05, + "loss": 0.7052, + "step": 12200 + }, + { + "epoch": 2.63, + "eval_loss": 0.7140177488327026, + "eval_runtime": 25.4673, + "eval_samples_per_second": 78.532, + "eval_steps_per_second": 1.257, + "step": 12200 + }, + { + "epoch": 2.64, + "learning_rate": 3.6682361463237955e-05, + "loss": 0.7011, + "step": 12220 + }, + { + "epoch": 2.64, + "learning_rate": 3.62477363274176e-05, + "loss": 0.7025, + "step": 12240 + }, + { + "epoch": 2.65, + "learning_rate": 3.581311119159725e-05, + "loss": 0.7006, + "step": 12260 + }, + { + "epoch": 2.65, + "learning_rate": 3.537848605577689e-05, + "loss": 0.7073, + "step": 12280 + }, + { + "epoch": 2.65, + "learning_rate": 3.4943860919956534e-05, + "loss": 0.7033, + "step": 12300 + }, + { + "epoch": 2.66, + "learning_rate": 3.4509235784136184e-05, + "loss": 0.6992, + "step": 12320 + }, + { + "epoch": 2.66, + "learning_rate": 3.407461064831582e-05, + "loss": 0.7043, + "step": 12340 + }, + { + "epoch": 2.67, + "learning_rate": 3.363998551249547e-05, + "loss": 0.7083, + "step": 12360 + }, + { + "epoch": 2.67, + "learning_rate": 3.3205360376675114e-05, + "loss": 0.7086, + "step": 12380 + }, + { + "epoch": 2.68, + "learning_rate": 3.277073524085476e-05, + "loss": 0.7168, + "step": 12400 + }, + { + "epoch": 2.68, + "eval_loss": 0.7138265371322632, + "eval_runtime": 25.5077, + "eval_samples_per_second": 78.408, + "eval_steps_per_second": 1.255, + "step": 12400 + }, + { + "epoch": 2.68, + "learning_rate": 3.233611010503441e-05, + "loss": 0.7026, + "step": 12420 + }, + { + "epoch": 2.68, + "learning_rate": 3.190148496921405e-05, + "loss": 0.7097, + "step": 12440 + }, + { + "epoch": 2.69, + "learning_rate": 3.1466859833393694e-05, + "loss": 0.7094, + "step": 12460 + }, + { + "epoch": 2.69, + "learning_rate": 3.1032234697573344e-05, + "loss": 0.6971, + "step": 12480 + }, + { + "epoch": 2.7, + "learning_rate": 3.059760956175299e-05, + "loss": 0.6977, + "step": 12500 + }, + { + "epoch": 2.7, + "learning_rate": 3.016298442593263e-05, + "loss": 0.6945, + "step": 12520 + }, + { + "epoch": 2.71, + "learning_rate": 2.9728359290112277e-05, + "loss": 0.6998, + "step": 12540 + }, + { + "epoch": 2.71, + "learning_rate": 2.929373415429192e-05, + "loss": 0.7067, + "step": 12560 + }, + { + "epoch": 2.71, + "learning_rate": 2.8859109018471563e-05, + "loss": 0.6935, + "step": 12580 + }, + { + "epoch": 2.72, + "learning_rate": 2.842448388265121e-05, + "loss": 0.6927, + "step": 12600 + }, + { + "epoch": 2.72, + "eval_loss": 0.7132371664047241, + "eval_runtime": 25.516, + "eval_samples_per_second": 78.382, + "eval_steps_per_second": 1.254, + "step": 12600 + }, + { + "epoch": 2.72, + "learning_rate": 2.7989858746830857e-05, + "loss": 0.7025, + "step": 12620 + }, + { + "epoch": 2.73, + "learning_rate": 2.75552336110105e-05, + "loss": 0.7098, + "step": 12640 + }, + { + "epoch": 2.73, + "learning_rate": 2.7120608475190147e-05, + "loss": 0.6939, + "step": 12660 + }, + { + "epoch": 2.74, + "learning_rate": 2.6685983339369793e-05, + "loss": 0.7038, + "step": 12680 + }, + { + "epoch": 2.74, + "learning_rate": 2.6251358203549436e-05, + "loss": 0.7039, + "step": 12700 + }, + { + "epoch": 2.74, + "learning_rate": 2.5816733067729083e-05, + "loss": 0.7018, + "step": 12720 + }, + { + "epoch": 2.75, + "learning_rate": 2.538210793190873e-05, + "loss": 0.6943, + "step": 12740 + }, + { + "epoch": 2.75, + "learning_rate": 2.4947482796088373e-05, + "loss": 0.7007, + "step": 12760 + }, + { + "epoch": 2.76, + "learning_rate": 2.4512857660268016e-05, + "loss": 0.7019, + "step": 12780 + }, + { + "epoch": 2.76, + "learning_rate": 2.407823252444766e-05, + "loss": 0.6957, + "step": 12800 + }, + { + "epoch": 2.76, + "eval_loss": 0.7126932144165039, + "eval_runtime": 25.4915, + "eval_samples_per_second": 78.458, + "eval_steps_per_second": 1.255, + "step": 12800 + }, + { + "epoch": 2.77, + "learning_rate": 2.3643607388627306e-05, + "loss": 0.6993, + "step": 12820 + }, + { + "epoch": 2.77, + "learning_rate": 2.3208982252806953e-05, + "loss": 0.6951, + "step": 12840 + }, + { + "epoch": 2.77, + "learning_rate": 2.2774357116986596e-05, + "loss": 0.7056, + "step": 12860 + }, + { + "epoch": 2.78, + "learning_rate": 2.2339731981166243e-05, + "loss": 0.7153, + "step": 12880 + }, + { + "epoch": 2.78, + "learning_rate": 2.190510684534589e-05, + "loss": 0.7022, + "step": 12900 + }, + { + "epoch": 2.79, + "learning_rate": 2.1470481709525532e-05, + "loss": 0.7078, + "step": 12920 + }, + { + "epoch": 2.79, + "learning_rate": 2.103585657370518e-05, + "loss": 0.6969, + "step": 12940 + }, + { + "epoch": 2.8, + "learning_rate": 2.0601231437884826e-05, + "loss": 0.7056, + "step": 12960 + }, + { + "epoch": 2.8, + "learning_rate": 2.016660630206447e-05, + "loss": 0.6975, + "step": 12980 + }, + { + "epoch": 2.8, + "learning_rate": 1.9731981166244112e-05, + "loss": 0.7065, + "step": 13000 + }, + { + "epoch": 2.8, + "eval_loss": 0.7130131721496582, + "eval_runtime": 25.4905, + "eval_samples_per_second": 78.461, + "eval_steps_per_second": 1.255, + "step": 13000 + }, + { + "epoch": 2.81, + "learning_rate": 1.9297356030423755e-05, + "loss": 0.7, + "step": 13020 + }, + { + "epoch": 2.81, + "learning_rate": 1.8862730894603402e-05, + "loss": 0.7144, + "step": 13040 + }, + { + "epoch": 2.82, + "learning_rate": 1.842810575878305e-05, + "loss": 0.6964, + "step": 13060 + }, + { + "epoch": 2.82, + "learning_rate": 1.7993480622962692e-05, + "loss": 0.6981, + "step": 13080 + }, + { + "epoch": 2.83, + "learning_rate": 1.755885548714234e-05, + "loss": 0.7102, + "step": 13100 + }, + { + "epoch": 2.83, + "learning_rate": 1.7124230351321985e-05, + "loss": 0.6975, + "step": 13120 + }, + { + "epoch": 2.83, + "learning_rate": 1.668960521550163e-05, + "loss": 0.7062, + "step": 13140 + }, + { + "epoch": 2.84, + "learning_rate": 1.625498007968127e-05, + "loss": 0.6956, + "step": 13160 + }, + { + "epoch": 2.84, + "learning_rate": 1.5820354943860918e-05, + "loss": 0.71, + "step": 13180 + }, + { + "epoch": 2.85, + "learning_rate": 1.5385729808040565e-05, + "loss": 0.7081, + "step": 13200 + }, + { + "epoch": 2.85, + "eval_loss": 0.7126001119613647, + "eval_runtime": 25.5102, + "eval_samples_per_second": 78.4, + "eval_steps_per_second": 1.254, + "step": 13200 + }, + { + "epoch": 2.85, + "learning_rate": 1.495110467222021e-05, + "loss": 0.6977, + "step": 13220 + }, + { + "epoch": 2.86, + "learning_rate": 1.4516479536399855e-05, + "loss": 0.705, + "step": 13240 + }, + { + "epoch": 2.86, + "learning_rate": 1.4081854400579498e-05, + "loss": 0.7016, + "step": 13260 + }, + { + "epoch": 2.87, + "learning_rate": 1.3647229264759143e-05, + "loss": 0.6922, + "step": 13280 + }, + { + "epoch": 2.87, + "learning_rate": 1.321260412893879e-05, + "loss": 0.6987, + "step": 13300 + }, + { + "epoch": 2.87, + "learning_rate": 1.2777978993118434e-05, + "loss": 0.7041, + "step": 13320 + }, + { + "epoch": 2.88, + "learning_rate": 1.234335385729808e-05, + "loss": 0.7101, + "step": 13340 + }, + { + "epoch": 2.88, + "learning_rate": 1.1908728721477723e-05, + "loss": 0.6976, + "step": 13360 + }, + { + "epoch": 2.89, + "learning_rate": 1.147410358565737e-05, + "loss": 0.7011, + "step": 13380 + }, + { + "epoch": 2.89, + "learning_rate": 1.1039478449837014e-05, + "loss": 0.6973, + "step": 13400 + }, + { + "epoch": 2.89, + "eval_loss": 0.7123447060585022, + "eval_runtime": 25.5029, + "eval_samples_per_second": 78.422, + "eval_steps_per_second": 1.255, + "step": 13400 + }, + { + "epoch": 2.9, + "learning_rate": 1.060485331401666e-05, + "loss": 0.7084, + "step": 13420 + }, + { + "epoch": 2.9, + "learning_rate": 1.0170228178196306e-05, + "loss": 0.7133, + "step": 13440 + }, + { + "epoch": 2.9, + "learning_rate": 9.73560304237595e-06, + "loss": 0.6988, + "step": 13460 + }, + { + "epoch": 2.91, + "learning_rate": 9.300977906555596e-06, + "loss": 0.7045, + "step": 13480 + }, + { + "epoch": 2.91, + "learning_rate": 8.866352770735239e-06, + "loss": 0.6985, + "step": 13500 + }, + { + "epoch": 2.92, + "learning_rate": 8.431727634914886e-06, + "loss": 0.6967, + "step": 13520 + }, + { + "epoch": 2.92, + "learning_rate": 7.99710249909453e-06, + "loss": 0.7008, + "step": 13540 + }, + { + "epoch": 2.93, + "learning_rate": 7.562477363274175e-06, + "loss": 0.6956, + "step": 13560 + }, + { + "epoch": 2.93, + "learning_rate": 7.12785222745382e-06, + "loss": 0.7065, + "step": 13580 + }, + { + "epoch": 2.93, + "learning_rate": 6.693227091633466e-06, + "loss": 0.7018, + "step": 13600 + }, + { + "epoch": 2.93, + "eval_loss": 0.712183952331543, + "eval_runtime": 25.5697, + "eval_samples_per_second": 78.218, + "eval_steps_per_second": 1.251, + "step": 13600 + }, + { + "epoch": 2.94, + "learning_rate": 6.25860195581311e-06, + "loss": 0.7083, + "step": 13620 + }, + { + "epoch": 2.94, + "learning_rate": 5.823976819992756e-06, + "loss": 0.7007, + "step": 13640 + }, + { + "epoch": 2.95, + "learning_rate": 5.3893516841724e-06, + "loss": 0.705, + "step": 13660 + }, + { + "epoch": 2.95, + "learning_rate": 4.954726548352046e-06, + "loss": 0.7003, + "step": 13680 + }, + { + "epoch": 2.96, + "learning_rate": 4.520101412531691e-06, + "loss": 0.6978, + "step": 13700 + }, + { + "epoch": 2.96, + "learning_rate": 4.085476276711337e-06, + "loss": 0.697, + "step": 13720 + }, + { + "epoch": 2.96, + "learning_rate": 3.650851140890981e-06, + "loss": 0.6976, + "step": 13740 + }, + { + "epoch": 2.97, + "learning_rate": 3.2162260050706265e-06, + "loss": 0.6998, + "step": 13760 + }, + { + "epoch": 2.97, + "learning_rate": 2.7816008692502714e-06, + "loss": 0.6999, + "step": 13780 + }, + { + "epoch": 2.98, + "learning_rate": 2.3469757334299168e-06, + "loss": 0.7046, + "step": 13800 + }, + { + "epoch": 2.98, + "eval_loss": 0.7120471596717834, + "eval_runtime": 25.4826, + "eval_samples_per_second": 78.485, + "eval_steps_per_second": 1.256, + "step": 13800 + } + ], + "max_steps": 13905, + "num_train_epochs": 3, + "total_flos": 6.977364611915422e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-belle13b/checkpoint-13800/training_args.bin b/adapters/saved-alpaca-belle13b/checkpoint-13800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d9fc651b09d1fbcbbf76356c2181acb1def32585 --- /dev/null +++ b/adapters/saved-alpaca-belle13b/checkpoint-13800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d94e5a4fbc2ed544893c730b1ef244fb2123fe494b95f38bd148f9dd38f68e0 +size 3643 diff --git a/adapters/saved-alpaca-belle30b/adapter_config.json b/adapters/saved-alpaca-belle30b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1c63d3d52c8ae53700f4a81186f7cb93f50a2d78 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/adapter_config.json @@ -0,0 +1,18 @@ +{ + "base_model_name_or_path": "/mnt/bn/qingyi-bn-lq/llama/llama-30b-hf", + "bias": "none", + "enable_lora": null, + "fan_in_fan_out": false, + "inference_mode": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved-alpaca-belle30b/adapter_model.bin b/adapters/saved-alpaca-belle30b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..1d56136e544b707fc782b87977bdac6f058f463d --- /dev/null +++ b/adapters/saved-alpaca-belle30b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57f3a6e364021298bc72b7b435bced828e4036992741880d2f5eb469d718b77b +size 51204365 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13400/optimizer.pt b/adapters/saved-alpaca-belle30b/checkpoint-13400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d379b1c99f2d47b0a081e545f0895343612c906 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4897b958c8b7b1437a18f680f29577a8d5c3963046c37578b80f4d81c23151fc +size 102377669 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13400/pytorch_model.bin b/adapters/saved-alpaca-belle30b/checkpoint-13400/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8a5f389cc3743e20c0fbb32a3aa757fa3a1d0915 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13400/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1b29f77aabf89ecd54f63130b79dc7bdb5922e8cbcb3859c0c657d1efa08632 +size 51204365 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_0.pth b/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..91a306b064d820bcfd5d84c31912a622a67e4f16 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:323f207448f55969c5ee42c82f113b4f8c1edb993b56d03bf8847b02d4dfecfb +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_1.pth b/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..42348acc7aa4ebf8807eb229c29790bbfd115971 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7172821d4a1c34d6ef5975523a7e60543bda9f1702786b5432a751b2af3c1f08 +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_2.pth b/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..2a2469513b89caa5c3561eab76e9ede4e1deebc7 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd5d9e0d4bcb6534190c625bebf26b419ca6fa7371913316d8b429164ab499b +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_3.pth b/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..d2e19cd90b835a93036e3cfc2a0732c533ebf0a2 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:baf89a9c94be2a861e70f4f32104c33b5b0a527c2b712a2827170cfc613bc56b +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_4.pth b/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..9c3db27cc39ea5d8fa852ff0f5aefc228aa31088 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0590512b4dd14782552cb447933f2bdc9c99eea6d1cd5b0345bb857fc6f91d1 +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_5.pth b/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1151be8a0afa5e5d66609b1085b48d52914f7e4b --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89c6336be10cab499fe47e4539ec0ab644084f67cbf0ff43b602bfe9e7dfd486 +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_6.pth b/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..4a578fbda60d195c6ca62e6e190c267c14f7e8b3 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5366407e125c8f750d2fed36d2667734101b9eec8d1886863074dd29649b718e +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_7.pth b/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..952e5f28f277895016d52d2fe21d78a43aaa1d02 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13400/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4aae1fc406f7a46dbf8b0d320cf34f041e5008fc80158bff6e360e7e3919bb58 +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13400/scaler.pt b/adapters/saved-alpaca-belle30b/checkpoint-13400/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..572ff10c1e3d766425e7fe7fbbd0b8041a6be139 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13400/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd8c7cc60b46e2edfcd31a86e53ffaf5b84fd28ec10216de569ff531ee01beff +size 557 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13400/scheduler.pt b/adapters/saved-alpaca-belle30b/checkpoint-13400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..353041f2729ee778b1cda18c517f3202145e5e43 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fe5117a4c00d91b3d7b759161646a0319a8a02f8b7460ea47db6270298d3fe2 +size 627 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13400/trainer_state.json b/adapters/saved-alpaca-belle30b/checkpoint-13400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d39b84f12c7aadcffdef9646fa02c9468c412b7b --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13400/trainer_state.json @@ -0,0 +1,4572 @@ +{ + "best_metric": 0.6676326990127563, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-belle30b/checkpoint-13400", + "epoch": 2.8910463861920173, + "global_step": 13400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.6143, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 1.2447, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017699999999999997, + "loss": 0.9529, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 0.000237, + "loss": 0.8899, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029699999999999996, + "loss": 0.8614, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029958710612097066, + "loss": 0.8402, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029915248098515027, + "loss": 0.8335, + "step": 140 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029871785584932993, + "loss": 0.8303, + "step": 160 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002982832307135096, + "loss": 0.8261, + "step": 180 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002978486055776892, + "loss": 0.807, + "step": 200 + }, + { + "epoch": 0.04, + "eval_loss": 0.8271128535270691, + "eval_runtime": 49.877, + "eval_samples_per_second": 40.099, + "eval_steps_per_second": 0.642, + "step": 200 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029741398044186887, + "loss": 0.808, + "step": 220 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029697935530604853, + "loss": 0.8092, + "step": 240 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029654473017022814, + "loss": 0.8045, + "step": 260 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002961101050344078, + "loss": 0.8007, + "step": 280 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029567547989858747, + "loss": 0.793, + "step": 300 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002952408547627671, + "loss": 0.7886, + "step": 320 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029480622962694674, + "loss": 0.7854, + "step": 340 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002943716044911264, + "loss": 0.783, + "step": 360 + }, + { + "epoch": 0.08, + "learning_rate": 0.000293936979355306, + "loss": 0.7797, + "step": 380 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002935023542194857, + "loss": 0.7801, + "step": 400 + }, + { + "epoch": 0.09, + "eval_loss": 0.793747067451477, + "eval_runtime": 49.8962, + "eval_samples_per_second": 40.083, + "eval_steps_per_second": 0.641, + "step": 400 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029306772908366534, + "loss": 0.7879, + "step": 420 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029263310394784495, + "loss": 0.7745, + "step": 440 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002921984788120246, + "loss": 0.7725, + "step": 460 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002917638536762043, + "loss": 0.7659, + "step": 480 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002913292285403839, + "loss": 0.7658, + "step": 500 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029089460340456355, + "loss": 0.7722, + "step": 520 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002904599782687432, + "loss": 0.773, + "step": 540 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002900253531329228, + "loss": 0.7749, + "step": 560 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002895907279971025, + "loss": 0.7734, + "step": 580 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028915610286128215, + "loss": 0.7607, + "step": 600 + }, + { + "epoch": 0.13, + "eval_loss": 0.7771433591842651, + "eval_runtime": 49.9486, + "eval_samples_per_second": 40.041, + "eval_steps_per_second": 0.641, + "step": 600 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028872147772546176, + "loss": 0.7657, + "step": 620 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028828685258964137, + "loss": 0.7602, + "step": 640 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028785222745382103, + "loss": 0.7619, + "step": 660 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002874176023180007, + "loss": 0.7587, + "step": 680 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002869829771821803, + "loss": 0.7553, + "step": 700 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028654835204635997, + "loss": 0.7565, + "step": 720 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028611372691053963, + "loss": 0.7586, + "step": 740 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028567910177471924, + "loss": 0.7556, + "step": 760 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002852444766388989, + "loss": 0.7487, + "step": 780 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028480985150307857, + "loss": 0.7516, + "step": 800 + }, + { + "epoch": 0.17, + "eval_loss": 0.7632888555526733, + "eval_runtime": 49.913, + "eval_samples_per_second": 40.07, + "eval_steps_per_second": 0.641, + "step": 800 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002843752263672582, + "loss": 0.7527, + "step": 820 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028394060123143784, + "loss": 0.7407, + "step": 840 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002835059760956175, + "loss": 0.744, + "step": 860 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002830713509597971, + "loss": 0.7456, + "step": 880 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002826367258239768, + "loss": 0.7429, + "step": 900 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028220210068815644, + "loss": 0.7516, + "step": 920 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028176747555233605, + "loss": 0.7381, + "step": 940 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002813328504165157, + "loss": 0.7256, + "step": 960 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002808982252806954, + "loss": 0.7443, + "step": 980 + }, + { + "epoch": 0.22, + "learning_rate": 0.000280463600144875, + "loss": 0.7389, + "step": 1000 + }, + { + "epoch": 0.22, + "eval_loss": 0.7532852292060852, + "eval_runtime": 49.9829, + "eval_samples_per_second": 40.014, + "eval_steps_per_second": 0.64, + "step": 1000 + }, + { + "epoch": 0.22, + "learning_rate": 0.00028002897500905465, + "loss": 0.7374, + "step": 1020 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002795943498732343, + "loss": 0.7296, + "step": 1040 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002791597247374139, + "loss": 0.7424, + "step": 1060 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002787250996015936, + "loss": 0.7328, + "step": 1080 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027829047446577325, + "loss": 0.7367, + "step": 1100 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027785584932995286, + "loss": 0.7419, + "step": 1120 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002774212241941325, + "loss": 0.7347, + "step": 1140 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002769865990583122, + "loss": 0.7292, + "step": 1160 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002765519739224918, + "loss": 0.7394, + "step": 1180 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027611734878667146, + "loss": 0.7358, + "step": 1200 + }, + { + "epoch": 0.26, + "eval_loss": 0.7463639974594116, + "eval_runtime": 49.9963, + "eval_samples_per_second": 40.003, + "eval_steps_per_second": 0.64, + "step": 1200 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002756827236508511, + "loss": 0.7266, + "step": 1220 + }, + { + "epoch": 0.27, + "learning_rate": 0.00027524809851503073, + "loss": 0.7336, + "step": 1240 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002748134733792104, + "loss": 0.7296, + "step": 1260 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027437884824339006, + "loss": 0.73, + "step": 1280 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027394422310756967, + "loss": 0.7312, + "step": 1300 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027350959797174933, + "loss": 0.7307, + "step": 1320 + }, + { + "epoch": 0.29, + "learning_rate": 0.000273074972835929, + "loss": 0.7246, + "step": 1340 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002726403477001086, + "loss": 0.7299, + "step": 1360 + }, + { + "epoch": 0.3, + "learning_rate": 0.00027220572256428827, + "loss": 0.7251, + "step": 1380 + }, + { + "epoch": 0.3, + "learning_rate": 0.00027177109742846793, + "loss": 0.7286, + "step": 1400 + }, + { + "epoch": 0.3, + "eval_loss": 0.7393819093704224, + "eval_runtime": 49.9896, + "eval_samples_per_second": 40.008, + "eval_steps_per_second": 0.64, + "step": 1400 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027133647229264754, + "loss": 0.7186, + "step": 1420 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002709018471568272, + "loss": 0.7215, + "step": 1440 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027046722202100687, + "loss": 0.7295, + "step": 1460 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002700325968851865, + "loss": 0.7198, + "step": 1480 + }, + { + "epoch": 0.32, + "learning_rate": 0.00026959797174936614, + "loss": 0.7184, + "step": 1500 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002691633466135458, + "loss": 0.7283, + "step": 1520 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002687287214777254, + "loss": 0.7378, + "step": 1540 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002682940963419051, + "loss": 0.7196, + "step": 1560 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026785947120608474, + "loss": 0.7152, + "step": 1580 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026742484607026435, + "loss": 0.7184, + "step": 1600 + }, + { + "epoch": 0.35, + "eval_loss": 0.7342154383659363, + "eval_runtime": 49.9957, + "eval_samples_per_second": 40.003, + "eval_steps_per_second": 0.64, + "step": 1600 + }, + { + "epoch": 0.35, + "learning_rate": 0.000266990220934444, + "loss": 0.7164, + "step": 1620 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002665555957986237, + "loss": 0.7136, + "step": 1640 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002661209706628033, + "loss": 0.7203, + "step": 1660 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026568634552698295, + "loss": 0.7158, + "step": 1680 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002652517203911626, + "loss": 0.7145, + "step": 1700 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002648170952553422, + "loss": 0.7111, + "step": 1720 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002643824701195219, + "loss": 0.7155, + "step": 1740 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026394784498370155, + "loss": 0.718, + "step": 1760 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026351321984788116, + "loss": 0.7125, + "step": 1780 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002630785947120608, + "loss": 0.7163, + "step": 1800 + }, + { + "epoch": 0.39, + "eval_loss": 0.7301950454711914, + "eval_runtime": 49.9689, + "eval_samples_per_second": 40.025, + "eval_steps_per_second": 0.64, + "step": 1800 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002626439695762405, + "loss": 0.7121, + "step": 1820 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002622093444404201, + "loss": 0.7092, + "step": 1840 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026177471930459976, + "loss": 0.7133, + "step": 1860 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002613400941687794, + "loss": 0.7171, + "step": 1880 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026090546903295903, + "loss": 0.7235, + "step": 1900 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002604708438971387, + "loss": 0.7086, + "step": 1920 + }, + { + "epoch": 0.42, + "learning_rate": 0.00026003621876131836, + "loss": 0.7136, + "step": 1940 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025960159362549797, + "loss": 0.7031, + "step": 1960 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025916696848967763, + "loss": 0.7084, + "step": 1980 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002587323433538573, + "loss": 0.7091, + "step": 2000 + }, + { + "epoch": 0.43, + "eval_loss": 0.726446270942688, + "eval_runtime": 50.0519, + "eval_samples_per_second": 39.959, + "eval_steps_per_second": 0.639, + "step": 2000 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002582977182180369, + "loss": 0.7119, + "step": 2020 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025786309308221657, + "loss": 0.7186, + "step": 2040 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025742846794639623, + "loss": 0.703, + "step": 2060 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025699384281057584, + "loss": 0.7078, + "step": 2080 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002565592176747555, + "loss": 0.7084, + "step": 2100 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025612459253893517, + "loss": 0.7014, + "step": 2120 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002556899674031148, + "loss": 0.7076, + "step": 2140 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025525534226729444, + "loss": 0.7103, + "step": 2160 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002548207171314741, + "loss": 0.7118, + "step": 2180 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002543860919956537, + "loss": 0.7028, + "step": 2200 + }, + { + "epoch": 0.47, + "eval_loss": 0.7220268845558167, + "eval_runtime": 49.9937, + "eval_samples_per_second": 40.005, + "eval_steps_per_second": 0.64, + "step": 2200 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002539514668598334, + "loss": 0.707, + "step": 2220 + }, + { + "epoch": 0.48, + "learning_rate": 0.00025351684172401304, + "loss": 0.7045, + "step": 2240 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025308221658819265, + "loss": 0.6905, + "step": 2260 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002526475914523723, + "loss": 0.6982, + "step": 2280 + }, + { + "epoch": 0.5, + "learning_rate": 0.000252212966316552, + "loss": 0.706, + "step": 2300 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002517783411807316, + "loss": 0.6992, + "step": 2320 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025134371604491125, + "loss": 0.6939, + "step": 2340 + }, + { + "epoch": 0.51, + "learning_rate": 0.00025090909090909086, + "loss": 0.7037, + "step": 2360 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002504744657732705, + "loss": 0.7127, + "step": 2380 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025003984063745014, + "loss": 0.702, + "step": 2400 + }, + { + "epoch": 0.52, + "eval_loss": 0.7191869020462036, + "eval_runtime": 50.0038, + "eval_samples_per_second": 39.997, + "eval_steps_per_second": 0.64, + "step": 2400 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002496052155016298, + "loss": 0.7033, + "step": 2420 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024917059036580946, + "loss": 0.7028, + "step": 2440 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024873596522998907, + "loss": 0.6967, + "step": 2460 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024830134009416874, + "loss": 0.7068, + "step": 2480 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002478667149583484, + "loss": 0.7105, + "step": 2500 + }, + { + "epoch": 0.54, + "learning_rate": 0.000247432089822528, + "loss": 0.6968, + "step": 2520 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024699746468670767, + "loss": 0.7025, + "step": 2540 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024656283955088734, + "loss": 0.6942, + "step": 2560 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024612821441506694, + "loss": 0.6948, + "step": 2580 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002456935892792466, + "loss": 0.6979, + "step": 2600 + }, + { + "epoch": 0.56, + "eval_loss": 0.715853750705719, + "eval_runtime": 50.0426, + "eval_samples_per_second": 39.966, + "eval_steps_per_second": 0.639, + "step": 2600 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024525896414342627, + "loss": 0.6967, + "step": 2620 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002448243390076059, + "loss": 0.7012, + "step": 2640 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024438971387178554, + "loss": 0.697, + "step": 2660 + }, + { + "epoch": 0.58, + "learning_rate": 0.0002439550887359652, + "loss": 0.6931, + "step": 2680 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024352046360014485, + "loss": 0.6856, + "step": 2700 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024308583846432448, + "loss": 0.697, + "step": 2720 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024265121332850415, + "loss": 0.6996, + "step": 2740 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024221658819268378, + "loss": 0.698, + "step": 2760 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024178196305686342, + "loss": 0.6952, + "step": 2780 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024134733792104308, + "loss": 0.7049, + "step": 2800 + }, + { + "epoch": 0.6, + "eval_loss": 0.7124837040901184, + "eval_runtime": 50.0654, + "eval_samples_per_second": 39.948, + "eval_steps_per_second": 0.639, + "step": 2800 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024091271278522272, + "loss": 0.6927, + "step": 2820 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024047808764940235, + "loss": 0.6996, + "step": 2840 + }, + { + "epoch": 0.62, + "learning_rate": 0.00024004346251358202, + "loss": 0.6921, + "step": 2860 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023960883737776165, + "loss": 0.695, + "step": 2880 + }, + { + "epoch": 0.63, + "learning_rate": 0.0002391742122419413, + "loss": 0.6887, + "step": 2900 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023873958710612095, + "loss": 0.6915, + "step": 2920 + }, + { + "epoch": 0.63, + "learning_rate": 0.0002383049619703006, + "loss": 0.6915, + "step": 2940 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023787033683448023, + "loss": 0.6916, + "step": 2960 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002374357116986599, + "loss": 0.687, + "step": 2980 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023700108656283953, + "loss": 0.6997, + "step": 3000 + }, + { + "epoch": 0.65, + "eval_loss": 0.7098860144615173, + "eval_runtime": 50.0652, + "eval_samples_per_second": 39.948, + "eval_steps_per_second": 0.639, + "step": 3000 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023656646142701916, + "loss": 0.6895, + "step": 3020 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023613183629119883, + "loss": 0.6861, + "step": 3040 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023569721115537846, + "loss": 0.6988, + "step": 3060 + }, + { + "epoch": 0.66, + "learning_rate": 0.0002352625860195581, + "loss": 0.6852, + "step": 3080 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023482796088373776, + "loss": 0.6863, + "step": 3100 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002343933357479174, + "loss": 0.6943, + "step": 3120 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023395871061209704, + "loss": 0.686, + "step": 3140 + }, + { + "epoch": 0.68, + "learning_rate": 0.0002335240854762767, + "loss": 0.684, + "step": 3160 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023308946034045634, + "loss": 0.6866, + "step": 3180 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023265483520463597, + "loss": 0.6859, + "step": 3200 + }, + { + "epoch": 0.69, + "eval_loss": 0.7077216506004333, + "eval_runtime": 50.0526, + "eval_samples_per_second": 39.958, + "eval_steps_per_second": 0.639, + "step": 3200 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023222021006881564, + "loss": 0.6845, + "step": 3220 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023178558493299527, + "loss": 0.7011, + "step": 3240 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002313509597971749, + "loss": 0.69, + "step": 3260 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023091633466135457, + "loss": 0.6931, + "step": 3280 + }, + { + "epoch": 0.71, + "learning_rate": 0.0002304817095255342, + "loss": 0.6998, + "step": 3300 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023004708438971385, + "loss": 0.6933, + "step": 3320 + }, + { + "epoch": 0.72, + "learning_rate": 0.0002296124592538935, + "loss": 0.6859, + "step": 3340 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022917783411807315, + "loss": 0.6972, + "step": 3360 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022874320898225278, + "loss": 0.6868, + "step": 3380 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022830858384643245, + "loss": 0.6902, + "step": 3400 + }, + { + "epoch": 0.73, + "eval_loss": 0.7059928178787231, + "eval_runtime": 50.0118, + "eval_samples_per_second": 39.991, + "eval_steps_per_second": 0.64, + "step": 3400 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022787395871061208, + "loss": 0.6819, + "step": 3420 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022743933357479172, + "loss": 0.6833, + "step": 3440 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022700470843897138, + "loss": 0.6826, + "step": 3460 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022657008330315102, + "loss": 0.694, + "step": 3480 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022613545816733066, + "loss": 0.6827, + "step": 3500 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022570083303151032, + "loss": 0.6844, + "step": 3520 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022526620789568996, + "loss": 0.6893, + "step": 3540 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002248315827598696, + "loss": 0.6843, + "step": 3560 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022439695762404926, + "loss": 0.6843, + "step": 3580 + }, + { + "epoch": 0.78, + "learning_rate": 0.0002239623324882289, + "loss": 0.691, + "step": 3600 + }, + { + "epoch": 0.78, + "eval_loss": 0.7041522264480591, + "eval_runtime": 50.0554, + "eval_samples_per_second": 39.956, + "eval_steps_per_second": 0.639, + "step": 3600 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022352770735240853, + "loss": 0.6846, + "step": 3620 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002230930822165882, + "loss": 0.689, + "step": 3640 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022265845708076783, + "loss": 0.6777, + "step": 3660 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022222383194494747, + "loss": 0.6903, + "step": 3680 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022178920680912713, + "loss": 0.684, + "step": 3700 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022135458167330677, + "loss": 0.6867, + "step": 3720 + }, + { + "epoch": 0.81, + "learning_rate": 0.0002209199565374864, + "loss": 0.6697, + "step": 3740 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022048533140166607, + "loss": 0.6864, + "step": 3760 + }, + { + "epoch": 0.82, + "learning_rate": 0.0002200507062658457, + "loss": 0.6813, + "step": 3780 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021961608113002534, + "loss": 0.6807, + "step": 3800 + }, + { + "epoch": 0.82, + "eval_loss": 0.7024796009063721, + "eval_runtime": 50.022, + "eval_samples_per_second": 39.982, + "eval_steps_per_second": 0.64, + "step": 3800 + }, + { + "epoch": 0.82, + "learning_rate": 0.000219181455994205, + "loss": 0.6824, + "step": 3820 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021874683085838464, + "loss": 0.6814, + "step": 3840 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021831220572256427, + "loss": 0.6789, + "step": 3860 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021787758058674394, + "loss": 0.6752, + "step": 3880 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021744295545092358, + "loss": 0.6826, + "step": 3900 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002170083303151032, + "loss": 0.6874, + "step": 3920 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021657370517928288, + "loss": 0.6761, + "step": 3940 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002161390800434625, + "loss": 0.6795, + "step": 3960 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021570445490764215, + "loss": 0.6781, + "step": 3980 + }, + { + "epoch": 0.86, + "learning_rate": 0.0002152698297718218, + "loss": 0.6754, + "step": 4000 + }, + { + "epoch": 0.86, + "eval_loss": 0.7004331350326538, + "eval_runtime": 50.0568, + "eval_samples_per_second": 39.955, + "eval_steps_per_second": 0.639, + "step": 4000 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021483520463600145, + "loss": 0.6791, + "step": 4020 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021440057950018108, + "loss": 0.6863, + "step": 4040 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021396595436436075, + "loss": 0.6846, + "step": 4060 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021353132922854036, + "loss": 0.6814, + "step": 4080 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021309670409272, + "loss": 0.6825, + "step": 4100 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021266207895689963, + "loss": 0.6827, + "step": 4120 + }, + { + "epoch": 0.89, + "learning_rate": 0.0002122274538210793, + "loss": 0.6769, + "step": 4140 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021179282868525893, + "loss": 0.6869, + "step": 4160 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021135820354943857, + "loss": 0.6815, + "step": 4180 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021092357841361823, + "loss": 0.6725, + "step": 4200 + }, + { + "epoch": 0.91, + "eval_loss": 0.6981337666511536, + "eval_runtime": 50.0559, + "eval_samples_per_second": 39.955, + "eval_steps_per_second": 0.639, + "step": 4200 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021051068453458889, + "loss": 0.6731, + "step": 4220 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021007605939876855, + "loss": 0.6792, + "step": 4240 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020964143426294819, + "loss": 0.6755, + "step": 4260 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020920680912712782, + "loss": 0.6833, + "step": 4280 + }, + { + "epoch": 0.93, + "learning_rate": 0.0002087721839913075, + "loss": 0.6693, + "step": 4300 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020833755885548712, + "loss": 0.6728, + "step": 4320 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020790293371966676, + "loss": 0.6812, + "step": 4340 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020746830858384642, + "loss": 0.6734, + "step": 4360 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020703368344802606, + "loss": 0.6813, + "step": 4380 + }, + { + "epoch": 0.95, + "learning_rate": 0.0002065990583122057, + "loss": 0.6779, + "step": 4400 + }, + { + "epoch": 0.95, + "eval_loss": 0.6968498826026917, + "eval_runtime": 50.0697, + "eval_samples_per_second": 39.944, + "eval_steps_per_second": 0.639, + "step": 4400 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020616443317638536, + "loss": 0.6712, + "step": 4420 + }, + { + "epoch": 0.96, + "learning_rate": 0.000205729808040565, + "loss": 0.6846, + "step": 4440 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020529518290474463, + "loss": 0.6694, + "step": 4460 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002048605577689243, + "loss": 0.6753, + "step": 4480 + }, + { + "epoch": 0.97, + "learning_rate": 0.00020442593263310393, + "loss": 0.6792, + "step": 4500 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020399130749728357, + "loss": 0.6738, + "step": 4520 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020355668236146323, + "loss": 0.6699, + "step": 4540 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020312205722564287, + "loss": 0.6737, + "step": 4560 + }, + { + "epoch": 0.99, + "learning_rate": 0.0002026874320898225, + "loss": 0.6837, + "step": 4580 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020225280695400217, + "loss": 0.6701, + "step": 4600 + }, + { + "epoch": 0.99, + "eval_loss": 0.6954157948493958, + "eval_runtime": 50.0724, + "eval_samples_per_second": 39.942, + "eval_steps_per_second": 0.639, + "step": 4600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0002018181818181818, + "loss": 0.6677, + "step": 4620 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020138355668236144, + "loss": 0.6706, + "step": 4640 + }, + { + "epoch": 1.01, + "learning_rate": 0.0002009489315465411, + "loss": 0.6741, + "step": 4660 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020051430641072074, + "loss": 0.6757, + "step": 4680 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020007968127490038, + "loss": 0.6773, + "step": 4700 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019964505613908004, + "loss": 0.6728, + "step": 4720 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019921043100325968, + "loss": 0.6715, + "step": 4740 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019877580586743931, + "loss": 0.6679, + "step": 4760 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019834118073161898, + "loss": 0.6729, + "step": 4780 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019790655559579861, + "loss": 0.6749, + "step": 4800 + }, + { + "epoch": 1.04, + "eval_loss": 0.6941403746604919, + "eval_runtime": 50.0645, + "eval_samples_per_second": 39.948, + "eval_steps_per_second": 0.639, + "step": 4800 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019747193045997825, + "loss": 0.6661, + "step": 4820 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001970373053241579, + "loss": 0.6638, + "step": 4840 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019660268018833755, + "loss": 0.6715, + "step": 4860 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001961680550525172, + "loss": 0.6721, + "step": 4880 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019573342991669682, + "loss": 0.6695, + "step": 4900 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001952988047808765, + "loss": 0.6809, + "step": 4920 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019486417964505612, + "loss": 0.6701, + "step": 4940 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019442955450923576, + "loss": 0.6747, + "step": 4960 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019399492937341542, + "loss": 0.6713, + "step": 4980 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019356030423759506, + "loss": 0.6746, + "step": 5000 + }, + { + "epoch": 1.08, + "eval_loss": 0.6935788989067078, + "eval_runtime": 50.0137, + "eval_samples_per_second": 39.989, + "eval_steps_per_second": 0.64, + "step": 5000 + }, + { + "epoch": 1.08, + "learning_rate": 0.0001931256791017747, + "loss": 0.672, + "step": 5020 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019269105396595436, + "loss": 0.6673, + "step": 5040 + }, + { + "epoch": 1.09, + "learning_rate": 0.000192256428830134, + "loss": 0.6706, + "step": 5060 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019182180369431363, + "loss": 0.6677, + "step": 5080 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001913871785584933, + "loss": 0.67, + "step": 5100 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019095255342267293, + "loss": 0.6693, + "step": 5120 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019051792828685257, + "loss": 0.671, + "step": 5140 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019008330315103223, + "loss": 0.6748, + "step": 5160 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018964867801521187, + "loss": 0.6698, + "step": 5180 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001892140528793915, + "loss": 0.662, + "step": 5200 + }, + { + "epoch": 1.12, + "eval_loss": 0.6918168663978577, + "eval_runtime": 50.0897, + "eval_samples_per_second": 39.928, + "eval_steps_per_second": 0.639, + "step": 5200 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018877942774357117, + "loss": 0.66, + "step": 5220 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001883448026077508, + "loss": 0.6705, + "step": 5240 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018791017747193044, + "loss": 0.6693, + "step": 5260 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001874755523361101, + "loss": 0.6546, + "step": 5280 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018704092720028974, + "loss": 0.6673, + "step": 5300 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018660630206446938, + "loss": 0.671, + "step": 5320 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018617167692864904, + "loss": 0.675, + "step": 5340 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018573705179282868, + "loss": 0.6744, + "step": 5360 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018530242665700832, + "loss": 0.6643, + "step": 5380 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018486780152118798, + "loss": 0.6686, + "step": 5400 + }, + { + "epoch": 1.17, + "eval_loss": 0.6908227801322937, + "eval_runtime": 50.0742, + "eval_samples_per_second": 39.941, + "eval_steps_per_second": 0.639, + "step": 5400 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018443317638536762, + "loss": 0.6666, + "step": 5420 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018399855124954725, + "loss": 0.6658, + "step": 5440 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001835639261137269, + "loss": 0.671, + "step": 5460 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018312930097790653, + "loss": 0.6736, + "step": 5480 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018269467584208616, + "loss": 0.6697, + "step": 5500 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018226005070626583, + "loss": 0.6718, + "step": 5520 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018182542557044546, + "loss": 0.6701, + "step": 5540 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001813908004346251, + "loss": 0.6696, + "step": 5560 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018095617529880476, + "loss": 0.6611, + "step": 5580 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001805215501629844, + "loss": 0.6638, + "step": 5600 + }, + { + "epoch": 1.21, + "eval_loss": 0.689289927482605, + "eval_runtime": 50.1304, + "eval_samples_per_second": 39.896, + "eval_steps_per_second": 0.638, + "step": 5600 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018008692502716404, + "loss": 0.6646, + "step": 5620 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001796522998913437, + "loss": 0.6717, + "step": 5640 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017921767475552334, + "loss": 0.6647, + "step": 5660 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017878304961970297, + "loss": 0.672, + "step": 5680 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017834842448388264, + "loss": 0.6645, + "step": 5700 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017791379934806227, + "loss": 0.6768, + "step": 5720 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001774791742122419, + "loss": 0.6748, + "step": 5740 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017704454907642157, + "loss": 0.6722, + "step": 5760 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001766099239406012, + "loss": 0.6631, + "step": 5780 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017617529880478084, + "loss": 0.6647, + "step": 5800 + }, + { + "epoch": 1.25, + "eval_loss": 0.688850462436676, + "eval_runtime": 50.0542, + "eval_samples_per_second": 39.957, + "eval_steps_per_second": 0.639, + "step": 5800 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001757406736689605, + "loss": 0.66, + "step": 5820 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017530604853314014, + "loss": 0.6682, + "step": 5840 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017487142339731978, + "loss": 0.6589, + "step": 5860 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017443679826149944, + "loss": 0.6691, + "step": 5880 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017400217312567908, + "loss": 0.6726, + "step": 5900 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017356754798985872, + "loss": 0.6628, + "step": 5920 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017313292285403838, + "loss": 0.6719, + "step": 5940 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017269829771821802, + "loss": 0.6648, + "step": 5960 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017226367258239765, + "loss": 0.6594, + "step": 5980 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017182904744657732, + "loss": 0.6717, + "step": 6000 + }, + { + "epoch": 1.29, + "eval_loss": 0.6876093745231628, + "eval_runtime": 50.1763, + "eval_samples_per_second": 39.859, + "eval_steps_per_second": 0.638, + "step": 6000 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017139442231075695, + "loss": 0.6632, + "step": 6020 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001709597971749366, + "loss": 0.6619, + "step": 6040 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017052517203911625, + "loss": 0.667, + "step": 6060 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001700905469032959, + "loss": 0.6625, + "step": 6080 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016965592176747553, + "loss": 0.6661, + "step": 6100 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001692212966316552, + "loss": 0.656, + "step": 6120 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016878667149583483, + "loss": 0.6668, + "step": 6140 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016835204636001446, + "loss": 0.6669, + "step": 6160 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016791742122419413, + "loss": 0.6662, + "step": 6180 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016748279608837376, + "loss": 0.6692, + "step": 6200 + }, + { + "epoch": 1.34, + "eval_loss": 0.6869744658470154, + "eval_runtime": 50.1517, + "eval_samples_per_second": 39.879, + "eval_steps_per_second": 0.638, + "step": 6200 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001670481709525534, + "loss": 0.6571, + "step": 6220 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016661354581673306, + "loss": 0.6659, + "step": 6240 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001661789206809127, + "loss": 0.6622, + "step": 6260 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016574429554509234, + "loss": 0.6522, + "step": 6280 + }, + { + "epoch": 1.36, + "learning_rate": 0.000165309670409272, + "loss": 0.667, + "step": 6300 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016487504527345164, + "loss": 0.6644, + "step": 6320 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016444042013763127, + "loss": 0.6625, + "step": 6340 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016400579500181094, + "loss": 0.6686, + "step": 6360 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016357116986599057, + "loss": 0.6562, + "step": 6380 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001631365447301702, + "loss": 0.6595, + "step": 6400 + }, + { + "epoch": 1.38, + "eval_loss": 0.685205340385437, + "eval_runtime": 50.162, + "eval_samples_per_second": 39.871, + "eval_steps_per_second": 0.638, + "step": 6400 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016270191959434987, + "loss": 0.6595, + "step": 6420 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001622672944585295, + "loss": 0.6644, + "step": 6440 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016183266932270915, + "loss": 0.6647, + "step": 6460 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001613980441868888, + "loss": 0.6655, + "step": 6480 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016096341905106845, + "loss": 0.6564, + "step": 6500 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016052879391524808, + "loss": 0.6578, + "step": 6520 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016009416877942775, + "loss": 0.6624, + "step": 6540 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015965954364360738, + "loss": 0.6633, + "step": 6560 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015922491850778702, + "loss": 0.6616, + "step": 6580 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015879029337196668, + "loss": 0.6607, + "step": 6600 + }, + { + "epoch": 1.42, + "eval_loss": 0.6847727298736572, + "eval_runtime": 50.1562, + "eval_samples_per_second": 39.875, + "eval_steps_per_second": 0.638, + "step": 6600 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015835566823614632, + "loss": 0.6564, + "step": 6620 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015792104310032596, + "loss": 0.66, + "step": 6640 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015748641796450562, + "loss": 0.6589, + "step": 6660 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015705179282868526, + "loss": 0.6596, + "step": 6680 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001566171676928649, + "loss": 0.6663, + "step": 6700 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015618254255704456, + "loss": 0.6603, + "step": 6720 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001557479174212242, + "loss": 0.6674, + "step": 6740 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015531329228540383, + "loss": 0.6603, + "step": 6760 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001548786671495835, + "loss": 0.6612, + "step": 6780 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015444404201376313, + "loss": 0.6609, + "step": 6800 + }, + { + "epoch": 1.47, + "eval_loss": 0.683903694152832, + "eval_runtime": 50.079, + "eval_samples_per_second": 39.937, + "eval_steps_per_second": 0.639, + "step": 6800 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015400941687794277, + "loss": 0.6557, + "step": 6820 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015357479174212243, + "loss": 0.6627, + "step": 6840 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015314016660630207, + "loss": 0.6667, + "step": 6860 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001527055414704817, + "loss": 0.6633, + "step": 6880 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015227091633466137, + "loss": 0.6565, + "step": 6900 + }, + { + "epoch": 1.49, + "learning_rate": 0.000151836291198841, + "loss": 0.6588, + "step": 6920 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015140166606302064, + "loss": 0.6687, + "step": 6940 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001509670409272003, + "loss": 0.6611, + "step": 6960 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015053241579137994, + "loss": 0.6576, + "step": 6980 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015009779065555957, + "loss": 0.6576, + "step": 7000 + }, + { + "epoch": 1.51, + "eval_loss": 0.6830142736434937, + "eval_runtime": 50.1233, + "eval_samples_per_second": 39.902, + "eval_steps_per_second": 0.638, + "step": 7000 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001496631655197392, + "loss": 0.6617, + "step": 7020 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014922854038391885, + "loss": 0.6533, + "step": 7040 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001487939152480985, + "loss": 0.6524, + "step": 7060 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014835929011227815, + "loss": 0.6597, + "step": 7080 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014792466497645778, + "loss": 0.656, + "step": 7100 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014749003984063745, + "loss": 0.6501, + "step": 7120 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014705541470481708, + "loss": 0.6563, + "step": 7140 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014662078956899672, + "loss": 0.6496, + "step": 7160 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014618616443317638, + "loss": 0.6602, + "step": 7180 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014575153929735602, + "loss": 0.6617, + "step": 7200 + }, + { + "epoch": 1.55, + "eval_loss": 0.6818540096282959, + "eval_runtime": 50.1175, + "eval_samples_per_second": 39.906, + "eval_steps_per_second": 0.639, + "step": 7200 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014531691416153566, + "loss": 0.6655, + "step": 7220 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014488228902571532, + "loss": 0.6544, + "step": 7240 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014444766388989496, + "loss": 0.655, + "step": 7260 + }, + { + "epoch": 1.57, + "learning_rate": 0.0001440130387540746, + "loss": 0.6535, + "step": 7280 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014357841361825426, + "loss": 0.6584, + "step": 7300 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001431437884824339, + "loss": 0.6602, + "step": 7320 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014270916334661353, + "loss": 0.6689, + "step": 7340 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001422745382107932, + "loss": 0.6613, + "step": 7360 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014183991307497283, + "loss": 0.659, + "step": 7380 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014140528793915247, + "loss": 0.6463, + "step": 7400 + }, + { + "epoch": 1.6, + "eval_loss": 0.681868851184845, + "eval_runtime": 50.1388, + "eval_samples_per_second": 39.889, + "eval_steps_per_second": 0.638, + "step": 7400 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014097066280333213, + "loss": 0.6617, + "step": 7420 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014053603766751177, + "loss": 0.6648, + "step": 7440 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001401014125316914, + "loss": 0.6528, + "step": 7460 + }, + { + "epoch": 1.61, + "learning_rate": 0.00013966678739587107, + "loss": 0.6655, + "step": 7480 + }, + { + "epoch": 1.62, + "learning_rate": 0.0001392321622600507, + "loss": 0.6609, + "step": 7500 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013879753712423034, + "loss": 0.6528, + "step": 7520 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013836291198841, + "loss": 0.6561, + "step": 7540 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013792828685258964, + "loss": 0.6682, + "step": 7560 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013749366171676928, + "loss": 0.6677, + "step": 7580 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013705903658094894, + "loss": 0.6599, + "step": 7600 + }, + { + "epoch": 1.64, + "eval_loss": 0.6807426810264587, + "eval_runtime": 50.3308, + "eval_samples_per_second": 39.737, + "eval_steps_per_second": 0.636, + "step": 7600 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013662441144512855, + "loss": 0.6525, + "step": 7620 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001361897863093082, + "loss": 0.6574, + "step": 7640 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013575516117348785, + "loss": 0.6516, + "step": 7660 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013532053603766749, + "loss": 0.6533, + "step": 7680 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013488591090184715, + "loss": 0.6577, + "step": 7700 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013445128576602679, + "loss": 0.6592, + "step": 7720 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013401666063020642, + "loss": 0.6585, + "step": 7740 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013358203549438609, + "loss": 0.6607, + "step": 7760 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013314741035856572, + "loss": 0.6617, + "step": 7780 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013271278522274536, + "loss": 0.6443, + "step": 7800 + }, + { + "epoch": 1.68, + "eval_loss": 0.6800745725631714, + "eval_runtime": 50.165, + "eval_samples_per_second": 39.868, + "eval_steps_per_second": 0.638, + "step": 7800 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013227816008692502, + "loss": 0.6587, + "step": 7820 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013184353495110466, + "loss": 0.6613, + "step": 7840 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001314089098152843, + "loss": 0.654, + "step": 7860 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013097428467946396, + "loss": 0.6523, + "step": 7880 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001305396595436436, + "loss": 0.6563, + "step": 7900 + }, + { + "epoch": 1.71, + "learning_rate": 0.00013010503440782323, + "loss": 0.6524, + "step": 7920 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001296704092720029, + "loss": 0.6523, + "step": 7940 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012923578413618253, + "loss": 0.6493, + "step": 7960 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012880115900036217, + "loss": 0.6538, + "step": 7980 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012836653386454183, + "loss": 0.6512, + "step": 8000 + }, + { + "epoch": 1.73, + "eval_loss": 0.6790341734886169, + "eval_runtime": 50.1317, + "eval_samples_per_second": 39.895, + "eval_steps_per_second": 0.638, + "step": 8000 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012793190872872147, + "loss": 0.6562, + "step": 8020 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001274972835929011, + "loss": 0.6556, + "step": 8040 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012706265845708077, + "loss": 0.65, + "step": 8060 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001266280333212604, + "loss": 0.661, + "step": 8080 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012619340818544004, + "loss": 0.655, + "step": 8100 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001257587830496197, + "loss": 0.6534, + "step": 8120 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012532415791379934, + "loss": 0.6517, + "step": 8140 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012488953277797898, + "loss": 0.6605, + "step": 8160 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012445490764215864, + "loss": 0.6556, + "step": 8180 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012402028250633828, + "loss": 0.6492, + "step": 8200 + }, + { + "epoch": 1.77, + "eval_loss": 0.6781870126724243, + "eval_runtime": 50.0809, + "eval_samples_per_second": 39.935, + "eval_steps_per_second": 0.639, + "step": 8200 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012358565737051791, + "loss": 0.6541, + "step": 8220 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012315103223469758, + "loss": 0.6517, + "step": 8240 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012271640709887721, + "loss": 0.6483, + "step": 8260 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012228178196305685, + "loss": 0.6619, + "step": 8280 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001218471568272365, + "loss": 0.6556, + "step": 8300 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012141253169141615, + "loss": 0.6471, + "step": 8320 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012097790655559579, + "loss": 0.6611, + "step": 8340 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012054328141977544, + "loss": 0.6506, + "step": 8360 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012010865628395509, + "loss": 0.6611, + "step": 8380 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011967403114813472, + "loss": 0.6557, + "step": 8400 + }, + { + "epoch": 1.81, + "eval_loss": 0.6776989102363586, + "eval_runtime": 50.1344, + "eval_samples_per_second": 39.893, + "eval_steps_per_second": 0.638, + "step": 8400 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011923940601231437, + "loss": 0.6504, + "step": 8420 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011880478087649402, + "loss": 0.6552, + "step": 8440 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011839188699746468, + "loss": 0.641, + "step": 8460 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011795726186164432, + "loss": 0.6535, + "step": 8480 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011752263672582397, + "loss": 0.6568, + "step": 8500 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011708801159000362, + "loss": 0.6621, + "step": 8520 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011665338645418325, + "loss": 0.6607, + "step": 8540 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001162187613183629, + "loss": 0.6516, + "step": 8560 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011578413618254255, + "loss": 0.6497, + "step": 8580 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011534951104672219, + "loss": 0.6559, + "step": 8600 + }, + { + "epoch": 1.86, + "eval_loss": 0.6773191094398499, + "eval_runtime": 50.1605, + "eval_samples_per_second": 39.872, + "eval_steps_per_second": 0.638, + "step": 8600 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011491488591090184, + "loss": 0.6595, + "step": 8620 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011448026077508149, + "loss": 0.6495, + "step": 8640 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011404563563926113, + "loss": 0.6518, + "step": 8660 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011361101050344078, + "loss": 0.6511, + "step": 8680 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011317638536762043, + "loss": 0.6495, + "step": 8700 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011274176023180006, + "loss": 0.6485, + "step": 8720 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011230713509597971, + "loss": 0.6543, + "step": 8740 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011187250996015936, + "loss": 0.6509, + "step": 8760 + }, + { + "epoch": 1.89, + "learning_rate": 0.000111437884824339, + "loss": 0.656, + "step": 8780 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011100325968851865, + "loss": 0.6557, + "step": 8800 + }, + { + "epoch": 1.9, + "eval_loss": 0.6773696541786194, + "eval_runtime": 50.1296, + "eval_samples_per_second": 39.897, + "eval_steps_per_second": 0.638, + "step": 8800 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001105686345526983, + "loss": 0.6509, + "step": 8820 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011013400941687794, + "loss": 0.65, + "step": 8840 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010969938428105759, + "loss": 0.6447, + "step": 8860 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010926475914523724, + "loss": 0.6563, + "step": 8880 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010883013400941687, + "loss": 0.6545, + "step": 8900 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010839550887359652, + "loss": 0.6509, + "step": 8920 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010796088373777617, + "loss": 0.6434, + "step": 8940 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010752625860195581, + "loss": 0.6412, + "step": 8960 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010709163346613546, + "loss": 0.6512, + "step": 8980 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010665700833031508, + "loss": 0.6478, + "step": 9000 + }, + { + "epoch": 1.94, + "eval_loss": 0.6760911345481873, + "eval_runtime": 50.1795, + "eval_samples_per_second": 39.857, + "eval_steps_per_second": 0.638, + "step": 9000 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010622238319449473, + "loss": 0.6545, + "step": 9020 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010578775805867438, + "loss": 0.6468, + "step": 9040 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010535313292285402, + "loss": 0.6527, + "step": 9060 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010491850778703367, + "loss": 0.6621, + "step": 9080 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010448388265121332, + "loss": 0.6496, + "step": 9100 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010404925751539295, + "loss": 0.6512, + "step": 9120 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001036146323795726, + "loss": 0.6491, + "step": 9140 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010318000724375225, + "loss": 0.6482, + "step": 9160 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010274538210793189, + "loss": 0.6456, + "step": 9180 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010231075697211154, + "loss": 0.6458, + "step": 9200 + }, + { + "epoch": 1.98, + "eval_loss": 0.6748936772346497, + "eval_runtime": 50.1856, + "eval_samples_per_second": 39.852, + "eval_steps_per_second": 0.638, + "step": 9200 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010187613183629119, + "loss": 0.6473, + "step": 9220 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010144150670047083, + "loss": 0.6496, + "step": 9240 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010100688156465048, + "loss": 0.6566, + "step": 9260 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010057225642883013, + "loss": 0.6475, + "step": 9280 + }, + { + "epoch": 2.01, + "learning_rate": 0.00010013763129300976, + "loss": 0.6536, + "step": 9300 + }, + { + "epoch": 2.01, + "learning_rate": 9.970300615718941e-05, + "loss": 0.646, + "step": 9320 + }, + { + "epoch": 2.02, + "learning_rate": 9.926838102136906e-05, + "loss": 0.6503, + "step": 9340 + }, + { + "epoch": 2.02, + "learning_rate": 9.88337558855487e-05, + "loss": 0.6527, + "step": 9360 + }, + { + "epoch": 2.02, + "learning_rate": 9.839913074972835e-05, + "loss": 0.6514, + "step": 9380 + }, + { + "epoch": 2.03, + "learning_rate": 9.7964505613908e-05, + "loss": 0.6548, + "step": 9400 + }, + { + "epoch": 2.03, + "eval_loss": 0.6744834780693054, + "eval_runtime": 50.1696, + "eval_samples_per_second": 39.865, + "eval_steps_per_second": 0.638, + "step": 9400 + }, + { + "epoch": 2.03, + "learning_rate": 9.752988047808764e-05, + "loss": 0.6483, + "step": 9420 + }, + { + "epoch": 2.04, + "learning_rate": 9.709525534226729e-05, + "loss": 0.6522, + "step": 9440 + }, + { + "epoch": 2.04, + "learning_rate": 9.666063020644694e-05, + "loss": 0.6538, + "step": 9460 + }, + { + "epoch": 2.05, + "learning_rate": 9.622600507062657e-05, + "loss": 0.6449, + "step": 9480 + }, + { + "epoch": 2.05, + "learning_rate": 9.579137993480622e-05, + "loss": 0.6451, + "step": 9500 + }, + { + "epoch": 2.05, + "learning_rate": 9.535675479898587e-05, + "loss": 0.6355, + "step": 9520 + }, + { + "epoch": 2.06, + "learning_rate": 9.492212966316551e-05, + "loss": 0.6494, + "step": 9540 + }, + { + "epoch": 2.06, + "learning_rate": 9.448750452734516e-05, + "loss": 0.6435, + "step": 9560 + }, + { + "epoch": 2.07, + "learning_rate": 9.405287939152481e-05, + "loss": 0.651, + "step": 9580 + }, + { + "epoch": 2.07, + "learning_rate": 9.361825425570445e-05, + "loss": 0.6493, + "step": 9600 + }, + { + "epoch": 2.07, + "eval_loss": 0.674017071723938, + "eval_runtime": 50.1402, + "eval_samples_per_second": 39.888, + "eval_steps_per_second": 0.638, + "step": 9600 + }, + { + "epoch": 2.08, + "learning_rate": 9.31836291198841e-05, + "loss": 0.6469, + "step": 9620 + }, + { + "epoch": 2.08, + "learning_rate": 9.274900398406375e-05, + "loss": 0.65, + "step": 9640 + }, + { + "epoch": 2.08, + "learning_rate": 9.231437884824338e-05, + "loss": 0.6536, + "step": 9660 + }, + { + "epoch": 2.09, + "learning_rate": 9.187975371242303e-05, + "loss": 0.6488, + "step": 9680 + }, + { + "epoch": 2.09, + "learning_rate": 9.144512857660268e-05, + "loss": 0.6391, + "step": 9700 + }, + { + "epoch": 2.1, + "learning_rate": 9.101050344078232e-05, + "loss": 0.644, + "step": 9720 + }, + { + "epoch": 2.1, + "learning_rate": 9.057587830496197e-05, + "loss": 0.6507, + "step": 9740 + }, + { + "epoch": 2.11, + "learning_rate": 9.014125316914162e-05, + "loss": 0.6404, + "step": 9760 + }, + { + "epoch": 2.11, + "learning_rate": 8.970662803332126e-05, + "loss": 0.6509, + "step": 9780 + }, + { + "epoch": 2.11, + "learning_rate": 8.92720028975009e-05, + "loss": 0.6435, + "step": 9800 + }, + { + "epoch": 2.11, + "eval_loss": 0.6735255122184753, + "eval_runtime": 50.1703, + "eval_samples_per_second": 39.864, + "eval_steps_per_second": 0.638, + "step": 9800 + }, + { + "epoch": 2.12, + "learning_rate": 8.883737776168056e-05, + "loss": 0.6374, + "step": 9820 + }, + { + "epoch": 2.12, + "learning_rate": 8.840275262586019e-05, + "loss": 0.6445, + "step": 9840 + }, + { + "epoch": 2.13, + "learning_rate": 8.796812749003983e-05, + "loss": 0.6495, + "step": 9860 + }, + { + "epoch": 2.13, + "learning_rate": 8.753350235421946e-05, + "loss": 0.6482, + "step": 9880 + }, + { + "epoch": 2.14, + "learning_rate": 8.709887721839911e-05, + "loss": 0.6441, + "step": 9900 + }, + { + "epoch": 2.14, + "learning_rate": 8.666425208257877e-05, + "loss": 0.6525, + "step": 9920 + }, + { + "epoch": 2.14, + "learning_rate": 8.62296269467584e-05, + "loss": 0.6453, + "step": 9940 + }, + { + "epoch": 2.15, + "learning_rate": 8.579500181093805e-05, + "loss": 0.6498, + "step": 9960 + }, + { + "epoch": 2.15, + "learning_rate": 8.53603766751177e-05, + "loss": 0.6471, + "step": 9980 + }, + { + "epoch": 2.16, + "learning_rate": 8.492575153929734e-05, + "loss": 0.6419, + "step": 10000 + }, + { + "epoch": 2.16, + "eval_loss": 0.6730753779411316, + "eval_runtime": 50.1885, + "eval_samples_per_second": 39.85, + "eval_steps_per_second": 0.638, + "step": 10000 + }, + { + "epoch": 2.16, + "learning_rate": 8.449112640347699e-05, + "loss": 0.6447, + "step": 10020 + }, + { + "epoch": 2.17, + "learning_rate": 8.405650126765664e-05, + "loss": 0.6444, + "step": 10040 + }, + { + "epoch": 2.17, + "learning_rate": 8.362187613183627e-05, + "loss": 0.6393, + "step": 10060 + }, + { + "epoch": 2.17, + "learning_rate": 8.318725099601592e-05, + "loss": 0.6464, + "step": 10080 + }, + { + "epoch": 2.18, + "learning_rate": 8.275262586019557e-05, + "loss": 0.6458, + "step": 10100 + }, + { + "epoch": 2.18, + "learning_rate": 8.231800072437521e-05, + "loss": 0.6402, + "step": 10120 + }, + { + "epoch": 2.19, + "learning_rate": 8.188337558855486e-05, + "loss": 0.6409, + "step": 10140 + }, + { + "epoch": 2.19, + "learning_rate": 8.144875045273451e-05, + "loss": 0.6512, + "step": 10160 + }, + { + "epoch": 2.2, + "learning_rate": 8.101412531691415e-05, + "loss": 0.6498, + "step": 10180 + }, + { + "epoch": 2.2, + "learning_rate": 8.05795001810938e-05, + "loss": 0.6393, + "step": 10200 + }, + { + "epoch": 2.2, + "eval_loss": 0.6726437211036682, + "eval_runtime": 50.1492, + "eval_samples_per_second": 39.881, + "eval_steps_per_second": 0.638, + "step": 10200 + }, + { + "epoch": 2.2, + "learning_rate": 8.014487504527345e-05, + "loss": 0.6458, + "step": 10220 + }, + { + "epoch": 2.21, + "learning_rate": 7.971024990945308e-05, + "loss": 0.6466, + "step": 10240 + }, + { + "epoch": 2.21, + "learning_rate": 7.927562477363273e-05, + "loss": 0.644, + "step": 10260 + }, + { + "epoch": 2.22, + "learning_rate": 7.884099963781238e-05, + "loss": 0.6467, + "step": 10280 + }, + { + "epoch": 2.22, + "learning_rate": 7.840637450199202e-05, + "loss": 0.6436, + "step": 10300 + }, + { + "epoch": 2.23, + "learning_rate": 7.797174936617167e-05, + "loss": 0.6422, + "step": 10320 + }, + { + "epoch": 2.23, + "learning_rate": 7.753712423035132e-05, + "loss": 0.645, + "step": 10340 + }, + { + "epoch": 2.24, + "learning_rate": 7.710249909453096e-05, + "loss": 0.6423, + "step": 10360 + }, + { + "epoch": 2.24, + "learning_rate": 7.666787395871061e-05, + "loss": 0.6557, + "step": 10380 + }, + { + "epoch": 2.24, + "learning_rate": 7.623324882289026e-05, + "loss": 0.646, + "step": 10400 + }, + { + "epoch": 2.24, + "eval_loss": 0.6725419759750366, + "eval_runtime": 50.1975, + "eval_samples_per_second": 39.843, + "eval_steps_per_second": 0.637, + "step": 10400 + }, + { + "epoch": 2.25, + "learning_rate": 7.57986236870699e-05, + "loss": 0.6503, + "step": 10420 + }, + { + "epoch": 2.25, + "learning_rate": 7.536399855124954e-05, + "loss": 0.6428, + "step": 10440 + }, + { + "epoch": 2.26, + "learning_rate": 7.49293734154292e-05, + "loss": 0.6438, + "step": 10460 + }, + { + "epoch": 2.26, + "learning_rate": 7.449474827960883e-05, + "loss": 0.6427, + "step": 10480 + }, + { + "epoch": 2.27, + "learning_rate": 7.406012314378847e-05, + "loss": 0.6458, + "step": 10500 + }, + { + "epoch": 2.27, + "learning_rate": 7.362549800796812e-05, + "loss": 0.6423, + "step": 10520 + }, + { + "epoch": 2.27, + "learning_rate": 7.319087287214777e-05, + "loss": 0.6466, + "step": 10540 + }, + { + "epoch": 2.28, + "learning_rate": 7.27562477363274e-05, + "loss": 0.6394, + "step": 10560 + }, + { + "epoch": 2.28, + "learning_rate": 7.232162260050705e-05, + "loss": 0.6362, + "step": 10580 + }, + { + "epoch": 2.29, + "learning_rate": 7.18869974646867e-05, + "loss": 0.6399, + "step": 10600 + }, + { + "epoch": 2.29, + "eval_loss": 0.6719211935997009, + "eval_runtime": 50.1808, + "eval_samples_per_second": 39.856, + "eval_steps_per_second": 0.638, + "step": 10600 + }, + { + "epoch": 2.29, + "learning_rate": 7.145237232886634e-05, + "loss": 0.6378, + "step": 10620 + }, + { + "epoch": 2.3, + "learning_rate": 7.101774719304599e-05, + "loss": 0.634, + "step": 10640 + }, + { + "epoch": 2.3, + "learning_rate": 7.058312205722564e-05, + "loss": 0.6374, + "step": 10660 + }, + { + "epoch": 2.3, + "learning_rate": 7.014849692140528e-05, + "loss": 0.6464, + "step": 10680 + }, + { + "epoch": 2.31, + "learning_rate": 6.971387178558493e-05, + "loss": 0.643, + "step": 10700 + }, + { + "epoch": 2.31, + "learning_rate": 6.927924664976458e-05, + "loss": 0.6384, + "step": 10720 + }, + { + "epoch": 2.32, + "learning_rate": 6.884462151394421e-05, + "loss": 0.6451, + "step": 10740 + }, + { + "epoch": 2.32, + "learning_rate": 6.840999637812386e-05, + "loss": 0.6465, + "step": 10760 + }, + { + "epoch": 2.33, + "learning_rate": 6.799710249909452e-05, + "loss": 0.646, + "step": 10780 + }, + { + "epoch": 2.33, + "learning_rate": 6.756247736327417e-05, + "loss": 0.6525, + "step": 10800 + }, + { + "epoch": 2.33, + "eval_loss": 0.6714358925819397, + "eval_runtime": 50.1294, + "eval_samples_per_second": 39.897, + "eval_steps_per_second": 0.638, + "step": 10800 + }, + { + "epoch": 2.33, + "learning_rate": 6.712785222745382e-05, + "loss": 0.6423, + "step": 10820 + }, + { + "epoch": 2.34, + "learning_rate": 6.669322709163345e-05, + "loss": 0.6449, + "step": 10840 + }, + { + "epoch": 2.34, + "learning_rate": 6.62586019558131e-05, + "loss": 0.6325, + "step": 10860 + }, + { + "epoch": 2.35, + "learning_rate": 6.582397681999275e-05, + "loss": 0.6558, + "step": 10880 + }, + { + "epoch": 2.35, + "learning_rate": 6.538935168417239e-05, + "loss": 0.6419, + "step": 10900 + }, + { + "epoch": 2.36, + "learning_rate": 6.495472654835204e-05, + "loss": 0.6466, + "step": 10920 + }, + { + "epoch": 2.36, + "learning_rate": 6.452010141253169e-05, + "loss": 0.6357, + "step": 10940 + }, + { + "epoch": 2.36, + "learning_rate": 6.408547627671133e-05, + "loss": 0.6366, + "step": 10960 + }, + { + "epoch": 2.37, + "learning_rate": 6.365085114089098e-05, + "loss": 0.6466, + "step": 10980 + }, + { + "epoch": 2.37, + "learning_rate": 6.321622600507063e-05, + "loss": 0.6542, + "step": 11000 + }, + { + "epoch": 2.37, + "eval_loss": 0.6710445880889893, + "eval_runtime": 50.2479, + "eval_samples_per_second": 39.803, + "eval_steps_per_second": 0.637, + "step": 11000 + }, + { + "epoch": 2.38, + "learning_rate": 6.278160086925026e-05, + "loss": 0.6481, + "step": 11020 + }, + { + "epoch": 2.38, + "learning_rate": 6.23469757334299e-05, + "loss": 0.6425, + "step": 11040 + }, + { + "epoch": 2.39, + "learning_rate": 6.191235059760955e-05, + "loss": 0.6439, + "step": 11060 + }, + { + "epoch": 2.39, + "learning_rate": 6.14777254617892e-05, + "loss": 0.6424, + "step": 11080 + }, + { + "epoch": 2.39, + "learning_rate": 6.104310032596884e-05, + "loss": 0.6404, + "step": 11100 + }, + { + "epoch": 2.4, + "learning_rate": 6.060847519014849e-05, + "loss": 0.6387, + "step": 11120 + }, + { + "epoch": 2.4, + "learning_rate": 6.017385005432814e-05, + "loss": 0.6462, + "step": 11140 + }, + { + "epoch": 2.41, + "learning_rate": 5.973922491850778e-05, + "loss": 0.6431, + "step": 11160 + }, + { + "epoch": 2.41, + "learning_rate": 5.9304599782687424e-05, + "loss": 0.638, + "step": 11180 + }, + { + "epoch": 2.42, + "learning_rate": 5.8869974646867074e-05, + "loss": 0.6344, + "step": 11200 + }, + { + "epoch": 2.42, + "eval_loss": 0.6704220771789551, + "eval_runtime": 50.1558, + "eval_samples_per_second": 39.876, + "eval_steps_per_second": 0.638, + "step": 11200 + }, + { + "epoch": 2.42, + "learning_rate": 5.843534951104672e-05, + "loss": 0.6448, + "step": 11220 + }, + { + "epoch": 2.43, + "learning_rate": 5.800072437522636e-05, + "loss": 0.6449, + "step": 11240 + }, + { + "epoch": 2.43, + "learning_rate": 5.756609923940601e-05, + "loss": 0.6399, + "step": 11260 + }, + { + "epoch": 2.43, + "learning_rate": 5.7131474103585654e-05, + "loss": 0.638, + "step": 11280 + }, + { + "epoch": 2.44, + "learning_rate": 5.66968489677653e-05, + "loss": 0.6418, + "step": 11300 + }, + { + "epoch": 2.44, + "learning_rate": 5.626222383194495e-05, + "loss": 0.6482, + "step": 11320 + }, + { + "epoch": 2.45, + "learning_rate": 5.582759869612459e-05, + "loss": 0.6392, + "step": 11340 + }, + { + "epoch": 2.45, + "learning_rate": 5.5392973560304233e-05, + "loss": 0.6363, + "step": 11360 + }, + { + "epoch": 2.46, + "learning_rate": 5.4958348424483883e-05, + "loss": 0.6503, + "step": 11380 + }, + { + "epoch": 2.46, + "learning_rate": 5.452372328866353e-05, + "loss": 0.6453, + "step": 11400 + }, + { + "epoch": 2.46, + "eval_loss": 0.670009195804596, + "eval_runtime": 50.155, + "eval_samples_per_second": 39.876, + "eval_steps_per_second": 0.638, + "step": 11400 + }, + { + "epoch": 2.46, + "learning_rate": 5.408909815284317e-05, + "loss": 0.6384, + "step": 11420 + }, + { + "epoch": 2.47, + "learning_rate": 5.365447301702282e-05, + "loss": 0.6449, + "step": 11440 + }, + { + "epoch": 2.47, + "learning_rate": 5.3219847881202456e-05, + "loss": 0.6406, + "step": 11460 + }, + { + "epoch": 2.48, + "learning_rate": 5.27852227453821e-05, + "loss": 0.6363, + "step": 11480 + }, + { + "epoch": 2.48, + "learning_rate": 5.235059760956174e-05, + "loss": 0.6482, + "step": 11500 + }, + { + "epoch": 2.49, + "learning_rate": 5.191597247374139e-05, + "loss": 0.6503, + "step": 11520 + }, + { + "epoch": 2.49, + "learning_rate": 5.1481347337921036e-05, + "loss": 0.6479, + "step": 11540 + }, + { + "epoch": 2.49, + "learning_rate": 5.10684534588917e-05, + "loss": 0.6437, + "step": 11560 + }, + { + "epoch": 2.5, + "learning_rate": 5.063382832307134e-05, + "loss": 0.6398, + "step": 11580 + }, + { + "epoch": 2.5, + "learning_rate": 5.0199203187250985e-05, + "loss": 0.6456, + "step": 11600 + }, + { + "epoch": 2.5, + "eval_loss": 0.6702134013175964, + "eval_runtime": 50.1834, + "eval_samples_per_second": 39.854, + "eval_steps_per_second": 0.638, + "step": 11600 + }, + { + "epoch": 2.51, + "learning_rate": 4.9764578051430635e-05, + "loss": 0.646, + "step": 11620 + }, + { + "epoch": 2.51, + "learning_rate": 4.932995291561028e-05, + "loss": 0.6375, + "step": 11640 + }, + { + "epoch": 2.52, + "learning_rate": 4.889532777978992e-05, + "loss": 0.6393, + "step": 11660 + }, + { + "epoch": 2.52, + "learning_rate": 4.846070264396957e-05, + "loss": 0.638, + "step": 11680 + }, + { + "epoch": 2.52, + "learning_rate": 4.8026077508149215e-05, + "loss": 0.6411, + "step": 11700 + }, + { + "epoch": 2.53, + "learning_rate": 4.759145237232886e-05, + "loss": 0.6467, + "step": 11720 + }, + { + "epoch": 2.53, + "learning_rate": 4.715682723650851e-05, + "loss": 0.6369, + "step": 11740 + }, + { + "epoch": 2.54, + "learning_rate": 4.672220210068815e-05, + "loss": 0.637, + "step": 11760 + }, + { + "epoch": 2.54, + "learning_rate": 4.6287576964867795e-05, + "loss": 0.6486, + "step": 11780 + }, + { + "epoch": 2.55, + "learning_rate": 4.5852951829047445e-05, + "loss": 0.637, + "step": 11800 + }, + { + "epoch": 2.55, + "eval_loss": 0.6698750257492065, + "eval_runtime": 50.1539, + "eval_samples_per_second": 39.877, + "eval_steps_per_second": 0.638, + "step": 11800 + }, + { + "epoch": 2.55, + "learning_rate": 4.541832669322709e-05, + "loss": 0.639, + "step": 11820 + }, + { + "epoch": 2.55, + "learning_rate": 4.498370155740673e-05, + "loss": 0.6366, + "step": 11840 + }, + { + "epoch": 2.56, + "learning_rate": 4.454907642158638e-05, + "loss": 0.6409, + "step": 11860 + }, + { + "epoch": 2.56, + "learning_rate": 4.4114451285766025e-05, + "loss": 0.6394, + "step": 11880 + }, + { + "epoch": 2.57, + "learning_rate": 4.367982614994567e-05, + "loss": 0.6351, + "step": 11900 + }, + { + "epoch": 2.57, + "learning_rate": 4.324520101412532e-05, + "loss": 0.6391, + "step": 11920 + }, + { + "epoch": 2.58, + "learning_rate": 4.281057587830496e-05, + "loss": 0.6267, + "step": 11940 + }, + { + "epoch": 2.58, + "learning_rate": 4.2375950742484604e-05, + "loss": 0.6461, + "step": 11960 + }, + { + "epoch": 2.58, + "learning_rate": 4.194132560666425e-05, + "loss": 0.6483, + "step": 11980 + }, + { + "epoch": 2.59, + "learning_rate": 4.150670047084389e-05, + "loss": 0.6461, + "step": 12000 + }, + { + "epoch": 2.59, + "eval_loss": 0.6692882180213928, + "eval_runtime": 50.1673, + "eval_samples_per_second": 39.867, + "eval_steps_per_second": 0.638, + "step": 12000 + }, + { + "epoch": 2.59, + "learning_rate": 4.1072075335023534e-05, + "loss": 0.6429, + "step": 12020 + }, + { + "epoch": 2.6, + "learning_rate": 4.0637450199203184e-05, + "loss": 0.6416, + "step": 12040 + }, + { + "epoch": 2.6, + "learning_rate": 4.020282506338283e-05, + "loss": 0.6356, + "step": 12060 + }, + { + "epoch": 2.61, + "learning_rate": 3.976819992756247e-05, + "loss": 0.6402, + "step": 12080 + }, + { + "epoch": 2.61, + "learning_rate": 3.933357479174212e-05, + "loss": 0.6395, + "step": 12100 + }, + { + "epoch": 2.61, + "learning_rate": 3.8898949655921764e-05, + "loss": 0.6432, + "step": 12120 + }, + { + "epoch": 2.62, + "learning_rate": 3.846432452010141e-05, + "loss": 0.6386, + "step": 12140 + }, + { + "epoch": 2.62, + "learning_rate": 3.802969938428106e-05, + "loss": 0.6396, + "step": 12160 + }, + { + "epoch": 2.63, + "learning_rate": 3.75950742484607e-05, + "loss": 0.6423, + "step": 12180 + }, + { + "epoch": 2.63, + "learning_rate": 3.7160449112640344e-05, + "loss": 0.649, + "step": 12200 + }, + { + "epoch": 2.63, + "eval_loss": 0.6691960096359253, + "eval_runtime": 50.1649, + "eval_samples_per_second": 39.869, + "eval_steps_per_second": 0.638, + "step": 12200 + }, + { + "epoch": 2.64, + "learning_rate": 3.672582397681999e-05, + "loss": 0.6547, + "step": 12220 + }, + { + "epoch": 2.64, + "learning_rate": 3.629119884099964e-05, + "loss": 0.642, + "step": 12240 + }, + { + "epoch": 2.65, + "learning_rate": 3.585657370517928e-05, + "loss": 0.634, + "step": 12260 + }, + { + "epoch": 2.65, + "learning_rate": 3.542194856935892e-05, + "loss": 0.6447, + "step": 12280 + }, + { + "epoch": 2.65, + "learning_rate": 3.498732343353857e-05, + "loss": 0.6285, + "step": 12300 + }, + { + "epoch": 2.66, + "learning_rate": 3.455269829771822e-05, + "loss": 0.6436, + "step": 12320 + }, + { + "epoch": 2.66, + "learning_rate": 3.411807316189786e-05, + "loss": 0.6349, + "step": 12340 + }, + { + "epoch": 2.67, + "learning_rate": 3.36834480260775e-05, + "loss": 0.6425, + "step": 12360 + }, + { + "epoch": 2.67, + "learning_rate": 3.324882289025715e-05, + "loss": 0.6393, + "step": 12380 + }, + { + "epoch": 2.68, + "learning_rate": 3.2814197754436796e-05, + "loss": 0.6367, + "step": 12400 + }, + { + "epoch": 2.68, + "eval_loss": 0.6687243580818176, + "eval_runtime": 50.3508, + "eval_samples_per_second": 39.721, + "eval_steps_per_second": 0.636, + "step": 12400 + }, + { + "epoch": 2.68, + "learning_rate": 3.237957261861644e-05, + "loss": 0.6386, + "step": 12420 + }, + { + "epoch": 2.68, + "learning_rate": 3.194494748279609e-05, + "loss": 0.6526, + "step": 12440 + }, + { + "epoch": 2.69, + "learning_rate": 3.151032234697573e-05, + "loss": 0.6357, + "step": 12460 + }, + { + "epoch": 2.69, + "learning_rate": 3.1075697211155376e-05, + "loss": 0.6353, + "step": 12480 + }, + { + "epoch": 2.7, + "learning_rate": 3.0641072075335026e-05, + "loss": 0.6449, + "step": 12500 + }, + { + "epoch": 2.7, + "learning_rate": 3.0206446939514663e-05, + "loss": 0.6425, + "step": 12520 + }, + { + "epoch": 2.71, + "learning_rate": 2.977182180369431e-05, + "loss": 0.6374, + "step": 12540 + }, + { + "epoch": 2.71, + "learning_rate": 2.9337196667873956e-05, + "loss": 0.6324, + "step": 12560 + }, + { + "epoch": 2.71, + "learning_rate": 2.89025715320536e-05, + "loss": 0.6502, + "step": 12580 + }, + { + "epoch": 2.72, + "learning_rate": 2.8467946396233246e-05, + "loss": 0.637, + "step": 12600 + }, + { + "epoch": 2.72, + "eval_loss": 0.6683821082115173, + "eval_runtime": 50.2054, + "eval_samples_per_second": 39.836, + "eval_steps_per_second": 0.637, + "step": 12600 + }, + { + "epoch": 2.72, + "learning_rate": 2.8033321260412892e-05, + "loss": 0.647, + "step": 12620 + }, + { + "epoch": 2.73, + "learning_rate": 2.7598696124592536e-05, + "loss": 0.632, + "step": 12640 + }, + { + "epoch": 2.73, + "learning_rate": 2.7164070988772182e-05, + "loss": 0.6411, + "step": 12660 + }, + { + "epoch": 2.74, + "learning_rate": 2.672944585295183e-05, + "loss": 0.632, + "step": 12680 + }, + { + "epoch": 2.74, + "learning_rate": 2.6294820717131472e-05, + "loss": 0.6389, + "step": 12700 + }, + { + "epoch": 2.74, + "learning_rate": 2.586019558131112e-05, + "loss": 0.6337, + "step": 12720 + }, + { + "epoch": 2.75, + "learning_rate": 2.542557044549076e-05, + "loss": 0.6439, + "step": 12740 + }, + { + "epoch": 2.75, + "learning_rate": 2.4990945309670405e-05, + "loss": 0.6364, + "step": 12760 + }, + { + "epoch": 2.76, + "learning_rate": 2.4556320173850052e-05, + "loss": 0.6402, + "step": 12780 + }, + { + "epoch": 2.76, + "learning_rate": 2.4121695038029695e-05, + "loss": 0.6376, + "step": 12800 + }, + { + "epoch": 2.76, + "eval_loss": 0.6680713295936584, + "eval_runtime": 50.1757, + "eval_samples_per_second": 39.86, + "eval_steps_per_second": 0.638, + "step": 12800 + }, + { + "epoch": 2.77, + "learning_rate": 2.3687069902209342e-05, + "loss": 0.6316, + "step": 12820 + }, + { + "epoch": 2.77, + "learning_rate": 2.325244476638899e-05, + "loss": 0.6393, + "step": 12840 + }, + { + "epoch": 2.77, + "learning_rate": 2.281781963056863e-05, + "loss": 0.6372, + "step": 12860 + }, + { + "epoch": 2.78, + "learning_rate": 2.2383194494748278e-05, + "loss": 0.6466, + "step": 12880 + }, + { + "epoch": 2.78, + "learning_rate": 2.1948569358927925e-05, + "loss": 0.6392, + "step": 12900 + }, + { + "epoch": 2.79, + "learning_rate": 2.1513944223107568e-05, + "loss": 0.6389, + "step": 12920 + }, + { + "epoch": 2.79, + "learning_rate": 2.107931908728721e-05, + "loss": 0.64, + "step": 12940 + }, + { + "epoch": 2.8, + "learning_rate": 2.0644693951466858e-05, + "loss": 0.6362, + "step": 12960 + }, + { + "epoch": 2.8, + "learning_rate": 2.02100688156465e-05, + "loss": 0.6364, + "step": 12980 + }, + { + "epoch": 2.8, + "learning_rate": 1.9775443679826148e-05, + "loss": 0.6372, + "step": 13000 + }, + { + "epoch": 2.8, + "eval_loss": 0.6680414080619812, + "eval_runtime": 50.2211, + "eval_samples_per_second": 39.824, + "eval_steps_per_second": 0.637, + "step": 13000 + }, + { + "epoch": 2.81, + "learning_rate": 1.9340818544005794e-05, + "loss": 0.6336, + "step": 13020 + }, + { + "epoch": 2.81, + "learning_rate": 1.8906193408185438e-05, + "loss": 0.6348, + "step": 13040 + }, + { + "epoch": 2.82, + "learning_rate": 1.8471568272365084e-05, + "loss": 0.6338, + "step": 13060 + }, + { + "epoch": 2.82, + "learning_rate": 1.8036943136544728e-05, + "loss": 0.6396, + "step": 13080 + }, + { + "epoch": 2.83, + "learning_rate": 1.7602318000724374e-05, + "loss": 0.641, + "step": 13100 + }, + { + "epoch": 2.83, + "learning_rate": 1.7167692864904017e-05, + "loss": 0.6369, + "step": 13120 + }, + { + "epoch": 2.83, + "learning_rate": 1.6733067729083664e-05, + "loss": 0.6345, + "step": 13140 + }, + { + "epoch": 2.84, + "learning_rate": 1.629844259326331e-05, + "loss": 0.649, + "step": 13160 + }, + { + "epoch": 2.84, + "learning_rate": 1.5863817457442954e-05, + "loss": 0.6409, + "step": 13180 + }, + { + "epoch": 2.85, + "learning_rate": 1.54291923216226e-05, + "loss": 0.63, + "step": 13200 + }, + { + "epoch": 2.85, + "eval_loss": 0.6678950190544128, + "eval_runtime": 50.1908, + "eval_samples_per_second": 39.848, + "eval_steps_per_second": 0.638, + "step": 13200 + }, + { + "epoch": 2.85, + "learning_rate": 1.4994567185802244e-05, + "loss": 0.6428, + "step": 13220 + }, + { + "epoch": 2.86, + "learning_rate": 1.4559942049981889e-05, + "loss": 0.645, + "step": 13240 + }, + { + "epoch": 2.86, + "learning_rate": 1.4125316914161534e-05, + "loss": 0.6434, + "step": 13260 + }, + { + "epoch": 2.87, + "learning_rate": 1.369069177834118e-05, + "loss": 0.6462, + "step": 13280 + }, + { + "epoch": 2.87, + "learning_rate": 1.3256066642520825e-05, + "loss": 0.6387, + "step": 13300 + }, + { + "epoch": 2.87, + "learning_rate": 1.2821441506700468e-05, + "loss": 0.6311, + "step": 13320 + }, + { + "epoch": 2.88, + "learning_rate": 1.2386816370880113e-05, + "loss": 0.6446, + "step": 13340 + }, + { + "epoch": 2.88, + "learning_rate": 1.195219123505976e-05, + "loss": 0.6426, + "step": 13360 + }, + { + "epoch": 2.89, + "learning_rate": 1.1517566099239405e-05, + "loss": 0.6369, + "step": 13380 + }, + { + "epoch": 2.89, + "learning_rate": 1.108294096341905e-05, + "loss": 0.6467, + "step": 13400 + }, + { + "epoch": 2.89, + "eval_loss": 0.6676326990127563, + "eval_runtime": 50.1589, + "eval_samples_per_second": 39.873, + "eval_steps_per_second": 0.638, + "step": 13400 + } + ], + "max_steps": 13905, + "num_train_epochs": 3, + "total_flos": 1.7033837289458893e+20, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13400/training_args.bin b/adapters/saved-alpaca-belle30b/checkpoint-13400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..12b801c69c900b96b8117a2e6bdeacc32be225f4 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e26d4a9526d1384fcaa3dc3df4f56f03c822ab57b4abed652b7156aebfaccc3 +size 3643 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13600/optimizer.pt b/adapters/saved-alpaca-belle30b/checkpoint-13600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ca4aca249f9e28d19d47d7fd717383ea5748702 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7838395cba8d02d021390f90dfc419e8965f498db20535b5271476aaca070761 +size 102377669 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13600/pytorch_model.bin b/adapters/saved-alpaca-belle30b/checkpoint-13600/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..f0bdf8f5a99895376727b2c06d2900c3c1f95441 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13600/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e0fba25ca5d46f53d8ed21ef8d5c77ffeba1d216cd2bdbe821a19c02c21d450 +size 51204365 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_0.pth b/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..302c263cd46e13177e0099fd53343ba182be82b8 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e3d7d33499dff647183babf1a23ca419fc3b47b3b7c7ea3740f4c5780e3e016 +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_1.pth b/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..27c77033dd6d3f0dcc25589c04a12aab2204b876 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cc4799ff28f268e43f45470a04be15607b25303c458bc4a6397f5a3630c2907 +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_2.pth b/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..a831f3f02cd67762853e2920e5f4ea024fcd4a91 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ee9a9827c8a71165b63a674e9893fd7f06af56413c9df9395fd253e2727e6ec +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_3.pth b/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..1c1ab81880b4b60ea3cdebdb296fed7b0938602c --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b78e36ff1ec07fe21710607bdd07238d8d4383236be829641b763218206fe1cf +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_4.pth b/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..8e780f9a02d8068a93faca9a99b6cba3ca591875 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d969a972d3d9331bd84b4521221fd6f254014984a1105400f3577bf3721c9c0 +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_5.pth b/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..4036c638a6398ff1d5bffd35bc04774f34e8af64 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d25ab6589842a8992cb31f9cde21455402d5ff5a87842ee96beed6830c722b5 +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_6.pth b/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..9c55346b78db22dde6f74b4d896dacb2be7dad16 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4ab98def5d962c4c80e4ac0f8a6ae210cbdf6f1ca645cfcf9cc867602251e0f +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_7.pth b/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..f345475863ea0c283693a68518e7565d5443a09f --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13600/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:799cabb4b724a2c2e66e43707e09ff520a5caa6db8776220ac29b0f33ddb2507 +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13600/scaler.pt b/adapters/saved-alpaca-belle30b/checkpoint-13600/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..75b23e79e16c4184562ea71c3128dbd3904d620d --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13600/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e7b9e5573fbdc3d0a7b970150bca346b6b12e5ba73b2542b5dd8ecaacf3efa9 +size 557 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13600/scheduler.pt b/adapters/saved-alpaca-belle30b/checkpoint-13600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..22e44c19c019472f9b4b5b4a85e40562ecc7e9a5 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c9f19a3e30ad64c864f012ed09fbca7cbe334b10bf2f3410cad9a42e6bdfc27 +size 627 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13600/trainer_state.json b/adapters/saved-alpaca-belle30b/checkpoint-13600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..220986392324b1cf11a12421736cd30c6673c3a9 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13600/trainer_state.json @@ -0,0 +1,4640 @@ +{ + "best_metric": 0.6675477027893066, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-belle30b/checkpoint-13600", + "epoch": 2.9341963322545848, + "global_step": 13600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.6143, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 1.2447, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017699999999999997, + "loss": 0.9529, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 0.000237, + "loss": 0.8899, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029699999999999996, + "loss": 0.8614, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029958710612097066, + "loss": 0.8402, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029915248098515027, + "loss": 0.8335, + "step": 140 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029871785584932993, + "loss": 0.8303, + "step": 160 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002982832307135096, + "loss": 0.8261, + "step": 180 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002978486055776892, + "loss": 0.807, + "step": 200 + }, + { + "epoch": 0.04, + "eval_loss": 0.8271128535270691, + "eval_runtime": 49.877, + "eval_samples_per_second": 40.099, + "eval_steps_per_second": 0.642, + "step": 200 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029741398044186887, + "loss": 0.808, + "step": 220 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029697935530604853, + "loss": 0.8092, + "step": 240 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029654473017022814, + "loss": 0.8045, + "step": 260 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002961101050344078, + "loss": 0.8007, + "step": 280 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029567547989858747, + "loss": 0.793, + "step": 300 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002952408547627671, + "loss": 0.7886, + "step": 320 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029480622962694674, + "loss": 0.7854, + "step": 340 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002943716044911264, + "loss": 0.783, + "step": 360 + }, + { + "epoch": 0.08, + "learning_rate": 0.000293936979355306, + "loss": 0.7797, + "step": 380 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002935023542194857, + "loss": 0.7801, + "step": 400 + }, + { + "epoch": 0.09, + "eval_loss": 0.793747067451477, + "eval_runtime": 49.8962, + "eval_samples_per_second": 40.083, + "eval_steps_per_second": 0.641, + "step": 400 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029306772908366534, + "loss": 0.7879, + "step": 420 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029263310394784495, + "loss": 0.7745, + "step": 440 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002921984788120246, + "loss": 0.7725, + "step": 460 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002917638536762043, + "loss": 0.7659, + "step": 480 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002913292285403839, + "loss": 0.7658, + "step": 500 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029089460340456355, + "loss": 0.7722, + "step": 520 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002904599782687432, + "loss": 0.773, + "step": 540 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002900253531329228, + "loss": 0.7749, + "step": 560 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002895907279971025, + "loss": 0.7734, + "step": 580 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028915610286128215, + "loss": 0.7607, + "step": 600 + }, + { + "epoch": 0.13, + "eval_loss": 0.7771433591842651, + "eval_runtime": 49.9486, + "eval_samples_per_second": 40.041, + "eval_steps_per_second": 0.641, + "step": 600 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028872147772546176, + "loss": 0.7657, + "step": 620 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028828685258964137, + "loss": 0.7602, + "step": 640 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028785222745382103, + "loss": 0.7619, + "step": 660 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002874176023180007, + "loss": 0.7587, + "step": 680 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002869829771821803, + "loss": 0.7553, + "step": 700 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028654835204635997, + "loss": 0.7565, + "step": 720 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028611372691053963, + "loss": 0.7586, + "step": 740 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028567910177471924, + "loss": 0.7556, + "step": 760 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002852444766388989, + "loss": 0.7487, + "step": 780 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028480985150307857, + "loss": 0.7516, + "step": 800 + }, + { + "epoch": 0.17, + "eval_loss": 0.7632888555526733, + "eval_runtime": 49.913, + "eval_samples_per_second": 40.07, + "eval_steps_per_second": 0.641, + "step": 800 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002843752263672582, + "loss": 0.7527, + "step": 820 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028394060123143784, + "loss": 0.7407, + "step": 840 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002835059760956175, + "loss": 0.744, + "step": 860 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002830713509597971, + "loss": 0.7456, + "step": 880 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002826367258239768, + "loss": 0.7429, + "step": 900 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028220210068815644, + "loss": 0.7516, + "step": 920 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028176747555233605, + "loss": 0.7381, + "step": 940 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002813328504165157, + "loss": 0.7256, + "step": 960 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002808982252806954, + "loss": 0.7443, + "step": 980 + }, + { + "epoch": 0.22, + "learning_rate": 0.000280463600144875, + "loss": 0.7389, + "step": 1000 + }, + { + "epoch": 0.22, + "eval_loss": 0.7532852292060852, + "eval_runtime": 49.9829, + "eval_samples_per_second": 40.014, + "eval_steps_per_second": 0.64, + "step": 1000 + }, + { + "epoch": 0.22, + "learning_rate": 0.00028002897500905465, + "loss": 0.7374, + "step": 1020 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002795943498732343, + "loss": 0.7296, + "step": 1040 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002791597247374139, + "loss": 0.7424, + "step": 1060 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002787250996015936, + "loss": 0.7328, + "step": 1080 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027829047446577325, + "loss": 0.7367, + "step": 1100 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027785584932995286, + "loss": 0.7419, + "step": 1120 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002774212241941325, + "loss": 0.7347, + "step": 1140 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002769865990583122, + "loss": 0.7292, + "step": 1160 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002765519739224918, + "loss": 0.7394, + "step": 1180 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027611734878667146, + "loss": 0.7358, + "step": 1200 + }, + { + "epoch": 0.26, + "eval_loss": 0.7463639974594116, + "eval_runtime": 49.9963, + "eval_samples_per_second": 40.003, + "eval_steps_per_second": 0.64, + "step": 1200 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002756827236508511, + "loss": 0.7266, + "step": 1220 + }, + { + "epoch": 0.27, + "learning_rate": 0.00027524809851503073, + "loss": 0.7336, + "step": 1240 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002748134733792104, + "loss": 0.7296, + "step": 1260 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027437884824339006, + "loss": 0.73, + "step": 1280 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027394422310756967, + "loss": 0.7312, + "step": 1300 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027350959797174933, + "loss": 0.7307, + "step": 1320 + }, + { + "epoch": 0.29, + "learning_rate": 0.000273074972835929, + "loss": 0.7246, + "step": 1340 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002726403477001086, + "loss": 0.7299, + "step": 1360 + }, + { + "epoch": 0.3, + "learning_rate": 0.00027220572256428827, + "loss": 0.7251, + "step": 1380 + }, + { + "epoch": 0.3, + "learning_rate": 0.00027177109742846793, + "loss": 0.7286, + "step": 1400 + }, + { + "epoch": 0.3, + "eval_loss": 0.7393819093704224, + "eval_runtime": 49.9896, + "eval_samples_per_second": 40.008, + "eval_steps_per_second": 0.64, + "step": 1400 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027133647229264754, + "loss": 0.7186, + "step": 1420 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002709018471568272, + "loss": 0.7215, + "step": 1440 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027046722202100687, + "loss": 0.7295, + "step": 1460 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002700325968851865, + "loss": 0.7198, + "step": 1480 + }, + { + "epoch": 0.32, + "learning_rate": 0.00026959797174936614, + "loss": 0.7184, + "step": 1500 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002691633466135458, + "loss": 0.7283, + "step": 1520 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002687287214777254, + "loss": 0.7378, + "step": 1540 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002682940963419051, + "loss": 0.7196, + "step": 1560 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026785947120608474, + "loss": 0.7152, + "step": 1580 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026742484607026435, + "loss": 0.7184, + "step": 1600 + }, + { + "epoch": 0.35, + "eval_loss": 0.7342154383659363, + "eval_runtime": 49.9957, + "eval_samples_per_second": 40.003, + "eval_steps_per_second": 0.64, + "step": 1600 + }, + { + "epoch": 0.35, + "learning_rate": 0.000266990220934444, + "loss": 0.7164, + "step": 1620 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002665555957986237, + "loss": 0.7136, + "step": 1640 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002661209706628033, + "loss": 0.7203, + "step": 1660 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026568634552698295, + "loss": 0.7158, + "step": 1680 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002652517203911626, + "loss": 0.7145, + "step": 1700 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002648170952553422, + "loss": 0.7111, + "step": 1720 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002643824701195219, + "loss": 0.7155, + "step": 1740 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026394784498370155, + "loss": 0.718, + "step": 1760 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026351321984788116, + "loss": 0.7125, + "step": 1780 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002630785947120608, + "loss": 0.7163, + "step": 1800 + }, + { + "epoch": 0.39, + "eval_loss": 0.7301950454711914, + "eval_runtime": 49.9689, + "eval_samples_per_second": 40.025, + "eval_steps_per_second": 0.64, + "step": 1800 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002626439695762405, + "loss": 0.7121, + "step": 1820 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002622093444404201, + "loss": 0.7092, + "step": 1840 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026177471930459976, + "loss": 0.7133, + "step": 1860 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002613400941687794, + "loss": 0.7171, + "step": 1880 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026090546903295903, + "loss": 0.7235, + "step": 1900 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002604708438971387, + "loss": 0.7086, + "step": 1920 + }, + { + "epoch": 0.42, + "learning_rate": 0.00026003621876131836, + "loss": 0.7136, + "step": 1940 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025960159362549797, + "loss": 0.7031, + "step": 1960 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025916696848967763, + "loss": 0.7084, + "step": 1980 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002587323433538573, + "loss": 0.7091, + "step": 2000 + }, + { + "epoch": 0.43, + "eval_loss": 0.726446270942688, + "eval_runtime": 50.0519, + "eval_samples_per_second": 39.959, + "eval_steps_per_second": 0.639, + "step": 2000 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002582977182180369, + "loss": 0.7119, + "step": 2020 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025786309308221657, + "loss": 0.7186, + "step": 2040 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025742846794639623, + "loss": 0.703, + "step": 2060 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025699384281057584, + "loss": 0.7078, + "step": 2080 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002565592176747555, + "loss": 0.7084, + "step": 2100 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025612459253893517, + "loss": 0.7014, + "step": 2120 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002556899674031148, + "loss": 0.7076, + "step": 2140 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025525534226729444, + "loss": 0.7103, + "step": 2160 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002548207171314741, + "loss": 0.7118, + "step": 2180 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002543860919956537, + "loss": 0.7028, + "step": 2200 + }, + { + "epoch": 0.47, + "eval_loss": 0.7220268845558167, + "eval_runtime": 49.9937, + "eval_samples_per_second": 40.005, + "eval_steps_per_second": 0.64, + "step": 2200 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002539514668598334, + "loss": 0.707, + "step": 2220 + }, + { + "epoch": 0.48, + "learning_rate": 0.00025351684172401304, + "loss": 0.7045, + "step": 2240 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025308221658819265, + "loss": 0.6905, + "step": 2260 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002526475914523723, + "loss": 0.6982, + "step": 2280 + }, + { + "epoch": 0.5, + "learning_rate": 0.000252212966316552, + "loss": 0.706, + "step": 2300 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002517783411807316, + "loss": 0.6992, + "step": 2320 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025134371604491125, + "loss": 0.6939, + "step": 2340 + }, + { + "epoch": 0.51, + "learning_rate": 0.00025090909090909086, + "loss": 0.7037, + "step": 2360 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002504744657732705, + "loss": 0.7127, + "step": 2380 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025003984063745014, + "loss": 0.702, + "step": 2400 + }, + { + "epoch": 0.52, + "eval_loss": 0.7191869020462036, + "eval_runtime": 50.0038, + "eval_samples_per_second": 39.997, + "eval_steps_per_second": 0.64, + "step": 2400 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002496052155016298, + "loss": 0.7033, + "step": 2420 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024917059036580946, + "loss": 0.7028, + "step": 2440 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024873596522998907, + "loss": 0.6967, + "step": 2460 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024830134009416874, + "loss": 0.7068, + "step": 2480 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002478667149583484, + "loss": 0.7105, + "step": 2500 + }, + { + "epoch": 0.54, + "learning_rate": 0.000247432089822528, + "loss": 0.6968, + "step": 2520 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024699746468670767, + "loss": 0.7025, + "step": 2540 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024656283955088734, + "loss": 0.6942, + "step": 2560 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024612821441506694, + "loss": 0.6948, + "step": 2580 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002456935892792466, + "loss": 0.6979, + "step": 2600 + }, + { + "epoch": 0.56, + "eval_loss": 0.715853750705719, + "eval_runtime": 50.0426, + "eval_samples_per_second": 39.966, + "eval_steps_per_second": 0.639, + "step": 2600 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024525896414342627, + "loss": 0.6967, + "step": 2620 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002448243390076059, + "loss": 0.7012, + "step": 2640 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024438971387178554, + "loss": 0.697, + "step": 2660 + }, + { + "epoch": 0.58, + "learning_rate": 0.0002439550887359652, + "loss": 0.6931, + "step": 2680 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024352046360014485, + "loss": 0.6856, + "step": 2700 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024308583846432448, + "loss": 0.697, + "step": 2720 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024265121332850415, + "loss": 0.6996, + "step": 2740 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024221658819268378, + "loss": 0.698, + "step": 2760 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024178196305686342, + "loss": 0.6952, + "step": 2780 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024134733792104308, + "loss": 0.7049, + "step": 2800 + }, + { + "epoch": 0.6, + "eval_loss": 0.7124837040901184, + "eval_runtime": 50.0654, + "eval_samples_per_second": 39.948, + "eval_steps_per_second": 0.639, + "step": 2800 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024091271278522272, + "loss": 0.6927, + "step": 2820 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024047808764940235, + "loss": 0.6996, + "step": 2840 + }, + { + "epoch": 0.62, + "learning_rate": 0.00024004346251358202, + "loss": 0.6921, + "step": 2860 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023960883737776165, + "loss": 0.695, + "step": 2880 + }, + { + "epoch": 0.63, + "learning_rate": 0.0002391742122419413, + "loss": 0.6887, + "step": 2900 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023873958710612095, + "loss": 0.6915, + "step": 2920 + }, + { + "epoch": 0.63, + "learning_rate": 0.0002383049619703006, + "loss": 0.6915, + "step": 2940 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023787033683448023, + "loss": 0.6916, + "step": 2960 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002374357116986599, + "loss": 0.687, + "step": 2980 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023700108656283953, + "loss": 0.6997, + "step": 3000 + }, + { + "epoch": 0.65, + "eval_loss": 0.7098860144615173, + "eval_runtime": 50.0652, + "eval_samples_per_second": 39.948, + "eval_steps_per_second": 0.639, + "step": 3000 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023656646142701916, + "loss": 0.6895, + "step": 3020 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023613183629119883, + "loss": 0.6861, + "step": 3040 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023569721115537846, + "loss": 0.6988, + "step": 3060 + }, + { + "epoch": 0.66, + "learning_rate": 0.0002352625860195581, + "loss": 0.6852, + "step": 3080 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023482796088373776, + "loss": 0.6863, + "step": 3100 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002343933357479174, + "loss": 0.6943, + "step": 3120 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023395871061209704, + "loss": 0.686, + "step": 3140 + }, + { + "epoch": 0.68, + "learning_rate": 0.0002335240854762767, + "loss": 0.684, + "step": 3160 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023308946034045634, + "loss": 0.6866, + "step": 3180 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023265483520463597, + "loss": 0.6859, + "step": 3200 + }, + { + "epoch": 0.69, + "eval_loss": 0.7077216506004333, + "eval_runtime": 50.0526, + "eval_samples_per_second": 39.958, + "eval_steps_per_second": 0.639, + "step": 3200 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023222021006881564, + "loss": 0.6845, + "step": 3220 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023178558493299527, + "loss": 0.7011, + "step": 3240 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002313509597971749, + "loss": 0.69, + "step": 3260 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023091633466135457, + "loss": 0.6931, + "step": 3280 + }, + { + "epoch": 0.71, + "learning_rate": 0.0002304817095255342, + "loss": 0.6998, + "step": 3300 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023004708438971385, + "loss": 0.6933, + "step": 3320 + }, + { + "epoch": 0.72, + "learning_rate": 0.0002296124592538935, + "loss": 0.6859, + "step": 3340 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022917783411807315, + "loss": 0.6972, + "step": 3360 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022874320898225278, + "loss": 0.6868, + "step": 3380 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022830858384643245, + "loss": 0.6902, + "step": 3400 + }, + { + "epoch": 0.73, + "eval_loss": 0.7059928178787231, + "eval_runtime": 50.0118, + "eval_samples_per_second": 39.991, + "eval_steps_per_second": 0.64, + "step": 3400 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022787395871061208, + "loss": 0.6819, + "step": 3420 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022743933357479172, + "loss": 0.6833, + "step": 3440 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022700470843897138, + "loss": 0.6826, + "step": 3460 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022657008330315102, + "loss": 0.694, + "step": 3480 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022613545816733066, + "loss": 0.6827, + "step": 3500 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022570083303151032, + "loss": 0.6844, + "step": 3520 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022526620789568996, + "loss": 0.6893, + "step": 3540 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002248315827598696, + "loss": 0.6843, + "step": 3560 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022439695762404926, + "loss": 0.6843, + "step": 3580 + }, + { + "epoch": 0.78, + "learning_rate": 0.0002239623324882289, + "loss": 0.691, + "step": 3600 + }, + { + "epoch": 0.78, + "eval_loss": 0.7041522264480591, + "eval_runtime": 50.0554, + "eval_samples_per_second": 39.956, + "eval_steps_per_second": 0.639, + "step": 3600 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022352770735240853, + "loss": 0.6846, + "step": 3620 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002230930822165882, + "loss": 0.689, + "step": 3640 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022265845708076783, + "loss": 0.6777, + "step": 3660 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022222383194494747, + "loss": 0.6903, + "step": 3680 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022178920680912713, + "loss": 0.684, + "step": 3700 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022135458167330677, + "loss": 0.6867, + "step": 3720 + }, + { + "epoch": 0.81, + "learning_rate": 0.0002209199565374864, + "loss": 0.6697, + "step": 3740 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022048533140166607, + "loss": 0.6864, + "step": 3760 + }, + { + "epoch": 0.82, + "learning_rate": 0.0002200507062658457, + "loss": 0.6813, + "step": 3780 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021961608113002534, + "loss": 0.6807, + "step": 3800 + }, + { + "epoch": 0.82, + "eval_loss": 0.7024796009063721, + "eval_runtime": 50.022, + "eval_samples_per_second": 39.982, + "eval_steps_per_second": 0.64, + "step": 3800 + }, + { + "epoch": 0.82, + "learning_rate": 0.000219181455994205, + "loss": 0.6824, + "step": 3820 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021874683085838464, + "loss": 0.6814, + "step": 3840 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021831220572256427, + "loss": 0.6789, + "step": 3860 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021787758058674394, + "loss": 0.6752, + "step": 3880 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021744295545092358, + "loss": 0.6826, + "step": 3900 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002170083303151032, + "loss": 0.6874, + "step": 3920 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021657370517928288, + "loss": 0.6761, + "step": 3940 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002161390800434625, + "loss": 0.6795, + "step": 3960 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021570445490764215, + "loss": 0.6781, + "step": 3980 + }, + { + "epoch": 0.86, + "learning_rate": 0.0002152698297718218, + "loss": 0.6754, + "step": 4000 + }, + { + "epoch": 0.86, + "eval_loss": 0.7004331350326538, + "eval_runtime": 50.0568, + "eval_samples_per_second": 39.955, + "eval_steps_per_second": 0.639, + "step": 4000 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021483520463600145, + "loss": 0.6791, + "step": 4020 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021440057950018108, + "loss": 0.6863, + "step": 4040 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021396595436436075, + "loss": 0.6846, + "step": 4060 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021353132922854036, + "loss": 0.6814, + "step": 4080 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021309670409272, + "loss": 0.6825, + "step": 4100 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021266207895689963, + "loss": 0.6827, + "step": 4120 + }, + { + "epoch": 0.89, + "learning_rate": 0.0002122274538210793, + "loss": 0.6769, + "step": 4140 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021179282868525893, + "loss": 0.6869, + "step": 4160 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021135820354943857, + "loss": 0.6815, + "step": 4180 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021092357841361823, + "loss": 0.6725, + "step": 4200 + }, + { + "epoch": 0.91, + "eval_loss": 0.6981337666511536, + "eval_runtime": 50.0559, + "eval_samples_per_second": 39.955, + "eval_steps_per_second": 0.639, + "step": 4200 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021051068453458889, + "loss": 0.6731, + "step": 4220 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021007605939876855, + "loss": 0.6792, + "step": 4240 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020964143426294819, + "loss": 0.6755, + "step": 4260 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020920680912712782, + "loss": 0.6833, + "step": 4280 + }, + { + "epoch": 0.93, + "learning_rate": 0.0002087721839913075, + "loss": 0.6693, + "step": 4300 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020833755885548712, + "loss": 0.6728, + "step": 4320 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020790293371966676, + "loss": 0.6812, + "step": 4340 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020746830858384642, + "loss": 0.6734, + "step": 4360 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020703368344802606, + "loss": 0.6813, + "step": 4380 + }, + { + "epoch": 0.95, + "learning_rate": 0.0002065990583122057, + "loss": 0.6779, + "step": 4400 + }, + { + "epoch": 0.95, + "eval_loss": 0.6968498826026917, + "eval_runtime": 50.0697, + "eval_samples_per_second": 39.944, + "eval_steps_per_second": 0.639, + "step": 4400 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020616443317638536, + "loss": 0.6712, + "step": 4420 + }, + { + "epoch": 0.96, + "learning_rate": 0.000205729808040565, + "loss": 0.6846, + "step": 4440 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020529518290474463, + "loss": 0.6694, + "step": 4460 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002048605577689243, + "loss": 0.6753, + "step": 4480 + }, + { + "epoch": 0.97, + "learning_rate": 0.00020442593263310393, + "loss": 0.6792, + "step": 4500 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020399130749728357, + "loss": 0.6738, + "step": 4520 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020355668236146323, + "loss": 0.6699, + "step": 4540 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020312205722564287, + "loss": 0.6737, + "step": 4560 + }, + { + "epoch": 0.99, + "learning_rate": 0.0002026874320898225, + "loss": 0.6837, + "step": 4580 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020225280695400217, + "loss": 0.6701, + "step": 4600 + }, + { + "epoch": 0.99, + "eval_loss": 0.6954157948493958, + "eval_runtime": 50.0724, + "eval_samples_per_second": 39.942, + "eval_steps_per_second": 0.639, + "step": 4600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0002018181818181818, + "loss": 0.6677, + "step": 4620 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020138355668236144, + "loss": 0.6706, + "step": 4640 + }, + { + "epoch": 1.01, + "learning_rate": 0.0002009489315465411, + "loss": 0.6741, + "step": 4660 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020051430641072074, + "loss": 0.6757, + "step": 4680 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020007968127490038, + "loss": 0.6773, + "step": 4700 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019964505613908004, + "loss": 0.6728, + "step": 4720 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019921043100325968, + "loss": 0.6715, + "step": 4740 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019877580586743931, + "loss": 0.6679, + "step": 4760 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019834118073161898, + "loss": 0.6729, + "step": 4780 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019790655559579861, + "loss": 0.6749, + "step": 4800 + }, + { + "epoch": 1.04, + "eval_loss": 0.6941403746604919, + "eval_runtime": 50.0645, + "eval_samples_per_second": 39.948, + "eval_steps_per_second": 0.639, + "step": 4800 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019747193045997825, + "loss": 0.6661, + "step": 4820 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001970373053241579, + "loss": 0.6638, + "step": 4840 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019660268018833755, + "loss": 0.6715, + "step": 4860 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001961680550525172, + "loss": 0.6721, + "step": 4880 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019573342991669682, + "loss": 0.6695, + "step": 4900 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001952988047808765, + "loss": 0.6809, + "step": 4920 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019486417964505612, + "loss": 0.6701, + "step": 4940 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019442955450923576, + "loss": 0.6747, + "step": 4960 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019399492937341542, + "loss": 0.6713, + "step": 4980 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019356030423759506, + "loss": 0.6746, + "step": 5000 + }, + { + "epoch": 1.08, + "eval_loss": 0.6935788989067078, + "eval_runtime": 50.0137, + "eval_samples_per_second": 39.989, + "eval_steps_per_second": 0.64, + "step": 5000 + }, + { + "epoch": 1.08, + "learning_rate": 0.0001931256791017747, + "loss": 0.672, + "step": 5020 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019269105396595436, + "loss": 0.6673, + "step": 5040 + }, + { + "epoch": 1.09, + "learning_rate": 0.000192256428830134, + "loss": 0.6706, + "step": 5060 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019182180369431363, + "loss": 0.6677, + "step": 5080 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001913871785584933, + "loss": 0.67, + "step": 5100 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019095255342267293, + "loss": 0.6693, + "step": 5120 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019051792828685257, + "loss": 0.671, + "step": 5140 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019008330315103223, + "loss": 0.6748, + "step": 5160 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018964867801521187, + "loss": 0.6698, + "step": 5180 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001892140528793915, + "loss": 0.662, + "step": 5200 + }, + { + "epoch": 1.12, + "eval_loss": 0.6918168663978577, + "eval_runtime": 50.0897, + "eval_samples_per_second": 39.928, + "eval_steps_per_second": 0.639, + "step": 5200 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018877942774357117, + "loss": 0.66, + "step": 5220 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001883448026077508, + "loss": 0.6705, + "step": 5240 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018791017747193044, + "loss": 0.6693, + "step": 5260 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001874755523361101, + "loss": 0.6546, + "step": 5280 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018704092720028974, + "loss": 0.6673, + "step": 5300 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018660630206446938, + "loss": 0.671, + "step": 5320 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018617167692864904, + "loss": 0.675, + "step": 5340 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018573705179282868, + "loss": 0.6744, + "step": 5360 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018530242665700832, + "loss": 0.6643, + "step": 5380 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018486780152118798, + "loss": 0.6686, + "step": 5400 + }, + { + "epoch": 1.17, + "eval_loss": 0.6908227801322937, + "eval_runtime": 50.0742, + "eval_samples_per_second": 39.941, + "eval_steps_per_second": 0.639, + "step": 5400 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018443317638536762, + "loss": 0.6666, + "step": 5420 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018399855124954725, + "loss": 0.6658, + "step": 5440 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001835639261137269, + "loss": 0.671, + "step": 5460 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018312930097790653, + "loss": 0.6736, + "step": 5480 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018269467584208616, + "loss": 0.6697, + "step": 5500 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018226005070626583, + "loss": 0.6718, + "step": 5520 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018182542557044546, + "loss": 0.6701, + "step": 5540 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001813908004346251, + "loss": 0.6696, + "step": 5560 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018095617529880476, + "loss": 0.6611, + "step": 5580 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001805215501629844, + "loss": 0.6638, + "step": 5600 + }, + { + "epoch": 1.21, + "eval_loss": 0.689289927482605, + "eval_runtime": 50.1304, + "eval_samples_per_second": 39.896, + "eval_steps_per_second": 0.638, + "step": 5600 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018008692502716404, + "loss": 0.6646, + "step": 5620 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001796522998913437, + "loss": 0.6717, + "step": 5640 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017921767475552334, + "loss": 0.6647, + "step": 5660 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017878304961970297, + "loss": 0.672, + "step": 5680 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017834842448388264, + "loss": 0.6645, + "step": 5700 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017791379934806227, + "loss": 0.6768, + "step": 5720 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001774791742122419, + "loss": 0.6748, + "step": 5740 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017704454907642157, + "loss": 0.6722, + "step": 5760 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001766099239406012, + "loss": 0.6631, + "step": 5780 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017617529880478084, + "loss": 0.6647, + "step": 5800 + }, + { + "epoch": 1.25, + "eval_loss": 0.688850462436676, + "eval_runtime": 50.0542, + "eval_samples_per_second": 39.957, + "eval_steps_per_second": 0.639, + "step": 5800 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001757406736689605, + "loss": 0.66, + "step": 5820 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017530604853314014, + "loss": 0.6682, + "step": 5840 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017487142339731978, + "loss": 0.6589, + "step": 5860 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017443679826149944, + "loss": 0.6691, + "step": 5880 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017400217312567908, + "loss": 0.6726, + "step": 5900 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017356754798985872, + "loss": 0.6628, + "step": 5920 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017313292285403838, + "loss": 0.6719, + "step": 5940 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017269829771821802, + "loss": 0.6648, + "step": 5960 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017226367258239765, + "loss": 0.6594, + "step": 5980 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017182904744657732, + "loss": 0.6717, + "step": 6000 + }, + { + "epoch": 1.29, + "eval_loss": 0.6876093745231628, + "eval_runtime": 50.1763, + "eval_samples_per_second": 39.859, + "eval_steps_per_second": 0.638, + "step": 6000 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017139442231075695, + "loss": 0.6632, + "step": 6020 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001709597971749366, + "loss": 0.6619, + "step": 6040 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017052517203911625, + "loss": 0.667, + "step": 6060 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001700905469032959, + "loss": 0.6625, + "step": 6080 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016965592176747553, + "loss": 0.6661, + "step": 6100 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001692212966316552, + "loss": 0.656, + "step": 6120 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016878667149583483, + "loss": 0.6668, + "step": 6140 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016835204636001446, + "loss": 0.6669, + "step": 6160 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016791742122419413, + "loss": 0.6662, + "step": 6180 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016748279608837376, + "loss": 0.6692, + "step": 6200 + }, + { + "epoch": 1.34, + "eval_loss": 0.6869744658470154, + "eval_runtime": 50.1517, + "eval_samples_per_second": 39.879, + "eval_steps_per_second": 0.638, + "step": 6200 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001670481709525534, + "loss": 0.6571, + "step": 6220 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016661354581673306, + "loss": 0.6659, + "step": 6240 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001661789206809127, + "loss": 0.6622, + "step": 6260 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016574429554509234, + "loss": 0.6522, + "step": 6280 + }, + { + "epoch": 1.36, + "learning_rate": 0.000165309670409272, + "loss": 0.667, + "step": 6300 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016487504527345164, + "loss": 0.6644, + "step": 6320 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016444042013763127, + "loss": 0.6625, + "step": 6340 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016400579500181094, + "loss": 0.6686, + "step": 6360 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016357116986599057, + "loss": 0.6562, + "step": 6380 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001631365447301702, + "loss": 0.6595, + "step": 6400 + }, + { + "epoch": 1.38, + "eval_loss": 0.685205340385437, + "eval_runtime": 50.162, + "eval_samples_per_second": 39.871, + "eval_steps_per_second": 0.638, + "step": 6400 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016270191959434987, + "loss": 0.6595, + "step": 6420 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001622672944585295, + "loss": 0.6644, + "step": 6440 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016183266932270915, + "loss": 0.6647, + "step": 6460 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001613980441868888, + "loss": 0.6655, + "step": 6480 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016096341905106845, + "loss": 0.6564, + "step": 6500 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016052879391524808, + "loss": 0.6578, + "step": 6520 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016009416877942775, + "loss": 0.6624, + "step": 6540 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015965954364360738, + "loss": 0.6633, + "step": 6560 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015922491850778702, + "loss": 0.6616, + "step": 6580 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015879029337196668, + "loss": 0.6607, + "step": 6600 + }, + { + "epoch": 1.42, + "eval_loss": 0.6847727298736572, + "eval_runtime": 50.1562, + "eval_samples_per_second": 39.875, + "eval_steps_per_second": 0.638, + "step": 6600 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015835566823614632, + "loss": 0.6564, + "step": 6620 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015792104310032596, + "loss": 0.66, + "step": 6640 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015748641796450562, + "loss": 0.6589, + "step": 6660 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015705179282868526, + "loss": 0.6596, + "step": 6680 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001566171676928649, + "loss": 0.6663, + "step": 6700 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015618254255704456, + "loss": 0.6603, + "step": 6720 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001557479174212242, + "loss": 0.6674, + "step": 6740 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015531329228540383, + "loss": 0.6603, + "step": 6760 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001548786671495835, + "loss": 0.6612, + "step": 6780 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015444404201376313, + "loss": 0.6609, + "step": 6800 + }, + { + "epoch": 1.47, + "eval_loss": 0.683903694152832, + "eval_runtime": 50.079, + "eval_samples_per_second": 39.937, + "eval_steps_per_second": 0.639, + "step": 6800 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015400941687794277, + "loss": 0.6557, + "step": 6820 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015357479174212243, + "loss": 0.6627, + "step": 6840 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015314016660630207, + "loss": 0.6667, + "step": 6860 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001527055414704817, + "loss": 0.6633, + "step": 6880 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015227091633466137, + "loss": 0.6565, + "step": 6900 + }, + { + "epoch": 1.49, + "learning_rate": 0.000151836291198841, + "loss": 0.6588, + "step": 6920 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015140166606302064, + "loss": 0.6687, + "step": 6940 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001509670409272003, + "loss": 0.6611, + "step": 6960 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015053241579137994, + "loss": 0.6576, + "step": 6980 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015009779065555957, + "loss": 0.6576, + "step": 7000 + }, + { + "epoch": 1.51, + "eval_loss": 0.6830142736434937, + "eval_runtime": 50.1233, + "eval_samples_per_second": 39.902, + "eval_steps_per_second": 0.638, + "step": 7000 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001496631655197392, + "loss": 0.6617, + "step": 7020 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014922854038391885, + "loss": 0.6533, + "step": 7040 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001487939152480985, + "loss": 0.6524, + "step": 7060 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014835929011227815, + "loss": 0.6597, + "step": 7080 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014792466497645778, + "loss": 0.656, + "step": 7100 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014749003984063745, + "loss": 0.6501, + "step": 7120 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014705541470481708, + "loss": 0.6563, + "step": 7140 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014662078956899672, + "loss": 0.6496, + "step": 7160 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014618616443317638, + "loss": 0.6602, + "step": 7180 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014575153929735602, + "loss": 0.6617, + "step": 7200 + }, + { + "epoch": 1.55, + "eval_loss": 0.6818540096282959, + "eval_runtime": 50.1175, + "eval_samples_per_second": 39.906, + "eval_steps_per_second": 0.639, + "step": 7200 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014531691416153566, + "loss": 0.6655, + "step": 7220 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014488228902571532, + "loss": 0.6544, + "step": 7240 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014444766388989496, + "loss": 0.655, + "step": 7260 + }, + { + "epoch": 1.57, + "learning_rate": 0.0001440130387540746, + "loss": 0.6535, + "step": 7280 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014357841361825426, + "loss": 0.6584, + "step": 7300 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001431437884824339, + "loss": 0.6602, + "step": 7320 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014270916334661353, + "loss": 0.6689, + "step": 7340 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001422745382107932, + "loss": 0.6613, + "step": 7360 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014183991307497283, + "loss": 0.659, + "step": 7380 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014140528793915247, + "loss": 0.6463, + "step": 7400 + }, + { + "epoch": 1.6, + "eval_loss": 0.681868851184845, + "eval_runtime": 50.1388, + "eval_samples_per_second": 39.889, + "eval_steps_per_second": 0.638, + "step": 7400 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014097066280333213, + "loss": 0.6617, + "step": 7420 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014053603766751177, + "loss": 0.6648, + "step": 7440 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001401014125316914, + "loss": 0.6528, + "step": 7460 + }, + { + "epoch": 1.61, + "learning_rate": 0.00013966678739587107, + "loss": 0.6655, + "step": 7480 + }, + { + "epoch": 1.62, + "learning_rate": 0.0001392321622600507, + "loss": 0.6609, + "step": 7500 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013879753712423034, + "loss": 0.6528, + "step": 7520 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013836291198841, + "loss": 0.6561, + "step": 7540 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013792828685258964, + "loss": 0.6682, + "step": 7560 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013749366171676928, + "loss": 0.6677, + "step": 7580 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013705903658094894, + "loss": 0.6599, + "step": 7600 + }, + { + "epoch": 1.64, + "eval_loss": 0.6807426810264587, + "eval_runtime": 50.3308, + "eval_samples_per_second": 39.737, + "eval_steps_per_second": 0.636, + "step": 7600 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013662441144512855, + "loss": 0.6525, + "step": 7620 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001361897863093082, + "loss": 0.6574, + "step": 7640 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013575516117348785, + "loss": 0.6516, + "step": 7660 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013532053603766749, + "loss": 0.6533, + "step": 7680 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013488591090184715, + "loss": 0.6577, + "step": 7700 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013445128576602679, + "loss": 0.6592, + "step": 7720 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013401666063020642, + "loss": 0.6585, + "step": 7740 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013358203549438609, + "loss": 0.6607, + "step": 7760 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013314741035856572, + "loss": 0.6617, + "step": 7780 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013271278522274536, + "loss": 0.6443, + "step": 7800 + }, + { + "epoch": 1.68, + "eval_loss": 0.6800745725631714, + "eval_runtime": 50.165, + "eval_samples_per_second": 39.868, + "eval_steps_per_second": 0.638, + "step": 7800 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013227816008692502, + "loss": 0.6587, + "step": 7820 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013184353495110466, + "loss": 0.6613, + "step": 7840 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001314089098152843, + "loss": 0.654, + "step": 7860 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013097428467946396, + "loss": 0.6523, + "step": 7880 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001305396595436436, + "loss": 0.6563, + "step": 7900 + }, + { + "epoch": 1.71, + "learning_rate": 0.00013010503440782323, + "loss": 0.6524, + "step": 7920 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001296704092720029, + "loss": 0.6523, + "step": 7940 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012923578413618253, + "loss": 0.6493, + "step": 7960 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012880115900036217, + "loss": 0.6538, + "step": 7980 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012836653386454183, + "loss": 0.6512, + "step": 8000 + }, + { + "epoch": 1.73, + "eval_loss": 0.6790341734886169, + "eval_runtime": 50.1317, + "eval_samples_per_second": 39.895, + "eval_steps_per_second": 0.638, + "step": 8000 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012793190872872147, + "loss": 0.6562, + "step": 8020 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001274972835929011, + "loss": 0.6556, + "step": 8040 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012706265845708077, + "loss": 0.65, + "step": 8060 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001266280333212604, + "loss": 0.661, + "step": 8080 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012619340818544004, + "loss": 0.655, + "step": 8100 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001257587830496197, + "loss": 0.6534, + "step": 8120 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012532415791379934, + "loss": 0.6517, + "step": 8140 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012488953277797898, + "loss": 0.6605, + "step": 8160 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012445490764215864, + "loss": 0.6556, + "step": 8180 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012402028250633828, + "loss": 0.6492, + "step": 8200 + }, + { + "epoch": 1.77, + "eval_loss": 0.6781870126724243, + "eval_runtime": 50.0809, + "eval_samples_per_second": 39.935, + "eval_steps_per_second": 0.639, + "step": 8200 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012358565737051791, + "loss": 0.6541, + "step": 8220 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012315103223469758, + "loss": 0.6517, + "step": 8240 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012271640709887721, + "loss": 0.6483, + "step": 8260 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012228178196305685, + "loss": 0.6619, + "step": 8280 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001218471568272365, + "loss": 0.6556, + "step": 8300 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012141253169141615, + "loss": 0.6471, + "step": 8320 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012097790655559579, + "loss": 0.6611, + "step": 8340 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012054328141977544, + "loss": 0.6506, + "step": 8360 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012010865628395509, + "loss": 0.6611, + "step": 8380 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011967403114813472, + "loss": 0.6557, + "step": 8400 + }, + { + "epoch": 1.81, + "eval_loss": 0.6776989102363586, + "eval_runtime": 50.1344, + "eval_samples_per_second": 39.893, + "eval_steps_per_second": 0.638, + "step": 8400 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011923940601231437, + "loss": 0.6504, + "step": 8420 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011880478087649402, + "loss": 0.6552, + "step": 8440 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011839188699746468, + "loss": 0.641, + "step": 8460 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011795726186164432, + "loss": 0.6535, + "step": 8480 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011752263672582397, + "loss": 0.6568, + "step": 8500 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011708801159000362, + "loss": 0.6621, + "step": 8520 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011665338645418325, + "loss": 0.6607, + "step": 8540 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001162187613183629, + "loss": 0.6516, + "step": 8560 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011578413618254255, + "loss": 0.6497, + "step": 8580 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011534951104672219, + "loss": 0.6559, + "step": 8600 + }, + { + "epoch": 1.86, + "eval_loss": 0.6773191094398499, + "eval_runtime": 50.1605, + "eval_samples_per_second": 39.872, + "eval_steps_per_second": 0.638, + "step": 8600 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011491488591090184, + "loss": 0.6595, + "step": 8620 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011448026077508149, + "loss": 0.6495, + "step": 8640 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011404563563926113, + "loss": 0.6518, + "step": 8660 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011361101050344078, + "loss": 0.6511, + "step": 8680 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011317638536762043, + "loss": 0.6495, + "step": 8700 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011274176023180006, + "loss": 0.6485, + "step": 8720 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011230713509597971, + "loss": 0.6543, + "step": 8740 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011187250996015936, + "loss": 0.6509, + "step": 8760 + }, + { + "epoch": 1.89, + "learning_rate": 0.000111437884824339, + "loss": 0.656, + "step": 8780 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011100325968851865, + "loss": 0.6557, + "step": 8800 + }, + { + "epoch": 1.9, + "eval_loss": 0.6773696541786194, + "eval_runtime": 50.1296, + "eval_samples_per_second": 39.897, + "eval_steps_per_second": 0.638, + "step": 8800 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001105686345526983, + "loss": 0.6509, + "step": 8820 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011013400941687794, + "loss": 0.65, + "step": 8840 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010969938428105759, + "loss": 0.6447, + "step": 8860 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010926475914523724, + "loss": 0.6563, + "step": 8880 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010883013400941687, + "loss": 0.6545, + "step": 8900 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010839550887359652, + "loss": 0.6509, + "step": 8920 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010796088373777617, + "loss": 0.6434, + "step": 8940 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010752625860195581, + "loss": 0.6412, + "step": 8960 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010709163346613546, + "loss": 0.6512, + "step": 8980 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010665700833031508, + "loss": 0.6478, + "step": 9000 + }, + { + "epoch": 1.94, + "eval_loss": 0.6760911345481873, + "eval_runtime": 50.1795, + "eval_samples_per_second": 39.857, + "eval_steps_per_second": 0.638, + "step": 9000 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010622238319449473, + "loss": 0.6545, + "step": 9020 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010578775805867438, + "loss": 0.6468, + "step": 9040 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010535313292285402, + "loss": 0.6527, + "step": 9060 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010491850778703367, + "loss": 0.6621, + "step": 9080 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010448388265121332, + "loss": 0.6496, + "step": 9100 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010404925751539295, + "loss": 0.6512, + "step": 9120 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001036146323795726, + "loss": 0.6491, + "step": 9140 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010318000724375225, + "loss": 0.6482, + "step": 9160 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010274538210793189, + "loss": 0.6456, + "step": 9180 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010231075697211154, + "loss": 0.6458, + "step": 9200 + }, + { + "epoch": 1.98, + "eval_loss": 0.6748936772346497, + "eval_runtime": 50.1856, + "eval_samples_per_second": 39.852, + "eval_steps_per_second": 0.638, + "step": 9200 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010187613183629119, + "loss": 0.6473, + "step": 9220 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010144150670047083, + "loss": 0.6496, + "step": 9240 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010100688156465048, + "loss": 0.6566, + "step": 9260 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010057225642883013, + "loss": 0.6475, + "step": 9280 + }, + { + "epoch": 2.01, + "learning_rate": 0.00010013763129300976, + "loss": 0.6536, + "step": 9300 + }, + { + "epoch": 2.01, + "learning_rate": 9.970300615718941e-05, + "loss": 0.646, + "step": 9320 + }, + { + "epoch": 2.02, + "learning_rate": 9.926838102136906e-05, + "loss": 0.6503, + "step": 9340 + }, + { + "epoch": 2.02, + "learning_rate": 9.88337558855487e-05, + "loss": 0.6527, + "step": 9360 + }, + { + "epoch": 2.02, + "learning_rate": 9.839913074972835e-05, + "loss": 0.6514, + "step": 9380 + }, + { + "epoch": 2.03, + "learning_rate": 9.7964505613908e-05, + "loss": 0.6548, + "step": 9400 + }, + { + "epoch": 2.03, + "eval_loss": 0.6744834780693054, + "eval_runtime": 50.1696, + "eval_samples_per_second": 39.865, + "eval_steps_per_second": 0.638, + "step": 9400 + }, + { + "epoch": 2.03, + "learning_rate": 9.752988047808764e-05, + "loss": 0.6483, + "step": 9420 + }, + { + "epoch": 2.04, + "learning_rate": 9.709525534226729e-05, + "loss": 0.6522, + "step": 9440 + }, + { + "epoch": 2.04, + "learning_rate": 9.666063020644694e-05, + "loss": 0.6538, + "step": 9460 + }, + { + "epoch": 2.05, + "learning_rate": 9.622600507062657e-05, + "loss": 0.6449, + "step": 9480 + }, + { + "epoch": 2.05, + "learning_rate": 9.579137993480622e-05, + "loss": 0.6451, + "step": 9500 + }, + { + "epoch": 2.05, + "learning_rate": 9.535675479898587e-05, + "loss": 0.6355, + "step": 9520 + }, + { + "epoch": 2.06, + "learning_rate": 9.492212966316551e-05, + "loss": 0.6494, + "step": 9540 + }, + { + "epoch": 2.06, + "learning_rate": 9.448750452734516e-05, + "loss": 0.6435, + "step": 9560 + }, + { + "epoch": 2.07, + "learning_rate": 9.405287939152481e-05, + "loss": 0.651, + "step": 9580 + }, + { + "epoch": 2.07, + "learning_rate": 9.361825425570445e-05, + "loss": 0.6493, + "step": 9600 + }, + { + "epoch": 2.07, + "eval_loss": 0.674017071723938, + "eval_runtime": 50.1402, + "eval_samples_per_second": 39.888, + "eval_steps_per_second": 0.638, + "step": 9600 + }, + { + "epoch": 2.08, + "learning_rate": 9.31836291198841e-05, + "loss": 0.6469, + "step": 9620 + }, + { + "epoch": 2.08, + "learning_rate": 9.274900398406375e-05, + "loss": 0.65, + "step": 9640 + }, + { + "epoch": 2.08, + "learning_rate": 9.231437884824338e-05, + "loss": 0.6536, + "step": 9660 + }, + { + "epoch": 2.09, + "learning_rate": 9.187975371242303e-05, + "loss": 0.6488, + "step": 9680 + }, + { + "epoch": 2.09, + "learning_rate": 9.144512857660268e-05, + "loss": 0.6391, + "step": 9700 + }, + { + "epoch": 2.1, + "learning_rate": 9.101050344078232e-05, + "loss": 0.644, + "step": 9720 + }, + { + "epoch": 2.1, + "learning_rate": 9.057587830496197e-05, + "loss": 0.6507, + "step": 9740 + }, + { + "epoch": 2.11, + "learning_rate": 9.014125316914162e-05, + "loss": 0.6404, + "step": 9760 + }, + { + "epoch": 2.11, + "learning_rate": 8.970662803332126e-05, + "loss": 0.6509, + "step": 9780 + }, + { + "epoch": 2.11, + "learning_rate": 8.92720028975009e-05, + "loss": 0.6435, + "step": 9800 + }, + { + "epoch": 2.11, + "eval_loss": 0.6735255122184753, + "eval_runtime": 50.1703, + "eval_samples_per_second": 39.864, + "eval_steps_per_second": 0.638, + "step": 9800 + }, + { + "epoch": 2.12, + "learning_rate": 8.883737776168056e-05, + "loss": 0.6374, + "step": 9820 + }, + { + "epoch": 2.12, + "learning_rate": 8.840275262586019e-05, + "loss": 0.6445, + "step": 9840 + }, + { + "epoch": 2.13, + "learning_rate": 8.796812749003983e-05, + "loss": 0.6495, + "step": 9860 + }, + { + "epoch": 2.13, + "learning_rate": 8.753350235421946e-05, + "loss": 0.6482, + "step": 9880 + }, + { + "epoch": 2.14, + "learning_rate": 8.709887721839911e-05, + "loss": 0.6441, + "step": 9900 + }, + { + "epoch": 2.14, + "learning_rate": 8.666425208257877e-05, + "loss": 0.6525, + "step": 9920 + }, + { + "epoch": 2.14, + "learning_rate": 8.62296269467584e-05, + "loss": 0.6453, + "step": 9940 + }, + { + "epoch": 2.15, + "learning_rate": 8.579500181093805e-05, + "loss": 0.6498, + "step": 9960 + }, + { + "epoch": 2.15, + "learning_rate": 8.53603766751177e-05, + "loss": 0.6471, + "step": 9980 + }, + { + "epoch": 2.16, + "learning_rate": 8.492575153929734e-05, + "loss": 0.6419, + "step": 10000 + }, + { + "epoch": 2.16, + "eval_loss": 0.6730753779411316, + "eval_runtime": 50.1885, + "eval_samples_per_second": 39.85, + "eval_steps_per_second": 0.638, + "step": 10000 + }, + { + "epoch": 2.16, + "learning_rate": 8.449112640347699e-05, + "loss": 0.6447, + "step": 10020 + }, + { + "epoch": 2.17, + "learning_rate": 8.405650126765664e-05, + "loss": 0.6444, + "step": 10040 + }, + { + "epoch": 2.17, + "learning_rate": 8.362187613183627e-05, + "loss": 0.6393, + "step": 10060 + }, + { + "epoch": 2.17, + "learning_rate": 8.318725099601592e-05, + "loss": 0.6464, + "step": 10080 + }, + { + "epoch": 2.18, + "learning_rate": 8.275262586019557e-05, + "loss": 0.6458, + "step": 10100 + }, + { + "epoch": 2.18, + "learning_rate": 8.231800072437521e-05, + "loss": 0.6402, + "step": 10120 + }, + { + "epoch": 2.19, + "learning_rate": 8.188337558855486e-05, + "loss": 0.6409, + "step": 10140 + }, + { + "epoch": 2.19, + "learning_rate": 8.144875045273451e-05, + "loss": 0.6512, + "step": 10160 + }, + { + "epoch": 2.2, + "learning_rate": 8.101412531691415e-05, + "loss": 0.6498, + "step": 10180 + }, + { + "epoch": 2.2, + "learning_rate": 8.05795001810938e-05, + "loss": 0.6393, + "step": 10200 + }, + { + "epoch": 2.2, + "eval_loss": 0.6726437211036682, + "eval_runtime": 50.1492, + "eval_samples_per_second": 39.881, + "eval_steps_per_second": 0.638, + "step": 10200 + }, + { + "epoch": 2.2, + "learning_rate": 8.014487504527345e-05, + "loss": 0.6458, + "step": 10220 + }, + { + "epoch": 2.21, + "learning_rate": 7.971024990945308e-05, + "loss": 0.6466, + "step": 10240 + }, + { + "epoch": 2.21, + "learning_rate": 7.927562477363273e-05, + "loss": 0.644, + "step": 10260 + }, + { + "epoch": 2.22, + "learning_rate": 7.884099963781238e-05, + "loss": 0.6467, + "step": 10280 + }, + { + "epoch": 2.22, + "learning_rate": 7.840637450199202e-05, + "loss": 0.6436, + "step": 10300 + }, + { + "epoch": 2.23, + "learning_rate": 7.797174936617167e-05, + "loss": 0.6422, + "step": 10320 + }, + { + "epoch": 2.23, + "learning_rate": 7.753712423035132e-05, + "loss": 0.645, + "step": 10340 + }, + { + "epoch": 2.24, + "learning_rate": 7.710249909453096e-05, + "loss": 0.6423, + "step": 10360 + }, + { + "epoch": 2.24, + "learning_rate": 7.666787395871061e-05, + "loss": 0.6557, + "step": 10380 + }, + { + "epoch": 2.24, + "learning_rate": 7.623324882289026e-05, + "loss": 0.646, + "step": 10400 + }, + { + "epoch": 2.24, + "eval_loss": 0.6725419759750366, + "eval_runtime": 50.1975, + "eval_samples_per_second": 39.843, + "eval_steps_per_second": 0.637, + "step": 10400 + }, + { + "epoch": 2.25, + "learning_rate": 7.57986236870699e-05, + "loss": 0.6503, + "step": 10420 + }, + { + "epoch": 2.25, + "learning_rate": 7.536399855124954e-05, + "loss": 0.6428, + "step": 10440 + }, + { + "epoch": 2.26, + "learning_rate": 7.49293734154292e-05, + "loss": 0.6438, + "step": 10460 + }, + { + "epoch": 2.26, + "learning_rate": 7.449474827960883e-05, + "loss": 0.6427, + "step": 10480 + }, + { + "epoch": 2.27, + "learning_rate": 7.406012314378847e-05, + "loss": 0.6458, + "step": 10500 + }, + { + "epoch": 2.27, + "learning_rate": 7.362549800796812e-05, + "loss": 0.6423, + "step": 10520 + }, + { + "epoch": 2.27, + "learning_rate": 7.319087287214777e-05, + "loss": 0.6466, + "step": 10540 + }, + { + "epoch": 2.28, + "learning_rate": 7.27562477363274e-05, + "loss": 0.6394, + "step": 10560 + }, + { + "epoch": 2.28, + "learning_rate": 7.232162260050705e-05, + "loss": 0.6362, + "step": 10580 + }, + { + "epoch": 2.29, + "learning_rate": 7.18869974646867e-05, + "loss": 0.6399, + "step": 10600 + }, + { + "epoch": 2.29, + "eval_loss": 0.6719211935997009, + "eval_runtime": 50.1808, + "eval_samples_per_second": 39.856, + "eval_steps_per_second": 0.638, + "step": 10600 + }, + { + "epoch": 2.29, + "learning_rate": 7.145237232886634e-05, + "loss": 0.6378, + "step": 10620 + }, + { + "epoch": 2.3, + "learning_rate": 7.101774719304599e-05, + "loss": 0.634, + "step": 10640 + }, + { + "epoch": 2.3, + "learning_rate": 7.058312205722564e-05, + "loss": 0.6374, + "step": 10660 + }, + { + "epoch": 2.3, + "learning_rate": 7.014849692140528e-05, + "loss": 0.6464, + "step": 10680 + }, + { + "epoch": 2.31, + "learning_rate": 6.971387178558493e-05, + "loss": 0.643, + "step": 10700 + }, + { + "epoch": 2.31, + "learning_rate": 6.927924664976458e-05, + "loss": 0.6384, + "step": 10720 + }, + { + "epoch": 2.32, + "learning_rate": 6.884462151394421e-05, + "loss": 0.6451, + "step": 10740 + }, + { + "epoch": 2.32, + "learning_rate": 6.840999637812386e-05, + "loss": 0.6465, + "step": 10760 + }, + { + "epoch": 2.33, + "learning_rate": 6.799710249909452e-05, + "loss": 0.646, + "step": 10780 + }, + { + "epoch": 2.33, + "learning_rate": 6.756247736327417e-05, + "loss": 0.6525, + "step": 10800 + }, + { + "epoch": 2.33, + "eval_loss": 0.6714358925819397, + "eval_runtime": 50.1294, + "eval_samples_per_second": 39.897, + "eval_steps_per_second": 0.638, + "step": 10800 + }, + { + "epoch": 2.33, + "learning_rate": 6.712785222745382e-05, + "loss": 0.6423, + "step": 10820 + }, + { + "epoch": 2.34, + "learning_rate": 6.669322709163345e-05, + "loss": 0.6449, + "step": 10840 + }, + { + "epoch": 2.34, + "learning_rate": 6.62586019558131e-05, + "loss": 0.6325, + "step": 10860 + }, + { + "epoch": 2.35, + "learning_rate": 6.582397681999275e-05, + "loss": 0.6558, + "step": 10880 + }, + { + "epoch": 2.35, + "learning_rate": 6.538935168417239e-05, + "loss": 0.6419, + "step": 10900 + }, + { + "epoch": 2.36, + "learning_rate": 6.495472654835204e-05, + "loss": 0.6466, + "step": 10920 + }, + { + "epoch": 2.36, + "learning_rate": 6.452010141253169e-05, + "loss": 0.6357, + "step": 10940 + }, + { + "epoch": 2.36, + "learning_rate": 6.408547627671133e-05, + "loss": 0.6366, + "step": 10960 + }, + { + "epoch": 2.37, + "learning_rate": 6.365085114089098e-05, + "loss": 0.6466, + "step": 10980 + }, + { + "epoch": 2.37, + "learning_rate": 6.321622600507063e-05, + "loss": 0.6542, + "step": 11000 + }, + { + "epoch": 2.37, + "eval_loss": 0.6710445880889893, + "eval_runtime": 50.2479, + "eval_samples_per_second": 39.803, + "eval_steps_per_second": 0.637, + "step": 11000 + }, + { + "epoch": 2.38, + "learning_rate": 6.278160086925026e-05, + "loss": 0.6481, + "step": 11020 + }, + { + "epoch": 2.38, + "learning_rate": 6.23469757334299e-05, + "loss": 0.6425, + "step": 11040 + }, + { + "epoch": 2.39, + "learning_rate": 6.191235059760955e-05, + "loss": 0.6439, + "step": 11060 + }, + { + "epoch": 2.39, + "learning_rate": 6.14777254617892e-05, + "loss": 0.6424, + "step": 11080 + }, + { + "epoch": 2.39, + "learning_rate": 6.104310032596884e-05, + "loss": 0.6404, + "step": 11100 + }, + { + "epoch": 2.4, + "learning_rate": 6.060847519014849e-05, + "loss": 0.6387, + "step": 11120 + }, + { + "epoch": 2.4, + "learning_rate": 6.017385005432814e-05, + "loss": 0.6462, + "step": 11140 + }, + { + "epoch": 2.41, + "learning_rate": 5.973922491850778e-05, + "loss": 0.6431, + "step": 11160 + }, + { + "epoch": 2.41, + "learning_rate": 5.9304599782687424e-05, + "loss": 0.638, + "step": 11180 + }, + { + "epoch": 2.42, + "learning_rate": 5.8869974646867074e-05, + "loss": 0.6344, + "step": 11200 + }, + { + "epoch": 2.42, + "eval_loss": 0.6704220771789551, + "eval_runtime": 50.1558, + "eval_samples_per_second": 39.876, + "eval_steps_per_second": 0.638, + "step": 11200 + }, + { + "epoch": 2.42, + "learning_rate": 5.843534951104672e-05, + "loss": 0.6448, + "step": 11220 + }, + { + "epoch": 2.43, + "learning_rate": 5.800072437522636e-05, + "loss": 0.6449, + "step": 11240 + }, + { + "epoch": 2.43, + "learning_rate": 5.756609923940601e-05, + "loss": 0.6399, + "step": 11260 + }, + { + "epoch": 2.43, + "learning_rate": 5.7131474103585654e-05, + "loss": 0.638, + "step": 11280 + }, + { + "epoch": 2.44, + "learning_rate": 5.66968489677653e-05, + "loss": 0.6418, + "step": 11300 + }, + { + "epoch": 2.44, + "learning_rate": 5.626222383194495e-05, + "loss": 0.6482, + "step": 11320 + }, + { + "epoch": 2.45, + "learning_rate": 5.582759869612459e-05, + "loss": 0.6392, + "step": 11340 + }, + { + "epoch": 2.45, + "learning_rate": 5.5392973560304233e-05, + "loss": 0.6363, + "step": 11360 + }, + { + "epoch": 2.46, + "learning_rate": 5.4958348424483883e-05, + "loss": 0.6503, + "step": 11380 + }, + { + "epoch": 2.46, + "learning_rate": 5.452372328866353e-05, + "loss": 0.6453, + "step": 11400 + }, + { + "epoch": 2.46, + "eval_loss": 0.670009195804596, + "eval_runtime": 50.155, + "eval_samples_per_second": 39.876, + "eval_steps_per_second": 0.638, + "step": 11400 + }, + { + "epoch": 2.46, + "learning_rate": 5.408909815284317e-05, + "loss": 0.6384, + "step": 11420 + }, + { + "epoch": 2.47, + "learning_rate": 5.365447301702282e-05, + "loss": 0.6449, + "step": 11440 + }, + { + "epoch": 2.47, + "learning_rate": 5.3219847881202456e-05, + "loss": 0.6406, + "step": 11460 + }, + { + "epoch": 2.48, + "learning_rate": 5.27852227453821e-05, + "loss": 0.6363, + "step": 11480 + }, + { + "epoch": 2.48, + "learning_rate": 5.235059760956174e-05, + "loss": 0.6482, + "step": 11500 + }, + { + "epoch": 2.49, + "learning_rate": 5.191597247374139e-05, + "loss": 0.6503, + "step": 11520 + }, + { + "epoch": 2.49, + "learning_rate": 5.1481347337921036e-05, + "loss": 0.6479, + "step": 11540 + }, + { + "epoch": 2.49, + "learning_rate": 5.10684534588917e-05, + "loss": 0.6437, + "step": 11560 + }, + { + "epoch": 2.5, + "learning_rate": 5.063382832307134e-05, + "loss": 0.6398, + "step": 11580 + }, + { + "epoch": 2.5, + "learning_rate": 5.0199203187250985e-05, + "loss": 0.6456, + "step": 11600 + }, + { + "epoch": 2.5, + "eval_loss": 0.6702134013175964, + "eval_runtime": 50.1834, + "eval_samples_per_second": 39.854, + "eval_steps_per_second": 0.638, + "step": 11600 + }, + { + "epoch": 2.51, + "learning_rate": 4.9764578051430635e-05, + "loss": 0.646, + "step": 11620 + }, + { + "epoch": 2.51, + "learning_rate": 4.932995291561028e-05, + "loss": 0.6375, + "step": 11640 + }, + { + "epoch": 2.52, + "learning_rate": 4.889532777978992e-05, + "loss": 0.6393, + "step": 11660 + }, + { + "epoch": 2.52, + "learning_rate": 4.846070264396957e-05, + "loss": 0.638, + "step": 11680 + }, + { + "epoch": 2.52, + "learning_rate": 4.8026077508149215e-05, + "loss": 0.6411, + "step": 11700 + }, + { + "epoch": 2.53, + "learning_rate": 4.759145237232886e-05, + "loss": 0.6467, + "step": 11720 + }, + { + "epoch": 2.53, + "learning_rate": 4.715682723650851e-05, + "loss": 0.6369, + "step": 11740 + }, + { + "epoch": 2.54, + "learning_rate": 4.672220210068815e-05, + "loss": 0.637, + "step": 11760 + }, + { + "epoch": 2.54, + "learning_rate": 4.6287576964867795e-05, + "loss": 0.6486, + "step": 11780 + }, + { + "epoch": 2.55, + "learning_rate": 4.5852951829047445e-05, + "loss": 0.637, + "step": 11800 + }, + { + "epoch": 2.55, + "eval_loss": 0.6698750257492065, + "eval_runtime": 50.1539, + "eval_samples_per_second": 39.877, + "eval_steps_per_second": 0.638, + "step": 11800 + }, + { + "epoch": 2.55, + "learning_rate": 4.541832669322709e-05, + "loss": 0.639, + "step": 11820 + }, + { + "epoch": 2.55, + "learning_rate": 4.498370155740673e-05, + "loss": 0.6366, + "step": 11840 + }, + { + "epoch": 2.56, + "learning_rate": 4.454907642158638e-05, + "loss": 0.6409, + "step": 11860 + }, + { + "epoch": 2.56, + "learning_rate": 4.4114451285766025e-05, + "loss": 0.6394, + "step": 11880 + }, + { + "epoch": 2.57, + "learning_rate": 4.367982614994567e-05, + "loss": 0.6351, + "step": 11900 + }, + { + "epoch": 2.57, + "learning_rate": 4.324520101412532e-05, + "loss": 0.6391, + "step": 11920 + }, + { + "epoch": 2.58, + "learning_rate": 4.281057587830496e-05, + "loss": 0.6267, + "step": 11940 + }, + { + "epoch": 2.58, + "learning_rate": 4.2375950742484604e-05, + "loss": 0.6461, + "step": 11960 + }, + { + "epoch": 2.58, + "learning_rate": 4.194132560666425e-05, + "loss": 0.6483, + "step": 11980 + }, + { + "epoch": 2.59, + "learning_rate": 4.150670047084389e-05, + "loss": 0.6461, + "step": 12000 + }, + { + "epoch": 2.59, + "eval_loss": 0.6692882180213928, + "eval_runtime": 50.1673, + "eval_samples_per_second": 39.867, + "eval_steps_per_second": 0.638, + "step": 12000 + }, + { + "epoch": 2.59, + "learning_rate": 4.1072075335023534e-05, + "loss": 0.6429, + "step": 12020 + }, + { + "epoch": 2.6, + "learning_rate": 4.0637450199203184e-05, + "loss": 0.6416, + "step": 12040 + }, + { + "epoch": 2.6, + "learning_rate": 4.020282506338283e-05, + "loss": 0.6356, + "step": 12060 + }, + { + "epoch": 2.61, + "learning_rate": 3.976819992756247e-05, + "loss": 0.6402, + "step": 12080 + }, + { + "epoch": 2.61, + "learning_rate": 3.933357479174212e-05, + "loss": 0.6395, + "step": 12100 + }, + { + "epoch": 2.61, + "learning_rate": 3.8898949655921764e-05, + "loss": 0.6432, + "step": 12120 + }, + { + "epoch": 2.62, + "learning_rate": 3.846432452010141e-05, + "loss": 0.6386, + "step": 12140 + }, + { + "epoch": 2.62, + "learning_rate": 3.802969938428106e-05, + "loss": 0.6396, + "step": 12160 + }, + { + "epoch": 2.63, + "learning_rate": 3.75950742484607e-05, + "loss": 0.6423, + "step": 12180 + }, + { + "epoch": 2.63, + "learning_rate": 3.7160449112640344e-05, + "loss": 0.649, + "step": 12200 + }, + { + "epoch": 2.63, + "eval_loss": 0.6691960096359253, + "eval_runtime": 50.1649, + "eval_samples_per_second": 39.869, + "eval_steps_per_second": 0.638, + "step": 12200 + }, + { + "epoch": 2.64, + "learning_rate": 3.672582397681999e-05, + "loss": 0.6547, + "step": 12220 + }, + { + "epoch": 2.64, + "learning_rate": 3.629119884099964e-05, + "loss": 0.642, + "step": 12240 + }, + { + "epoch": 2.65, + "learning_rate": 3.585657370517928e-05, + "loss": 0.634, + "step": 12260 + }, + { + "epoch": 2.65, + "learning_rate": 3.542194856935892e-05, + "loss": 0.6447, + "step": 12280 + }, + { + "epoch": 2.65, + "learning_rate": 3.498732343353857e-05, + "loss": 0.6285, + "step": 12300 + }, + { + "epoch": 2.66, + "learning_rate": 3.455269829771822e-05, + "loss": 0.6436, + "step": 12320 + }, + { + "epoch": 2.66, + "learning_rate": 3.411807316189786e-05, + "loss": 0.6349, + "step": 12340 + }, + { + "epoch": 2.67, + "learning_rate": 3.36834480260775e-05, + "loss": 0.6425, + "step": 12360 + }, + { + "epoch": 2.67, + "learning_rate": 3.324882289025715e-05, + "loss": 0.6393, + "step": 12380 + }, + { + "epoch": 2.68, + "learning_rate": 3.2814197754436796e-05, + "loss": 0.6367, + "step": 12400 + }, + { + "epoch": 2.68, + "eval_loss": 0.6687243580818176, + "eval_runtime": 50.3508, + "eval_samples_per_second": 39.721, + "eval_steps_per_second": 0.636, + "step": 12400 + }, + { + "epoch": 2.68, + "learning_rate": 3.237957261861644e-05, + "loss": 0.6386, + "step": 12420 + }, + { + "epoch": 2.68, + "learning_rate": 3.194494748279609e-05, + "loss": 0.6526, + "step": 12440 + }, + { + "epoch": 2.69, + "learning_rate": 3.151032234697573e-05, + "loss": 0.6357, + "step": 12460 + }, + { + "epoch": 2.69, + "learning_rate": 3.1075697211155376e-05, + "loss": 0.6353, + "step": 12480 + }, + { + "epoch": 2.7, + "learning_rate": 3.0641072075335026e-05, + "loss": 0.6449, + "step": 12500 + }, + { + "epoch": 2.7, + "learning_rate": 3.0206446939514663e-05, + "loss": 0.6425, + "step": 12520 + }, + { + "epoch": 2.71, + "learning_rate": 2.977182180369431e-05, + "loss": 0.6374, + "step": 12540 + }, + { + "epoch": 2.71, + "learning_rate": 2.9337196667873956e-05, + "loss": 0.6324, + "step": 12560 + }, + { + "epoch": 2.71, + "learning_rate": 2.89025715320536e-05, + "loss": 0.6502, + "step": 12580 + }, + { + "epoch": 2.72, + "learning_rate": 2.8467946396233246e-05, + "loss": 0.637, + "step": 12600 + }, + { + "epoch": 2.72, + "eval_loss": 0.6683821082115173, + "eval_runtime": 50.2054, + "eval_samples_per_second": 39.836, + "eval_steps_per_second": 0.637, + "step": 12600 + }, + { + "epoch": 2.72, + "learning_rate": 2.8033321260412892e-05, + "loss": 0.647, + "step": 12620 + }, + { + "epoch": 2.73, + "learning_rate": 2.7598696124592536e-05, + "loss": 0.632, + "step": 12640 + }, + { + "epoch": 2.73, + "learning_rate": 2.7164070988772182e-05, + "loss": 0.6411, + "step": 12660 + }, + { + "epoch": 2.74, + "learning_rate": 2.672944585295183e-05, + "loss": 0.632, + "step": 12680 + }, + { + "epoch": 2.74, + "learning_rate": 2.6294820717131472e-05, + "loss": 0.6389, + "step": 12700 + }, + { + "epoch": 2.74, + "learning_rate": 2.586019558131112e-05, + "loss": 0.6337, + "step": 12720 + }, + { + "epoch": 2.75, + "learning_rate": 2.542557044549076e-05, + "loss": 0.6439, + "step": 12740 + }, + { + "epoch": 2.75, + "learning_rate": 2.4990945309670405e-05, + "loss": 0.6364, + "step": 12760 + }, + { + "epoch": 2.76, + "learning_rate": 2.4556320173850052e-05, + "loss": 0.6402, + "step": 12780 + }, + { + "epoch": 2.76, + "learning_rate": 2.4121695038029695e-05, + "loss": 0.6376, + "step": 12800 + }, + { + "epoch": 2.76, + "eval_loss": 0.6680713295936584, + "eval_runtime": 50.1757, + "eval_samples_per_second": 39.86, + "eval_steps_per_second": 0.638, + "step": 12800 + }, + { + "epoch": 2.77, + "learning_rate": 2.3687069902209342e-05, + "loss": 0.6316, + "step": 12820 + }, + { + "epoch": 2.77, + "learning_rate": 2.325244476638899e-05, + "loss": 0.6393, + "step": 12840 + }, + { + "epoch": 2.77, + "learning_rate": 2.281781963056863e-05, + "loss": 0.6372, + "step": 12860 + }, + { + "epoch": 2.78, + "learning_rate": 2.2383194494748278e-05, + "loss": 0.6466, + "step": 12880 + }, + { + "epoch": 2.78, + "learning_rate": 2.1948569358927925e-05, + "loss": 0.6392, + "step": 12900 + }, + { + "epoch": 2.79, + "learning_rate": 2.1513944223107568e-05, + "loss": 0.6389, + "step": 12920 + }, + { + "epoch": 2.79, + "learning_rate": 2.107931908728721e-05, + "loss": 0.64, + "step": 12940 + }, + { + "epoch": 2.8, + "learning_rate": 2.0644693951466858e-05, + "loss": 0.6362, + "step": 12960 + }, + { + "epoch": 2.8, + "learning_rate": 2.02100688156465e-05, + "loss": 0.6364, + "step": 12980 + }, + { + "epoch": 2.8, + "learning_rate": 1.9775443679826148e-05, + "loss": 0.6372, + "step": 13000 + }, + { + "epoch": 2.8, + "eval_loss": 0.6680414080619812, + "eval_runtime": 50.2211, + "eval_samples_per_second": 39.824, + "eval_steps_per_second": 0.637, + "step": 13000 + }, + { + "epoch": 2.81, + "learning_rate": 1.9340818544005794e-05, + "loss": 0.6336, + "step": 13020 + }, + { + "epoch": 2.81, + "learning_rate": 1.8906193408185438e-05, + "loss": 0.6348, + "step": 13040 + }, + { + "epoch": 2.82, + "learning_rate": 1.8471568272365084e-05, + "loss": 0.6338, + "step": 13060 + }, + { + "epoch": 2.82, + "learning_rate": 1.8036943136544728e-05, + "loss": 0.6396, + "step": 13080 + }, + { + "epoch": 2.83, + "learning_rate": 1.7602318000724374e-05, + "loss": 0.641, + "step": 13100 + }, + { + "epoch": 2.83, + "learning_rate": 1.7167692864904017e-05, + "loss": 0.6369, + "step": 13120 + }, + { + "epoch": 2.83, + "learning_rate": 1.6733067729083664e-05, + "loss": 0.6345, + "step": 13140 + }, + { + "epoch": 2.84, + "learning_rate": 1.629844259326331e-05, + "loss": 0.649, + "step": 13160 + }, + { + "epoch": 2.84, + "learning_rate": 1.5863817457442954e-05, + "loss": 0.6409, + "step": 13180 + }, + { + "epoch": 2.85, + "learning_rate": 1.54291923216226e-05, + "loss": 0.63, + "step": 13200 + }, + { + "epoch": 2.85, + "eval_loss": 0.6678950190544128, + "eval_runtime": 50.1908, + "eval_samples_per_second": 39.848, + "eval_steps_per_second": 0.638, + "step": 13200 + }, + { + "epoch": 2.85, + "learning_rate": 1.4994567185802244e-05, + "loss": 0.6428, + "step": 13220 + }, + { + "epoch": 2.86, + "learning_rate": 1.4559942049981889e-05, + "loss": 0.645, + "step": 13240 + }, + { + "epoch": 2.86, + "learning_rate": 1.4125316914161534e-05, + "loss": 0.6434, + "step": 13260 + }, + { + "epoch": 2.87, + "learning_rate": 1.369069177834118e-05, + "loss": 0.6462, + "step": 13280 + }, + { + "epoch": 2.87, + "learning_rate": 1.3256066642520825e-05, + "loss": 0.6387, + "step": 13300 + }, + { + "epoch": 2.87, + "learning_rate": 1.2821441506700468e-05, + "loss": 0.6311, + "step": 13320 + }, + { + "epoch": 2.88, + "learning_rate": 1.2386816370880113e-05, + "loss": 0.6446, + "step": 13340 + }, + { + "epoch": 2.88, + "learning_rate": 1.195219123505976e-05, + "loss": 0.6426, + "step": 13360 + }, + { + "epoch": 2.89, + "learning_rate": 1.1517566099239405e-05, + "loss": 0.6369, + "step": 13380 + }, + { + "epoch": 2.89, + "learning_rate": 1.108294096341905e-05, + "loss": 0.6467, + "step": 13400 + }, + { + "epoch": 2.89, + "eval_loss": 0.6676326990127563, + "eval_runtime": 50.1589, + "eval_samples_per_second": 39.873, + "eval_steps_per_second": 0.638, + "step": 13400 + }, + { + "epoch": 2.9, + "learning_rate": 1.0648315827598697e-05, + "loss": 0.6347, + "step": 13420 + }, + { + "epoch": 2.9, + "learning_rate": 1.021369069177834e-05, + "loss": 0.6364, + "step": 13440 + }, + { + "epoch": 2.9, + "learning_rate": 9.779065555957985e-06, + "loss": 0.6309, + "step": 13460 + }, + { + "epoch": 2.91, + "learning_rate": 9.34444042013763e-06, + "loss": 0.6407, + "step": 13480 + }, + { + "epoch": 2.91, + "learning_rate": 8.909815284317276e-06, + "loss": 0.6389, + "step": 13500 + }, + { + "epoch": 2.92, + "learning_rate": 8.475190148496921e-06, + "loss": 0.6378, + "step": 13520 + }, + { + "epoch": 2.92, + "learning_rate": 8.040565012676566e-06, + "loss": 0.6359, + "step": 13540 + }, + { + "epoch": 2.93, + "learning_rate": 7.60593987685621e-06, + "loss": 0.6282, + "step": 13560 + }, + { + "epoch": 2.93, + "learning_rate": 7.171314741035856e-06, + "loss": 0.6409, + "step": 13580 + }, + { + "epoch": 2.93, + "learning_rate": 6.736689605215501e-06, + "loss": 0.6339, + "step": 13600 + }, + { + "epoch": 2.93, + "eval_loss": 0.6675477027893066, + "eval_runtime": 50.3638, + "eval_samples_per_second": 39.711, + "eval_steps_per_second": 0.635, + "step": 13600 + } + ], + "max_steps": 13905, + "num_train_epochs": 3, + "total_flos": 1.7288080781943505e+20, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13600/training_args.bin b/adapters/saved-alpaca-belle30b/checkpoint-13600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..12b801c69c900b96b8117a2e6bdeacc32be225f4 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e26d4a9526d1384fcaa3dc3df4f56f03c822ab57b4abed652b7156aebfaccc3 +size 3643 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13800/optimizer.pt b/adapters/saved-alpaca-belle30b/checkpoint-13800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..65d5d0b4b61816b9a4d47efca86e910691027508 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a00789d3ee0613501e5714fb050734282b431fb616f653ec99228daa079208f +size 102377669 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13800/pytorch_model.bin b/adapters/saved-alpaca-belle30b/checkpoint-13800/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..767fae8b1b34ae05a37cff847aeef8e86f2b5e90 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13800/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48987c243d171a73cbc344a320baf0cd1f7c0941b03478ec04c62949c0294f54 +size 51204365 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_0.pth b/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..74befe1291a1d95eb1c8953f8d65ca222c4fb194 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b184905375b02f82efecc0a5d431ac5cd3d7631a0a2149909e0251cd8c6fcff0 +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_1.pth b/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..1c194eb40899ab4d0337db936583a2d72a4b8377 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2722fbf018f8f75f60d466358819b2ca1b82eba755247f876fb1a3fc6b2d9c0 +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_2.pth b/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..4c88755051944274a0961901fe1e0101e626323a --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4ce86f397d10ef78c38ea9438e4e5e744c30c6d77bace84c4ee6cfdf2d19dc9 +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_3.pth b/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..1e388c7d9e52f330e5808cda1d52a8b561f52943 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:989d30f24242381ac754b61ca18c89ccb87337a137b4b87b55e3ee1d4c37276a +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_4.pth b/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..0b52b2514e098e3f0c2841c67785170cbb264bd5 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97a5e3adb74ae4591f2a4bd30bb59466838dc598eef70e38a63560c53a2ff23b +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_5.pth b/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..374b38d6d7f896a446089928f9c7719bf3d8570a --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a7f60affca6785c6d42e2837d6e3cf42e59a3fb09c48c39ac0c0fd066f92ff4 +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_6.pth b/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..a0c946c0b5f0b9ff3b2050c670bf44f4fe13d4f5 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0983a72ddec3674e3b1a987a644c8113b57b9d83e9a1eddc933fac722f1dafe +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_7.pth b/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d4989d119296ddb879675acc4eef2b3dd6d24e55 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13800/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45b4af35367d962046439e5c81fc1dd699f986d22f9023ea9a75a1095de488cc +size 14583 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13800/scaler.pt b/adapters/saved-alpaca-belle30b/checkpoint-13800/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1113dc95a27b862c41cbd8066c0a2430b72f5556 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13800/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e6d9882752f5a6883700db2c76ffdc18498ee8ecfcc2a6a80d86f5e77f881be +size 557 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13800/scheduler.pt b/adapters/saved-alpaca-belle30b/checkpoint-13800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce1dd0ff161f56598ecc45230c8a5ec1e10b0c75 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dca20206c1f9d4a2e61887fef76a89983f3927a0112e0a2fafd97a71ce2cbe8 +size 627 diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13800/trainer_state.json b/adapters/saved-alpaca-belle30b/checkpoint-13800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e0d9110f44d5787aa90b768be80f963ee5585ae9 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13800/trainer_state.json @@ -0,0 +1,4708 @@ +{ + "best_metric": 0.6671983599662781, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-belle30b/checkpoint-13800", + "epoch": 2.9773462783171523, + "global_step": 13800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.6143, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 1.2447, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017699999999999997, + "loss": 0.9529, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 0.000237, + "loss": 0.8899, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029699999999999996, + "loss": 0.8614, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029958710612097066, + "loss": 0.8402, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029915248098515027, + "loss": 0.8335, + "step": 140 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029871785584932993, + "loss": 0.8303, + "step": 160 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002982832307135096, + "loss": 0.8261, + "step": 180 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002978486055776892, + "loss": 0.807, + "step": 200 + }, + { + "epoch": 0.04, + "eval_loss": 0.8271128535270691, + "eval_runtime": 49.877, + "eval_samples_per_second": 40.099, + "eval_steps_per_second": 0.642, + "step": 200 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029741398044186887, + "loss": 0.808, + "step": 220 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029697935530604853, + "loss": 0.8092, + "step": 240 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029654473017022814, + "loss": 0.8045, + "step": 260 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002961101050344078, + "loss": 0.8007, + "step": 280 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029567547989858747, + "loss": 0.793, + "step": 300 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002952408547627671, + "loss": 0.7886, + "step": 320 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029480622962694674, + "loss": 0.7854, + "step": 340 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002943716044911264, + "loss": 0.783, + "step": 360 + }, + { + "epoch": 0.08, + "learning_rate": 0.000293936979355306, + "loss": 0.7797, + "step": 380 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002935023542194857, + "loss": 0.7801, + "step": 400 + }, + { + "epoch": 0.09, + "eval_loss": 0.793747067451477, + "eval_runtime": 49.8962, + "eval_samples_per_second": 40.083, + "eval_steps_per_second": 0.641, + "step": 400 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029306772908366534, + "loss": 0.7879, + "step": 420 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029263310394784495, + "loss": 0.7745, + "step": 440 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002921984788120246, + "loss": 0.7725, + "step": 460 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002917638536762043, + "loss": 0.7659, + "step": 480 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002913292285403839, + "loss": 0.7658, + "step": 500 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029089460340456355, + "loss": 0.7722, + "step": 520 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002904599782687432, + "loss": 0.773, + "step": 540 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002900253531329228, + "loss": 0.7749, + "step": 560 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002895907279971025, + "loss": 0.7734, + "step": 580 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028915610286128215, + "loss": 0.7607, + "step": 600 + }, + { + "epoch": 0.13, + "eval_loss": 0.7771433591842651, + "eval_runtime": 49.9486, + "eval_samples_per_second": 40.041, + "eval_steps_per_second": 0.641, + "step": 600 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028872147772546176, + "loss": 0.7657, + "step": 620 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028828685258964137, + "loss": 0.7602, + "step": 640 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028785222745382103, + "loss": 0.7619, + "step": 660 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002874176023180007, + "loss": 0.7587, + "step": 680 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002869829771821803, + "loss": 0.7553, + "step": 700 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028654835204635997, + "loss": 0.7565, + "step": 720 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028611372691053963, + "loss": 0.7586, + "step": 740 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028567910177471924, + "loss": 0.7556, + "step": 760 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002852444766388989, + "loss": 0.7487, + "step": 780 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028480985150307857, + "loss": 0.7516, + "step": 800 + }, + { + "epoch": 0.17, + "eval_loss": 0.7632888555526733, + "eval_runtime": 49.913, + "eval_samples_per_second": 40.07, + "eval_steps_per_second": 0.641, + "step": 800 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002843752263672582, + "loss": 0.7527, + "step": 820 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028394060123143784, + "loss": 0.7407, + "step": 840 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002835059760956175, + "loss": 0.744, + "step": 860 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002830713509597971, + "loss": 0.7456, + "step": 880 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002826367258239768, + "loss": 0.7429, + "step": 900 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028220210068815644, + "loss": 0.7516, + "step": 920 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028176747555233605, + "loss": 0.7381, + "step": 940 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002813328504165157, + "loss": 0.7256, + "step": 960 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002808982252806954, + "loss": 0.7443, + "step": 980 + }, + { + "epoch": 0.22, + "learning_rate": 0.000280463600144875, + "loss": 0.7389, + "step": 1000 + }, + { + "epoch": 0.22, + "eval_loss": 0.7532852292060852, + "eval_runtime": 49.9829, + "eval_samples_per_second": 40.014, + "eval_steps_per_second": 0.64, + "step": 1000 + }, + { + "epoch": 0.22, + "learning_rate": 0.00028002897500905465, + "loss": 0.7374, + "step": 1020 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002795943498732343, + "loss": 0.7296, + "step": 1040 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002791597247374139, + "loss": 0.7424, + "step": 1060 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002787250996015936, + "loss": 0.7328, + "step": 1080 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027829047446577325, + "loss": 0.7367, + "step": 1100 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027785584932995286, + "loss": 0.7419, + "step": 1120 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002774212241941325, + "loss": 0.7347, + "step": 1140 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002769865990583122, + "loss": 0.7292, + "step": 1160 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002765519739224918, + "loss": 0.7394, + "step": 1180 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027611734878667146, + "loss": 0.7358, + "step": 1200 + }, + { + "epoch": 0.26, + "eval_loss": 0.7463639974594116, + "eval_runtime": 49.9963, + "eval_samples_per_second": 40.003, + "eval_steps_per_second": 0.64, + "step": 1200 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002756827236508511, + "loss": 0.7266, + "step": 1220 + }, + { + "epoch": 0.27, + "learning_rate": 0.00027524809851503073, + "loss": 0.7336, + "step": 1240 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002748134733792104, + "loss": 0.7296, + "step": 1260 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027437884824339006, + "loss": 0.73, + "step": 1280 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027394422310756967, + "loss": 0.7312, + "step": 1300 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027350959797174933, + "loss": 0.7307, + "step": 1320 + }, + { + "epoch": 0.29, + "learning_rate": 0.000273074972835929, + "loss": 0.7246, + "step": 1340 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002726403477001086, + "loss": 0.7299, + "step": 1360 + }, + { + "epoch": 0.3, + "learning_rate": 0.00027220572256428827, + "loss": 0.7251, + "step": 1380 + }, + { + "epoch": 0.3, + "learning_rate": 0.00027177109742846793, + "loss": 0.7286, + "step": 1400 + }, + { + "epoch": 0.3, + "eval_loss": 0.7393819093704224, + "eval_runtime": 49.9896, + "eval_samples_per_second": 40.008, + "eval_steps_per_second": 0.64, + "step": 1400 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027133647229264754, + "loss": 0.7186, + "step": 1420 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002709018471568272, + "loss": 0.7215, + "step": 1440 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027046722202100687, + "loss": 0.7295, + "step": 1460 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002700325968851865, + "loss": 0.7198, + "step": 1480 + }, + { + "epoch": 0.32, + "learning_rate": 0.00026959797174936614, + "loss": 0.7184, + "step": 1500 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002691633466135458, + "loss": 0.7283, + "step": 1520 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002687287214777254, + "loss": 0.7378, + "step": 1540 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002682940963419051, + "loss": 0.7196, + "step": 1560 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026785947120608474, + "loss": 0.7152, + "step": 1580 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026742484607026435, + "loss": 0.7184, + "step": 1600 + }, + { + "epoch": 0.35, + "eval_loss": 0.7342154383659363, + "eval_runtime": 49.9957, + "eval_samples_per_second": 40.003, + "eval_steps_per_second": 0.64, + "step": 1600 + }, + { + "epoch": 0.35, + "learning_rate": 0.000266990220934444, + "loss": 0.7164, + "step": 1620 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002665555957986237, + "loss": 0.7136, + "step": 1640 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002661209706628033, + "loss": 0.7203, + "step": 1660 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026568634552698295, + "loss": 0.7158, + "step": 1680 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002652517203911626, + "loss": 0.7145, + "step": 1700 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002648170952553422, + "loss": 0.7111, + "step": 1720 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002643824701195219, + "loss": 0.7155, + "step": 1740 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026394784498370155, + "loss": 0.718, + "step": 1760 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026351321984788116, + "loss": 0.7125, + "step": 1780 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002630785947120608, + "loss": 0.7163, + "step": 1800 + }, + { + "epoch": 0.39, + "eval_loss": 0.7301950454711914, + "eval_runtime": 49.9689, + "eval_samples_per_second": 40.025, + "eval_steps_per_second": 0.64, + "step": 1800 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002626439695762405, + "loss": 0.7121, + "step": 1820 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002622093444404201, + "loss": 0.7092, + "step": 1840 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026177471930459976, + "loss": 0.7133, + "step": 1860 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002613400941687794, + "loss": 0.7171, + "step": 1880 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026090546903295903, + "loss": 0.7235, + "step": 1900 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002604708438971387, + "loss": 0.7086, + "step": 1920 + }, + { + "epoch": 0.42, + "learning_rate": 0.00026003621876131836, + "loss": 0.7136, + "step": 1940 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025960159362549797, + "loss": 0.7031, + "step": 1960 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025916696848967763, + "loss": 0.7084, + "step": 1980 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002587323433538573, + "loss": 0.7091, + "step": 2000 + }, + { + "epoch": 0.43, + "eval_loss": 0.726446270942688, + "eval_runtime": 50.0519, + "eval_samples_per_second": 39.959, + "eval_steps_per_second": 0.639, + "step": 2000 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002582977182180369, + "loss": 0.7119, + "step": 2020 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025786309308221657, + "loss": 0.7186, + "step": 2040 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025742846794639623, + "loss": 0.703, + "step": 2060 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025699384281057584, + "loss": 0.7078, + "step": 2080 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002565592176747555, + "loss": 0.7084, + "step": 2100 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025612459253893517, + "loss": 0.7014, + "step": 2120 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002556899674031148, + "loss": 0.7076, + "step": 2140 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025525534226729444, + "loss": 0.7103, + "step": 2160 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002548207171314741, + "loss": 0.7118, + "step": 2180 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002543860919956537, + "loss": 0.7028, + "step": 2200 + }, + { + "epoch": 0.47, + "eval_loss": 0.7220268845558167, + "eval_runtime": 49.9937, + "eval_samples_per_second": 40.005, + "eval_steps_per_second": 0.64, + "step": 2200 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002539514668598334, + "loss": 0.707, + "step": 2220 + }, + { + "epoch": 0.48, + "learning_rate": 0.00025351684172401304, + "loss": 0.7045, + "step": 2240 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025308221658819265, + "loss": 0.6905, + "step": 2260 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002526475914523723, + "loss": 0.6982, + "step": 2280 + }, + { + "epoch": 0.5, + "learning_rate": 0.000252212966316552, + "loss": 0.706, + "step": 2300 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002517783411807316, + "loss": 0.6992, + "step": 2320 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025134371604491125, + "loss": 0.6939, + "step": 2340 + }, + { + "epoch": 0.51, + "learning_rate": 0.00025090909090909086, + "loss": 0.7037, + "step": 2360 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002504744657732705, + "loss": 0.7127, + "step": 2380 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025003984063745014, + "loss": 0.702, + "step": 2400 + }, + { + "epoch": 0.52, + "eval_loss": 0.7191869020462036, + "eval_runtime": 50.0038, + "eval_samples_per_second": 39.997, + "eval_steps_per_second": 0.64, + "step": 2400 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002496052155016298, + "loss": 0.7033, + "step": 2420 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024917059036580946, + "loss": 0.7028, + "step": 2440 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024873596522998907, + "loss": 0.6967, + "step": 2460 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024830134009416874, + "loss": 0.7068, + "step": 2480 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002478667149583484, + "loss": 0.7105, + "step": 2500 + }, + { + "epoch": 0.54, + "learning_rate": 0.000247432089822528, + "loss": 0.6968, + "step": 2520 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024699746468670767, + "loss": 0.7025, + "step": 2540 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024656283955088734, + "loss": 0.6942, + "step": 2560 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024612821441506694, + "loss": 0.6948, + "step": 2580 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002456935892792466, + "loss": 0.6979, + "step": 2600 + }, + { + "epoch": 0.56, + "eval_loss": 0.715853750705719, + "eval_runtime": 50.0426, + "eval_samples_per_second": 39.966, + "eval_steps_per_second": 0.639, + "step": 2600 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024525896414342627, + "loss": 0.6967, + "step": 2620 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002448243390076059, + "loss": 0.7012, + "step": 2640 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024438971387178554, + "loss": 0.697, + "step": 2660 + }, + { + "epoch": 0.58, + "learning_rate": 0.0002439550887359652, + "loss": 0.6931, + "step": 2680 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024352046360014485, + "loss": 0.6856, + "step": 2700 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024308583846432448, + "loss": 0.697, + "step": 2720 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024265121332850415, + "loss": 0.6996, + "step": 2740 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024221658819268378, + "loss": 0.698, + "step": 2760 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024178196305686342, + "loss": 0.6952, + "step": 2780 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024134733792104308, + "loss": 0.7049, + "step": 2800 + }, + { + "epoch": 0.6, + "eval_loss": 0.7124837040901184, + "eval_runtime": 50.0654, + "eval_samples_per_second": 39.948, + "eval_steps_per_second": 0.639, + "step": 2800 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024091271278522272, + "loss": 0.6927, + "step": 2820 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024047808764940235, + "loss": 0.6996, + "step": 2840 + }, + { + "epoch": 0.62, + "learning_rate": 0.00024004346251358202, + "loss": 0.6921, + "step": 2860 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023960883737776165, + "loss": 0.695, + "step": 2880 + }, + { + "epoch": 0.63, + "learning_rate": 0.0002391742122419413, + "loss": 0.6887, + "step": 2900 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023873958710612095, + "loss": 0.6915, + "step": 2920 + }, + { + "epoch": 0.63, + "learning_rate": 0.0002383049619703006, + "loss": 0.6915, + "step": 2940 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023787033683448023, + "loss": 0.6916, + "step": 2960 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002374357116986599, + "loss": 0.687, + "step": 2980 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023700108656283953, + "loss": 0.6997, + "step": 3000 + }, + { + "epoch": 0.65, + "eval_loss": 0.7098860144615173, + "eval_runtime": 50.0652, + "eval_samples_per_second": 39.948, + "eval_steps_per_second": 0.639, + "step": 3000 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023656646142701916, + "loss": 0.6895, + "step": 3020 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023613183629119883, + "loss": 0.6861, + "step": 3040 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023569721115537846, + "loss": 0.6988, + "step": 3060 + }, + { + "epoch": 0.66, + "learning_rate": 0.0002352625860195581, + "loss": 0.6852, + "step": 3080 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023482796088373776, + "loss": 0.6863, + "step": 3100 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002343933357479174, + "loss": 0.6943, + "step": 3120 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023395871061209704, + "loss": 0.686, + "step": 3140 + }, + { + "epoch": 0.68, + "learning_rate": 0.0002335240854762767, + "loss": 0.684, + "step": 3160 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023308946034045634, + "loss": 0.6866, + "step": 3180 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023265483520463597, + "loss": 0.6859, + "step": 3200 + }, + { + "epoch": 0.69, + "eval_loss": 0.7077216506004333, + "eval_runtime": 50.0526, + "eval_samples_per_second": 39.958, + "eval_steps_per_second": 0.639, + "step": 3200 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023222021006881564, + "loss": 0.6845, + "step": 3220 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023178558493299527, + "loss": 0.7011, + "step": 3240 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002313509597971749, + "loss": 0.69, + "step": 3260 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023091633466135457, + "loss": 0.6931, + "step": 3280 + }, + { + "epoch": 0.71, + "learning_rate": 0.0002304817095255342, + "loss": 0.6998, + "step": 3300 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023004708438971385, + "loss": 0.6933, + "step": 3320 + }, + { + "epoch": 0.72, + "learning_rate": 0.0002296124592538935, + "loss": 0.6859, + "step": 3340 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022917783411807315, + "loss": 0.6972, + "step": 3360 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022874320898225278, + "loss": 0.6868, + "step": 3380 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022830858384643245, + "loss": 0.6902, + "step": 3400 + }, + { + "epoch": 0.73, + "eval_loss": 0.7059928178787231, + "eval_runtime": 50.0118, + "eval_samples_per_second": 39.991, + "eval_steps_per_second": 0.64, + "step": 3400 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022787395871061208, + "loss": 0.6819, + "step": 3420 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022743933357479172, + "loss": 0.6833, + "step": 3440 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022700470843897138, + "loss": 0.6826, + "step": 3460 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022657008330315102, + "loss": 0.694, + "step": 3480 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022613545816733066, + "loss": 0.6827, + "step": 3500 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022570083303151032, + "loss": 0.6844, + "step": 3520 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022526620789568996, + "loss": 0.6893, + "step": 3540 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002248315827598696, + "loss": 0.6843, + "step": 3560 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022439695762404926, + "loss": 0.6843, + "step": 3580 + }, + { + "epoch": 0.78, + "learning_rate": 0.0002239623324882289, + "loss": 0.691, + "step": 3600 + }, + { + "epoch": 0.78, + "eval_loss": 0.7041522264480591, + "eval_runtime": 50.0554, + "eval_samples_per_second": 39.956, + "eval_steps_per_second": 0.639, + "step": 3600 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022352770735240853, + "loss": 0.6846, + "step": 3620 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002230930822165882, + "loss": 0.689, + "step": 3640 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022265845708076783, + "loss": 0.6777, + "step": 3660 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022222383194494747, + "loss": 0.6903, + "step": 3680 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022178920680912713, + "loss": 0.684, + "step": 3700 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022135458167330677, + "loss": 0.6867, + "step": 3720 + }, + { + "epoch": 0.81, + "learning_rate": 0.0002209199565374864, + "loss": 0.6697, + "step": 3740 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022048533140166607, + "loss": 0.6864, + "step": 3760 + }, + { + "epoch": 0.82, + "learning_rate": 0.0002200507062658457, + "loss": 0.6813, + "step": 3780 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021961608113002534, + "loss": 0.6807, + "step": 3800 + }, + { + "epoch": 0.82, + "eval_loss": 0.7024796009063721, + "eval_runtime": 50.022, + "eval_samples_per_second": 39.982, + "eval_steps_per_second": 0.64, + "step": 3800 + }, + { + "epoch": 0.82, + "learning_rate": 0.000219181455994205, + "loss": 0.6824, + "step": 3820 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021874683085838464, + "loss": 0.6814, + "step": 3840 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021831220572256427, + "loss": 0.6789, + "step": 3860 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021787758058674394, + "loss": 0.6752, + "step": 3880 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021744295545092358, + "loss": 0.6826, + "step": 3900 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002170083303151032, + "loss": 0.6874, + "step": 3920 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021657370517928288, + "loss": 0.6761, + "step": 3940 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002161390800434625, + "loss": 0.6795, + "step": 3960 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021570445490764215, + "loss": 0.6781, + "step": 3980 + }, + { + "epoch": 0.86, + "learning_rate": 0.0002152698297718218, + "loss": 0.6754, + "step": 4000 + }, + { + "epoch": 0.86, + "eval_loss": 0.7004331350326538, + "eval_runtime": 50.0568, + "eval_samples_per_second": 39.955, + "eval_steps_per_second": 0.639, + "step": 4000 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021483520463600145, + "loss": 0.6791, + "step": 4020 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021440057950018108, + "loss": 0.6863, + "step": 4040 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021396595436436075, + "loss": 0.6846, + "step": 4060 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021353132922854036, + "loss": 0.6814, + "step": 4080 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021309670409272, + "loss": 0.6825, + "step": 4100 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021266207895689963, + "loss": 0.6827, + "step": 4120 + }, + { + "epoch": 0.89, + "learning_rate": 0.0002122274538210793, + "loss": 0.6769, + "step": 4140 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021179282868525893, + "loss": 0.6869, + "step": 4160 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021135820354943857, + "loss": 0.6815, + "step": 4180 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021092357841361823, + "loss": 0.6725, + "step": 4200 + }, + { + "epoch": 0.91, + "eval_loss": 0.6981337666511536, + "eval_runtime": 50.0559, + "eval_samples_per_second": 39.955, + "eval_steps_per_second": 0.639, + "step": 4200 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021051068453458889, + "loss": 0.6731, + "step": 4220 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021007605939876855, + "loss": 0.6792, + "step": 4240 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020964143426294819, + "loss": 0.6755, + "step": 4260 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020920680912712782, + "loss": 0.6833, + "step": 4280 + }, + { + "epoch": 0.93, + "learning_rate": 0.0002087721839913075, + "loss": 0.6693, + "step": 4300 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020833755885548712, + "loss": 0.6728, + "step": 4320 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020790293371966676, + "loss": 0.6812, + "step": 4340 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020746830858384642, + "loss": 0.6734, + "step": 4360 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020703368344802606, + "loss": 0.6813, + "step": 4380 + }, + { + "epoch": 0.95, + "learning_rate": 0.0002065990583122057, + "loss": 0.6779, + "step": 4400 + }, + { + "epoch": 0.95, + "eval_loss": 0.6968498826026917, + "eval_runtime": 50.0697, + "eval_samples_per_second": 39.944, + "eval_steps_per_second": 0.639, + "step": 4400 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020616443317638536, + "loss": 0.6712, + "step": 4420 + }, + { + "epoch": 0.96, + "learning_rate": 0.000205729808040565, + "loss": 0.6846, + "step": 4440 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020529518290474463, + "loss": 0.6694, + "step": 4460 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002048605577689243, + "loss": 0.6753, + "step": 4480 + }, + { + "epoch": 0.97, + "learning_rate": 0.00020442593263310393, + "loss": 0.6792, + "step": 4500 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020399130749728357, + "loss": 0.6738, + "step": 4520 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020355668236146323, + "loss": 0.6699, + "step": 4540 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020312205722564287, + "loss": 0.6737, + "step": 4560 + }, + { + "epoch": 0.99, + "learning_rate": 0.0002026874320898225, + "loss": 0.6837, + "step": 4580 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020225280695400217, + "loss": 0.6701, + "step": 4600 + }, + { + "epoch": 0.99, + "eval_loss": 0.6954157948493958, + "eval_runtime": 50.0724, + "eval_samples_per_second": 39.942, + "eval_steps_per_second": 0.639, + "step": 4600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0002018181818181818, + "loss": 0.6677, + "step": 4620 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020138355668236144, + "loss": 0.6706, + "step": 4640 + }, + { + "epoch": 1.01, + "learning_rate": 0.0002009489315465411, + "loss": 0.6741, + "step": 4660 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020051430641072074, + "loss": 0.6757, + "step": 4680 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020007968127490038, + "loss": 0.6773, + "step": 4700 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019964505613908004, + "loss": 0.6728, + "step": 4720 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019921043100325968, + "loss": 0.6715, + "step": 4740 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019877580586743931, + "loss": 0.6679, + "step": 4760 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019834118073161898, + "loss": 0.6729, + "step": 4780 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019790655559579861, + "loss": 0.6749, + "step": 4800 + }, + { + "epoch": 1.04, + "eval_loss": 0.6941403746604919, + "eval_runtime": 50.0645, + "eval_samples_per_second": 39.948, + "eval_steps_per_second": 0.639, + "step": 4800 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019747193045997825, + "loss": 0.6661, + "step": 4820 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001970373053241579, + "loss": 0.6638, + "step": 4840 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019660268018833755, + "loss": 0.6715, + "step": 4860 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001961680550525172, + "loss": 0.6721, + "step": 4880 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019573342991669682, + "loss": 0.6695, + "step": 4900 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001952988047808765, + "loss": 0.6809, + "step": 4920 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019486417964505612, + "loss": 0.6701, + "step": 4940 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019442955450923576, + "loss": 0.6747, + "step": 4960 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019399492937341542, + "loss": 0.6713, + "step": 4980 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019356030423759506, + "loss": 0.6746, + "step": 5000 + }, + { + "epoch": 1.08, + "eval_loss": 0.6935788989067078, + "eval_runtime": 50.0137, + "eval_samples_per_second": 39.989, + "eval_steps_per_second": 0.64, + "step": 5000 + }, + { + "epoch": 1.08, + "learning_rate": 0.0001931256791017747, + "loss": 0.672, + "step": 5020 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019269105396595436, + "loss": 0.6673, + "step": 5040 + }, + { + "epoch": 1.09, + "learning_rate": 0.000192256428830134, + "loss": 0.6706, + "step": 5060 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019182180369431363, + "loss": 0.6677, + "step": 5080 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001913871785584933, + "loss": 0.67, + "step": 5100 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019095255342267293, + "loss": 0.6693, + "step": 5120 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019051792828685257, + "loss": 0.671, + "step": 5140 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019008330315103223, + "loss": 0.6748, + "step": 5160 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018964867801521187, + "loss": 0.6698, + "step": 5180 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001892140528793915, + "loss": 0.662, + "step": 5200 + }, + { + "epoch": 1.12, + "eval_loss": 0.6918168663978577, + "eval_runtime": 50.0897, + "eval_samples_per_second": 39.928, + "eval_steps_per_second": 0.639, + "step": 5200 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018877942774357117, + "loss": 0.66, + "step": 5220 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001883448026077508, + "loss": 0.6705, + "step": 5240 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018791017747193044, + "loss": 0.6693, + "step": 5260 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001874755523361101, + "loss": 0.6546, + "step": 5280 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018704092720028974, + "loss": 0.6673, + "step": 5300 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018660630206446938, + "loss": 0.671, + "step": 5320 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018617167692864904, + "loss": 0.675, + "step": 5340 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018573705179282868, + "loss": 0.6744, + "step": 5360 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018530242665700832, + "loss": 0.6643, + "step": 5380 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018486780152118798, + "loss": 0.6686, + "step": 5400 + }, + { + "epoch": 1.17, + "eval_loss": 0.6908227801322937, + "eval_runtime": 50.0742, + "eval_samples_per_second": 39.941, + "eval_steps_per_second": 0.639, + "step": 5400 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018443317638536762, + "loss": 0.6666, + "step": 5420 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018399855124954725, + "loss": 0.6658, + "step": 5440 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001835639261137269, + "loss": 0.671, + "step": 5460 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018312930097790653, + "loss": 0.6736, + "step": 5480 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018269467584208616, + "loss": 0.6697, + "step": 5500 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018226005070626583, + "loss": 0.6718, + "step": 5520 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018182542557044546, + "loss": 0.6701, + "step": 5540 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001813908004346251, + "loss": 0.6696, + "step": 5560 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018095617529880476, + "loss": 0.6611, + "step": 5580 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001805215501629844, + "loss": 0.6638, + "step": 5600 + }, + { + "epoch": 1.21, + "eval_loss": 0.689289927482605, + "eval_runtime": 50.1304, + "eval_samples_per_second": 39.896, + "eval_steps_per_second": 0.638, + "step": 5600 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018008692502716404, + "loss": 0.6646, + "step": 5620 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001796522998913437, + "loss": 0.6717, + "step": 5640 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017921767475552334, + "loss": 0.6647, + "step": 5660 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017878304961970297, + "loss": 0.672, + "step": 5680 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017834842448388264, + "loss": 0.6645, + "step": 5700 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017791379934806227, + "loss": 0.6768, + "step": 5720 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001774791742122419, + "loss": 0.6748, + "step": 5740 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017704454907642157, + "loss": 0.6722, + "step": 5760 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001766099239406012, + "loss": 0.6631, + "step": 5780 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017617529880478084, + "loss": 0.6647, + "step": 5800 + }, + { + "epoch": 1.25, + "eval_loss": 0.688850462436676, + "eval_runtime": 50.0542, + "eval_samples_per_second": 39.957, + "eval_steps_per_second": 0.639, + "step": 5800 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001757406736689605, + "loss": 0.66, + "step": 5820 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017530604853314014, + "loss": 0.6682, + "step": 5840 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017487142339731978, + "loss": 0.6589, + "step": 5860 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017443679826149944, + "loss": 0.6691, + "step": 5880 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017400217312567908, + "loss": 0.6726, + "step": 5900 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017356754798985872, + "loss": 0.6628, + "step": 5920 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017313292285403838, + "loss": 0.6719, + "step": 5940 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017269829771821802, + "loss": 0.6648, + "step": 5960 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017226367258239765, + "loss": 0.6594, + "step": 5980 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017182904744657732, + "loss": 0.6717, + "step": 6000 + }, + { + "epoch": 1.29, + "eval_loss": 0.6876093745231628, + "eval_runtime": 50.1763, + "eval_samples_per_second": 39.859, + "eval_steps_per_second": 0.638, + "step": 6000 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017139442231075695, + "loss": 0.6632, + "step": 6020 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001709597971749366, + "loss": 0.6619, + "step": 6040 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017052517203911625, + "loss": 0.667, + "step": 6060 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001700905469032959, + "loss": 0.6625, + "step": 6080 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016965592176747553, + "loss": 0.6661, + "step": 6100 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001692212966316552, + "loss": 0.656, + "step": 6120 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016878667149583483, + "loss": 0.6668, + "step": 6140 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016835204636001446, + "loss": 0.6669, + "step": 6160 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016791742122419413, + "loss": 0.6662, + "step": 6180 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016748279608837376, + "loss": 0.6692, + "step": 6200 + }, + { + "epoch": 1.34, + "eval_loss": 0.6869744658470154, + "eval_runtime": 50.1517, + "eval_samples_per_second": 39.879, + "eval_steps_per_second": 0.638, + "step": 6200 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001670481709525534, + "loss": 0.6571, + "step": 6220 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016661354581673306, + "loss": 0.6659, + "step": 6240 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001661789206809127, + "loss": 0.6622, + "step": 6260 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016574429554509234, + "loss": 0.6522, + "step": 6280 + }, + { + "epoch": 1.36, + "learning_rate": 0.000165309670409272, + "loss": 0.667, + "step": 6300 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016487504527345164, + "loss": 0.6644, + "step": 6320 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016444042013763127, + "loss": 0.6625, + "step": 6340 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016400579500181094, + "loss": 0.6686, + "step": 6360 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016357116986599057, + "loss": 0.6562, + "step": 6380 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001631365447301702, + "loss": 0.6595, + "step": 6400 + }, + { + "epoch": 1.38, + "eval_loss": 0.685205340385437, + "eval_runtime": 50.162, + "eval_samples_per_second": 39.871, + "eval_steps_per_second": 0.638, + "step": 6400 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016270191959434987, + "loss": 0.6595, + "step": 6420 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001622672944585295, + "loss": 0.6644, + "step": 6440 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016183266932270915, + "loss": 0.6647, + "step": 6460 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001613980441868888, + "loss": 0.6655, + "step": 6480 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016096341905106845, + "loss": 0.6564, + "step": 6500 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016052879391524808, + "loss": 0.6578, + "step": 6520 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016009416877942775, + "loss": 0.6624, + "step": 6540 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015965954364360738, + "loss": 0.6633, + "step": 6560 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015922491850778702, + "loss": 0.6616, + "step": 6580 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015879029337196668, + "loss": 0.6607, + "step": 6600 + }, + { + "epoch": 1.42, + "eval_loss": 0.6847727298736572, + "eval_runtime": 50.1562, + "eval_samples_per_second": 39.875, + "eval_steps_per_second": 0.638, + "step": 6600 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015835566823614632, + "loss": 0.6564, + "step": 6620 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015792104310032596, + "loss": 0.66, + "step": 6640 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015748641796450562, + "loss": 0.6589, + "step": 6660 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015705179282868526, + "loss": 0.6596, + "step": 6680 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001566171676928649, + "loss": 0.6663, + "step": 6700 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015618254255704456, + "loss": 0.6603, + "step": 6720 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001557479174212242, + "loss": 0.6674, + "step": 6740 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015531329228540383, + "loss": 0.6603, + "step": 6760 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001548786671495835, + "loss": 0.6612, + "step": 6780 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015444404201376313, + "loss": 0.6609, + "step": 6800 + }, + { + "epoch": 1.47, + "eval_loss": 0.683903694152832, + "eval_runtime": 50.079, + "eval_samples_per_second": 39.937, + "eval_steps_per_second": 0.639, + "step": 6800 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015400941687794277, + "loss": 0.6557, + "step": 6820 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015357479174212243, + "loss": 0.6627, + "step": 6840 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015314016660630207, + "loss": 0.6667, + "step": 6860 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001527055414704817, + "loss": 0.6633, + "step": 6880 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015227091633466137, + "loss": 0.6565, + "step": 6900 + }, + { + "epoch": 1.49, + "learning_rate": 0.000151836291198841, + "loss": 0.6588, + "step": 6920 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015140166606302064, + "loss": 0.6687, + "step": 6940 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001509670409272003, + "loss": 0.6611, + "step": 6960 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015053241579137994, + "loss": 0.6576, + "step": 6980 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015009779065555957, + "loss": 0.6576, + "step": 7000 + }, + { + "epoch": 1.51, + "eval_loss": 0.6830142736434937, + "eval_runtime": 50.1233, + "eval_samples_per_second": 39.902, + "eval_steps_per_second": 0.638, + "step": 7000 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001496631655197392, + "loss": 0.6617, + "step": 7020 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014922854038391885, + "loss": 0.6533, + "step": 7040 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001487939152480985, + "loss": 0.6524, + "step": 7060 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014835929011227815, + "loss": 0.6597, + "step": 7080 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014792466497645778, + "loss": 0.656, + "step": 7100 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014749003984063745, + "loss": 0.6501, + "step": 7120 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014705541470481708, + "loss": 0.6563, + "step": 7140 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014662078956899672, + "loss": 0.6496, + "step": 7160 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014618616443317638, + "loss": 0.6602, + "step": 7180 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014575153929735602, + "loss": 0.6617, + "step": 7200 + }, + { + "epoch": 1.55, + "eval_loss": 0.6818540096282959, + "eval_runtime": 50.1175, + "eval_samples_per_second": 39.906, + "eval_steps_per_second": 0.639, + "step": 7200 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014531691416153566, + "loss": 0.6655, + "step": 7220 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014488228902571532, + "loss": 0.6544, + "step": 7240 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014444766388989496, + "loss": 0.655, + "step": 7260 + }, + { + "epoch": 1.57, + "learning_rate": 0.0001440130387540746, + "loss": 0.6535, + "step": 7280 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014357841361825426, + "loss": 0.6584, + "step": 7300 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001431437884824339, + "loss": 0.6602, + "step": 7320 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014270916334661353, + "loss": 0.6689, + "step": 7340 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001422745382107932, + "loss": 0.6613, + "step": 7360 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014183991307497283, + "loss": 0.659, + "step": 7380 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014140528793915247, + "loss": 0.6463, + "step": 7400 + }, + { + "epoch": 1.6, + "eval_loss": 0.681868851184845, + "eval_runtime": 50.1388, + "eval_samples_per_second": 39.889, + "eval_steps_per_second": 0.638, + "step": 7400 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014097066280333213, + "loss": 0.6617, + "step": 7420 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014053603766751177, + "loss": 0.6648, + "step": 7440 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001401014125316914, + "loss": 0.6528, + "step": 7460 + }, + { + "epoch": 1.61, + "learning_rate": 0.00013966678739587107, + "loss": 0.6655, + "step": 7480 + }, + { + "epoch": 1.62, + "learning_rate": 0.0001392321622600507, + "loss": 0.6609, + "step": 7500 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013879753712423034, + "loss": 0.6528, + "step": 7520 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013836291198841, + "loss": 0.6561, + "step": 7540 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013792828685258964, + "loss": 0.6682, + "step": 7560 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013749366171676928, + "loss": 0.6677, + "step": 7580 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013705903658094894, + "loss": 0.6599, + "step": 7600 + }, + { + "epoch": 1.64, + "eval_loss": 0.6807426810264587, + "eval_runtime": 50.3308, + "eval_samples_per_second": 39.737, + "eval_steps_per_second": 0.636, + "step": 7600 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013662441144512855, + "loss": 0.6525, + "step": 7620 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001361897863093082, + "loss": 0.6574, + "step": 7640 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013575516117348785, + "loss": 0.6516, + "step": 7660 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013532053603766749, + "loss": 0.6533, + "step": 7680 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013488591090184715, + "loss": 0.6577, + "step": 7700 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013445128576602679, + "loss": 0.6592, + "step": 7720 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013401666063020642, + "loss": 0.6585, + "step": 7740 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013358203549438609, + "loss": 0.6607, + "step": 7760 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013314741035856572, + "loss": 0.6617, + "step": 7780 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013271278522274536, + "loss": 0.6443, + "step": 7800 + }, + { + "epoch": 1.68, + "eval_loss": 0.6800745725631714, + "eval_runtime": 50.165, + "eval_samples_per_second": 39.868, + "eval_steps_per_second": 0.638, + "step": 7800 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013227816008692502, + "loss": 0.6587, + "step": 7820 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013184353495110466, + "loss": 0.6613, + "step": 7840 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001314089098152843, + "loss": 0.654, + "step": 7860 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013097428467946396, + "loss": 0.6523, + "step": 7880 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001305396595436436, + "loss": 0.6563, + "step": 7900 + }, + { + "epoch": 1.71, + "learning_rate": 0.00013010503440782323, + "loss": 0.6524, + "step": 7920 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001296704092720029, + "loss": 0.6523, + "step": 7940 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012923578413618253, + "loss": 0.6493, + "step": 7960 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012880115900036217, + "loss": 0.6538, + "step": 7980 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012836653386454183, + "loss": 0.6512, + "step": 8000 + }, + { + "epoch": 1.73, + "eval_loss": 0.6790341734886169, + "eval_runtime": 50.1317, + "eval_samples_per_second": 39.895, + "eval_steps_per_second": 0.638, + "step": 8000 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012793190872872147, + "loss": 0.6562, + "step": 8020 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001274972835929011, + "loss": 0.6556, + "step": 8040 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012706265845708077, + "loss": 0.65, + "step": 8060 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001266280333212604, + "loss": 0.661, + "step": 8080 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012619340818544004, + "loss": 0.655, + "step": 8100 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001257587830496197, + "loss": 0.6534, + "step": 8120 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012532415791379934, + "loss": 0.6517, + "step": 8140 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012488953277797898, + "loss": 0.6605, + "step": 8160 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012445490764215864, + "loss": 0.6556, + "step": 8180 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012402028250633828, + "loss": 0.6492, + "step": 8200 + }, + { + "epoch": 1.77, + "eval_loss": 0.6781870126724243, + "eval_runtime": 50.0809, + "eval_samples_per_second": 39.935, + "eval_steps_per_second": 0.639, + "step": 8200 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012358565737051791, + "loss": 0.6541, + "step": 8220 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012315103223469758, + "loss": 0.6517, + "step": 8240 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012271640709887721, + "loss": 0.6483, + "step": 8260 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012228178196305685, + "loss": 0.6619, + "step": 8280 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001218471568272365, + "loss": 0.6556, + "step": 8300 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012141253169141615, + "loss": 0.6471, + "step": 8320 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012097790655559579, + "loss": 0.6611, + "step": 8340 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012054328141977544, + "loss": 0.6506, + "step": 8360 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012010865628395509, + "loss": 0.6611, + "step": 8380 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011967403114813472, + "loss": 0.6557, + "step": 8400 + }, + { + "epoch": 1.81, + "eval_loss": 0.6776989102363586, + "eval_runtime": 50.1344, + "eval_samples_per_second": 39.893, + "eval_steps_per_second": 0.638, + "step": 8400 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011923940601231437, + "loss": 0.6504, + "step": 8420 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011880478087649402, + "loss": 0.6552, + "step": 8440 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011839188699746468, + "loss": 0.641, + "step": 8460 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011795726186164432, + "loss": 0.6535, + "step": 8480 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011752263672582397, + "loss": 0.6568, + "step": 8500 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011708801159000362, + "loss": 0.6621, + "step": 8520 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011665338645418325, + "loss": 0.6607, + "step": 8540 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001162187613183629, + "loss": 0.6516, + "step": 8560 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011578413618254255, + "loss": 0.6497, + "step": 8580 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011534951104672219, + "loss": 0.6559, + "step": 8600 + }, + { + "epoch": 1.86, + "eval_loss": 0.6773191094398499, + "eval_runtime": 50.1605, + "eval_samples_per_second": 39.872, + "eval_steps_per_second": 0.638, + "step": 8600 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011491488591090184, + "loss": 0.6595, + "step": 8620 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011448026077508149, + "loss": 0.6495, + "step": 8640 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011404563563926113, + "loss": 0.6518, + "step": 8660 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011361101050344078, + "loss": 0.6511, + "step": 8680 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011317638536762043, + "loss": 0.6495, + "step": 8700 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011274176023180006, + "loss": 0.6485, + "step": 8720 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011230713509597971, + "loss": 0.6543, + "step": 8740 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011187250996015936, + "loss": 0.6509, + "step": 8760 + }, + { + "epoch": 1.89, + "learning_rate": 0.000111437884824339, + "loss": 0.656, + "step": 8780 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011100325968851865, + "loss": 0.6557, + "step": 8800 + }, + { + "epoch": 1.9, + "eval_loss": 0.6773696541786194, + "eval_runtime": 50.1296, + "eval_samples_per_second": 39.897, + "eval_steps_per_second": 0.638, + "step": 8800 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001105686345526983, + "loss": 0.6509, + "step": 8820 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011013400941687794, + "loss": 0.65, + "step": 8840 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010969938428105759, + "loss": 0.6447, + "step": 8860 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010926475914523724, + "loss": 0.6563, + "step": 8880 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010883013400941687, + "loss": 0.6545, + "step": 8900 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010839550887359652, + "loss": 0.6509, + "step": 8920 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010796088373777617, + "loss": 0.6434, + "step": 8940 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010752625860195581, + "loss": 0.6412, + "step": 8960 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010709163346613546, + "loss": 0.6512, + "step": 8980 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010665700833031508, + "loss": 0.6478, + "step": 9000 + }, + { + "epoch": 1.94, + "eval_loss": 0.6760911345481873, + "eval_runtime": 50.1795, + "eval_samples_per_second": 39.857, + "eval_steps_per_second": 0.638, + "step": 9000 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010622238319449473, + "loss": 0.6545, + "step": 9020 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010578775805867438, + "loss": 0.6468, + "step": 9040 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010535313292285402, + "loss": 0.6527, + "step": 9060 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010491850778703367, + "loss": 0.6621, + "step": 9080 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010448388265121332, + "loss": 0.6496, + "step": 9100 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010404925751539295, + "loss": 0.6512, + "step": 9120 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001036146323795726, + "loss": 0.6491, + "step": 9140 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010318000724375225, + "loss": 0.6482, + "step": 9160 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010274538210793189, + "loss": 0.6456, + "step": 9180 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010231075697211154, + "loss": 0.6458, + "step": 9200 + }, + { + "epoch": 1.98, + "eval_loss": 0.6748936772346497, + "eval_runtime": 50.1856, + "eval_samples_per_second": 39.852, + "eval_steps_per_second": 0.638, + "step": 9200 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010187613183629119, + "loss": 0.6473, + "step": 9220 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010144150670047083, + "loss": 0.6496, + "step": 9240 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010100688156465048, + "loss": 0.6566, + "step": 9260 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010057225642883013, + "loss": 0.6475, + "step": 9280 + }, + { + "epoch": 2.01, + "learning_rate": 0.00010013763129300976, + "loss": 0.6536, + "step": 9300 + }, + { + "epoch": 2.01, + "learning_rate": 9.970300615718941e-05, + "loss": 0.646, + "step": 9320 + }, + { + "epoch": 2.02, + "learning_rate": 9.926838102136906e-05, + "loss": 0.6503, + "step": 9340 + }, + { + "epoch": 2.02, + "learning_rate": 9.88337558855487e-05, + "loss": 0.6527, + "step": 9360 + }, + { + "epoch": 2.02, + "learning_rate": 9.839913074972835e-05, + "loss": 0.6514, + "step": 9380 + }, + { + "epoch": 2.03, + "learning_rate": 9.7964505613908e-05, + "loss": 0.6548, + "step": 9400 + }, + { + "epoch": 2.03, + "eval_loss": 0.6744834780693054, + "eval_runtime": 50.1696, + "eval_samples_per_second": 39.865, + "eval_steps_per_second": 0.638, + "step": 9400 + }, + { + "epoch": 2.03, + "learning_rate": 9.752988047808764e-05, + "loss": 0.6483, + "step": 9420 + }, + { + "epoch": 2.04, + "learning_rate": 9.709525534226729e-05, + "loss": 0.6522, + "step": 9440 + }, + { + "epoch": 2.04, + "learning_rate": 9.666063020644694e-05, + "loss": 0.6538, + "step": 9460 + }, + { + "epoch": 2.05, + "learning_rate": 9.622600507062657e-05, + "loss": 0.6449, + "step": 9480 + }, + { + "epoch": 2.05, + "learning_rate": 9.579137993480622e-05, + "loss": 0.6451, + "step": 9500 + }, + { + "epoch": 2.05, + "learning_rate": 9.535675479898587e-05, + "loss": 0.6355, + "step": 9520 + }, + { + "epoch": 2.06, + "learning_rate": 9.492212966316551e-05, + "loss": 0.6494, + "step": 9540 + }, + { + "epoch": 2.06, + "learning_rate": 9.448750452734516e-05, + "loss": 0.6435, + "step": 9560 + }, + { + "epoch": 2.07, + "learning_rate": 9.405287939152481e-05, + "loss": 0.651, + "step": 9580 + }, + { + "epoch": 2.07, + "learning_rate": 9.361825425570445e-05, + "loss": 0.6493, + "step": 9600 + }, + { + "epoch": 2.07, + "eval_loss": 0.674017071723938, + "eval_runtime": 50.1402, + "eval_samples_per_second": 39.888, + "eval_steps_per_second": 0.638, + "step": 9600 + }, + { + "epoch": 2.08, + "learning_rate": 9.31836291198841e-05, + "loss": 0.6469, + "step": 9620 + }, + { + "epoch": 2.08, + "learning_rate": 9.274900398406375e-05, + "loss": 0.65, + "step": 9640 + }, + { + "epoch": 2.08, + "learning_rate": 9.231437884824338e-05, + "loss": 0.6536, + "step": 9660 + }, + { + "epoch": 2.09, + "learning_rate": 9.187975371242303e-05, + "loss": 0.6488, + "step": 9680 + }, + { + "epoch": 2.09, + "learning_rate": 9.144512857660268e-05, + "loss": 0.6391, + "step": 9700 + }, + { + "epoch": 2.1, + "learning_rate": 9.101050344078232e-05, + "loss": 0.644, + "step": 9720 + }, + { + "epoch": 2.1, + "learning_rate": 9.057587830496197e-05, + "loss": 0.6507, + "step": 9740 + }, + { + "epoch": 2.11, + "learning_rate": 9.014125316914162e-05, + "loss": 0.6404, + "step": 9760 + }, + { + "epoch": 2.11, + "learning_rate": 8.970662803332126e-05, + "loss": 0.6509, + "step": 9780 + }, + { + "epoch": 2.11, + "learning_rate": 8.92720028975009e-05, + "loss": 0.6435, + "step": 9800 + }, + { + "epoch": 2.11, + "eval_loss": 0.6735255122184753, + "eval_runtime": 50.1703, + "eval_samples_per_second": 39.864, + "eval_steps_per_second": 0.638, + "step": 9800 + }, + { + "epoch": 2.12, + "learning_rate": 8.883737776168056e-05, + "loss": 0.6374, + "step": 9820 + }, + { + "epoch": 2.12, + "learning_rate": 8.840275262586019e-05, + "loss": 0.6445, + "step": 9840 + }, + { + "epoch": 2.13, + "learning_rate": 8.796812749003983e-05, + "loss": 0.6495, + "step": 9860 + }, + { + "epoch": 2.13, + "learning_rate": 8.753350235421946e-05, + "loss": 0.6482, + "step": 9880 + }, + { + "epoch": 2.14, + "learning_rate": 8.709887721839911e-05, + "loss": 0.6441, + "step": 9900 + }, + { + "epoch": 2.14, + "learning_rate": 8.666425208257877e-05, + "loss": 0.6525, + "step": 9920 + }, + { + "epoch": 2.14, + "learning_rate": 8.62296269467584e-05, + "loss": 0.6453, + "step": 9940 + }, + { + "epoch": 2.15, + "learning_rate": 8.579500181093805e-05, + "loss": 0.6498, + "step": 9960 + }, + { + "epoch": 2.15, + "learning_rate": 8.53603766751177e-05, + "loss": 0.6471, + "step": 9980 + }, + { + "epoch": 2.16, + "learning_rate": 8.492575153929734e-05, + "loss": 0.6419, + "step": 10000 + }, + { + "epoch": 2.16, + "eval_loss": 0.6730753779411316, + "eval_runtime": 50.1885, + "eval_samples_per_second": 39.85, + "eval_steps_per_second": 0.638, + "step": 10000 + }, + { + "epoch": 2.16, + "learning_rate": 8.449112640347699e-05, + "loss": 0.6447, + "step": 10020 + }, + { + "epoch": 2.17, + "learning_rate": 8.405650126765664e-05, + "loss": 0.6444, + "step": 10040 + }, + { + "epoch": 2.17, + "learning_rate": 8.362187613183627e-05, + "loss": 0.6393, + "step": 10060 + }, + { + "epoch": 2.17, + "learning_rate": 8.318725099601592e-05, + "loss": 0.6464, + "step": 10080 + }, + { + "epoch": 2.18, + "learning_rate": 8.275262586019557e-05, + "loss": 0.6458, + "step": 10100 + }, + { + "epoch": 2.18, + "learning_rate": 8.231800072437521e-05, + "loss": 0.6402, + "step": 10120 + }, + { + "epoch": 2.19, + "learning_rate": 8.188337558855486e-05, + "loss": 0.6409, + "step": 10140 + }, + { + "epoch": 2.19, + "learning_rate": 8.144875045273451e-05, + "loss": 0.6512, + "step": 10160 + }, + { + "epoch": 2.2, + "learning_rate": 8.101412531691415e-05, + "loss": 0.6498, + "step": 10180 + }, + { + "epoch": 2.2, + "learning_rate": 8.05795001810938e-05, + "loss": 0.6393, + "step": 10200 + }, + { + "epoch": 2.2, + "eval_loss": 0.6726437211036682, + "eval_runtime": 50.1492, + "eval_samples_per_second": 39.881, + "eval_steps_per_second": 0.638, + "step": 10200 + }, + { + "epoch": 2.2, + "learning_rate": 8.014487504527345e-05, + "loss": 0.6458, + "step": 10220 + }, + { + "epoch": 2.21, + "learning_rate": 7.971024990945308e-05, + "loss": 0.6466, + "step": 10240 + }, + { + "epoch": 2.21, + "learning_rate": 7.927562477363273e-05, + "loss": 0.644, + "step": 10260 + }, + { + "epoch": 2.22, + "learning_rate": 7.884099963781238e-05, + "loss": 0.6467, + "step": 10280 + }, + { + "epoch": 2.22, + "learning_rate": 7.840637450199202e-05, + "loss": 0.6436, + "step": 10300 + }, + { + "epoch": 2.23, + "learning_rate": 7.797174936617167e-05, + "loss": 0.6422, + "step": 10320 + }, + { + "epoch": 2.23, + "learning_rate": 7.753712423035132e-05, + "loss": 0.645, + "step": 10340 + }, + { + "epoch": 2.24, + "learning_rate": 7.710249909453096e-05, + "loss": 0.6423, + "step": 10360 + }, + { + "epoch": 2.24, + "learning_rate": 7.666787395871061e-05, + "loss": 0.6557, + "step": 10380 + }, + { + "epoch": 2.24, + "learning_rate": 7.623324882289026e-05, + "loss": 0.646, + "step": 10400 + }, + { + "epoch": 2.24, + "eval_loss": 0.6725419759750366, + "eval_runtime": 50.1975, + "eval_samples_per_second": 39.843, + "eval_steps_per_second": 0.637, + "step": 10400 + }, + { + "epoch": 2.25, + "learning_rate": 7.57986236870699e-05, + "loss": 0.6503, + "step": 10420 + }, + { + "epoch": 2.25, + "learning_rate": 7.536399855124954e-05, + "loss": 0.6428, + "step": 10440 + }, + { + "epoch": 2.26, + "learning_rate": 7.49293734154292e-05, + "loss": 0.6438, + "step": 10460 + }, + { + "epoch": 2.26, + "learning_rate": 7.449474827960883e-05, + "loss": 0.6427, + "step": 10480 + }, + { + "epoch": 2.27, + "learning_rate": 7.406012314378847e-05, + "loss": 0.6458, + "step": 10500 + }, + { + "epoch": 2.27, + "learning_rate": 7.362549800796812e-05, + "loss": 0.6423, + "step": 10520 + }, + { + "epoch": 2.27, + "learning_rate": 7.319087287214777e-05, + "loss": 0.6466, + "step": 10540 + }, + { + "epoch": 2.28, + "learning_rate": 7.27562477363274e-05, + "loss": 0.6394, + "step": 10560 + }, + { + "epoch": 2.28, + "learning_rate": 7.232162260050705e-05, + "loss": 0.6362, + "step": 10580 + }, + { + "epoch": 2.29, + "learning_rate": 7.18869974646867e-05, + "loss": 0.6399, + "step": 10600 + }, + { + "epoch": 2.29, + "eval_loss": 0.6719211935997009, + "eval_runtime": 50.1808, + "eval_samples_per_second": 39.856, + "eval_steps_per_second": 0.638, + "step": 10600 + }, + { + "epoch": 2.29, + "learning_rate": 7.145237232886634e-05, + "loss": 0.6378, + "step": 10620 + }, + { + "epoch": 2.3, + "learning_rate": 7.101774719304599e-05, + "loss": 0.634, + "step": 10640 + }, + { + "epoch": 2.3, + "learning_rate": 7.058312205722564e-05, + "loss": 0.6374, + "step": 10660 + }, + { + "epoch": 2.3, + "learning_rate": 7.014849692140528e-05, + "loss": 0.6464, + "step": 10680 + }, + { + "epoch": 2.31, + "learning_rate": 6.971387178558493e-05, + "loss": 0.643, + "step": 10700 + }, + { + "epoch": 2.31, + "learning_rate": 6.927924664976458e-05, + "loss": 0.6384, + "step": 10720 + }, + { + "epoch": 2.32, + "learning_rate": 6.884462151394421e-05, + "loss": 0.6451, + "step": 10740 + }, + { + "epoch": 2.32, + "learning_rate": 6.840999637812386e-05, + "loss": 0.6465, + "step": 10760 + }, + { + "epoch": 2.33, + "learning_rate": 6.799710249909452e-05, + "loss": 0.646, + "step": 10780 + }, + { + "epoch": 2.33, + "learning_rate": 6.756247736327417e-05, + "loss": 0.6525, + "step": 10800 + }, + { + "epoch": 2.33, + "eval_loss": 0.6714358925819397, + "eval_runtime": 50.1294, + "eval_samples_per_second": 39.897, + "eval_steps_per_second": 0.638, + "step": 10800 + }, + { + "epoch": 2.33, + "learning_rate": 6.712785222745382e-05, + "loss": 0.6423, + "step": 10820 + }, + { + "epoch": 2.34, + "learning_rate": 6.669322709163345e-05, + "loss": 0.6449, + "step": 10840 + }, + { + "epoch": 2.34, + "learning_rate": 6.62586019558131e-05, + "loss": 0.6325, + "step": 10860 + }, + { + "epoch": 2.35, + "learning_rate": 6.582397681999275e-05, + "loss": 0.6558, + "step": 10880 + }, + { + "epoch": 2.35, + "learning_rate": 6.538935168417239e-05, + "loss": 0.6419, + "step": 10900 + }, + { + "epoch": 2.36, + "learning_rate": 6.495472654835204e-05, + "loss": 0.6466, + "step": 10920 + }, + { + "epoch": 2.36, + "learning_rate": 6.452010141253169e-05, + "loss": 0.6357, + "step": 10940 + }, + { + "epoch": 2.36, + "learning_rate": 6.408547627671133e-05, + "loss": 0.6366, + "step": 10960 + }, + { + "epoch": 2.37, + "learning_rate": 6.365085114089098e-05, + "loss": 0.6466, + "step": 10980 + }, + { + "epoch": 2.37, + "learning_rate": 6.321622600507063e-05, + "loss": 0.6542, + "step": 11000 + }, + { + "epoch": 2.37, + "eval_loss": 0.6710445880889893, + "eval_runtime": 50.2479, + "eval_samples_per_second": 39.803, + "eval_steps_per_second": 0.637, + "step": 11000 + }, + { + "epoch": 2.38, + "learning_rate": 6.278160086925026e-05, + "loss": 0.6481, + "step": 11020 + }, + { + "epoch": 2.38, + "learning_rate": 6.23469757334299e-05, + "loss": 0.6425, + "step": 11040 + }, + { + "epoch": 2.39, + "learning_rate": 6.191235059760955e-05, + "loss": 0.6439, + "step": 11060 + }, + { + "epoch": 2.39, + "learning_rate": 6.14777254617892e-05, + "loss": 0.6424, + "step": 11080 + }, + { + "epoch": 2.39, + "learning_rate": 6.104310032596884e-05, + "loss": 0.6404, + "step": 11100 + }, + { + "epoch": 2.4, + "learning_rate": 6.060847519014849e-05, + "loss": 0.6387, + "step": 11120 + }, + { + "epoch": 2.4, + "learning_rate": 6.017385005432814e-05, + "loss": 0.6462, + "step": 11140 + }, + { + "epoch": 2.41, + "learning_rate": 5.973922491850778e-05, + "loss": 0.6431, + "step": 11160 + }, + { + "epoch": 2.41, + "learning_rate": 5.9304599782687424e-05, + "loss": 0.638, + "step": 11180 + }, + { + "epoch": 2.42, + "learning_rate": 5.8869974646867074e-05, + "loss": 0.6344, + "step": 11200 + }, + { + "epoch": 2.42, + "eval_loss": 0.6704220771789551, + "eval_runtime": 50.1558, + "eval_samples_per_second": 39.876, + "eval_steps_per_second": 0.638, + "step": 11200 + }, + { + "epoch": 2.42, + "learning_rate": 5.843534951104672e-05, + "loss": 0.6448, + "step": 11220 + }, + { + "epoch": 2.43, + "learning_rate": 5.800072437522636e-05, + "loss": 0.6449, + "step": 11240 + }, + { + "epoch": 2.43, + "learning_rate": 5.756609923940601e-05, + "loss": 0.6399, + "step": 11260 + }, + { + "epoch": 2.43, + "learning_rate": 5.7131474103585654e-05, + "loss": 0.638, + "step": 11280 + }, + { + "epoch": 2.44, + "learning_rate": 5.66968489677653e-05, + "loss": 0.6418, + "step": 11300 + }, + { + "epoch": 2.44, + "learning_rate": 5.626222383194495e-05, + "loss": 0.6482, + "step": 11320 + }, + { + "epoch": 2.45, + "learning_rate": 5.582759869612459e-05, + "loss": 0.6392, + "step": 11340 + }, + { + "epoch": 2.45, + "learning_rate": 5.5392973560304233e-05, + "loss": 0.6363, + "step": 11360 + }, + { + "epoch": 2.46, + "learning_rate": 5.4958348424483883e-05, + "loss": 0.6503, + "step": 11380 + }, + { + "epoch": 2.46, + "learning_rate": 5.452372328866353e-05, + "loss": 0.6453, + "step": 11400 + }, + { + "epoch": 2.46, + "eval_loss": 0.670009195804596, + "eval_runtime": 50.155, + "eval_samples_per_second": 39.876, + "eval_steps_per_second": 0.638, + "step": 11400 + }, + { + "epoch": 2.46, + "learning_rate": 5.408909815284317e-05, + "loss": 0.6384, + "step": 11420 + }, + { + "epoch": 2.47, + "learning_rate": 5.365447301702282e-05, + "loss": 0.6449, + "step": 11440 + }, + { + "epoch": 2.47, + "learning_rate": 5.3219847881202456e-05, + "loss": 0.6406, + "step": 11460 + }, + { + "epoch": 2.48, + "learning_rate": 5.27852227453821e-05, + "loss": 0.6363, + "step": 11480 + }, + { + "epoch": 2.48, + "learning_rate": 5.235059760956174e-05, + "loss": 0.6482, + "step": 11500 + }, + { + "epoch": 2.49, + "learning_rate": 5.191597247374139e-05, + "loss": 0.6503, + "step": 11520 + }, + { + "epoch": 2.49, + "learning_rate": 5.1481347337921036e-05, + "loss": 0.6479, + "step": 11540 + }, + { + "epoch": 2.49, + "learning_rate": 5.10684534588917e-05, + "loss": 0.6437, + "step": 11560 + }, + { + "epoch": 2.5, + "learning_rate": 5.063382832307134e-05, + "loss": 0.6398, + "step": 11580 + }, + { + "epoch": 2.5, + "learning_rate": 5.0199203187250985e-05, + "loss": 0.6456, + "step": 11600 + }, + { + "epoch": 2.5, + "eval_loss": 0.6702134013175964, + "eval_runtime": 50.1834, + "eval_samples_per_second": 39.854, + "eval_steps_per_second": 0.638, + "step": 11600 + }, + { + "epoch": 2.51, + "learning_rate": 4.9764578051430635e-05, + "loss": 0.646, + "step": 11620 + }, + { + "epoch": 2.51, + "learning_rate": 4.932995291561028e-05, + "loss": 0.6375, + "step": 11640 + }, + { + "epoch": 2.52, + "learning_rate": 4.889532777978992e-05, + "loss": 0.6393, + "step": 11660 + }, + { + "epoch": 2.52, + "learning_rate": 4.846070264396957e-05, + "loss": 0.638, + "step": 11680 + }, + { + "epoch": 2.52, + "learning_rate": 4.8026077508149215e-05, + "loss": 0.6411, + "step": 11700 + }, + { + "epoch": 2.53, + "learning_rate": 4.759145237232886e-05, + "loss": 0.6467, + "step": 11720 + }, + { + "epoch": 2.53, + "learning_rate": 4.715682723650851e-05, + "loss": 0.6369, + "step": 11740 + }, + { + "epoch": 2.54, + "learning_rate": 4.672220210068815e-05, + "loss": 0.637, + "step": 11760 + }, + { + "epoch": 2.54, + "learning_rate": 4.6287576964867795e-05, + "loss": 0.6486, + "step": 11780 + }, + { + "epoch": 2.55, + "learning_rate": 4.5852951829047445e-05, + "loss": 0.637, + "step": 11800 + }, + { + "epoch": 2.55, + "eval_loss": 0.6698750257492065, + "eval_runtime": 50.1539, + "eval_samples_per_second": 39.877, + "eval_steps_per_second": 0.638, + "step": 11800 + }, + { + "epoch": 2.55, + "learning_rate": 4.541832669322709e-05, + "loss": 0.639, + "step": 11820 + }, + { + "epoch": 2.55, + "learning_rate": 4.498370155740673e-05, + "loss": 0.6366, + "step": 11840 + }, + { + "epoch": 2.56, + "learning_rate": 4.454907642158638e-05, + "loss": 0.6409, + "step": 11860 + }, + { + "epoch": 2.56, + "learning_rate": 4.4114451285766025e-05, + "loss": 0.6394, + "step": 11880 + }, + { + "epoch": 2.57, + "learning_rate": 4.367982614994567e-05, + "loss": 0.6351, + "step": 11900 + }, + { + "epoch": 2.57, + "learning_rate": 4.324520101412532e-05, + "loss": 0.6391, + "step": 11920 + }, + { + "epoch": 2.58, + "learning_rate": 4.281057587830496e-05, + "loss": 0.6267, + "step": 11940 + }, + { + "epoch": 2.58, + "learning_rate": 4.2375950742484604e-05, + "loss": 0.6461, + "step": 11960 + }, + { + "epoch": 2.58, + "learning_rate": 4.194132560666425e-05, + "loss": 0.6483, + "step": 11980 + }, + { + "epoch": 2.59, + "learning_rate": 4.150670047084389e-05, + "loss": 0.6461, + "step": 12000 + }, + { + "epoch": 2.59, + "eval_loss": 0.6692882180213928, + "eval_runtime": 50.1673, + "eval_samples_per_second": 39.867, + "eval_steps_per_second": 0.638, + "step": 12000 + }, + { + "epoch": 2.59, + "learning_rate": 4.1072075335023534e-05, + "loss": 0.6429, + "step": 12020 + }, + { + "epoch": 2.6, + "learning_rate": 4.0637450199203184e-05, + "loss": 0.6416, + "step": 12040 + }, + { + "epoch": 2.6, + "learning_rate": 4.020282506338283e-05, + "loss": 0.6356, + "step": 12060 + }, + { + "epoch": 2.61, + "learning_rate": 3.976819992756247e-05, + "loss": 0.6402, + "step": 12080 + }, + { + "epoch": 2.61, + "learning_rate": 3.933357479174212e-05, + "loss": 0.6395, + "step": 12100 + }, + { + "epoch": 2.61, + "learning_rate": 3.8898949655921764e-05, + "loss": 0.6432, + "step": 12120 + }, + { + "epoch": 2.62, + "learning_rate": 3.846432452010141e-05, + "loss": 0.6386, + "step": 12140 + }, + { + "epoch": 2.62, + "learning_rate": 3.802969938428106e-05, + "loss": 0.6396, + "step": 12160 + }, + { + "epoch": 2.63, + "learning_rate": 3.75950742484607e-05, + "loss": 0.6423, + "step": 12180 + }, + { + "epoch": 2.63, + "learning_rate": 3.7160449112640344e-05, + "loss": 0.649, + "step": 12200 + }, + { + "epoch": 2.63, + "eval_loss": 0.6691960096359253, + "eval_runtime": 50.1649, + "eval_samples_per_second": 39.869, + "eval_steps_per_second": 0.638, + "step": 12200 + }, + { + "epoch": 2.64, + "learning_rate": 3.672582397681999e-05, + "loss": 0.6547, + "step": 12220 + }, + { + "epoch": 2.64, + "learning_rate": 3.629119884099964e-05, + "loss": 0.642, + "step": 12240 + }, + { + "epoch": 2.65, + "learning_rate": 3.585657370517928e-05, + "loss": 0.634, + "step": 12260 + }, + { + "epoch": 2.65, + "learning_rate": 3.542194856935892e-05, + "loss": 0.6447, + "step": 12280 + }, + { + "epoch": 2.65, + "learning_rate": 3.498732343353857e-05, + "loss": 0.6285, + "step": 12300 + }, + { + "epoch": 2.66, + "learning_rate": 3.455269829771822e-05, + "loss": 0.6436, + "step": 12320 + }, + { + "epoch": 2.66, + "learning_rate": 3.411807316189786e-05, + "loss": 0.6349, + "step": 12340 + }, + { + "epoch": 2.67, + "learning_rate": 3.36834480260775e-05, + "loss": 0.6425, + "step": 12360 + }, + { + "epoch": 2.67, + "learning_rate": 3.324882289025715e-05, + "loss": 0.6393, + "step": 12380 + }, + { + "epoch": 2.68, + "learning_rate": 3.2814197754436796e-05, + "loss": 0.6367, + "step": 12400 + }, + { + "epoch": 2.68, + "eval_loss": 0.6687243580818176, + "eval_runtime": 50.3508, + "eval_samples_per_second": 39.721, + "eval_steps_per_second": 0.636, + "step": 12400 + }, + { + "epoch": 2.68, + "learning_rate": 3.237957261861644e-05, + "loss": 0.6386, + "step": 12420 + }, + { + "epoch": 2.68, + "learning_rate": 3.194494748279609e-05, + "loss": 0.6526, + "step": 12440 + }, + { + "epoch": 2.69, + "learning_rate": 3.151032234697573e-05, + "loss": 0.6357, + "step": 12460 + }, + { + "epoch": 2.69, + "learning_rate": 3.1075697211155376e-05, + "loss": 0.6353, + "step": 12480 + }, + { + "epoch": 2.7, + "learning_rate": 3.0641072075335026e-05, + "loss": 0.6449, + "step": 12500 + }, + { + "epoch": 2.7, + "learning_rate": 3.0206446939514663e-05, + "loss": 0.6425, + "step": 12520 + }, + { + "epoch": 2.71, + "learning_rate": 2.977182180369431e-05, + "loss": 0.6374, + "step": 12540 + }, + { + "epoch": 2.71, + "learning_rate": 2.9337196667873956e-05, + "loss": 0.6324, + "step": 12560 + }, + { + "epoch": 2.71, + "learning_rate": 2.89025715320536e-05, + "loss": 0.6502, + "step": 12580 + }, + { + "epoch": 2.72, + "learning_rate": 2.8467946396233246e-05, + "loss": 0.637, + "step": 12600 + }, + { + "epoch": 2.72, + "eval_loss": 0.6683821082115173, + "eval_runtime": 50.2054, + "eval_samples_per_second": 39.836, + "eval_steps_per_second": 0.637, + "step": 12600 + }, + { + "epoch": 2.72, + "learning_rate": 2.8033321260412892e-05, + "loss": 0.647, + "step": 12620 + }, + { + "epoch": 2.73, + "learning_rate": 2.7598696124592536e-05, + "loss": 0.632, + "step": 12640 + }, + { + "epoch": 2.73, + "learning_rate": 2.7164070988772182e-05, + "loss": 0.6411, + "step": 12660 + }, + { + "epoch": 2.74, + "learning_rate": 2.672944585295183e-05, + "loss": 0.632, + "step": 12680 + }, + { + "epoch": 2.74, + "learning_rate": 2.6294820717131472e-05, + "loss": 0.6389, + "step": 12700 + }, + { + "epoch": 2.74, + "learning_rate": 2.586019558131112e-05, + "loss": 0.6337, + "step": 12720 + }, + { + "epoch": 2.75, + "learning_rate": 2.542557044549076e-05, + "loss": 0.6439, + "step": 12740 + }, + { + "epoch": 2.75, + "learning_rate": 2.4990945309670405e-05, + "loss": 0.6364, + "step": 12760 + }, + { + "epoch": 2.76, + "learning_rate": 2.4556320173850052e-05, + "loss": 0.6402, + "step": 12780 + }, + { + "epoch": 2.76, + "learning_rate": 2.4121695038029695e-05, + "loss": 0.6376, + "step": 12800 + }, + { + "epoch": 2.76, + "eval_loss": 0.6680713295936584, + "eval_runtime": 50.1757, + "eval_samples_per_second": 39.86, + "eval_steps_per_second": 0.638, + "step": 12800 + }, + { + "epoch": 2.77, + "learning_rate": 2.3687069902209342e-05, + "loss": 0.6316, + "step": 12820 + }, + { + "epoch": 2.77, + "learning_rate": 2.325244476638899e-05, + "loss": 0.6393, + "step": 12840 + }, + { + "epoch": 2.77, + "learning_rate": 2.281781963056863e-05, + "loss": 0.6372, + "step": 12860 + }, + { + "epoch": 2.78, + "learning_rate": 2.2383194494748278e-05, + "loss": 0.6466, + "step": 12880 + }, + { + "epoch": 2.78, + "learning_rate": 2.1948569358927925e-05, + "loss": 0.6392, + "step": 12900 + }, + { + "epoch": 2.79, + "learning_rate": 2.1513944223107568e-05, + "loss": 0.6389, + "step": 12920 + }, + { + "epoch": 2.79, + "learning_rate": 2.107931908728721e-05, + "loss": 0.64, + "step": 12940 + }, + { + "epoch": 2.8, + "learning_rate": 2.0644693951466858e-05, + "loss": 0.6362, + "step": 12960 + }, + { + "epoch": 2.8, + "learning_rate": 2.02100688156465e-05, + "loss": 0.6364, + "step": 12980 + }, + { + "epoch": 2.8, + "learning_rate": 1.9775443679826148e-05, + "loss": 0.6372, + "step": 13000 + }, + { + "epoch": 2.8, + "eval_loss": 0.6680414080619812, + "eval_runtime": 50.2211, + "eval_samples_per_second": 39.824, + "eval_steps_per_second": 0.637, + "step": 13000 + }, + { + "epoch": 2.81, + "learning_rate": 1.9340818544005794e-05, + "loss": 0.6336, + "step": 13020 + }, + { + "epoch": 2.81, + "learning_rate": 1.8906193408185438e-05, + "loss": 0.6348, + "step": 13040 + }, + { + "epoch": 2.82, + "learning_rate": 1.8471568272365084e-05, + "loss": 0.6338, + "step": 13060 + }, + { + "epoch": 2.82, + "learning_rate": 1.8036943136544728e-05, + "loss": 0.6396, + "step": 13080 + }, + { + "epoch": 2.83, + "learning_rate": 1.7602318000724374e-05, + "loss": 0.641, + "step": 13100 + }, + { + "epoch": 2.83, + "learning_rate": 1.7167692864904017e-05, + "loss": 0.6369, + "step": 13120 + }, + { + "epoch": 2.83, + "learning_rate": 1.6733067729083664e-05, + "loss": 0.6345, + "step": 13140 + }, + { + "epoch": 2.84, + "learning_rate": 1.629844259326331e-05, + "loss": 0.649, + "step": 13160 + }, + { + "epoch": 2.84, + "learning_rate": 1.5863817457442954e-05, + "loss": 0.6409, + "step": 13180 + }, + { + "epoch": 2.85, + "learning_rate": 1.54291923216226e-05, + "loss": 0.63, + "step": 13200 + }, + { + "epoch": 2.85, + "eval_loss": 0.6678950190544128, + "eval_runtime": 50.1908, + "eval_samples_per_second": 39.848, + "eval_steps_per_second": 0.638, + "step": 13200 + }, + { + "epoch": 2.85, + "learning_rate": 1.4994567185802244e-05, + "loss": 0.6428, + "step": 13220 + }, + { + "epoch": 2.86, + "learning_rate": 1.4559942049981889e-05, + "loss": 0.645, + "step": 13240 + }, + { + "epoch": 2.86, + "learning_rate": 1.4125316914161534e-05, + "loss": 0.6434, + "step": 13260 + }, + { + "epoch": 2.87, + "learning_rate": 1.369069177834118e-05, + "loss": 0.6462, + "step": 13280 + }, + { + "epoch": 2.87, + "learning_rate": 1.3256066642520825e-05, + "loss": 0.6387, + "step": 13300 + }, + { + "epoch": 2.87, + "learning_rate": 1.2821441506700468e-05, + "loss": 0.6311, + "step": 13320 + }, + { + "epoch": 2.88, + "learning_rate": 1.2386816370880113e-05, + "loss": 0.6446, + "step": 13340 + }, + { + "epoch": 2.88, + "learning_rate": 1.195219123505976e-05, + "loss": 0.6426, + "step": 13360 + }, + { + "epoch": 2.89, + "learning_rate": 1.1517566099239405e-05, + "loss": 0.6369, + "step": 13380 + }, + { + "epoch": 2.89, + "learning_rate": 1.108294096341905e-05, + "loss": 0.6467, + "step": 13400 + }, + { + "epoch": 2.89, + "eval_loss": 0.6676326990127563, + "eval_runtime": 50.1589, + "eval_samples_per_second": 39.873, + "eval_steps_per_second": 0.638, + "step": 13400 + }, + { + "epoch": 2.9, + "learning_rate": 1.0648315827598697e-05, + "loss": 0.6347, + "step": 13420 + }, + { + "epoch": 2.9, + "learning_rate": 1.021369069177834e-05, + "loss": 0.6364, + "step": 13440 + }, + { + "epoch": 2.9, + "learning_rate": 9.779065555957985e-06, + "loss": 0.6309, + "step": 13460 + }, + { + "epoch": 2.91, + "learning_rate": 9.34444042013763e-06, + "loss": 0.6407, + "step": 13480 + }, + { + "epoch": 2.91, + "learning_rate": 8.909815284317276e-06, + "loss": 0.6389, + "step": 13500 + }, + { + "epoch": 2.92, + "learning_rate": 8.475190148496921e-06, + "loss": 0.6378, + "step": 13520 + }, + { + "epoch": 2.92, + "learning_rate": 8.040565012676566e-06, + "loss": 0.6359, + "step": 13540 + }, + { + "epoch": 2.93, + "learning_rate": 7.60593987685621e-06, + "loss": 0.6282, + "step": 13560 + }, + { + "epoch": 2.93, + "learning_rate": 7.171314741035856e-06, + "loss": 0.6409, + "step": 13580 + }, + { + "epoch": 2.93, + "learning_rate": 6.736689605215501e-06, + "loss": 0.6339, + "step": 13600 + }, + { + "epoch": 2.93, + "eval_loss": 0.6675477027893066, + "eval_runtime": 50.3638, + "eval_samples_per_second": 39.711, + "eval_steps_per_second": 0.635, + "step": 13600 + }, + { + "epoch": 2.94, + "learning_rate": 6.302064469395146e-06, + "loss": 0.6306, + "step": 13620 + }, + { + "epoch": 2.94, + "learning_rate": 5.867439333574791e-06, + "loss": 0.6438, + "step": 13640 + }, + { + "epoch": 2.95, + "learning_rate": 5.432814197754437e-06, + "loss": 0.6372, + "step": 13660 + }, + { + "epoch": 2.95, + "learning_rate": 4.9981890619340815e-06, + "loss": 0.6373, + "step": 13680 + }, + { + "epoch": 2.96, + "learning_rate": 4.5635639261137265e-06, + "loss": 0.6441, + "step": 13700 + }, + { + "epoch": 2.96, + "learning_rate": 4.128938790293371e-06, + "loss": 0.6486, + "step": 13720 + }, + { + "epoch": 2.96, + "learning_rate": 3.6943136544730164e-06, + "loss": 0.6359, + "step": 13740 + }, + { + "epoch": 2.97, + "learning_rate": 3.259688518652662e-06, + "loss": 0.6401, + "step": 13760 + }, + { + "epoch": 2.97, + "learning_rate": 2.825063382832307e-06, + "loss": 0.6493, + "step": 13780 + }, + { + "epoch": 2.98, + "learning_rate": 2.390438247011952e-06, + "loss": 0.6368, + "step": 13800 + }, + { + "epoch": 2.98, + "eval_loss": 0.6671983599662781, + "eval_runtime": 50.179, + "eval_samples_per_second": 39.857, + "eval_steps_per_second": 0.638, + "step": 13800 + } + ], + "max_steps": 13905, + "num_train_epochs": 3, + "total_flos": 1.7542324274428117e+20, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-belle30b/checkpoint-13800/training_args.bin b/adapters/saved-alpaca-belle30b/checkpoint-13800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..12b801c69c900b96b8117a2e6bdeacc32be225f4 --- /dev/null +++ b/adapters/saved-alpaca-belle30b/checkpoint-13800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e26d4a9526d1384fcaa3dc3df4f56f03c822ab57b4abed652b7156aebfaccc3 +size 3643 diff --git a/adapters/saved-alpaca-belle7b/adapter_config.json b/adapters/saved-alpaca-belle7b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6e56f2ae8f10fadfeec6c730ac6b119025824443 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/adapter_config.json @@ -0,0 +1,18 @@ +{ + "base_model_name_or_path": "/mnt/bn/qingyi-bn-lq/llama/llama-7b-hf", + "bias": "none", + "enable_lora": null, + "fan_in_fan_out": false, + "inference_mode": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved-alpaca-belle7b/adapter_model.bin b/adapters/saved-alpaca-belle7b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..d8540a80a81868134c30173dd0c7d21ccfaad2f3 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e886712eaa102351dd321ca543f1301223e6f3fcc19e0fb91c3b5f884114ecfb +size 16822989 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6400/optimizer.pt b/adapters/saved-alpaca-belle7b/checkpoint-6400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..da341bc5f250b3957267ba27555fa4cd82f24f25 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20f3a79d48e8cf6d44d450046f596cd9424436c76dcdea6e4dd44e6f671f418a +size 33629893 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6400/pytorch_model.bin b/adapters/saved-alpaca-belle7b/checkpoint-6400/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..3832b981b98db07141a3cffd85d424b2230af61b --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6400/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6cae8b69673aae3424511e142ef8bd2495df4810728af6b6420ab1c97b7d739 +size 16822989 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_0.pth b/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d75cf1ab0f00ea9f53305f011786d827b5f425e6 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:739d31b652593f3799ea8994a74474712959cd79efe6128ae43f7518054f58f6 +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_1.pth b/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d306230d78bf0d5f0e9dae329de711a1a95e557 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83a93d150605afa83ed7110ec58322dd8bd7a1482ed83cfbf17c97bc92cc068f +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_2.pth b/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..42998024066b0f81ac5e7b43aa26ff0881acd56a --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac1d72f4cf20b04f39c77b807827cf5d21267eec3906a2a711cbf9efd34ce3ad +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_3.pth b/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..c20e6df0960687614f7177f0f849b35daddd7de9 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96667a84944a39c1b7fead5e5352c87156477a12fcbdc007115e5dbd8986412e +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_4.pth b/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..beddecd19707029a74822895530d9b45fef5d689 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3b079ce086d6deaf3e98a8d64eb0e98ed9f14959d10eac8f06622c88b8054f9 +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_5.pth b/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..a2e70646fd57f4420831c602a73c011404a9afd8 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77e19f18c59f5ae311f803df2ad60fb6c5600892105a0f87c65f7897c60be0ab +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_6.pth b/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..f803863c248fe92e9ddb8ea9f74f8abb137b2393 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:237aa77bd71e6ed2274646189b7bb923be8bf6dbd6d6457d02ed1b79ac18e412 +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_7.pth b/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..3524332e259f134546ffa3ed4c7b792b9d4aae0e --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6400/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:625752c32e66ef13a62b1f3b709e903f29d38333c81e74d7dd1dd5ea16fb8c88 +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6400/scaler.pt b/adapters/saved-alpaca-belle7b/checkpoint-6400/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..73e0c05c242efd0c1cf6430829dd3275e599eeb0 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6400/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f443ab621e0e81b4551add2cd7570d8cf383b3a4163b34788cb579b8a803f826 +size 557 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6400/scheduler.pt b/adapters/saved-alpaca-belle7b/checkpoint-6400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..91360c203db1c5f1d16d538068068815c3f21087 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdd2bf54bab481e01fb013139e03bd02851b31bd33df8b196e28ce5d1671b791 +size 627 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6400/trainer_state.json b/adapters/saved-alpaca-belle7b/checkpoint-6400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e32822634daa31ecd03f86d8bd89de4f31b97872 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6400/trainer_state.json @@ -0,0 +1,2192 @@ +{ + "best_metric": 0.7553005218505859, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-belle7b/checkpoint-6400", + "epoch": 2.7615965480043148, + "global_step": 6400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.8997, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 0.00011999999999999999, + "loss": 1.5651, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 0.00017999999999999998, + "loss": 1.1268, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 0.00023999999999999998, + "loss": 1.041, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 0.0003, + "loss": 1.0094, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 0.000299124215443001, + "loss": 0.9948, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029824843088600204, + "loss": 0.974, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 0.000297372646329003, + "loss": 0.9594, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029649686177200406, + "loss": 0.9494, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002956210772150051, + "loss": 0.9473, + "step": 200 + }, + { + "epoch": 0.09, + "eval_loss": 0.9373907446861267, + "eval_runtime": 9.7698, + "eval_samples_per_second": 204.713, + "eval_steps_per_second": 3.275, + "step": 200 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029474529265800613, + "loss": 0.9377, + "step": 220 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002938695081010071, + "loss": 0.9311, + "step": 240 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029299372354400815, + "loss": 0.9268, + "step": 260 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002921179389870092, + "loss": 0.9145, + "step": 280 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002912421544300102, + "loss": 0.9099, + "step": 300 + }, + { + "epoch": 0.14, + "learning_rate": 0.00029036636987301125, + "loss": 0.9076, + "step": 320 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028949058531601223, + "loss": 0.901, + "step": 340 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028861480075901327, + "loss": 0.901, + "step": 360 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002877390162020143, + "loss": 0.8973, + "step": 380 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028686323164501534, + "loss": 0.8904, + "step": 400 + }, + { + "epoch": 0.17, + "eval_loss": 0.8943666815757751, + "eval_runtime": 9.7563, + "eval_samples_per_second": 204.995, + "eval_steps_per_second": 3.28, + "step": 400 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002859874470880163, + "loss": 0.8905, + "step": 420 + }, + { + "epoch": 0.19, + "learning_rate": 0.00028511166253101735, + "loss": 0.8856, + "step": 440 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002842358779740184, + "loss": 0.883, + "step": 460 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028336009341701937, + "loss": 0.8846, + "step": 480 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002824843088600204, + "loss": 0.8777, + "step": 500 + }, + { + "epoch": 0.22, + "learning_rate": 0.00028160852430302144, + "loss": 0.883, + "step": 520 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002807327397460225, + "loss": 0.8819, + "step": 540 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027985695518902346, + "loss": 0.8761, + "step": 560 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002789811706320245, + "loss": 0.8738, + "step": 580 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002781053860750255, + "loss": 0.8636, + "step": 600 + }, + { + "epoch": 0.26, + "eval_loss": 0.8673094511032104, + "eval_runtime": 9.8373, + "eval_samples_per_second": 203.309, + "eval_steps_per_second": 3.253, + "step": 600 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002772296015180265, + "loss": 0.8627, + "step": 620 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027635381696102754, + "loss": 0.8615, + "step": 640 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002754780324040286, + "loss": 0.8647, + "step": 660 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002746022478470296, + "loss": 0.8635, + "step": 680 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002737264632900306, + "loss": 0.8619, + "step": 700 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027285067873303163, + "loss": 0.8477, + "step": 720 + }, + { + "epoch": 0.32, + "learning_rate": 0.00027197489417603266, + "loss": 0.8564, + "step": 740 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002710991096190337, + "loss": 0.8513, + "step": 760 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002702233250620347, + "loss": 0.855, + "step": 780 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002693475405050357, + "loss": 0.8462, + "step": 800 + }, + { + "epoch": 0.35, + "eval_loss": 0.8500058650970459, + "eval_runtime": 9.7922, + "eval_samples_per_second": 204.244, + "eval_steps_per_second": 3.268, + "step": 800 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026847175594803675, + "loss": 0.852, + "step": 820 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002675959713910378, + "loss": 0.8455, + "step": 840 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002667201868340388, + "loss": 0.8479, + "step": 860 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002658444022770398, + "loss": 0.8423, + "step": 880 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026496861772004084, + "loss": 0.8404, + "step": 900 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026409283316304187, + "loss": 0.8434, + "step": 920 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002632170486060429, + "loss": 0.8371, + "step": 940 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002623412640490439, + "loss": 0.8397, + "step": 960 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002614654794920449, + "loss": 0.8394, + "step": 980 + }, + { + "epoch": 0.43, + "learning_rate": 0.00026058969493504596, + "loss": 0.8368, + "step": 1000 + }, + { + "epoch": 0.43, + "eval_loss": 0.8375310301780701, + "eval_runtime": 9.8151, + "eval_samples_per_second": 203.767, + "eval_steps_per_second": 3.26, + "step": 1000 + }, + { + "epoch": 0.44, + "learning_rate": 0.000259713910378047, + "loss": 0.8376, + "step": 1020 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025883812582104803, + "loss": 0.8307, + "step": 1040 + }, + { + "epoch": 0.46, + "learning_rate": 0.000257962341264049, + "loss": 0.8254, + "step": 1060 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025708655670705005, + "loss": 0.8347, + "step": 1080 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002562107721500511, + "loss": 0.8273, + "step": 1100 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002553349875930521, + "loss": 0.8252, + "step": 1120 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002544592030360531, + "loss": 0.8245, + "step": 1140 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025358341847905413, + "loss": 0.8213, + "step": 1160 + }, + { + "epoch": 0.51, + "learning_rate": 0.00025270763392205517, + "loss": 0.8269, + "step": 1180 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002518318493650562, + "loss": 0.8218, + "step": 1200 + }, + { + "epoch": 0.52, + "eval_loss": 0.8258803486824036, + "eval_runtime": 9.7909, + "eval_samples_per_second": 204.271, + "eval_steps_per_second": 3.268, + "step": 1200 + }, + { + "epoch": 0.53, + "learning_rate": 0.00025095606480805724, + "loss": 0.8141, + "step": 1220 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002500802802510582, + "loss": 0.822, + "step": 1240 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024920449569405925, + "loss": 0.82, + "step": 1260 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002483287111370603, + "loss": 0.8133, + "step": 1280 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024745292658006127, + "loss": 0.8248, + "step": 1300 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002465771420230623, + "loss": 0.814, + "step": 1320 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024570135746606334, + "loss": 0.8157, + "step": 1340 + }, + { + "epoch": 0.59, + "learning_rate": 0.0002448255729090644, + "loss": 0.8109, + "step": 1360 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024394978835206538, + "loss": 0.8142, + "step": 1380 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024307400379506642, + "loss": 0.8107, + "step": 1400 + }, + { + "epoch": 0.6, + "eval_loss": 0.8169026970863342, + "eval_runtime": 9.8327, + "eval_samples_per_second": 203.402, + "eval_steps_per_second": 3.254, + "step": 1400 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002421982192380674, + "loss": 0.8133, + "step": 1420 + }, + { + "epoch": 0.62, + "learning_rate": 0.00024132243468106843, + "loss": 0.8107, + "step": 1440 + }, + { + "epoch": 0.63, + "learning_rate": 0.00024044665012406947, + "loss": 0.8086, + "step": 1460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002395708655670705, + "loss": 0.8142, + "step": 1480 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023869508101007149, + "loss": 0.8154, + "step": 1500 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023781929645307252, + "loss": 0.8088, + "step": 1520 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023694351189607356, + "loss": 0.8052, + "step": 1540 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023606772733907456, + "loss": 0.8146, + "step": 1560 + }, + { + "epoch": 0.68, + "learning_rate": 0.0002351919427820756, + "loss": 0.802, + "step": 1580 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002343161582250766, + "loss": 0.8115, + "step": 1600 + }, + { + "epoch": 0.69, + "eval_loss": 0.8092362284660339, + "eval_runtime": 9.7925, + "eval_samples_per_second": 204.237, + "eval_steps_per_second": 3.268, + "step": 1600 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023344037366807764, + "loss": 0.7997, + "step": 1620 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023256458911107865, + "loss": 0.807, + "step": 1640 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023168880455407968, + "loss": 0.7951, + "step": 1660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0002308130199970807, + "loss": 0.8027, + "step": 1680 + }, + { + "epoch": 0.73, + "learning_rate": 0.0002299372354400817, + "loss": 0.8119, + "step": 1700 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022906145088308274, + "loss": 0.8024, + "step": 1720 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022818566632608377, + "loss": 0.8043, + "step": 1740 + }, + { + "epoch": 0.76, + "learning_rate": 0.0002273098817690848, + "loss": 0.8005, + "step": 1760 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002264340972120858, + "loss": 0.8024, + "step": 1780 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022555831265508682, + "loss": 0.7949, + "step": 1800 + }, + { + "epoch": 0.78, + "eval_loss": 0.8031176924705505, + "eval_runtime": 9.8716, + "eval_samples_per_second": 202.602, + "eval_steps_per_second": 3.242, + "step": 1800 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022468252809808786, + "loss": 0.7939, + "step": 1820 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002238067435410889, + "loss": 0.7958, + "step": 1840 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022293095898408987, + "loss": 0.8019, + "step": 1860 + }, + { + "epoch": 0.81, + "learning_rate": 0.0002220551744270909, + "loss": 0.7933, + "step": 1880 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022117938987009194, + "loss": 0.7994, + "step": 1900 + }, + { + "epoch": 0.83, + "learning_rate": 0.00022030360531309298, + "loss": 0.7938, + "step": 1920 + }, + { + "epoch": 0.84, + "learning_rate": 0.000219427820756094, + "loss": 0.7914, + "step": 1940 + }, + { + "epoch": 0.85, + "learning_rate": 0.000218552036199095, + "loss": 0.7921, + "step": 1960 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021767625164209603, + "loss": 0.8021, + "step": 1980 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021680046708509704, + "loss": 0.7961, + "step": 2000 + }, + { + "epoch": 0.86, + "eval_loss": 0.7977419495582581, + "eval_runtime": 9.8406, + "eval_samples_per_second": 203.239, + "eval_steps_per_second": 3.252, + "step": 2000 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021592468252809807, + "loss": 0.7969, + "step": 2020 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021504889797109908, + "loss": 0.7942, + "step": 2040 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021417311341410012, + "loss": 0.7882, + "step": 2060 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021329732885710112, + "loss": 0.79, + "step": 2080 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021242154430010216, + "loss": 0.7903, + "step": 2100 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002115457597431032, + "loss": 0.7885, + "step": 2120 + }, + { + "epoch": 0.92, + "learning_rate": 0.0002106699751861042, + "loss": 0.7925, + "step": 2140 + }, + { + "epoch": 0.93, + "learning_rate": 0.0002097941906291052, + "loss": 0.7896, + "step": 2160 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020891840607210625, + "loss": 0.7892, + "step": 2180 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020804262151510728, + "loss": 0.7901, + "step": 2200 + }, + { + "epoch": 0.95, + "eval_loss": 0.7927260994911194, + "eval_runtime": 9.9093, + "eval_samples_per_second": 201.83, + "eval_steps_per_second": 3.229, + "step": 2200 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020716683695810826, + "loss": 0.7937, + "step": 2220 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002062910524011093, + "loss": 0.7904, + "step": 2240 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020541526784411033, + "loss": 0.7886, + "step": 2260 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020453948328711137, + "loss": 0.7837, + "step": 2280 + }, + { + "epoch": 0.99, + "learning_rate": 0.0002036636987301124, + "loss": 0.79, + "step": 2300 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020278791417311338, + "loss": 0.7886, + "step": 2320 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020191212961611442, + "loss": 0.7816, + "step": 2340 + }, + { + "epoch": 1.02, + "learning_rate": 0.00020103634505911545, + "loss": 0.7774, + "step": 2360 + }, + { + "epoch": 1.03, + "learning_rate": 0.00020016056050211646, + "loss": 0.7811, + "step": 2380 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019928477594511747, + "loss": 0.788, + "step": 2400 + }, + { + "epoch": 1.04, + "eval_loss": 0.7887651920318604, + "eval_runtime": 10.1462, + "eval_samples_per_second": 197.119, + "eval_steps_per_second": 3.154, + "step": 2400 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001984089913881185, + "loss": 0.7818, + "step": 2420 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019753320683111954, + "loss": 0.7864, + "step": 2440 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019665742227412055, + "loss": 0.7745, + "step": 2460 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019578163771712158, + "loss": 0.7743, + "step": 2480 + }, + { + "epoch": 1.08, + "learning_rate": 0.0001949058531601226, + "loss": 0.7832, + "step": 2500 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001940300686031236, + "loss": 0.7803, + "step": 2520 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019315428404612463, + "loss": 0.7817, + "step": 2540 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019227849948912567, + "loss": 0.7843, + "step": 2560 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019140271493212668, + "loss": 0.7755, + "step": 2580 + }, + { + "epoch": 1.12, + "learning_rate": 0.00019052693037512769, + "loss": 0.7816, + "step": 2600 + }, + { + "epoch": 1.12, + "eval_loss": 0.7844015955924988, + "eval_runtime": 9.9343, + "eval_samples_per_second": 201.323, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018965114581812872, + "loss": 0.7753, + "step": 2620 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018877536126112976, + "loss": 0.7782, + "step": 2640 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001878995767041308, + "loss": 0.7776, + "step": 2660 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018702379214713177, + "loss": 0.7773, + "step": 2680 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001861480075901328, + "loss": 0.7771, + "step": 2700 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018527222303313384, + "loss": 0.7762, + "step": 2720 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018439643847613488, + "loss": 0.7694, + "step": 2740 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018352065391913586, + "loss": 0.7789, + "step": 2760 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001826448693621369, + "loss": 0.7808, + "step": 2780 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018176908480513793, + "loss": 0.7743, + "step": 2800 + }, + { + "epoch": 1.21, + "eval_loss": 0.7819436192512512, + "eval_runtime": 9.8637, + "eval_samples_per_second": 202.764, + "eval_steps_per_second": 3.244, + "step": 2800 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018089330024813894, + "loss": 0.7785, + "step": 2820 + }, + { + "epoch": 1.23, + "learning_rate": 0.00018001751569113997, + "loss": 0.7737, + "step": 2840 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017914173113414098, + "loss": 0.7793, + "step": 2860 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017826594657714202, + "loss": 0.7741, + "step": 2880 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017739016202014302, + "loss": 0.7758, + "step": 2900 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017651437746314406, + "loss": 0.7742, + "step": 2920 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017563859290614507, + "loss": 0.7678, + "step": 2940 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001747628083491461, + "loss": 0.7717, + "step": 2960 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001738870237921471, + "loss": 0.7752, + "step": 2980 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017301123923514814, + "loss": 0.7733, + "step": 3000 + }, + { + "epoch": 1.29, + "eval_loss": 0.7789211273193359, + "eval_runtime": 9.8687, + "eval_samples_per_second": 202.66, + "eval_steps_per_second": 3.243, + "step": 3000 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017213545467814918, + "loss": 0.772, + "step": 3020 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017125967012115016, + "loss": 0.7726, + "step": 3040 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001703838855641512, + "loss": 0.7669, + "step": 3060 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016950810100715223, + "loss": 0.7716, + "step": 3080 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016863231645015327, + "loss": 0.7736, + "step": 3100 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016775653189315425, + "loss": 0.7652, + "step": 3120 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016688074733615528, + "loss": 0.7675, + "step": 3140 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016600496277915632, + "loss": 0.7729, + "step": 3160 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016512917822215735, + "loss": 0.7748, + "step": 3180 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016425339366515836, + "loss": 0.7728, + "step": 3200 + }, + { + "epoch": 1.38, + "eval_loss": 0.7762572169303894, + "eval_runtime": 9.9106, + "eval_samples_per_second": 201.803, + "eval_steps_per_second": 3.229, + "step": 3200 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016337760910815937, + "loss": 0.7644, + "step": 3220 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001625018245511604, + "loss": 0.7701, + "step": 3240 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016162603999416144, + "loss": 0.7712, + "step": 3260 + }, + { + "epoch": 1.42, + "learning_rate": 0.00016075025543716245, + "loss": 0.7665, + "step": 3280 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015987447088016346, + "loss": 0.7649, + "step": 3300 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001589986863231645, + "loss": 0.7636, + "step": 3320 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001581229017661655, + "loss": 0.7679, + "step": 3340 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015724711720916653, + "loss": 0.7686, + "step": 3360 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015637133265216757, + "loss": 0.7723, + "step": 3380 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015549554809516858, + "loss": 0.7672, + "step": 3400 + }, + { + "epoch": 1.47, + "eval_loss": 0.7736611366271973, + "eval_runtime": 9.8349, + "eval_samples_per_second": 203.357, + "eval_steps_per_second": 3.254, + "step": 3400 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015461976353816958, + "loss": 0.7633, + "step": 3420 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015374397898117062, + "loss": 0.762, + "step": 3440 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015286819442417166, + "loss": 0.7647, + "step": 3460 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015199240986717264, + "loss": 0.7626, + "step": 3480 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015111662531017367, + "loss": 0.7683, + "step": 3500 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001502408407531747, + "loss": 0.7633, + "step": 3520 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014936505619617571, + "loss": 0.7641, + "step": 3540 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014848927163917675, + "loss": 0.7702, + "step": 3560 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014761348708217778, + "loss": 0.772, + "step": 3580 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001467377025251788, + "loss": 0.7759, + "step": 3600 + }, + { + "epoch": 1.55, + "eval_loss": 0.7713318467140198, + "eval_runtime": 9.9228, + "eval_samples_per_second": 201.556, + "eval_steps_per_second": 3.225, + "step": 3600 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014586191796817983, + "loss": 0.7647, + "step": 3620 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014498613341118084, + "loss": 0.7597, + "step": 3640 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014411034885418187, + "loss": 0.7615, + "step": 3660 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014323456429718288, + "loss": 0.7588, + "step": 3680 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014235877974018391, + "loss": 0.7549, + "step": 3700 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014148299518318492, + "loss": 0.7687, + "step": 3720 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014060721062618596, + "loss": 0.7684, + "step": 3740 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013973142606918697, + "loss": 0.7664, + "step": 3760 + }, + { + "epoch": 1.63, + "learning_rate": 0.000138855641512188, + "loss": 0.7638, + "step": 3780 + }, + { + "epoch": 1.64, + "learning_rate": 0.000137979856955189, + "loss": 0.763, + "step": 3800 + }, + { + "epoch": 1.64, + "eval_loss": 0.7696812748908997, + "eval_runtime": 9.9103, + "eval_samples_per_second": 201.81, + "eval_steps_per_second": 3.229, + "step": 3800 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013710407239819002, + "loss": 0.7594, + "step": 3820 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013622828784119105, + "loss": 0.7579, + "step": 3840 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013535250328419206, + "loss": 0.7651, + "step": 3860 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001344767187271931, + "loss": 0.7665, + "step": 3880 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001336009341701941, + "loss": 0.7595, + "step": 3900 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013272514961319514, + "loss": 0.7635, + "step": 3920 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013184936505619617, + "loss": 0.7599, + "step": 3940 + }, + { + "epoch": 1.71, + "learning_rate": 0.00013097358049919718, + "loss": 0.7581, + "step": 3960 + }, + { + "epoch": 1.72, + "learning_rate": 0.00013009779594219822, + "loss": 0.7514, + "step": 3980 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012922201138519922, + "loss": 0.7597, + "step": 4000 + }, + { + "epoch": 1.73, + "eval_loss": 0.7680906057357788, + "eval_runtime": 10.2283, + "eval_samples_per_second": 195.536, + "eval_steps_per_second": 3.129, + "step": 4000 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012834622682820026, + "loss": 0.7664, + "step": 4020 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012747044227120127, + "loss": 0.7613, + "step": 4040 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001265946577142023, + "loss": 0.759, + "step": 4060 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001257188731572033, + "loss": 0.7609, + "step": 4080 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012484308860020435, + "loss": 0.765, + "step": 4100 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012396730404320538, + "loss": 0.7559, + "step": 4120 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001230915194862064, + "loss": 0.7575, + "step": 4140 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001222157349292074, + "loss": 0.7596, + "step": 4160 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012133995037220842, + "loss": 0.7692, + "step": 4180 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012046416581520945, + "loss": 0.7627, + "step": 4200 + }, + { + "epoch": 1.81, + "eval_loss": 0.7660259008407593, + "eval_runtime": 9.861, + "eval_samples_per_second": 202.82, + "eval_steps_per_second": 3.245, + "step": 4200 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011958838125821046, + "loss": 0.7642, + "step": 4220 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001187125967012115, + "loss": 0.7612, + "step": 4240 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001178368121442125, + "loss": 0.7576, + "step": 4260 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011696102758721354, + "loss": 0.7592, + "step": 4280 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011608524303021456, + "loss": 0.7614, + "step": 4300 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011520945847321558, + "loss": 0.7564, + "step": 4320 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001143336739162166, + "loss": 0.7604, + "step": 4340 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011345788935921761, + "loss": 0.7547, + "step": 4360 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011258210480221865, + "loss": 0.7602, + "step": 4380 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011170632024521966, + "loss": 0.7569, + "step": 4400 + }, + { + "epoch": 1.9, + "eval_loss": 0.764076828956604, + "eval_runtime": 9.9349, + "eval_samples_per_second": 201.31, + "eval_steps_per_second": 3.221, + "step": 4400 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011083053568822069, + "loss": 0.7545, + "step": 4420 + }, + { + "epoch": 1.92, + "learning_rate": 0.0001099547511312217, + "loss": 0.7552, + "step": 4440 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010907896657422273, + "loss": 0.7579, + "step": 4460 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010820318201722376, + "loss": 0.7547, + "step": 4480 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010732739746022478, + "loss": 0.7581, + "step": 4500 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001064516129032258, + "loss": 0.7554, + "step": 4520 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010557582834622682, + "loss": 0.7563, + "step": 4540 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010470004378922784, + "loss": 0.7569, + "step": 4560 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010382425923222885, + "loss": 0.7571, + "step": 4580 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010294847467522989, + "loss": 0.7616, + "step": 4600 + }, + { + "epoch": 1.98, + "eval_loss": 0.7628415822982788, + "eval_runtime": 10.0234, + "eval_samples_per_second": 199.534, + "eval_steps_per_second": 3.193, + "step": 4600 + }, + { + "epoch": 1.99, + "learning_rate": 0.0001020726901182309, + "loss": 0.7587, + "step": 4620 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010119690556123193, + "loss": 0.7536, + "step": 4640 + }, + { + "epoch": 2.01, + "learning_rate": 0.00010032112100423295, + "loss": 0.7575, + "step": 4660 + }, + { + "epoch": 2.02, + "learning_rate": 9.944533644723397e-05, + "loss": 0.7572, + "step": 4680 + }, + { + "epoch": 2.03, + "learning_rate": 9.8569551890235e-05, + "loss": 0.7578, + "step": 4700 + }, + { + "epoch": 2.04, + "learning_rate": 9.769376733323602e-05, + "loss": 0.7545, + "step": 4720 + }, + { + "epoch": 2.05, + "learning_rate": 9.681798277623704e-05, + "loss": 0.7616, + "step": 4740 + }, + { + "epoch": 2.05, + "learning_rate": 9.594219821923806e-05, + "loss": 0.7549, + "step": 4760 + }, + { + "epoch": 2.06, + "learning_rate": 9.506641366223908e-05, + "loss": 0.7571, + "step": 4780 + }, + { + "epoch": 2.07, + "learning_rate": 9.41906291052401e-05, + "loss": 0.7528, + "step": 4800 + }, + { + "epoch": 2.07, + "eval_loss": 0.7622065544128418, + "eval_runtime": 9.9086, + "eval_samples_per_second": 201.845, + "eval_steps_per_second": 3.23, + "step": 4800 + }, + { + "epoch": 2.08, + "learning_rate": 9.331484454824112e-05, + "loss": 0.7561, + "step": 4820 + }, + { + "epoch": 2.09, + "learning_rate": 9.243905999124216e-05, + "loss": 0.7528, + "step": 4840 + }, + { + "epoch": 2.1, + "learning_rate": 9.156327543424317e-05, + "loss": 0.7496, + "step": 4860 + }, + { + "epoch": 2.11, + "learning_rate": 9.06874908772442e-05, + "loss": 0.7594, + "step": 4880 + }, + { + "epoch": 2.11, + "learning_rate": 8.981170632024521e-05, + "loss": 0.7539, + "step": 4900 + }, + { + "epoch": 2.12, + "learning_rate": 8.893592176324623e-05, + "loss": 0.7437, + "step": 4920 + }, + { + "epoch": 2.13, + "learning_rate": 8.806013720624725e-05, + "loss": 0.7475, + "step": 4940 + }, + { + "epoch": 2.14, + "learning_rate": 8.718435264924827e-05, + "loss": 0.75, + "step": 4960 + }, + { + "epoch": 2.15, + "learning_rate": 8.63085680922493e-05, + "loss": 0.7552, + "step": 4980 + }, + { + "epoch": 2.16, + "learning_rate": 8.543278353525032e-05, + "loss": 0.753, + "step": 5000 + }, + { + "epoch": 2.16, + "eval_loss": 0.7612386345863342, + "eval_runtime": 9.9196, + "eval_samples_per_second": 201.621, + "eval_steps_per_second": 3.226, + "step": 5000 + }, + { + "epoch": 2.17, + "learning_rate": 8.455699897825135e-05, + "loss": 0.7546, + "step": 5020 + }, + { + "epoch": 2.17, + "learning_rate": 8.368121442125236e-05, + "loss": 0.7562, + "step": 5040 + }, + { + "epoch": 2.18, + "learning_rate": 8.28054298642534e-05, + "loss": 0.7495, + "step": 5060 + }, + { + "epoch": 2.19, + "learning_rate": 8.19296453072544e-05, + "loss": 0.751, + "step": 5080 + }, + { + "epoch": 2.2, + "learning_rate": 8.105386075025544e-05, + "loss": 0.7459, + "step": 5100 + }, + { + "epoch": 2.21, + "learning_rate": 8.017807619325645e-05, + "loss": 0.7553, + "step": 5120 + }, + { + "epoch": 2.22, + "learning_rate": 7.930229163625748e-05, + "loss": 0.7504, + "step": 5140 + }, + { + "epoch": 2.23, + "learning_rate": 7.842650707925849e-05, + "loss": 0.7565, + "step": 5160 + }, + { + "epoch": 2.24, + "learning_rate": 7.755072252225951e-05, + "loss": 0.7471, + "step": 5180 + }, + { + "epoch": 2.24, + "learning_rate": 7.667493796526055e-05, + "loss": 0.7507, + "step": 5200 + }, + { + "epoch": 2.24, + "eval_loss": 0.7600537538528442, + "eval_runtime": 9.8816, + "eval_samples_per_second": 202.397, + "eval_steps_per_second": 3.238, + "step": 5200 + }, + { + "epoch": 2.25, + "learning_rate": 7.579915340826155e-05, + "loss": 0.7537, + "step": 5220 + }, + { + "epoch": 2.26, + "learning_rate": 7.492336885126258e-05, + "loss": 0.7537, + "step": 5240 + }, + { + "epoch": 2.27, + "learning_rate": 7.40475842942636e-05, + "loss": 0.7516, + "step": 5260 + }, + { + "epoch": 2.28, + "learning_rate": 7.317179973726463e-05, + "loss": 0.7459, + "step": 5280 + }, + { + "epoch": 2.29, + "learning_rate": 7.229601518026565e-05, + "loss": 0.7509, + "step": 5300 + }, + { + "epoch": 2.3, + "learning_rate": 7.142023062326668e-05, + "loss": 0.7519, + "step": 5320 + }, + { + "epoch": 2.3, + "learning_rate": 7.05444460662677e-05, + "loss": 0.7426, + "step": 5340 + }, + { + "epoch": 2.31, + "learning_rate": 6.966866150926872e-05, + "loss": 0.7451, + "step": 5360 + }, + { + "epoch": 2.32, + "learning_rate": 6.879287695226974e-05, + "loss": 0.7493, + "step": 5380 + }, + { + "epoch": 2.33, + "learning_rate": 6.791709239527075e-05, + "loss": 0.748, + "step": 5400 + }, + { + "epoch": 2.33, + "eval_loss": 0.7588484287261963, + "eval_runtime": 9.902, + "eval_samples_per_second": 201.98, + "eval_steps_per_second": 3.232, + "step": 5400 + }, + { + "epoch": 2.34, + "learning_rate": 6.704130783827177e-05, + "loss": 0.75, + "step": 5420 + }, + { + "epoch": 2.35, + "learning_rate": 6.616552328127279e-05, + "loss": 0.7491, + "step": 5440 + }, + { + "epoch": 2.36, + "learning_rate": 6.528973872427383e-05, + "loss": 0.747, + "step": 5460 + }, + { + "epoch": 2.36, + "learning_rate": 6.441395416727485e-05, + "loss": 0.7512, + "step": 5480 + }, + { + "epoch": 2.37, + "learning_rate": 6.353816961027587e-05, + "loss": 0.7557, + "step": 5500 + }, + { + "epoch": 2.38, + "learning_rate": 6.266238505327689e-05, + "loss": 0.7529, + "step": 5520 + }, + { + "epoch": 2.39, + "learning_rate": 6.178660049627791e-05, + "loss": 0.7465, + "step": 5540 + }, + { + "epoch": 2.4, + "learning_rate": 6.091081593927893e-05, + "loss": 0.7462, + "step": 5560 + }, + { + "epoch": 2.41, + "learning_rate": 6.003503138227995e-05, + "loss": 0.7476, + "step": 5580 + }, + { + "epoch": 2.42, + "learning_rate": 5.915924682528097e-05, + "loss": 0.7478, + "step": 5600 + }, + { + "epoch": 2.42, + "eval_loss": 0.7580318450927734, + "eval_runtime": 9.8692, + "eval_samples_per_second": 202.65, + "eval_steps_per_second": 3.242, + "step": 5600 + }, + { + "epoch": 2.43, + "learning_rate": 5.8283462268281994e-05, + "loss": 0.7504, + "step": 5620 + }, + { + "epoch": 2.43, + "learning_rate": 5.740767771128302e-05, + "loss": 0.7491, + "step": 5640 + }, + { + "epoch": 2.44, + "learning_rate": 5.6531893154284043e-05, + "loss": 0.7464, + "step": 5660 + }, + { + "epoch": 2.45, + "learning_rate": 5.5656108597285065e-05, + "loss": 0.7474, + "step": 5680 + }, + { + "epoch": 2.46, + "learning_rate": 5.478032404028609e-05, + "loss": 0.7488, + "step": 5700 + }, + { + "epoch": 2.47, + "learning_rate": 5.390453948328711e-05, + "loss": 0.7484, + "step": 5720 + }, + { + "epoch": 2.48, + "learning_rate": 5.302875492628813e-05, + "loss": 0.7486, + "step": 5740 + }, + { + "epoch": 2.49, + "learning_rate": 5.215297036928915e-05, + "loss": 0.7487, + "step": 5760 + }, + { + "epoch": 2.49, + "learning_rate": 5.127718581229017e-05, + "loss": 0.747, + "step": 5780 + }, + { + "epoch": 2.5, + "learning_rate": 5.040140125529119e-05, + "loss": 0.7511, + "step": 5800 + }, + { + "epoch": 2.5, + "eval_loss": 0.7569240927696228, + "eval_runtime": 9.868, + "eval_samples_per_second": 202.676, + "eval_steps_per_second": 3.243, + "step": 5800 + }, + { + "epoch": 2.51, + "learning_rate": 4.952561669829222e-05, + "loss": 0.7475, + "step": 5820 + }, + { + "epoch": 2.52, + "learning_rate": 4.864983214129324e-05, + "loss": 0.7402, + "step": 5840 + }, + { + "epoch": 2.53, + "learning_rate": 4.777404758429426e-05, + "loss": 0.7497, + "step": 5860 + }, + { + "epoch": 2.54, + "learning_rate": 4.689826302729528e-05, + "loss": 0.7481, + "step": 5880 + }, + { + "epoch": 2.55, + "learning_rate": 4.60224784702963e-05, + "loss": 0.7488, + "step": 5900 + }, + { + "epoch": 2.55, + "learning_rate": 4.5146693913297324e-05, + "loss": 0.747, + "step": 5920 + }, + { + "epoch": 2.56, + "learning_rate": 4.4270909356298346e-05, + "loss": 0.7538, + "step": 5940 + }, + { + "epoch": 2.57, + "learning_rate": 4.339512479929937e-05, + "loss": 0.7465, + "step": 5960 + }, + { + "epoch": 2.58, + "learning_rate": 4.251934024230039e-05, + "loss": 0.7499, + "step": 5980 + }, + { + "epoch": 2.59, + "learning_rate": 4.164355568530142e-05, + "loss": 0.7477, + "step": 6000 + }, + { + "epoch": 2.59, + "eval_loss": 0.7565082907676697, + "eval_runtime": 9.8679, + "eval_samples_per_second": 202.678, + "eval_steps_per_second": 3.243, + "step": 6000 + }, + { + "epoch": 2.6, + "learning_rate": 4.076777112830244e-05, + "loss": 0.7455, + "step": 6020 + }, + { + "epoch": 2.61, + "learning_rate": 3.989198657130346e-05, + "loss": 0.749, + "step": 6040 + }, + { + "epoch": 2.61, + "learning_rate": 3.901620201430448e-05, + "loss": 0.751, + "step": 6060 + }, + { + "epoch": 2.62, + "learning_rate": 3.81404174573055e-05, + "loss": 0.7471, + "step": 6080 + }, + { + "epoch": 2.63, + "learning_rate": 3.726463290030652e-05, + "loss": 0.7473, + "step": 6100 + }, + { + "epoch": 2.64, + "learning_rate": 3.638884834330754e-05, + "loss": 0.7515, + "step": 6120 + }, + { + "epoch": 2.65, + "learning_rate": 3.551306378630857e-05, + "loss": 0.7449, + "step": 6140 + }, + { + "epoch": 2.66, + "learning_rate": 3.463727922930959e-05, + "loss": 0.7433, + "step": 6160 + }, + { + "epoch": 2.67, + "learning_rate": 3.376149467231061e-05, + "loss": 0.7504, + "step": 6180 + }, + { + "epoch": 2.68, + "learning_rate": 3.2885710115311626e-05, + "loss": 0.7508, + "step": 6200 + }, + { + "epoch": 2.68, + "eval_loss": 0.7557815909385681, + "eval_runtime": 9.9145, + "eval_samples_per_second": 201.725, + "eval_steps_per_second": 3.228, + "step": 6200 + }, + { + "epoch": 2.68, + "learning_rate": 3.200992555831265e-05, + "loss": 0.7461, + "step": 6220 + }, + { + "epoch": 2.69, + "learning_rate": 3.1134141001313676e-05, + "loss": 0.7411, + "step": 6240 + }, + { + "epoch": 2.7, + "learning_rate": 3.0258356444314698e-05, + "loss": 0.7461, + "step": 6260 + }, + { + "epoch": 2.71, + "learning_rate": 2.9382571887315716e-05, + "loss": 0.7465, + "step": 6280 + }, + { + "epoch": 2.72, + "learning_rate": 2.8506787330316738e-05, + "loss": 0.7515, + "step": 6300 + }, + { + "epoch": 2.73, + "learning_rate": 2.7631002773317763e-05, + "loss": 0.7444, + "step": 6320 + }, + { + "epoch": 2.74, + "learning_rate": 2.6755218216318784e-05, + "loss": 0.7488, + "step": 6340 + }, + { + "epoch": 2.74, + "learning_rate": 2.5879433659319806e-05, + "loss": 0.7471, + "step": 6360 + }, + { + "epoch": 2.75, + "learning_rate": 2.5003649102320828e-05, + "loss": 0.7494, + "step": 6380 + }, + { + "epoch": 2.76, + "learning_rate": 2.41716537731718e-05, + "loss": 0.7552, + "step": 6400 + }, + { + "epoch": 2.76, + "eval_loss": 0.7553005218505859, + "eval_runtime": 9.9033, + "eval_samples_per_second": 201.953, + "eval_steps_per_second": 3.231, + "step": 6400 + } + ], + "max_steps": 6951, + "num_train_epochs": 3, + "total_flos": 1.6637993191991149e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6400/training_args.bin b/adapters/saved-alpaca-belle7b/checkpoint-6400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37751b82699e4ce9ea9e31699dc8564113a8dd87 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07526739f82df92f9a9bb721a1bac0aa5b54e880e798fc8e04003c255ebe3f76 +size 3643 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6600/optimizer.pt b/adapters/saved-alpaca-belle7b/checkpoint-6600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..03e08a784b68c612ed376b66cfe044c9b8a9e219 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1773109de8ccd30fc84f7025fc0fcf56070c6b2d1f7e9a9ddfedf46cae6b254e +size 33629893 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6600/pytorch_model.bin b/adapters/saved-alpaca-belle7b/checkpoint-6600/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..f83f4d4f924a32237e165beb1ecf0c047480af0a --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6600/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acfd4c36e85c995f9e13ee5d3e1ed4bc74cb42ca3352a948dde0f400d7f9c9c1 +size 16822989 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_0.pth b/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ca7b35b675e2f5f43a6e554b76b43549586bee61 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e38f6af0dda4efb932568567054e67d8e0ae510b4a5e4dd84e4e60e5644133c +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_1.pth b/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6fa645c7bd756abb33e47f96ba021928813e04e3 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8452b228a5b65045b9b84b54ca41823c52313d4fd4bad9da3e4961d0b5bd90f +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_2.pth b/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..a70e6c65bfdfa19793761eb20cefae9df3c2334f --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e57cdd9dde0edf9251bf40e2e2f215655ba06b466f891c3bd611f546e70f8026 +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_3.pth b/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..8a5f38cbe647f172eb551add473e1d3daa4158ae --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:423fe7c2b6e920f1db82c4c19eca6adc2ed7d34970e358640150dd24c2822b04 +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_4.pth b/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..ac89f22487835e0b7624da1337a7b7b3f2b28e2f --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0834110184a81b6d9c91f6e2aa567a4d30544053d485e0730d10b9522cbfee36 +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_5.pth b/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..3fc3104aa1425ab7935b78a2ec0eac3831c7ec7f --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a90d6a4ef4985fb58e852894139be45be655a50ddf2143b9ea12d8a195ccd14b +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_6.pth b/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..488727651b83bf71417c2b8cdfde029ec4fb82ad --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f44ce502d8b74654b16ced8c9211e76b7c0f36fe647f3e8b874d2a907b6b471 +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_7.pth b/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..0414db8e09f94efa45ef8ba8b085505f844db7f0 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6600/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f607eb3020a8a013e442b459ab398e00acbdec3ccdf5d83433eb2d2007609b9 +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6600/scaler.pt b/adapters/saved-alpaca-belle7b/checkpoint-6600/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4627685a44a9c999b115aa38a03e67896b77586 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6600/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1db78af08d46e4b49e1f6482c7dc00852ee24dcad65f736caeec043f0e904c8 +size 557 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6600/scheduler.pt b/adapters/saved-alpaca-belle7b/checkpoint-6600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..69a065329d2ac02b34b6774dffbf223395c3edca --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb265a1458a0bf20053e939d8e967e241ca813629490f14a159c6e431a47166c +size 627 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6600/trainer_state.json b/adapters/saved-alpaca-belle7b/checkpoint-6600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0dcb472afa084eaed3d85df65f94649f2c366db5 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6600/trainer_state.json @@ -0,0 +1,2260 @@ +{ + "best_metric": 0.7546943426132202, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-belle7b/checkpoint-6600", + "epoch": 2.8478964401294498, + "global_step": 6600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.8997, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 0.00011999999999999999, + "loss": 1.5651, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 0.00017999999999999998, + "loss": 1.1268, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 0.00023999999999999998, + "loss": 1.041, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 0.0003, + "loss": 1.0094, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 0.000299124215443001, + "loss": 0.9948, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029824843088600204, + "loss": 0.974, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 0.000297372646329003, + "loss": 0.9594, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029649686177200406, + "loss": 0.9494, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002956210772150051, + "loss": 0.9473, + "step": 200 + }, + { + "epoch": 0.09, + "eval_loss": 0.9373907446861267, + "eval_runtime": 9.7698, + "eval_samples_per_second": 204.713, + "eval_steps_per_second": 3.275, + "step": 200 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029474529265800613, + "loss": 0.9377, + "step": 220 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002938695081010071, + "loss": 0.9311, + "step": 240 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029299372354400815, + "loss": 0.9268, + "step": 260 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002921179389870092, + "loss": 0.9145, + "step": 280 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002912421544300102, + "loss": 0.9099, + "step": 300 + }, + { + "epoch": 0.14, + "learning_rate": 0.00029036636987301125, + "loss": 0.9076, + "step": 320 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028949058531601223, + "loss": 0.901, + "step": 340 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028861480075901327, + "loss": 0.901, + "step": 360 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002877390162020143, + "loss": 0.8973, + "step": 380 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028686323164501534, + "loss": 0.8904, + "step": 400 + }, + { + "epoch": 0.17, + "eval_loss": 0.8943666815757751, + "eval_runtime": 9.7563, + "eval_samples_per_second": 204.995, + "eval_steps_per_second": 3.28, + "step": 400 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002859874470880163, + "loss": 0.8905, + "step": 420 + }, + { + "epoch": 0.19, + "learning_rate": 0.00028511166253101735, + "loss": 0.8856, + "step": 440 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002842358779740184, + "loss": 0.883, + "step": 460 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028336009341701937, + "loss": 0.8846, + "step": 480 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002824843088600204, + "loss": 0.8777, + "step": 500 + }, + { + "epoch": 0.22, + "learning_rate": 0.00028160852430302144, + "loss": 0.883, + "step": 520 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002807327397460225, + "loss": 0.8819, + "step": 540 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027985695518902346, + "loss": 0.8761, + "step": 560 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002789811706320245, + "loss": 0.8738, + "step": 580 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002781053860750255, + "loss": 0.8636, + "step": 600 + }, + { + "epoch": 0.26, + "eval_loss": 0.8673094511032104, + "eval_runtime": 9.8373, + "eval_samples_per_second": 203.309, + "eval_steps_per_second": 3.253, + "step": 600 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002772296015180265, + "loss": 0.8627, + "step": 620 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027635381696102754, + "loss": 0.8615, + "step": 640 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002754780324040286, + "loss": 0.8647, + "step": 660 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002746022478470296, + "loss": 0.8635, + "step": 680 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002737264632900306, + "loss": 0.8619, + "step": 700 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027285067873303163, + "loss": 0.8477, + "step": 720 + }, + { + "epoch": 0.32, + "learning_rate": 0.00027197489417603266, + "loss": 0.8564, + "step": 740 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002710991096190337, + "loss": 0.8513, + "step": 760 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002702233250620347, + "loss": 0.855, + "step": 780 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002693475405050357, + "loss": 0.8462, + "step": 800 + }, + { + "epoch": 0.35, + "eval_loss": 0.8500058650970459, + "eval_runtime": 9.7922, + "eval_samples_per_second": 204.244, + "eval_steps_per_second": 3.268, + "step": 800 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026847175594803675, + "loss": 0.852, + "step": 820 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002675959713910378, + "loss": 0.8455, + "step": 840 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002667201868340388, + "loss": 0.8479, + "step": 860 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002658444022770398, + "loss": 0.8423, + "step": 880 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026496861772004084, + "loss": 0.8404, + "step": 900 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026409283316304187, + "loss": 0.8434, + "step": 920 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002632170486060429, + "loss": 0.8371, + "step": 940 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002623412640490439, + "loss": 0.8397, + "step": 960 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002614654794920449, + "loss": 0.8394, + "step": 980 + }, + { + "epoch": 0.43, + "learning_rate": 0.00026058969493504596, + "loss": 0.8368, + "step": 1000 + }, + { + "epoch": 0.43, + "eval_loss": 0.8375310301780701, + "eval_runtime": 9.8151, + "eval_samples_per_second": 203.767, + "eval_steps_per_second": 3.26, + "step": 1000 + }, + { + "epoch": 0.44, + "learning_rate": 0.000259713910378047, + "loss": 0.8376, + "step": 1020 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025883812582104803, + "loss": 0.8307, + "step": 1040 + }, + { + "epoch": 0.46, + "learning_rate": 0.000257962341264049, + "loss": 0.8254, + "step": 1060 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025708655670705005, + "loss": 0.8347, + "step": 1080 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002562107721500511, + "loss": 0.8273, + "step": 1100 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002553349875930521, + "loss": 0.8252, + "step": 1120 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002544592030360531, + "loss": 0.8245, + "step": 1140 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025358341847905413, + "loss": 0.8213, + "step": 1160 + }, + { + "epoch": 0.51, + "learning_rate": 0.00025270763392205517, + "loss": 0.8269, + "step": 1180 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002518318493650562, + "loss": 0.8218, + "step": 1200 + }, + { + "epoch": 0.52, + "eval_loss": 0.8258803486824036, + "eval_runtime": 9.7909, + "eval_samples_per_second": 204.271, + "eval_steps_per_second": 3.268, + "step": 1200 + }, + { + "epoch": 0.53, + "learning_rate": 0.00025095606480805724, + "loss": 0.8141, + "step": 1220 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002500802802510582, + "loss": 0.822, + "step": 1240 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024920449569405925, + "loss": 0.82, + "step": 1260 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002483287111370603, + "loss": 0.8133, + "step": 1280 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024745292658006127, + "loss": 0.8248, + "step": 1300 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002465771420230623, + "loss": 0.814, + "step": 1320 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024570135746606334, + "loss": 0.8157, + "step": 1340 + }, + { + "epoch": 0.59, + "learning_rate": 0.0002448255729090644, + "loss": 0.8109, + "step": 1360 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024394978835206538, + "loss": 0.8142, + "step": 1380 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024307400379506642, + "loss": 0.8107, + "step": 1400 + }, + { + "epoch": 0.6, + "eval_loss": 0.8169026970863342, + "eval_runtime": 9.8327, + "eval_samples_per_second": 203.402, + "eval_steps_per_second": 3.254, + "step": 1400 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002421982192380674, + "loss": 0.8133, + "step": 1420 + }, + { + "epoch": 0.62, + "learning_rate": 0.00024132243468106843, + "loss": 0.8107, + "step": 1440 + }, + { + "epoch": 0.63, + "learning_rate": 0.00024044665012406947, + "loss": 0.8086, + "step": 1460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002395708655670705, + "loss": 0.8142, + "step": 1480 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023869508101007149, + "loss": 0.8154, + "step": 1500 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023781929645307252, + "loss": 0.8088, + "step": 1520 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023694351189607356, + "loss": 0.8052, + "step": 1540 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023606772733907456, + "loss": 0.8146, + "step": 1560 + }, + { + "epoch": 0.68, + "learning_rate": 0.0002351919427820756, + "loss": 0.802, + "step": 1580 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002343161582250766, + "loss": 0.8115, + "step": 1600 + }, + { + "epoch": 0.69, + "eval_loss": 0.8092362284660339, + "eval_runtime": 9.7925, + "eval_samples_per_second": 204.237, + "eval_steps_per_second": 3.268, + "step": 1600 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023344037366807764, + "loss": 0.7997, + "step": 1620 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023256458911107865, + "loss": 0.807, + "step": 1640 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023168880455407968, + "loss": 0.7951, + "step": 1660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0002308130199970807, + "loss": 0.8027, + "step": 1680 + }, + { + "epoch": 0.73, + "learning_rate": 0.0002299372354400817, + "loss": 0.8119, + "step": 1700 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022906145088308274, + "loss": 0.8024, + "step": 1720 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022818566632608377, + "loss": 0.8043, + "step": 1740 + }, + { + "epoch": 0.76, + "learning_rate": 0.0002273098817690848, + "loss": 0.8005, + "step": 1760 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002264340972120858, + "loss": 0.8024, + "step": 1780 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022555831265508682, + "loss": 0.7949, + "step": 1800 + }, + { + "epoch": 0.78, + "eval_loss": 0.8031176924705505, + "eval_runtime": 9.8716, + "eval_samples_per_second": 202.602, + "eval_steps_per_second": 3.242, + "step": 1800 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022468252809808786, + "loss": 0.7939, + "step": 1820 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002238067435410889, + "loss": 0.7958, + "step": 1840 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022293095898408987, + "loss": 0.8019, + "step": 1860 + }, + { + "epoch": 0.81, + "learning_rate": 0.0002220551744270909, + "loss": 0.7933, + "step": 1880 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022117938987009194, + "loss": 0.7994, + "step": 1900 + }, + { + "epoch": 0.83, + "learning_rate": 0.00022030360531309298, + "loss": 0.7938, + "step": 1920 + }, + { + "epoch": 0.84, + "learning_rate": 0.000219427820756094, + "loss": 0.7914, + "step": 1940 + }, + { + "epoch": 0.85, + "learning_rate": 0.000218552036199095, + "loss": 0.7921, + "step": 1960 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021767625164209603, + "loss": 0.8021, + "step": 1980 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021680046708509704, + "loss": 0.7961, + "step": 2000 + }, + { + "epoch": 0.86, + "eval_loss": 0.7977419495582581, + "eval_runtime": 9.8406, + "eval_samples_per_second": 203.239, + "eval_steps_per_second": 3.252, + "step": 2000 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021592468252809807, + "loss": 0.7969, + "step": 2020 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021504889797109908, + "loss": 0.7942, + "step": 2040 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021417311341410012, + "loss": 0.7882, + "step": 2060 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021329732885710112, + "loss": 0.79, + "step": 2080 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021242154430010216, + "loss": 0.7903, + "step": 2100 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002115457597431032, + "loss": 0.7885, + "step": 2120 + }, + { + "epoch": 0.92, + "learning_rate": 0.0002106699751861042, + "loss": 0.7925, + "step": 2140 + }, + { + "epoch": 0.93, + "learning_rate": 0.0002097941906291052, + "loss": 0.7896, + "step": 2160 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020891840607210625, + "loss": 0.7892, + "step": 2180 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020804262151510728, + "loss": 0.7901, + "step": 2200 + }, + { + "epoch": 0.95, + "eval_loss": 0.7927260994911194, + "eval_runtime": 9.9093, + "eval_samples_per_second": 201.83, + "eval_steps_per_second": 3.229, + "step": 2200 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020716683695810826, + "loss": 0.7937, + "step": 2220 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002062910524011093, + "loss": 0.7904, + "step": 2240 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020541526784411033, + "loss": 0.7886, + "step": 2260 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020453948328711137, + "loss": 0.7837, + "step": 2280 + }, + { + "epoch": 0.99, + "learning_rate": 0.0002036636987301124, + "loss": 0.79, + "step": 2300 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020278791417311338, + "loss": 0.7886, + "step": 2320 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020191212961611442, + "loss": 0.7816, + "step": 2340 + }, + { + "epoch": 1.02, + "learning_rate": 0.00020103634505911545, + "loss": 0.7774, + "step": 2360 + }, + { + "epoch": 1.03, + "learning_rate": 0.00020016056050211646, + "loss": 0.7811, + "step": 2380 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019928477594511747, + "loss": 0.788, + "step": 2400 + }, + { + "epoch": 1.04, + "eval_loss": 0.7887651920318604, + "eval_runtime": 10.1462, + "eval_samples_per_second": 197.119, + "eval_steps_per_second": 3.154, + "step": 2400 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001984089913881185, + "loss": 0.7818, + "step": 2420 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019753320683111954, + "loss": 0.7864, + "step": 2440 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019665742227412055, + "loss": 0.7745, + "step": 2460 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019578163771712158, + "loss": 0.7743, + "step": 2480 + }, + { + "epoch": 1.08, + "learning_rate": 0.0001949058531601226, + "loss": 0.7832, + "step": 2500 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001940300686031236, + "loss": 0.7803, + "step": 2520 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019315428404612463, + "loss": 0.7817, + "step": 2540 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019227849948912567, + "loss": 0.7843, + "step": 2560 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019140271493212668, + "loss": 0.7755, + "step": 2580 + }, + { + "epoch": 1.12, + "learning_rate": 0.00019052693037512769, + "loss": 0.7816, + "step": 2600 + }, + { + "epoch": 1.12, + "eval_loss": 0.7844015955924988, + "eval_runtime": 9.9343, + "eval_samples_per_second": 201.323, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018965114581812872, + "loss": 0.7753, + "step": 2620 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018877536126112976, + "loss": 0.7782, + "step": 2640 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001878995767041308, + "loss": 0.7776, + "step": 2660 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018702379214713177, + "loss": 0.7773, + "step": 2680 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001861480075901328, + "loss": 0.7771, + "step": 2700 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018527222303313384, + "loss": 0.7762, + "step": 2720 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018439643847613488, + "loss": 0.7694, + "step": 2740 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018352065391913586, + "loss": 0.7789, + "step": 2760 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001826448693621369, + "loss": 0.7808, + "step": 2780 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018176908480513793, + "loss": 0.7743, + "step": 2800 + }, + { + "epoch": 1.21, + "eval_loss": 0.7819436192512512, + "eval_runtime": 9.8637, + "eval_samples_per_second": 202.764, + "eval_steps_per_second": 3.244, + "step": 2800 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018089330024813894, + "loss": 0.7785, + "step": 2820 + }, + { + "epoch": 1.23, + "learning_rate": 0.00018001751569113997, + "loss": 0.7737, + "step": 2840 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017914173113414098, + "loss": 0.7793, + "step": 2860 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017826594657714202, + "loss": 0.7741, + "step": 2880 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017739016202014302, + "loss": 0.7758, + "step": 2900 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017651437746314406, + "loss": 0.7742, + "step": 2920 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017563859290614507, + "loss": 0.7678, + "step": 2940 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001747628083491461, + "loss": 0.7717, + "step": 2960 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001738870237921471, + "loss": 0.7752, + "step": 2980 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017301123923514814, + "loss": 0.7733, + "step": 3000 + }, + { + "epoch": 1.29, + "eval_loss": 0.7789211273193359, + "eval_runtime": 9.8687, + "eval_samples_per_second": 202.66, + "eval_steps_per_second": 3.243, + "step": 3000 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017213545467814918, + "loss": 0.772, + "step": 3020 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017125967012115016, + "loss": 0.7726, + "step": 3040 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001703838855641512, + "loss": 0.7669, + "step": 3060 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016950810100715223, + "loss": 0.7716, + "step": 3080 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016863231645015327, + "loss": 0.7736, + "step": 3100 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016775653189315425, + "loss": 0.7652, + "step": 3120 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016688074733615528, + "loss": 0.7675, + "step": 3140 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016600496277915632, + "loss": 0.7729, + "step": 3160 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016512917822215735, + "loss": 0.7748, + "step": 3180 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016425339366515836, + "loss": 0.7728, + "step": 3200 + }, + { + "epoch": 1.38, + "eval_loss": 0.7762572169303894, + "eval_runtime": 9.9106, + "eval_samples_per_second": 201.803, + "eval_steps_per_second": 3.229, + "step": 3200 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016337760910815937, + "loss": 0.7644, + "step": 3220 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001625018245511604, + "loss": 0.7701, + "step": 3240 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016162603999416144, + "loss": 0.7712, + "step": 3260 + }, + { + "epoch": 1.42, + "learning_rate": 0.00016075025543716245, + "loss": 0.7665, + "step": 3280 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015987447088016346, + "loss": 0.7649, + "step": 3300 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001589986863231645, + "loss": 0.7636, + "step": 3320 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001581229017661655, + "loss": 0.7679, + "step": 3340 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015724711720916653, + "loss": 0.7686, + "step": 3360 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015637133265216757, + "loss": 0.7723, + "step": 3380 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015549554809516858, + "loss": 0.7672, + "step": 3400 + }, + { + "epoch": 1.47, + "eval_loss": 0.7736611366271973, + "eval_runtime": 9.8349, + "eval_samples_per_second": 203.357, + "eval_steps_per_second": 3.254, + "step": 3400 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015461976353816958, + "loss": 0.7633, + "step": 3420 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015374397898117062, + "loss": 0.762, + "step": 3440 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015286819442417166, + "loss": 0.7647, + "step": 3460 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015199240986717264, + "loss": 0.7626, + "step": 3480 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015111662531017367, + "loss": 0.7683, + "step": 3500 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001502408407531747, + "loss": 0.7633, + "step": 3520 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014936505619617571, + "loss": 0.7641, + "step": 3540 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014848927163917675, + "loss": 0.7702, + "step": 3560 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014761348708217778, + "loss": 0.772, + "step": 3580 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001467377025251788, + "loss": 0.7759, + "step": 3600 + }, + { + "epoch": 1.55, + "eval_loss": 0.7713318467140198, + "eval_runtime": 9.9228, + "eval_samples_per_second": 201.556, + "eval_steps_per_second": 3.225, + "step": 3600 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014586191796817983, + "loss": 0.7647, + "step": 3620 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014498613341118084, + "loss": 0.7597, + "step": 3640 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014411034885418187, + "loss": 0.7615, + "step": 3660 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014323456429718288, + "loss": 0.7588, + "step": 3680 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014235877974018391, + "loss": 0.7549, + "step": 3700 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014148299518318492, + "loss": 0.7687, + "step": 3720 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014060721062618596, + "loss": 0.7684, + "step": 3740 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013973142606918697, + "loss": 0.7664, + "step": 3760 + }, + { + "epoch": 1.63, + "learning_rate": 0.000138855641512188, + "loss": 0.7638, + "step": 3780 + }, + { + "epoch": 1.64, + "learning_rate": 0.000137979856955189, + "loss": 0.763, + "step": 3800 + }, + { + "epoch": 1.64, + "eval_loss": 0.7696812748908997, + "eval_runtime": 9.9103, + "eval_samples_per_second": 201.81, + "eval_steps_per_second": 3.229, + "step": 3800 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013710407239819002, + "loss": 0.7594, + "step": 3820 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013622828784119105, + "loss": 0.7579, + "step": 3840 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013535250328419206, + "loss": 0.7651, + "step": 3860 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001344767187271931, + "loss": 0.7665, + "step": 3880 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001336009341701941, + "loss": 0.7595, + "step": 3900 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013272514961319514, + "loss": 0.7635, + "step": 3920 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013184936505619617, + "loss": 0.7599, + "step": 3940 + }, + { + "epoch": 1.71, + "learning_rate": 0.00013097358049919718, + "loss": 0.7581, + "step": 3960 + }, + { + "epoch": 1.72, + "learning_rate": 0.00013009779594219822, + "loss": 0.7514, + "step": 3980 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012922201138519922, + "loss": 0.7597, + "step": 4000 + }, + { + "epoch": 1.73, + "eval_loss": 0.7680906057357788, + "eval_runtime": 10.2283, + "eval_samples_per_second": 195.536, + "eval_steps_per_second": 3.129, + "step": 4000 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012834622682820026, + "loss": 0.7664, + "step": 4020 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012747044227120127, + "loss": 0.7613, + "step": 4040 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001265946577142023, + "loss": 0.759, + "step": 4060 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001257188731572033, + "loss": 0.7609, + "step": 4080 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012484308860020435, + "loss": 0.765, + "step": 4100 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012396730404320538, + "loss": 0.7559, + "step": 4120 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001230915194862064, + "loss": 0.7575, + "step": 4140 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001222157349292074, + "loss": 0.7596, + "step": 4160 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012133995037220842, + "loss": 0.7692, + "step": 4180 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012046416581520945, + "loss": 0.7627, + "step": 4200 + }, + { + "epoch": 1.81, + "eval_loss": 0.7660259008407593, + "eval_runtime": 9.861, + "eval_samples_per_second": 202.82, + "eval_steps_per_second": 3.245, + "step": 4200 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011958838125821046, + "loss": 0.7642, + "step": 4220 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001187125967012115, + "loss": 0.7612, + "step": 4240 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001178368121442125, + "loss": 0.7576, + "step": 4260 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011696102758721354, + "loss": 0.7592, + "step": 4280 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011608524303021456, + "loss": 0.7614, + "step": 4300 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011520945847321558, + "loss": 0.7564, + "step": 4320 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001143336739162166, + "loss": 0.7604, + "step": 4340 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011345788935921761, + "loss": 0.7547, + "step": 4360 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011258210480221865, + "loss": 0.7602, + "step": 4380 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011170632024521966, + "loss": 0.7569, + "step": 4400 + }, + { + "epoch": 1.9, + "eval_loss": 0.764076828956604, + "eval_runtime": 9.9349, + "eval_samples_per_second": 201.31, + "eval_steps_per_second": 3.221, + "step": 4400 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011083053568822069, + "loss": 0.7545, + "step": 4420 + }, + { + "epoch": 1.92, + "learning_rate": 0.0001099547511312217, + "loss": 0.7552, + "step": 4440 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010907896657422273, + "loss": 0.7579, + "step": 4460 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010820318201722376, + "loss": 0.7547, + "step": 4480 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010732739746022478, + "loss": 0.7581, + "step": 4500 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001064516129032258, + "loss": 0.7554, + "step": 4520 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010557582834622682, + "loss": 0.7563, + "step": 4540 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010470004378922784, + "loss": 0.7569, + "step": 4560 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010382425923222885, + "loss": 0.7571, + "step": 4580 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010294847467522989, + "loss": 0.7616, + "step": 4600 + }, + { + "epoch": 1.98, + "eval_loss": 0.7628415822982788, + "eval_runtime": 10.0234, + "eval_samples_per_second": 199.534, + "eval_steps_per_second": 3.193, + "step": 4600 + }, + { + "epoch": 1.99, + "learning_rate": 0.0001020726901182309, + "loss": 0.7587, + "step": 4620 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010119690556123193, + "loss": 0.7536, + "step": 4640 + }, + { + "epoch": 2.01, + "learning_rate": 0.00010032112100423295, + "loss": 0.7575, + "step": 4660 + }, + { + "epoch": 2.02, + "learning_rate": 9.944533644723397e-05, + "loss": 0.7572, + "step": 4680 + }, + { + "epoch": 2.03, + "learning_rate": 9.8569551890235e-05, + "loss": 0.7578, + "step": 4700 + }, + { + "epoch": 2.04, + "learning_rate": 9.769376733323602e-05, + "loss": 0.7545, + "step": 4720 + }, + { + "epoch": 2.05, + "learning_rate": 9.681798277623704e-05, + "loss": 0.7616, + "step": 4740 + }, + { + "epoch": 2.05, + "learning_rate": 9.594219821923806e-05, + "loss": 0.7549, + "step": 4760 + }, + { + "epoch": 2.06, + "learning_rate": 9.506641366223908e-05, + "loss": 0.7571, + "step": 4780 + }, + { + "epoch": 2.07, + "learning_rate": 9.41906291052401e-05, + "loss": 0.7528, + "step": 4800 + }, + { + "epoch": 2.07, + "eval_loss": 0.7622065544128418, + "eval_runtime": 9.9086, + "eval_samples_per_second": 201.845, + "eval_steps_per_second": 3.23, + "step": 4800 + }, + { + "epoch": 2.08, + "learning_rate": 9.331484454824112e-05, + "loss": 0.7561, + "step": 4820 + }, + { + "epoch": 2.09, + "learning_rate": 9.243905999124216e-05, + "loss": 0.7528, + "step": 4840 + }, + { + "epoch": 2.1, + "learning_rate": 9.156327543424317e-05, + "loss": 0.7496, + "step": 4860 + }, + { + "epoch": 2.11, + "learning_rate": 9.06874908772442e-05, + "loss": 0.7594, + "step": 4880 + }, + { + "epoch": 2.11, + "learning_rate": 8.981170632024521e-05, + "loss": 0.7539, + "step": 4900 + }, + { + "epoch": 2.12, + "learning_rate": 8.893592176324623e-05, + "loss": 0.7437, + "step": 4920 + }, + { + "epoch": 2.13, + "learning_rate": 8.806013720624725e-05, + "loss": 0.7475, + "step": 4940 + }, + { + "epoch": 2.14, + "learning_rate": 8.718435264924827e-05, + "loss": 0.75, + "step": 4960 + }, + { + "epoch": 2.15, + "learning_rate": 8.63085680922493e-05, + "loss": 0.7552, + "step": 4980 + }, + { + "epoch": 2.16, + "learning_rate": 8.543278353525032e-05, + "loss": 0.753, + "step": 5000 + }, + { + "epoch": 2.16, + "eval_loss": 0.7612386345863342, + "eval_runtime": 9.9196, + "eval_samples_per_second": 201.621, + "eval_steps_per_second": 3.226, + "step": 5000 + }, + { + "epoch": 2.17, + "learning_rate": 8.455699897825135e-05, + "loss": 0.7546, + "step": 5020 + }, + { + "epoch": 2.17, + "learning_rate": 8.368121442125236e-05, + "loss": 0.7562, + "step": 5040 + }, + { + "epoch": 2.18, + "learning_rate": 8.28054298642534e-05, + "loss": 0.7495, + "step": 5060 + }, + { + "epoch": 2.19, + "learning_rate": 8.19296453072544e-05, + "loss": 0.751, + "step": 5080 + }, + { + "epoch": 2.2, + "learning_rate": 8.105386075025544e-05, + "loss": 0.7459, + "step": 5100 + }, + { + "epoch": 2.21, + "learning_rate": 8.017807619325645e-05, + "loss": 0.7553, + "step": 5120 + }, + { + "epoch": 2.22, + "learning_rate": 7.930229163625748e-05, + "loss": 0.7504, + "step": 5140 + }, + { + "epoch": 2.23, + "learning_rate": 7.842650707925849e-05, + "loss": 0.7565, + "step": 5160 + }, + { + "epoch": 2.24, + "learning_rate": 7.755072252225951e-05, + "loss": 0.7471, + "step": 5180 + }, + { + "epoch": 2.24, + "learning_rate": 7.667493796526055e-05, + "loss": 0.7507, + "step": 5200 + }, + { + "epoch": 2.24, + "eval_loss": 0.7600537538528442, + "eval_runtime": 9.8816, + "eval_samples_per_second": 202.397, + "eval_steps_per_second": 3.238, + "step": 5200 + }, + { + "epoch": 2.25, + "learning_rate": 7.579915340826155e-05, + "loss": 0.7537, + "step": 5220 + }, + { + "epoch": 2.26, + "learning_rate": 7.492336885126258e-05, + "loss": 0.7537, + "step": 5240 + }, + { + "epoch": 2.27, + "learning_rate": 7.40475842942636e-05, + "loss": 0.7516, + "step": 5260 + }, + { + "epoch": 2.28, + "learning_rate": 7.317179973726463e-05, + "loss": 0.7459, + "step": 5280 + }, + { + "epoch": 2.29, + "learning_rate": 7.229601518026565e-05, + "loss": 0.7509, + "step": 5300 + }, + { + "epoch": 2.3, + "learning_rate": 7.142023062326668e-05, + "loss": 0.7519, + "step": 5320 + }, + { + "epoch": 2.3, + "learning_rate": 7.05444460662677e-05, + "loss": 0.7426, + "step": 5340 + }, + { + "epoch": 2.31, + "learning_rate": 6.966866150926872e-05, + "loss": 0.7451, + "step": 5360 + }, + { + "epoch": 2.32, + "learning_rate": 6.879287695226974e-05, + "loss": 0.7493, + "step": 5380 + }, + { + "epoch": 2.33, + "learning_rate": 6.791709239527075e-05, + "loss": 0.748, + "step": 5400 + }, + { + "epoch": 2.33, + "eval_loss": 0.7588484287261963, + "eval_runtime": 9.902, + "eval_samples_per_second": 201.98, + "eval_steps_per_second": 3.232, + "step": 5400 + }, + { + "epoch": 2.34, + "learning_rate": 6.704130783827177e-05, + "loss": 0.75, + "step": 5420 + }, + { + "epoch": 2.35, + "learning_rate": 6.616552328127279e-05, + "loss": 0.7491, + "step": 5440 + }, + { + "epoch": 2.36, + "learning_rate": 6.528973872427383e-05, + "loss": 0.747, + "step": 5460 + }, + { + "epoch": 2.36, + "learning_rate": 6.441395416727485e-05, + "loss": 0.7512, + "step": 5480 + }, + { + "epoch": 2.37, + "learning_rate": 6.353816961027587e-05, + "loss": 0.7557, + "step": 5500 + }, + { + "epoch": 2.38, + "learning_rate": 6.266238505327689e-05, + "loss": 0.7529, + "step": 5520 + }, + { + "epoch": 2.39, + "learning_rate": 6.178660049627791e-05, + "loss": 0.7465, + "step": 5540 + }, + { + "epoch": 2.4, + "learning_rate": 6.091081593927893e-05, + "loss": 0.7462, + "step": 5560 + }, + { + "epoch": 2.41, + "learning_rate": 6.003503138227995e-05, + "loss": 0.7476, + "step": 5580 + }, + { + "epoch": 2.42, + "learning_rate": 5.915924682528097e-05, + "loss": 0.7478, + "step": 5600 + }, + { + "epoch": 2.42, + "eval_loss": 0.7580318450927734, + "eval_runtime": 9.8692, + "eval_samples_per_second": 202.65, + "eval_steps_per_second": 3.242, + "step": 5600 + }, + { + "epoch": 2.43, + "learning_rate": 5.8283462268281994e-05, + "loss": 0.7504, + "step": 5620 + }, + { + "epoch": 2.43, + "learning_rate": 5.740767771128302e-05, + "loss": 0.7491, + "step": 5640 + }, + { + "epoch": 2.44, + "learning_rate": 5.6531893154284043e-05, + "loss": 0.7464, + "step": 5660 + }, + { + "epoch": 2.45, + "learning_rate": 5.5656108597285065e-05, + "loss": 0.7474, + "step": 5680 + }, + { + "epoch": 2.46, + "learning_rate": 5.478032404028609e-05, + "loss": 0.7488, + "step": 5700 + }, + { + "epoch": 2.47, + "learning_rate": 5.390453948328711e-05, + "loss": 0.7484, + "step": 5720 + }, + { + "epoch": 2.48, + "learning_rate": 5.302875492628813e-05, + "loss": 0.7486, + "step": 5740 + }, + { + "epoch": 2.49, + "learning_rate": 5.215297036928915e-05, + "loss": 0.7487, + "step": 5760 + }, + { + "epoch": 2.49, + "learning_rate": 5.127718581229017e-05, + "loss": 0.747, + "step": 5780 + }, + { + "epoch": 2.5, + "learning_rate": 5.040140125529119e-05, + "loss": 0.7511, + "step": 5800 + }, + { + "epoch": 2.5, + "eval_loss": 0.7569240927696228, + "eval_runtime": 9.868, + "eval_samples_per_second": 202.676, + "eval_steps_per_second": 3.243, + "step": 5800 + }, + { + "epoch": 2.51, + "learning_rate": 4.952561669829222e-05, + "loss": 0.7475, + "step": 5820 + }, + { + "epoch": 2.52, + "learning_rate": 4.864983214129324e-05, + "loss": 0.7402, + "step": 5840 + }, + { + "epoch": 2.53, + "learning_rate": 4.777404758429426e-05, + "loss": 0.7497, + "step": 5860 + }, + { + "epoch": 2.54, + "learning_rate": 4.689826302729528e-05, + "loss": 0.7481, + "step": 5880 + }, + { + "epoch": 2.55, + "learning_rate": 4.60224784702963e-05, + "loss": 0.7488, + "step": 5900 + }, + { + "epoch": 2.55, + "learning_rate": 4.5146693913297324e-05, + "loss": 0.747, + "step": 5920 + }, + { + "epoch": 2.56, + "learning_rate": 4.4270909356298346e-05, + "loss": 0.7538, + "step": 5940 + }, + { + "epoch": 2.57, + "learning_rate": 4.339512479929937e-05, + "loss": 0.7465, + "step": 5960 + }, + { + "epoch": 2.58, + "learning_rate": 4.251934024230039e-05, + "loss": 0.7499, + "step": 5980 + }, + { + "epoch": 2.59, + "learning_rate": 4.164355568530142e-05, + "loss": 0.7477, + "step": 6000 + }, + { + "epoch": 2.59, + "eval_loss": 0.7565082907676697, + "eval_runtime": 9.8679, + "eval_samples_per_second": 202.678, + "eval_steps_per_second": 3.243, + "step": 6000 + }, + { + "epoch": 2.6, + "learning_rate": 4.076777112830244e-05, + "loss": 0.7455, + "step": 6020 + }, + { + "epoch": 2.61, + "learning_rate": 3.989198657130346e-05, + "loss": 0.749, + "step": 6040 + }, + { + "epoch": 2.61, + "learning_rate": 3.901620201430448e-05, + "loss": 0.751, + "step": 6060 + }, + { + "epoch": 2.62, + "learning_rate": 3.81404174573055e-05, + "loss": 0.7471, + "step": 6080 + }, + { + "epoch": 2.63, + "learning_rate": 3.726463290030652e-05, + "loss": 0.7473, + "step": 6100 + }, + { + "epoch": 2.64, + "learning_rate": 3.638884834330754e-05, + "loss": 0.7515, + "step": 6120 + }, + { + "epoch": 2.65, + "learning_rate": 3.551306378630857e-05, + "loss": 0.7449, + "step": 6140 + }, + { + "epoch": 2.66, + "learning_rate": 3.463727922930959e-05, + "loss": 0.7433, + "step": 6160 + }, + { + "epoch": 2.67, + "learning_rate": 3.376149467231061e-05, + "loss": 0.7504, + "step": 6180 + }, + { + "epoch": 2.68, + "learning_rate": 3.2885710115311626e-05, + "loss": 0.7508, + "step": 6200 + }, + { + "epoch": 2.68, + "eval_loss": 0.7557815909385681, + "eval_runtime": 9.9145, + "eval_samples_per_second": 201.725, + "eval_steps_per_second": 3.228, + "step": 6200 + }, + { + "epoch": 2.68, + "learning_rate": 3.200992555831265e-05, + "loss": 0.7461, + "step": 6220 + }, + { + "epoch": 2.69, + "learning_rate": 3.1134141001313676e-05, + "loss": 0.7411, + "step": 6240 + }, + { + "epoch": 2.7, + "learning_rate": 3.0258356444314698e-05, + "loss": 0.7461, + "step": 6260 + }, + { + "epoch": 2.71, + "learning_rate": 2.9382571887315716e-05, + "loss": 0.7465, + "step": 6280 + }, + { + "epoch": 2.72, + "learning_rate": 2.8506787330316738e-05, + "loss": 0.7515, + "step": 6300 + }, + { + "epoch": 2.73, + "learning_rate": 2.7631002773317763e-05, + "loss": 0.7444, + "step": 6320 + }, + { + "epoch": 2.74, + "learning_rate": 2.6755218216318784e-05, + "loss": 0.7488, + "step": 6340 + }, + { + "epoch": 2.74, + "learning_rate": 2.5879433659319806e-05, + "loss": 0.7471, + "step": 6360 + }, + { + "epoch": 2.75, + "learning_rate": 2.5003649102320828e-05, + "loss": 0.7494, + "step": 6380 + }, + { + "epoch": 2.76, + "learning_rate": 2.41716537731718e-05, + "loss": 0.7552, + "step": 6400 + }, + { + "epoch": 2.76, + "eval_loss": 0.7553005218505859, + "eval_runtime": 9.9033, + "eval_samples_per_second": 201.953, + "eval_steps_per_second": 3.231, + "step": 6400 + }, + { + "epoch": 2.77, + "learning_rate": 2.329586921617282e-05, + "loss": 0.7425, + "step": 6420 + }, + { + "epoch": 2.78, + "learning_rate": 2.242008465917384e-05, + "loss": 0.744, + "step": 6440 + }, + { + "epoch": 2.79, + "learning_rate": 2.1544300102174863e-05, + "loss": 0.7431, + "step": 6460 + }, + { + "epoch": 2.8, + "learning_rate": 2.0668515545175885e-05, + "loss": 0.7421, + "step": 6480 + }, + { + "epoch": 2.8, + "learning_rate": 1.9792730988176907e-05, + "loss": 0.7481, + "step": 6500 + }, + { + "epoch": 2.81, + "learning_rate": 1.8916946431177928e-05, + "loss": 0.7332, + "step": 6520 + }, + { + "epoch": 2.82, + "learning_rate": 1.804116187417895e-05, + "loss": 0.7462, + "step": 6540 + }, + { + "epoch": 2.83, + "learning_rate": 1.716537731717997e-05, + "loss": 0.7487, + "step": 6560 + }, + { + "epoch": 2.84, + "learning_rate": 1.6289592760180993e-05, + "loss": 0.7493, + "step": 6580 + }, + { + "epoch": 2.85, + "learning_rate": 1.5413808203182015e-05, + "loss": 0.75, + "step": 6600 + }, + { + "epoch": 2.85, + "eval_loss": 0.7546943426132202, + "eval_runtime": 9.8229, + "eval_samples_per_second": 203.606, + "eval_steps_per_second": 3.258, + "step": 6600 + } + ], + "max_steps": 6951, + "num_train_epochs": 3, + "total_flos": 1.715794571241613e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6600/training_args.bin b/adapters/saved-alpaca-belle7b/checkpoint-6600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37751b82699e4ce9ea9e31699dc8564113a8dd87 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07526739f82df92f9a9bb721a1bac0aa5b54e880e798fc8e04003c255ebe3f76 +size 3643 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6800/optimizer.pt b/adapters/saved-alpaca-belle7b/checkpoint-6800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..05845c605caf1e3a03df94f43ee3016a6bab31b4 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d98c796cd689386e0d5c6113d69b24d596ecc4efc41a549eae807168bf555980 +size 33629893 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6800/pytorch_model.bin b/adapters/saved-alpaca-belle7b/checkpoint-6800/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..b5a0292c61a31e7103e863dfcd1cec431a49e74e --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6800/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5d662aaeb0e859967343aced859f6ada0d3664c2388305e8a9fc5b9f0e30ce1 +size 16822989 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_0.pth b/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..21b6d70eadf61bd0ccf26de79f698234a3abe723 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f36d4f5a1046f41f8f47d4282921ccf2fe6a410d18d232288db18fe3479119a0 +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_1.pth b/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..deb15b6f65677ab9e77012ba1d4713c9b8161a2a --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccc906c604c1ef15afdef9b869ef24b228ff84c37984d32f74177f57f1370a8f +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_2.pth b/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..bac3d519fa578c7469f4d5d1503ce22369a02cfd --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f730bcf510efce6eb96fc8e6938f14a7fb0a91c6162f4bc92b49f3d74718ea47 +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_3.pth b/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f6295c3f7528a5701b0e66d45326ed1ec6065679 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:498551a0ce52469c2f7e478d7fa170f8476e269e02876619d095dda789619912 +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_4.pth b/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..4fb0f7c38ef17c25d9bd7d48c117464e92cbd13e --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2722f4ed8abd13fb426e0bf8f3f6fd75ab0ffc6c653dd3e3600ebd6be1f96f06 +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_5.pth b/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..489ac72dd953f37042e943edac19143937260674 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27ffb802f41db531520802e02b685b7be7fb39da115748c3fcbdead2a42a8ecb +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_6.pth b/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..af99ff522b2f2d03095f0e0c6dcb81e9e545f205 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33901f2421f03a5d56c3023639676dfaa12fac3526362c6c5119790ce4c70e5b +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_7.pth b/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..48d3a32889feef5e51c963ece11f20aeb47e06e8 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6800/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af48f9edb24d769318b0cfd5fcd22f920beaeda47ed5c1c71aa1330b32f283ab +size 14583 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6800/scaler.pt b/adapters/saved-alpaca-belle7b/checkpoint-6800/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5b65abcc1cc251ab54f954b5767e5c7249570ce --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6800/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d0edd16e32f7fc081b6ff8df79111703a30b6733e1ab2d4376bfa34efcf9b48 +size 557 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6800/scheduler.pt b/adapters/saved-alpaca-belle7b/checkpoint-6800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7832db575ecd0bd4a2384c889e61d1e6be46ca6 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f3bcb636b9f47ff2d75120ed8c8d96bdbc7a1b5dd3da72497bada51f61c7fe4 +size 627 diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6800/trainer_state.json b/adapters/saved-alpaca-belle7b/checkpoint-6800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8febe33433bc5527d1e350b2b38bc7e44ed4d964 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6800/trainer_state.json @@ -0,0 +1,2328 @@ +{ + "best_metric": 0.7542949318885803, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-belle7b/checkpoint-6800", + "epoch": 2.9341963322545848, + "global_step": 6800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.8997, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 0.00011999999999999999, + "loss": 1.5651, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 0.00017999999999999998, + "loss": 1.1268, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 0.00023999999999999998, + "loss": 1.041, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 0.0003, + "loss": 1.0094, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 0.000299124215443001, + "loss": 0.9948, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029824843088600204, + "loss": 0.974, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 0.000297372646329003, + "loss": 0.9594, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029649686177200406, + "loss": 0.9494, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002956210772150051, + "loss": 0.9473, + "step": 200 + }, + { + "epoch": 0.09, + "eval_loss": 0.9373907446861267, + "eval_runtime": 9.7698, + "eval_samples_per_second": 204.713, + "eval_steps_per_second": 3.275, + "step": 200 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029474529265800613, + "loss": 0.9377, + "step": 220 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002938695081010071, + "loss": 0.9311, + "step": 240 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029299372354400815, + "loss": 0.9268, + "step": 260 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002921179389870092, + "loss": 0.9145, + "step": 280 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002912421544300102, + "loss": 0.9099, + "step": 300 + }, + { + "epoch": 0.14, + "learning_rate": 0.00029036636987301125, + "loss": 0.9076, + "step": 320 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028949058531601223, + "loss": 0.901, + "step": 340 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028861480075901327, + "loss": 0.901, + "step": 360 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002877390162020143, + "loss": 0.8973, + "step": 380 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028686323164501534, + "loss": 0.8904, + "step": 400 + }, + { + "epoch": 0.17, + "eval_loss": 0.8943666815757751, + "eval_runtime": 9.7563, + "eval_samples_per_second": 204.995, + "eval_steps_per_second": 3.28, + "step": 400 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002859874470880163, + "loss": 0.8905, + "step": 420 + }, + { + "epoch": 0.19, + "learning_rate": 0.00028511166253101735, + "loss": 0.8856, + "step": 440 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002842358779740184, + "loss": 0.883, + "step": 460 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028336009341701937, + "loss": 0.8846, + "step": 480 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002824843088600204, + "loss": 0.8777, + "step": 500 + }, + { + "epoch": 0.22, + "learning_rate": 0.00028160852430302144, + "loss": 0.883, + "step": 520 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002807327397460225, + "loss": 0.8819, + "step": 540 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027985695518902346, + "loss": 0.8761, + "step": 560 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002789811706320245, + "loss": 0.8738, + "step": 580 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002781053860750255, + "loss": 0.8636, + "step": 600 + }, + { + "epoch": 0.26, + "eval_loss": 0.8673094511032104, + "eval_runtime": 9.8373, + "eval_samples_per_second": 203.309, + "eval_steps_per_second": 3.253, + "step": 600 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002772296015180265, + "loss": 0.8627, + "step": 620 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027635381696102754, + "loss": 0.8615, + "step": 640 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002754780324040286, + "loss": 0.8647, + "step": 660 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002746022478470296, + "loss": 0.8635, + "step": 680 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002737264632900306, + "loss": 0.8619, + "step": 700 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027285067873303163, + "loss": 0.8477, + "step": 720 + }, + { + "epoch": 0.32, + "learning_rate": 0.00027197489417603266, + "loss": 0.8564, + "step": 740 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002710991096190337, + "loss": 0.8513, + "step": 760 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002702233250620347, + "loss": 0.855, + "step": 780 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002693475405050357, + "loss": 0.8462, + "step": 800 + }, + { + "epoch": 0.35, + "eval_loss": 0.8500058650970459, + "eval_runtime": 9.7922, + "eval_samples_per_second": 204.244, + "eval_steps_per_second": 3.268, + "step": 800 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026847175594803675, + "loss": 0.852, + "step": 820 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002675959713910378, + "loss": 0.8455, + "step": 840 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002667201868340388, + "loss": 0.8479, + "step": 860 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002658444022770398, + "loss": 0.8423, + "step": 880 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026496861772004084, + "loss": 0.8404, + "step": 900 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026409283316304187, + "loss": 0.8434, + "step": 920 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002632170486060429, + "loss": 0.8371, + "step": 940 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002623412640490439, + "loss": 0.8397, + "step": 960 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002614654794920449, + "loss": 0.8394, + "step": 980 + }, + { + "epoch": 0.43, + "learning_rate": 0.00026058969493504596, + "loss": 0.8368, + "step": 1000 + }, + { + "epoch": 0.43, + "eval_loss": 0.8375310301780701, + "eval_runtime": 9.8151, + "eval_samples_per_second": 203.767, + "eval_steps_per_second": 3.26, + "step": 1000 + }, + { + "epoch": 0.44, + "learning_rate": 0.000259713910378047, + "loss": 0.8376, + "step": 1020 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025883812582104803, + "loss": 0.8307, + "step": 1040 + }, + { + "epoch": 0.46, + "learning_rate": 0.000257962341264049, + "loss": 0.8254, + "step": 1060 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025708655670705005, + "loss": 0.8347, + "step": 1080 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002562107721500511, + "loss": 0.8273, + "step": 1100 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002553349875930521, + "loss": 0.8252, + "step": 1120 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002544592030360531, + "loss": 0.8245, + "step": 1140 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025358341847905413, + "loss": 0.8213, + "step": 1160 + }, + { + "epoch": 0.51, + "learning_rate": 0.00025270763392205517, + "loss": 0.8269, + "step": 1180 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002518318493650562, + "loss": 0.8218, + "step": 1200 + }, + { + "epoch": 0.52, + "eval_loss": 0.8258803486824036, + "eval_runtime": 9.7909, + "eval_samples_per_second": 204.271, + "eval_steps_per_second": 3.268, + "step": 1200 + }, + { + "epoch": 0.53, + "learning_rate": 0.00025095606480805724, + "loss": 0.8141, + "step": 1220 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002500802802510582, + "loss": 0.822, + "step": 1240 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024920449569405925, + "loss": 0.82, + "step": 1260 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002483287111370603, + "loss": 0.8133, + "step": 1280 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024745292658006127, + "loss": 0.8248, + "step": 1300 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002465771420230623, + "loss": 0.814, + "step": 1320 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024570135746606334, + "loss": 0.8157, + "step": 1340 + }, + { + "epoch": 0.59, + "learning_rate": 0.0002448255729090644, + "loss": 0.8109, + "step": 1360 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024394978835206538, + "loss": 0.8142, + "step": 1380 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024307400379506642, + "loss": 0.8107, + "step": 1400 + }, + { + "epoch": 0.6, + "eval_loss": 0.8169026970863342, + "eval_runtime": 9.8327, + "eval_samples_per_second": 203.402, + "eval_steps_per_second": 3.254, + "step": 1400 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002421982192380674, + "loss": 0.8133, + "step": 1420 + }, + { + "epoch": 0.62, + "learning_rate": 0.00024132243468106843, + "loss": 0.8107, + "step": 1440 + }, + { + "epoch": 0.63, + "learning_rate": 0.00024044665012406947, + "loss": 0.8086, + "step": 1460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002395708655670705, + "loss": 0.8142, + "step": 1480 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023869508101007149, + "loss": 0.8154, + "step": 1500 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023781929645307252, + "loss": 0.8088, + "step": 1520 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023694351189607356, + "loss": 0.8052, + "step": 1540 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023606772733907456, + "loss": 0.8146, + "step": 1560 + }, + { + "epoch": 0.68, + "learning_rate": 0.0002351919427820756, + "loss": 0.802, + "step": 1580 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002343161582250766, + "loss": 0.8115, + "step": 1600 + }, + { + "epoch": 0.69, + "eval_loss": 0.8092362284660339, + "eval_runtime": 9.7925, + "eval_samples_per_second": 204.237, + "eval_steps_per_second": 3.268, + "step": 1600 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023344037366807764, + "loss": 0.7997, + "step": 1620 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023256458911107865, + "loss": 0.807, + "step": 1640 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023168880455407968, + "loss": 0.7951, + "step": 1660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0002308130199970807, + "loss": 0.8027, + "step": 1680 + }, + { + "epoch": 0.73, + "learning_rate": 0.0002299372354400817, + "loss": 0.8119, + "step": 1700 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022906145088308274, + "loss": 0.8024, + "step": 1720 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022818566632608377, + "loss": 0.8043, + "step": 1740 + }, + { + "epoch": 0.76, + "learning_rate": 0.0002273098817690848, + "loss": 0.8005, + "step": 1760 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002264340972120858, + "loss": 0.8024, + "step": 1780 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022555831265508682, + "loss": 0.7949, + "step": 1800 + }, + { + "epoch": 0.78, + "eval_loss": 0.8031176924705505, + "eval_runtime": 9.8716, + "eval_samples_per_second": 202.602, + "eval_steps_per_second": 3.242, + "step": 1800 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022468252809808786, + "loss": 0.7939, + "step": 1820 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002238067435410889, + "loss": 0.7958, + "step": 1840 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022293095898408987, + "loss": 0.8019, + "step": 1860 + }, + { + "epoch": 0.81, + "learning_rate": 0.0002220551744270909, + "loss": 0.7933, + "step": 1880 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022117938987009194, + "loss": 0.7994, + "step": 1900 + }, + { + "epoch": 0.83, + "learning_rate": 0.00022030360531309298, + "loss": 0.7938, + "step": 1920 + }, + { + "epoch": 0.84, + "learning_rate": 0.000219427820756094, + "loss": 0.7914, + "step": 1940 + }, + { + "epoch": 0.85, + "learning_rate": 0.000218552036199095, + "loss": 0.7921, + "step": 1960 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021767625164209603, + "loss": 0.8021, + "step": 1980 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021680046708509704, + "loss": 0.7961, + "step": 2000 + }, + { + "epoch": 0.86, + "eval_loss": 0.7977419495582581, + "eval_runtime": 9.8406, + "eval_samples_per_second": 203.239, + "eval_steps_per_second": 3.252, + "step": 2000 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021592468252809807, + "loss": 0.7969, + "step": 2020 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021504889797109908, + "loss": 0.7942, + "step": 2040 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021417311341410012, + "loss": 0.7882, + "step": 2060 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021329732885710112, + "loss": 0.79, + "step": 2080 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021242154430010216, + "loss": 0.7903, + "step": 2100 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002115457597431032, + "loss": 0.7885, + "step": 2120 + }, + { + "epoch": 0.92, + "learning_rate": 0.0002106699751861042, + "loss": 0.7925, + "step": 2140 + }, + { + "epoch": 0.93, + "learning_rate": 0.0002097941906291052, + "loss": 0.7896, + "step": 2160 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020891840607210625, + "loss": 0.7892, + "step": 2180 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020804262151510728, + "loss": 0.7901, + "step": 2200 + }, + { + "epoch": 0.95, + "eval_loss": 0.7927260994911194, + "eval_runtime": 9.9093, + "eval_samples_per_second": 201.83, + "eval_steps_per_second": 3.229, + "step": 2200 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020716683695810826, + "loss": 0.7937, + "step": 2220 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002062910524011093, + "loss": 0.7904, + "step": 2240 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020541526784411033, + "loss": 0.7886, + "step": 2260 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020453948328711137, + "loss": 0.7837, + "step": 2280 + }, + { + "epoch": 0.99, + "learning_rate": 0.0002036636987301124, + "loss": 0.79, + "step": 2300 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020278791417311338, + "loss": 0.7886, + "step": 2320 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020191212961611442, + "loss": 0.7816, + "step": 2340 + }, + { + "epoch": 1.02, + "learning_rate": 0.00020103634505911545, + "loss": 0.7774, + "step": 2360 + }, + { + "epoch": 1.03, + "learning_rate": 0.00020016056050211646, + "loss": 0.7811, + "step": 2380 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019928477594511747, + "loss": 0.788, + "step": 2400 + }, + { + "epoch": 1.04, + "eval_loss": 0.7887651920318604, + "eval_runtime": 10.1462, + "eval_samples_per_second": 197.119, + "eval_steps_per_second": 3.154, + "step": 2400 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001984089913881185, + "loss": 0.7818, + "step": 2420 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019753320683111954, + "loss": 0.7864, + "step": 2440 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019665742227412055, + "loss": 0.7745, + "step": 2460 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019578163771712158, + "loss": 0.7743, + "step": 2480 + }, + { + "epoch": 1.08, + "learning_rate": 0.0001949058531601226, + "loss": 0.7832, + "step": 2500 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001940300686031236, + "loss": 0.7803, + "step": 2520 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019315428404612463, + "loss": 0.7817, + "step": 2540 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019227849948912567, + "loss": 0.7843, + "step": 2560 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019140271493212668, + "loss": 0.7755, + "step": 2580 + }, + { + "epoch": 1.12, + "learning_rate": 0.00019052693037512769, + "loss": 0.7816, + "step": 2600 + }, + { + "epoch": 1.12, + "eval_loss": 0.7844015955924988, + "eval_runtime": 9.9343, + "eval_samples_per_second": 201.323, + "eval_steps_per_second": 3.221, + "step": 2600 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018965114581812872, + "loss": 0.7753, + "step": 2620 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018877536126112976, + "loss": 0.7782, + "step": 2640 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001878995767041308, + "loss": 0.7776, + "step": 2660 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018702379214713177, + "loss": 0.7773, + "step": 2680 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001861480075901328, + "loss": 0.7771, + "step": 2700 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018527222303313384, + "loss": 0.7762, + "step": 2720 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018439643847613488, + "loss": 0.7694, + "step": 2740 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018352065391913586, + "loss": 0.7789, + "step": 2760 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001826448693621369, + "loss": 0.7808, + "step": 2780 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018176908480513793, + "loss": 0.7743, + "step": 2800 + }, + { + "epoch": 1.21, + "eval_loss": 0.7819436192512512, + "eval_runtime": 9.8637, + "eval_samples_per_second": 202.764, + "eval_steps_per_second": 3.244, + "step": 2800 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018089330024813894, + "loss": 0.7785, + "step": 2820 + }, + { + "epoch": 1.23, + "learning_rate": 0.00018001751569113997, + "loss": 0.7737, + "step": 2840 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017914173113414098, + "loss": 0.7793, + "step": 2860 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017826594657714202, + "loss": 0.7741, + "step": 2880 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017739016202014302, + "loss": 0.7758, + "step": 2900 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017651437746314406, + "loss": 0.7742, + "step": 2920 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017563859290614507, + "loss": 0.7678, + "step": 2940 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001747628083491461, + "loss": 0.7717, + "step": 2960 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001738870237921471, + "loss": 0.7752, + "step": 2980 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017301123923514814, + "loss": 0.7733, + "step": 3000 + }, + { + "epoch": 1.29, + "eval_loss": 0.7789211273193359, + "eval_runtime": 9.8687, + "eval_samples_per_second": 202.66, + "eval_steps_per_second": 3.243, + "step": 3000 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017213545467814918, + "loss": 0.772, + "step": 3020 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017125967012115016, + "loss": 0.7726, + "step": 3040 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001703838855641512, + "loss": 0.7669, + "step": 3060 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016950810100715223, + "loss": 0.7716, + "step": 3080 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016863231645015327, + "loss": 0.7736, + "step": 3100 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016775653189315425, + "loss": 0.7652, + "step": 3120 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016688074733615528, + "loss": 0.7675, + "step": 3140 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016600496277915632, + "loss": 0.7729, + "step": 3160 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016512917822215735, + "loss": 0.7748, + "step": 3180 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016425339366515836, + "loss": 0.7728, + "step": 3200 + }, + { + "epoch": 1.38, + "eval_loss": 0.7762572169303894, + "eval_runtime": 9.9106, + "eval_samples_per_second": 201.803, + "eval_steps_per_second": 3.229, + "step": 3200 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016337760910815937, + "loss": 0.7644, + "step": 3220 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001625018245511604, + "loss": 0.7701, + "step": 3240 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016162603999416144, + "loss": 0.7712, + "step": 3260 + }, + { + "epoch": 1.42, + "learning_rate": 0.00016075025543716245, + "loss": 0.7665, + "step": 3280 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015987447088016346, + "loss": 0.7649, + "step": 3300 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001589986863231645, + "loss": 0.7636, + "step": 3320 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001581229017661655, + "loss": 0.7679, + "step": 3340 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015724711720916653, + "loss": 0.7686, + "step": 3360 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015637133265216757, + "loss": 0.7723, + "step": 3380 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015549554809516858, + "loss": 0.7672, + "step": 3400 + }, + { + "epoch": 1.47, + "eval_loss": 0.7736611366271973, + "eval_runtime": 9.8349, + "eval_samples_per_second": 203.357, + "eval_steps_per_second": 3.254, + "step": 3400 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015461976353816958, + "loss": 0.7633, + "step": 3420 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015374397898117062, + "loss": 0.762, + "step": 3440 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015286819442417166, + "loss": 0.7647, + "step": 3460 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015199240986717264, + "loss": 0.7626, + "step": 3480 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015111662531017367, + "loss": 0.7683, + "step": 3500 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001502408407531747, + "loss": 0.7633, + "step": 3520 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014936505619617571, + "loss": 0.7641, + "step": 3540 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014848927163917675, + "loss": 0.7702, + "step": 3560 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014761348708217778, + "loss": 0.772, + "step": 3580 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001467377025251788, + "loss": 0.7759, + "step": 3600 + }, + { + "epoch": 1.55, + "eval_loss": 0.7713318467140198, + "eval_runtime": 9.9228, + "eval_samples_per_second": 201.556, + "eval_steps_per_second": 3.225, + "step": 3600 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014586191796817983, + "loss": 0.7647, + "step": 3620 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014498613341118084, + "loss": 0.7597, + "step": 3640 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014411034885418187, + "loss": 0.7615, + "step": 3660 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014323456429718288, + "loss": 0.7588, + "step": 3680 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014235877974018391, + "loss": 0.7549, + "step": 3700 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014148299518318492, + "loss": 0.7687, + "step": 3720 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014060721062618596, + "loss": 0.7684, + "step": 3740 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013973142606918697, + "loss": 0.7664, + "step": 3760 + }, + { + "epoch": 1.63, + "learning_rate": 0.000138855641512188, + "loss": 0.7638, + "step": 3780 + }, + { + "epoch": 1.64, + "learning_rate": 0.000137979856955189, + "loss": 0.763, + "step": 3800 + }, + { + "epoch": 1.64, + "eval_loss": 0.7696812748908997, + "eval_runtime": 9.9103, + "eval_samples_per_second": 201.81, + "eval_steps_per_second": 3.229, + "step": 3800 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013710407239819002, + "loss": 0.7594, + "step": 3820 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013622828784119105, + "loss": 0.7579, + "step": 3840 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013535250328419206, + "loss": 0.7651, + "step": 3860 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001344767187271931, + "loss": 0.7665, + "step": 3880 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001336009341701941, + "loss": 0.7595, + "step": 3900 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013272514961319514, + "loss": 0.7635, + "step": 3920 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013184936505619617, + "loss": 0.7599, + "step": 3940 + }, + { + "epoch": 1.71, + "learning_rate": 0.00013097358049919718, + "loss": 0.7581, + "step": 3960 + }, + { + "epoch": 1.72, + "learning_rate": 0.00013009779594219822, + "loss": 0.7514, + "step": 3980 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012922201138519922, + "loss": 0.7597, + "step": 4000 + }, + { + "epoch": 1.73, + "eval_loss": 0.7680906057357788, + "eval_runtime": 10.2283, + "eval_samples_per_second": 195.536, + "eval_steps_per_second": 3.129, + "step": 4000 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012834622682820026, + "loss": 0.7664, + "step": 4020 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012747044227120127, + "loss": 0.7613, + "step": 4040 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001265946577142023, + "loss": 0.759, + "step": 4060 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001257188731572033, + "loss": 0.7609, + "step": 4080 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012484308860020435, + "loss": 0.765, + "step": 4100 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012396730404320538, + "loss": 0.7559, + "step": 4120 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001230915194862064, + "loss": 0.7575, + "step": 4140 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001222157349292074, + "loss": 0.7596, + "step": 4160 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012133995037220842, + "loss": 0.7692, + "step": 4180 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012046416581520945, + "loss": 0.7627, + "step": 4200 + }, + { + "epoch": 1.81, + "eval_loss": 0.7660259008407593, + "eval_runtime": 9.861, + "eval_samples_per_second": 202.82, + "eval_steps_per_second": 3.245, + "step": 4200 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011958838125821046, + "loss": 0.7642, + "step": 4220 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001187125967012115, + "loss": 0.7612, + "step": 4240 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001178368121442125, + "loss": 0.7576, + "step": 4260 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011696102758721354, + "loss": 0.7592, + "step": 4280 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011608524303021456, + "loss": 0.7614, + "step": 4300 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011520945847321558, + "loss": 0.7564, + "step": 4320 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001143336739162166, + "loss": 0.7604, + "step": 4340 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011345788935921761, + "loss": 0.7547, + "step": 4360 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011258210480221865, + "loss": 0.7602, + "step": 4380 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011170632024521966, + "loss": 0.7569, + "step": 4400 + }, + { + "epoch": 1.9, + "eval_loss": 0.764076828956604, + "eval_runtime": 9.9349, + "eval_samples_per_second": 201.31, + "eval_steps_per_second": 3.221, + "step": 4400 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011083053568822069, + "loss": 0.7545, + "step": 4420 + }, + { + "epoch": 1.92, + "learning_rate": 0.0001099547511312217, + "loss": 0.7552, + "step": 4440 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010907896657422273, + "loss": 0.7579, + "step": 4460 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010820318201722376, + "loss": 0.7547, + "step": 4480 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010732739746022478, + "loss": 0.7581, + "step": 4500 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001064516129032258, + "loss": 0.7554, + "step": 4520 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010557582834622682, + "loss": 0.7563, + "step": 4540 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010470004378922784, + "loss": 0.7569, + "step": 4560 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010382425923222885, + "loss": 0.7571, + "step": 4580 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010294847467522989, + "loss": 0.7616, + "step": 4600 + }, + { + "epoch": 1.98, + "eval_loss": 0.7628415822982788, + "eval_runtime": 10.0234, + "eval_samples_per_second": 199.534, + "eval_steps_per_second": 3.193, + "step": 4600 + }, + { + "epoch": 1.99, + "learning_rate": 0.0001020726901182309, + "loss": 0.7587, + "step": 4620 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010119690556123193, + "loss": 0.7536, + "step": 4640 + }, + { + "epoch": 2.01, + "learning_rate": 0.00010032112100423295, + "loss": 0.7575, + "step": 4660 + }, + { + "epoch": 2.02, + "learning_rate": 9.944533644723397e-05, + "loss": 0.7572, + "step": 4680 + }, + { + "epoch": 2.03, + "learning_rate": 9.8569551890235e-05, + "loss": 0.7578, + "step": 4700 + }, + { + "epoch": 2.04, + "learning_rate": 9.769376733323602e-05, + "loss": 0.7545, + "step": 4720 + }, + { + "epoch": 2.05, + "learning_rate": 9.681798277623704e-05, + "loss": 0.7616, + "step": 4740 + }, + { + "epoch": 2.05, + "learning_rate": 9.594219821923806e-05, + "loss": 0.7549, + "step": 4760 + }, + { + "epoch": 2.06, + "learning_rate": 9.506641366223908e-05, + "loss": 0.7571, + "step": 4780 + }, + { + "epoch": 2.07, + "learning_rate": 9.41906291052401e-05, + "loss": 0.7528, + "step": 4800 + }, + { + "epoch": 2.07, + "eval_loss": 0.7622065544128418, + "eval_runtime": 9.9086, + "eval_samples_per_second": 201.845, + "eval_steps_per_second": 3.23, + "step": 4800 + }, + { + "epoch": 2.08, + "learning_rate": 9.331484454824112e-05, + "loss": 0.7561, + "step": 4820 + }, + { + "epoch": 2.09, + "learning_rate": 9.243905999124216e-05, + "loss": 0.7528, + "step": 4840 + }, + { + "epoch": 2.1, + "learning_rate": 9.156327543424317e-05, + "loss": 0.7496, + "step": 4860 + }, + { + "epoch": 2.11, + "learning_rate": 9.06874908772442e-05, + "loss": 0.7594, + "step": 4880 + }, + { + "epoch": 2.11, + "learning_rate": 8.981170632024521e-05, + "loss": 0.7539, + "step": 4900 + }, + { + "epoch": 2.12, + "learning_rate": 8.893592176324623e-05, + "loss": 0.7437, + "step": 4920 + }, + { + "epoch": 2.13, + "learning_rate": 8.806013720624725e-05, + "loss": 0.7475, + "step": 4940 + }, + { + "epoch": 2.14, + "learning_rate": 8.718435264924827e-05, + "loss": 0.75, + "step": 4960 + }, + { + "epoch": 2.15, + "learning_rate": 8.63085680922493e-05, + "loss": 0.7552, + "step": 4980 + }, + { + "epoch": 2.16, + "learning_rate": 8.543278353525032e-05, + "loss": 0.753, + "step": 5000 + }, + { + "epoch": 2.16, + "eval_loss": 0.7612386345863342, + "eval_runtime": 9.9196, + "eval_samples_per_second": 201.621, + "eval_steps_per_second": 3.226, + "step": 5000 + }, + { + "epoch": 2.17, + "learning_rate": 8.455699897825135e-05, + "loss": 0.7546, + "step": 5020 + }, + { + "epoch": 2.17, + "learning_rate": 8.368121442125236e-05, + "loss": 0.7562, + "step": 5040 + }, + { + "epoch": 2.18, + "learning_rate": 8.28054298642534e-05, + "loss": 0.7495, + "step": 5060 + }, + { + "epoch": 2.19, + "learning_rate": 8.19296453072544e-05, + "loss": 0.751, + "step": 5080 + }, + { + "epoch": 2.2, + "learning_rate": 8.105386075025544e-05, + "loss": 0.7459, + "step": 5100 + }, + { + "epoch": 2.21, + "learning_rate": 8.017807619325645e-05, + "loss": 0.7553, + "step": 5120 + }, + { + "epoch": 2.22, + "learning_rate": 7.930229163625748e-05, + "loss": 0.7504, + "step": 5140 + }, + { + "epoch": 2.23, + "learning_rate": 7.842650707925849e-05, + "loss": 0.7565, + "step": 5160 + }, + { + "epoch": 2.24, + "learning_rate": 7.755072252225951e-05, + "loss": 0.7471, + "step": 5180 + }, + { + "epoch": 2.24, + "learning_rate": 7.667493796526055e-05, + "loss": 0.7507, + "step": 5200 + }, + { + "epoch": 2.24, + "eval_loss": 0.7600537538528442, + "eval_runtime": 9.8816, + "eval_samples_per_second": 202.397, + "eval_steps_per_second": 3.238, + "step": 5200 + }, + { + "epoch": 2.25, + "learning_rate": 7.579915340826155e-05, + "loss": 0.7537, + "step": 5220 + }, + { + "epoch": 2.26, + "learning_rate": 7.492336885126258e-05, + "loss": 0.7537, + "step": 5240 + }, + { + "epoch": 2.27, + "learning_rate": 7.40475842942636e-05, + "loss": 0.7516, + "step": 5260 + }, + { + "epoch": 2.28, + "learning_rate": 7.317179973726463e-05, + "loss": 0.7459, + "step": 5280 + }, + { + "epoch": 2.29, + "learning_rate": 7.229601518026565e-05, + "loss": 0.7509, + "step": 5300 + }, + { + "epoch": 2.3, + "learning_rate": 7.142023062326668e-05, + "loss": 0.7519, + "step": 5320 + }, + { + "epoch": 2.3, + "learning_rate": 7.05444460662677e-05, + "loss": 0.7426, + "step": 5340 + }, + { + "epoch": 2.31, + "learning_rate": 6.966866150926872e-05, + "loss": 0.7451, + "step": 5360 + }, + { + "epoch": 2.32, + "learning_rate": 6.879287695226974e-05, + "loss": 0.7493, + "step": 5380 + }, + { + "epoch": 2.33, + "learning_rate": 6.791709239527075e-05, + "loss": 0.748, + "step": 5400 + }, + { + "epoch": 2.33, + "eval_loss": 0.7588484287261963, + "eval_runtime": 9.902, + "eval_samples_per_second": 201.98, + "eval_steps_per_second": 3.232, + "step": 5400 + }, + { + "epoch": 2.34, + "learning_rate": 6.704130783827177e-05, + "loss": 0.75, + "step": 5420 + }, + { + "epoch": 2.35, + "learning_rate": 6.616552328127279e-05, + "loss": 0.7491, + "step": 5440 + }, + { + "epoch": 2.36, + "learning_rate": 6.528973872427383e-05, + "loss": 0.747, + "step": 5460 + }, + { + "epoch": 2.36, + "learning_rate": 6.441395416727485e-05, + "loss": 0.7512, + "step": 5480 + }, + { + "epoch": 2.37, + "learning_rate": 6.353816961027587e-05, + "loss": 0.7557, + "step": 5500 + }, + { + "epoch": 2.38, + "learning_rate": 6.266238505327689e-05, + "loss": 0.7529, + "step": 5520 + }, + { + "epoch": 2.39, + "learning_rate": 6.178660049627791e-05, + "loss": 0.7465, + "step": 5540 + }, + { + "epoch": 2.4, + "learning_rate": 6.091081593927893e-05, + "loss": 0.7462, + "step": 5560 + }, + { + "epoch": 2.41, + "learning_rate": 6.003503138227995e-05, + "loss": 0.7476, + "step": 5580 + }, + { + "epoch": 2.42, + "learning_rate": 5.915924682528097e-05, + "loss": 0.7478, + "step": 5600 + }, + { + "epoch": 2.42, + "eval_loss": 0.7580318450927734, + "eval_runtime": 9.8692, + "eval_samples_per_second": 202.65, + "eval_steps_per_second": 3.242, + "step": 5600 + }, + { + "epoch": 2.43, + "learning_rate": 5.8283462268281994e-05, + "loss": 0.7504, + "step": 5620 + }, + { + "epoch": 2.43, + "learning_rate": 5.740767771128302e-05, + "loss": 0.7491, + "step": 5640 + }, + { + "epoch": 2.44, + "learning_rate": 5.6531893154284043e-05, + "loss": 0.7464, + "step": 5660 + }, + { + "epoch": 2.45, + "learning_rate": 5.5656108597285065e-05, + "loss": 0.7474, + "step": 5680 + }, + { + "epoch": 2.46, + "learning_rate": 5.478032404028609e-05, + "loss": 0.7488, + "step": 5700 + }, + { + "epoch": 2.47, + "learning_rate": 5.390453948328711e-05, + "loss": 0.7484, + "step": 5720 + }, + { + "epoch": 2.48, + "learning_rate": 5.302875492628813e-05, + "loss": 0.7486, + "step": 5740 + }, + { + "epoch": 2.49, + "learning_rate": 5.215297036928915e-05, + "loss": 0.7487, + "step": 5760 + }, + { + "epoch": 2.49, + "learning_rate": 5.127718581229017e-05, + "loss": 0.747, + "step": 5780 + }, + { + "epoch": 2.5, + "learning_rate": 5.040140125529119e-05, + "loss": 0.7511, + "step": 5800 + }, + { + "epoch": 2.5, + "eval_loss": 0.7569240927696228, + "eval_runtime": 9.868, + "eval_samples_per_second": 202.676, + "eval_steps_per_second": 3.243, + "step": 5800 + }, + { + "epoch": 2.51, + "learning_rate": 4.952561669829222e-05, + "loss": 0.7475, + "step": 5820 + }, + { + "epoch": 2.52, + "learning_rate": 4.864983214129324e-05, + "loss": 0.7402, + "step": 5840 + }, + { + "epoch": 2.53, + "learning_rate": 4.777404758429426e-05, + "loss": 0.7497, + "step": 5860 + }, + { + "epoch": 2.54, + "learning_rate": 4.689826302729528e-05, + "loss": 0.7481, + "step": 5880 + }, + { + "epoch": 2.55, + "learning_rate": 4.60224784702963e-05, + "loss": 0.7488, + "step": 5900 + }, + { + "epoch": 2.55, + "learning_rate": 4.5146693913297324e-05, + "loss": 0.747, + "step": 5920 + }, + { + "epoch": 2.56, + "learning_rate": 4.4270909356298346e-05, + "loss": 0.7538, + "step": 5940 + }, + { + "epoch": 2.57, + "learning_rate": 4.339512479929937e-05, + "loss": 0.7465, + "step": 5960 + }, + { + "epoch": 2.58, + "learning_rate": 4.251934024230039e-05, + "loss": 0.7499, + "step": 5980 + }, + { + "epoch": 2.59, + "learning_rate": 4.164355568530142e-05, + "loss": 0.7477, + "step": 6000 + }, + { + "epoch": 2.59, + "eval_loss": 0.7565082907676697, + "eval_runtime": 9.8679, + "eval_samples_per_second": 202.678, + "eval_steps_per_second": 3.243, + "step": 6000 + }, + { + "epoch": 2.6, + "learning_rate": 4.076777112830244e-05, + "loss": 0.7455, + "step": 6020 + }, + { + "epoch": 2.61, + "learning_rate": 3.989198657130346e-05, + "loss": 0.749, + "step": 6040 + }, + { + "epoch": 2.61, + "learning_rate": 3.901620201430448e-05, + "loss": 0.751, + "step": 6060 + }, + { + "epoch": 2.62, + "learning_rate": 3.81404174573055e-05, + "loss": 0.7471, + "step": 6080 + }, + { + "epoch": 2.63, + "learning_rate": 3.726463290030652e-05, + "loss": 0.7473, + "step": 6100 + }, + { + "epoch": 2.64, + "learning_rate": 3.638884834330754e-05, + "loss": 0.7515, + "step": 6120 + }, + { + "epoch": 2.65, + "learning_rate": 3.551306378630857e-05, + "loss": 0.7449, + "step": 6140 + }, + { + "epoch": 2.66, + "learning_rate": 3.463727922930959e-05, + "loss": 0.7433, + "step": 6160 + }, + { + "epoch": 2.67, + "learning_rate": 3.376149467231061e-05, + "loss": 0.7504, + "step": 6180 + }, + { + "epoch": 2.68, + "learning_rate": 3.2885710115311626e-05, + "loss": 0.7508, + "step": 6200 + }, + { + "epoch": 2.68, + "eval_loss": 0.7557815909385681, + "eval_runtime": 9.9145, + "eval_samples_per_second": 201.725, + "eval_steps_per_second": 3.228, + "step": 6200 + }, + { + "epoch": 2.68, + "learning_rate": 3.200992555831265e-05, + "loss": 0.7461, + "step": 6220 + }, + { + "epoch": 2.69, + "learning_rate": 3.1134141001313676e-05, + "loss": 0.7411, + "step": 6240 + }, + { + "epoch": 2.7, + "learning_rate": 3.0258356444314698e-05, + "loss": 0.7461, + "step": 6260 + }, + { + "epoch": 2.71, + "learning_rate": 2.9382571887315716e-05, + "loss": 0.7465, + "step": 6280 + }, + { + "epoch": 2.72, + "learning_rate": 2.8506787330316738e-05, + "loss": 0.7515, + "step": 6300 + }, + { + "epoch": 2.73, + "learning_rate": 2.7631002773317763e-05, + "loss": 0.7444, + "step": 6320 + }, + { + "epoch": 2.74, + "learning_rate": 2.6755218216318784e-05, + "loss": 0.7488, + "step": 6340 + }, + { + "epoch": 2.74, + "learning_rate": 2.5879433659319806e-05, + "loss": 0.7471, + "step": 6360 + }, + { + "epoch": 2.75, + "learning_rate": 2.5003649102320828e-05, + "loss": 0.7494, + "step": 6380 + }, + { + "epoch": 2.76, + "learning_rate": 2.41716537731718e-05, + "loss": 0.7552, + "step": 6400 + }, + { + "epoch": 2.76, + "eval_loss": 0.7553005218505859, + "eval_runtime": 9.9033, + "eval_samples_per_second": 201.953, + "eval_steps_per_second": 3.231, + "step": 6400 + }, + { + "epoch": 2.77, + "learning_rate": 2.329586921617282e-05, + "loss": 0.7425, + "step": 6420 + }, + { + "epoch": 2.78, + "learning_rate": 2.242008465917384e-05, + "loss": 0.744, + "step": 6440 + }, + { + "epoch": 2.79, + "learning_rate": 2.1544300102174863e-05, + "loss": 0.7431, + "step": 6460 + }, + { + "epoch": 2.8, + "learning_rate": 2.0668515545175885e-05, + "loss": 0.7421, + "step": 6480 + }, + { + "epoch": 2.8, + "learning_rate": 1.9792730988176907e-05, + "loss": 0.7481, + "step": 6500 + }, + { + "epoch": 2.81, + "learning_rate": 1.8916946431177928e-05, + "loss": 0.7332, + "step": 6520 + }, + { + "epoch": 2.82, + "learning_rate": 1.804116187417895e-05, + "loss": 0.7462, + "step": 6540 + }, + { + "epoch": 2.83, + "learning_rate": 1.716537731717997e-05, + "loss": 0.7487, + "step": 6560 + }, + { + "epoch": 2.84, + "learning_rate": 1.6289592760180993e-05, + "loss": 0.7493, + "step": 6580 + }, + { + "epoch": 2.85, + "learning_rate": 1.5413808203182015e-05, + "loss": 0.75, + "step": 6600 + }, + { + "epoch": 2.85, + "eval_loss": 0.7546943426132202, + "eval_runtime": 9.8229, + "eval_samples_per_second": 203.606, + "eval_steps_per_second": 3.258, + "step": 6600 + }, + { + "epoch": 2.86, + "learning_rate": 1.4538023646183038e-05, + "loss": 0.7489, + "step": 6620 + }, + { + "epoch": 2.87, + "learning_rate": 1.366223908918406e-05, + "loss": 0.7544, + "step": 6640 + }, + { + "epoch": 2.87, + "learning_rate": 1.2786454532185083e-05, + "loss": 0.7459, + "step": 6660 + }, + { + "epoch": 2.88, + "learning_rate": 1.1910669975186103e-05, + "loss": 0.7427, + "step": 6680 + }, + { + "epoch": 2.89, + "learning_rate": 1.1034885418187124e-05, + "loss": 0.7482, + "step": 6700 + }, + { + "epoch": 2.9, + "learning_rate": 1.0159100861188148e-05, + "loss": 0.7454, + "step": 6720 + }, + { + "epoch": 2.91, + "learning_rate": 9.283316304189169e-06, + "loss": 0.7576, + "step": 6740 + }, + { + "epoch": 2.92, + "learning_rate": 8.40753174719019e-06, + "loss": 0.7385, + "step": 6760 + }, + { + "epoch": 2.93, + "learning_rate": 7.5317471901912114e-06, + "loss": 0.742, + "step": 6780 + }, + { + "epoch": 2.93, + "learning_rate": 6.655962633192234e-06, + "loss": 0.7499, + "step": 6800 + }, + { + "epoch": 2.93, + "eval_loss": 0.7542949318885803, + "eval_runtime": 9.9073, + "eval_samples_per_second": 201.871, + "eval_steps_per_second": 3.23, + "step": 6800 + } + ], + "max_steps": 6951, + "num_train_epochs": 3, + "total_flos": 1.767789823284111e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-belle7b/checkpoint-6800/training_args.bin b/adapters/saved-alpaca-belle7b/checkpoint-6800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37751b82699e4ce9ea9e31699dc8564113a8dd87 --- /dev/null +++ b/adapters/saved-alpaca-belle7b/checkpoint-6800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07526739f82df92f9a9bb721a1bac0aa5b54e880e798fc8e04003c255ebe3f76 +size 3643 diff --git a/adapters/saved-alpaca-cot13b/adapter_config.json b/adapters/saved-alpaca-cot13b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4ba49948a8f8232ee95452e47fcf9bd523635048 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/adapter_config.json @@ -0,0 +1,18 @@ +{ + "base_model_name_or_path": "/mnt/bn/qingyi-bn-lq/llama/llama-13b-hf", + "bias": "none", + "enable_lora": null, + "fan_in_fan_out": false, + "inference_mode": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved-alpaca-cot13b/adapter_model.bin b/adapters/saved-alpaca-cot13b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..d6ab4c1a3900f66c4a253b717185db206e4f91ee --- /dev/null +++ b/adapters/saved-alpaca-cot13b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b56905b7e2f488ceab32ac0967ad8522fbf8e4a9f88cbf02e5c25dfc4e3f0e8 +size 26271757 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2400/optimizer.pt b/adapters/saved-alpaca-cot13b/checkpoint-2400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..41413ea25002ea6fcf56be8d4e03c85a4db17ff3 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:054fe809ce967112d4e653ae2f21c805f6d7b0368cfd5602284c0266894cbfb4 +size 52523141 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2400/pytorch_model.bin b/adapters/saved-alpaca-cot13b/checkpoint-2400/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..43e3ed763801f2ad83340fee7f1541683ae03384 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2400/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acc3df90a12e27370b63e734ab5f270b70b35767eeb146a0d88015b7236d7876 +size 26271757 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_0.pth b/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..13bd83a6019b3c5cd351cd74af0afb6894e1c11a --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54fa73c6f0c833d64f575ff57aefcc62aec6e6c7ecb637e623a24f6e9c5d44e6 +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_1.pth b/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..e8f9e11b4a2d108d8c83603d6623451faa2f9270 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3f90fc2ed461905d187fc8c24d8918edf78eb79818f6b420d2f231a4ce5a958 +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_2.pth b/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..d23652974f2ad80ab403eb10a029d3888ed6e1ac --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72094c5961a0921b3cc3ac48be4a00dc3f86c04c6a560269c66c58c63ad5586a +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_3.pth b/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..81d0c5398c0cae219a05fc2531ce3b9942fa84e7 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69c3a0ddef4d689023f62c31ac3b28292e2d4aa4c60ca0ad4dff51e5f212be62 +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_4.pth b/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..13bb7adc8d2f2199a26530db3fa5dfc1d84de61c --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff1224a5f06cc6cb86c19ac81fef9b77b1992ca4227b6bd5f331e11ca6a69b29 +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_5.pth b/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..575625130f2f0ce24e4cbb3a898263e0cfc6c33f --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:269bf94c7b4d8558d0fef2f3eed1552a168af4094e411e4a0fc9f52e68a31c91 +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_6.pth b/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..2d3a15e3b37fb2c9191279ca1da9a3d93dc5c733 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae6521d87014f3005fe60ac302d109a475ae9723a102181279f487bbefb270d2 +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_7.pth b/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..341a3668a82bbc4b6b34090d606dd478142fe298 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2400/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c12793ea3d0a2873e7072288ace4e634e35427e265bb304eb40d9f447c682283 +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2400/scaler.pt b/adapters/saved-alpaca-cot13b/checkpoint-2400/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..05c991fe0d31f36923f65fa09bb9d7c1bde541bd --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2400/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3394d96409ceafdf6f72a31a1eab4e95f434c26b3e6eb0029414a0b03634c63 +size 557 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2400/scheduler.pt b/adapters/saved-alpaca-cot13b/checkpoint-2400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a1fd7c9fcf2ec4cb06ab5a2f9ac3098d334d0af --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62e98ec51e41fcc325644e63fe47526fa6163e15a725bfa8fb640aaa4a532796 +size 627 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2400/trainer_state.json b/adapters/saved-alpaca-cot13b/checkpoint-2400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cc322925e61b4ac3a548fd1aae309b6d175e143c --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2400/trainer_state.json @@ -0,0 +1,832 @@ +{ + "best_metric": 0.7276496291160583, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-cot13b/checkpoint-2400", + "epoch": 2.4628014366341713, + "global_step": 2400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "learning_rate": 5.9999999999999995e-05, + "loss": 2.1559, + "step": 20 + }, + { + "epoch": 0.04, + "learning_rate": 0.00011999999999999999, + "loss": 1.736, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.00017999999999999998, + "loss": 1.2009, + "step": 60 + }, + { + "epoch": 0.08, + "learning_rate": 0.00023999999999999998, + "loss": 0.9775, + "step": 80 + }, + { + "epoch": 0.1, + "learning_rate": 0.0003, + "loss": 0.8852, + "step": 100 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002978738483345145, + "loss": 0.8508, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.000295747696669029, + "loss": 0.8276, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.00029362154500354353, + "loss": 0.8102, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002914953933380581, + "loss": 0.7959, + "step": 180 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002893692416725726, + "loss": 0.7816, + "step": 200 + }, + { + "epoch": 0.21, + "eval_loss": 0.7917931079864502, + "eval_runtime": 25.1681, + "eval_samples_per_second": 79.466, + "eval_steps_per_second": 1.271, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.00028724309000708714, + "loss": 0.7793, + "step": 220 + }, + { + "epoch": 0.25, + "learning_rate": 0.00028511693834160166, + "loss": 0.7778, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002829907866761162, + "loss": 0.7828, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.00028086463501063075, + "loss": 0.7716, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027873848334514527, + "loss": 0.7686, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002766123316796598, + "loss": 0.7618, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002744861800141743, + "loss": 0.7568, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002723600283486889, + "loss": 0.7679, + "step": 360 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002702338766832034, + "loss": 0.7713, + "step": 380 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002681077250177179, + "loss": 0.7516, + "step": 400 + }, + { + "epoch": 0.41, + "eval_loss": 0.7647445201873779, + "eval_runtime": 25.1482, + "eval_samples_per_second": 79.529, + "eval_steps_per_second": 1.272, + "step": 400 + }, + { + "epoch": 0.43, + "learning_rate": 0.00026598157335223243, + "loss": 0.7608, + "step": 420 + }, + { + "epoch": 0.45, + "learning_rate": 0.00026385542168674695, + "loss": 0.758, + "step": 440 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002617292700212615, + "loss": 0.7547, + "step": 460 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025960311835577604, + "loss": 0.7593, + "step": 480 + }, + { + "epoch": 0.51, + "learning_rate": 0.00025747696669029056, + "loss": 0.7537, + "step": 500 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002553508150248051, + "loss": 0.7586, + "step": 520 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002532246633593196, + "loss": 0.7726, + "step": 540 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002510985116938341, + "loss": 0.756, + "step": 560 + }, + { + "epoch": 0.6, + "learning_rate": 0.0002489723600283487, + "loss": 0.7437, + "step": 580 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002468462083628632, + "loss": 0.7498, + "step": 600 + }, + { + "epoch": 0.62, + "eval_loss": 0.7551296353340149, + "eval_runtime": 25.165, + "eval_samples_per_second": 79.475, + "eval_steps_per_second": 1.272, + "step": 600 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002447200566973777, + "loss": 0.749, + "step": 620 + }, + { + "epoch": 0.66, + "learning_rate": 0.00024259390503189224, + "loss": 0.7431, + "step": 640 + }, + { + "epoch": 0.68, + "learning_rate": 0.0002404677533664068, + "loss": 0.7533, + "step": 660 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002383416017009213, + "loss": 0.7486, + "step": 680 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023621545003543583, + "loss": 0.737, + "step": 700 + }, + { + "epoch": 0.74, + "learning_rate": 0.0002340892983699504, + "loss": 0.7463, + "step": 720 + }, + { + "epoch": 0.76, + "learning_rate": 0.00023196314670446492, + "loss": 0.7419, + "step": 740 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022983699503897943, + "loss": 0.7414, + "step": 760 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022771084337349395, + "loss": 0.7502, + "step": 780 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022558469170800847, + "loss": 0.7397, + "step": 800 + }, + { + "epoch": 0.82, + "eval_loss": 0.7478033900260925, + "eval_runtime": 25.1278, + "eval_samples_per_second": 79.593, + "eval_steps_per_second": 1.273, + "step": 800 + }, + { + "epoch": 0.84, + "learning_rate": 0.00022345854004252302, + "loss": 0.7456, + "step": 820 + }, + { + "epoch": 0.86, + "learning_rate": 0.00022133238837703754, + "loss": 0.7317, + "step": 840 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021920623671155208, + "loss": 0.7377, + "step": 860 + }, + { + "epoch": 0.9, + "learning_rate": 0.0002170800850460666, + "loss": 0.7439, + "step": 880 + }, + { + "epoch": 0.92, + "learning_rate": 0.00021495393338058114, + "loss": 0.7394, + "step": 900 + }, + { + "epoch": 0.94, + "learning_rate": 0.00021282778171509566, + "loss": 0.7397, + "step": 920 + }, + { + "epoch": 0.96, + "learning_rate": 0.00021070163004961018, + "loss": 0.7369, + "step": 940 + }, + { + "epoch": 0.99, + "learning_rate": 0.0002085754783841247, + "loss": 0.7365, + "step": 960 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020644932671863922, + "loss": 0.7443, + "step": 980 + }, + { + "epoch": 1.03, + "learning_rate": 0.0002043231750531538, + "loss": 0.7364, + "step": 1000 + }, + { + "epoch": 1.03, + "eval_loss": 0.7435225248336792, + "eval_runtime": 25.1054, + "eval_samples_per_second": 79.664, + "eval_steps_per_second": 1.275, + "step": 1000 + }, + { + "epoch": 1.05, + "learning_rate": 0.0002021970233876683, + "loss": 0.7369, + "step": 1020 + }, + { + "epoch": 1.07, + "learning_rate": 0.00020007087172218283, + "loss": 0.7318, + "step": 1040 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019794472005669735, + "loss": 0.734, + "step": 1060 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001958185683912119, + "loss": 0.7299, + "step": 1080 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001936924167257264, + "loss": 0.7352, + "step": 1100 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019156626506024093, + "loss": 0.727, + "step": 1120 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001894401133947555, + "loss": 0.7262, + "step": 1140 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018731396172927002, + "loss": 0.7332, + "step": 1160 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018518781006378454, + "loss": 0.7315, + "step": 1180 + }, + { + "epoch": 1.23, + "learning_rate": 0.00018306165839829906, + "loss": 0.7391, + "step": 1200 + }, + { + "epoch": 1.23, + "eval_loss": 0.739651620388031, + "eval_runtime": 25.1539, + "eval_samples_per_second": 79.511, + "eval_steps_per_second": 1.272, + "step": 1200 + }, + { + "epoch": 1.25, + "learning_rate": 0.00018093550673281358, + "loss": 0.726, + "step": 1220 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017880935506732812, + "loss": 0.7358, + "step": 1240 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017668320340184267, + "loss": 0.7195, + "step": 1260 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017455705173635719, + "loss": 0.7304, + "step": 1280 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001724309000708717, + "loss": 0.7281, + "step": 1300 + }, + { + "epoch": 1.35, + "learning_rate": 0.00017030474840538625, + "loss": 0.7216, + "step": 1320 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016817859673990077, + "loss": 0.7247, + "step": 1340 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016605244507441529, + "loss": 0.7251, + "step": 1360 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001639262934089298, + "loss": 0.7266, + "step": 1380 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016180014174344438, + "loss": 0.7339, + "step": 1400 + }, + { + "epoch": 1.44, + "eval_loss": 0.737014651298523, + "eval_runtime": 25.1254, + "eval_samples_per_second": 79.601, + "eval_steps_per_second": 1.274, + "step": 1400 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001596739900779589, + "loss": 0.7336, + "step": 1420 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015754783841247341, + "loss": 0.7316, + "step": 1440 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015542168674698793, + "loss": 0.7317, + "step": 1460 + }, + { + "epoch": 1.52, + "learning_rate": 0.00015329553508150245, + "loss": 0.7289, + "step": 1480 + }, + { + "epoch": 1.54, + "learning_rate": 0.000151169383416017, + "loss": 0.7321, + "step": 1500 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014904323175053151, + "loss": 0.7213, + "step": 1520 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014691708008504606, + "loss": 0.7313, + "step": 1540 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014479092841956058, + "loss": 0.7232, + "step": 1560 + }, + { + "epoch": 1.62, + "learning_rate": 0.00014266477675407512, + "loss": 0.7298, + "step": 1580 + }, + { + "epoch": 1.64, + "learning_rate": 0.00014053862508858964, + "loss": 0.7203, + "step": 1600 + }, + { + "epoch": 1.64, + "eval_loss": 0.7359894514083862, + "eval_runtime": 25.1784, + "eval_samples_per_second": 79.433, + "eval_steps_per_second": 1.271, + "step": 1600 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013841247342310416, + "loss": 0.7187, + "step": 1620 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001362863217576187, + "loss": 0.708, + "step": 1640 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013416017009213323, + "loss": 0.7252, + "step": 1660 + }, + { + "epoch": 1.72, + "learning_rate": 0.00013203401842664774, + "loss": 0.717, + "step": 1680 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001299078667611623, + "loss": 0.7273, + "step": 1700 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001277817150956768, + "loss": 0.7244, + "step": 1720 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012565556343019135, + "loss": 0.7261, + "step": 1740 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012352941176470587, + "loss": 0.7191, + "step": 1760 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001214032600992204, + "loss": 0.724, + "step": 1780 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011927710843373494, + "loss": 0.714, + "step": 1800 + }, + { + "epoch": 1.85, + "eval_loss": 0.7326391935348511, + "eval_runtime": 25.1862, + "eval_samples_per_second": 79.409, + "eval_steps_per_second": 1.271, + "step": 1800 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011715095676824945, + "loss": 0.7196, + "step": 1820 + }, + { + "epoch": 1.89, + "learning_rate": 0.000115024805102764, + "loss": 0.7188, + "step": 1840 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011289865343727852, + "loss": 0.7223, + "step": 1860 + }, + { + "epoch": 1.93, + "learning_rate": 0.00011077250177179304, + "loss": 0.7311, + "step": 1880 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010864635010630757, + "loss": 0.7217, + "step": 1900 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010652019844082211, + "loss": 0.7205, + "step": 1920 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010439404677533663, + "loss": 0.7175, + "step": 1940 + }, + { + "epoch": 2.01, + "learning_rate": 0.00010226789510985115, + "loss": 0.7235, + "step": 1960 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001001417434443657, + "loss": 0.7158, + "step": 1980 + }, + { + "epoch": 2.05, + "learning_rate": 9.801559177888021e-05, + "loss": 0.7084, + "step": 2000 + }, + { + "epoch": 2.05, + "eval_loss": 0.730781614780426, + "eval_runtime": 25.1365, + "eval_samples_per_second": 79.565, + "eval_steps_per_second": 1.273, + "step": 2000 + }, + { + "epoch": 2.07, + "learning_rate": 9.588944011339475e-05, + "loss": 0.7159, + "step": 2020 + }, + { + "epoch": 2.09, + "learning_rate": 9.376328844790927e-05, + "loss": 0.7207, + "step": 2040 + }, + { + "epoch": 2.11, + "learning_rate": 9.163713678242381e-05, + "loss": 0.7107, + "step": 2060 + }, + { + "epoch": 2.13, + "learning_rate": 8.951098511693833e-05, + "loss": 0.7151, + "step": 2080 + }, + { + "epoch": 2.15, + "learning_rate": 8.738483345145286e-05, + "loss": 0.7157, + "step": 2100 + }, + { + "epoch": 2.18, + "learning_rate": 8.525868178596739e-05, + "loss": 0.7163, + "step": 2120 + }, + { + "epoch": 2.2, + "learning_rate": 8.313253012048193e-05, + "loss": 0.7152, + "step": 2140 + }, + { + "epoch": 2.22, + "learning_rate": 8.100637845499644e-05, + "loss": 0.7159, + "step": 2160 + }, + { + "epoch": 2.24, + "learning_rate": 7.888022678951099e-05, + "loss": 0.709, + "step": 2180 + }, + { + "epoch": 2.26, + "learning_rate": 7.675407512402551e-05, + "loss": 0.7247, + "step": 2200 + }, + { + "epoch": 2.26, + "eval_loss": 0.7292709350585938, + "eval_runtime": 25.1853, + "eval_samples_per_second": 79.412, + "eval_steps_per_second": 1.271, + "step": 2200 + }, + { + "epoch": 2.28, + "learning_rate": 7.462792345854004e-05, + "loss": 0.7214, + "step": 2220 + }, + { + "epoch": 2.3, + "learning_rate": 7.250177179305457e-05, + "loss": 0.7067, + "step": 2240 + }, + { + "epoch": 2.32, + "learning_rate": 7.037562012756909e-05, + "loss": 0.7264, + "step": 2260 + }, + { + "epoch": 2.34, + "learning_rate": 6.824946846208362e-05, + "loss": 0.7144, + "step": 2280 + }, + { + "epoch": 2.36, + "learning_rate": 6.612331679659815e-05, + "loss": 0.7184, + "step": 2300 + }, + { + "epoch": 2.38, + "learning_rate": 6.399716513111269e-05, + "loss": 0.7181, + "step": 2320 + }, + { + "epoch": 2.4, + "learning_rate": 6.187101346562722e-05, + "loss": 0.7165, + "step": 2340 + }, + { + "epoch": 2.42, + "learning_rate": 5.9744861800141736e-05, + "loss": 0.7229, + "step": 2360 + }, + { + "epoch": 2.44, + "learning_rate": 5.761871013465627e-05, + "loss": 0.7176, + "step": 2380 + }, + { + "epoch": 2.46, + "learning_rate": 5.5492558469170794e-05, + "loss": 0.7049, + "step": 2400 + }, + { + "epoch": 2.46, + "eval_loss": 0.7276496291160583, + "eval_runtime": 25.206, + "eval_samples_per_second": 79.346, + "eval_steps_per_second": 1.27, + "step": 2400 + } + ], + "max_steps": 2922, + "num_train_epochs": 3, + "total_flos": 1.2133612842645455e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2400/training_args.bin b/adapters/saved-alpaca-cot13b/checkpoint-2400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..318c748fceb0df0b87683d008843fc06e8b214b8 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:849f12b57e926b6a1ebcfdbf5b3122f6fc883616b79c739cdf46fd55da383327 +size 3643 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2600/optimizer.pt b/adapters/saved-alpaca-cot13b/checkpoint-2600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..dbc66c078fbc02dca1dfe12e618563ad2817f149 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82b66fd10d37a51de29bfd384e4bfdf2a7b6dea1ab2822e8762674781d0ea3b0 +size 52523141 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2600/pytorch_model.bin b/adapters/saved-alpaca-cot13b/checkpoint-2600/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..d22c49044a1f7886e405c7097d72fedeb10edc82 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2600/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b780ea34a56ed162c5ed90e83f4906e239096632d064c94ada80183e0acbb6ae +size 26271757 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_0.pth b/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..120b123ec64bf21b444c244ea093d2d7993ed271 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83aa2ef01b6c56c53a40587f1ab1b1bffa348766ef18a5fabd74e0e97dbd3c29 +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_1.pth b/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6c5ff4f9148f914d6a7f6998ea92dfa3e763c272 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b880bc55fe95696e954795e77c3e9ad1bf89748dd940bff33812d3e4576f582c +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_2.pth b/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..b0f1a1528077352080c7ee17a570e3a359b34241 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:279c789d6560d6282cba1d898ca6570c3d5565715829c9583fe3adc751f4c1c0 +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_3.pth b/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..016bf450c786053b383aeee57e92cc686c737385 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:549fa395aebd07dde1e3a21bcd7be74f4113e576df2d35dc259775b07998e88e +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_4.pth b/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..046c58f02eb7a667d8246f154d648a484f441cc5 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2fe931dfb60dc59bcf49e77076c7ba740c3731dec691fac4e868149b95d9777 +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_5.pth b/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..927ef1feeeefc0df55ce8a0a1db07f45e2f28626 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fa92dc8822b917dccca56309f190254b2c7cc47d716c693349548438d0552e9 +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_6.pth b/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..90ba611233679f39527cf7fdc55a0daef872c7a5 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85f1e5a3d9fea0400428e4a217e8966804dc93e49b55c5b2bfd05994761df851 +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_7.pth b/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..cd8e276988c275f392d4f51da7debfe9301a30d9 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2600/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:428563b5362f5e748794542f50a110bdc12821b502c0bf66d212ae9ac0a21838 +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2600/scaler.pt b/adapters/saved-alpaca-cot13b/checkpoint-2600/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..65894f4d214482c93818b7f185d1810082ab9e0b --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2600/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5dc513ac70929e4303afe4c21d0bcbe3b91ca4fff6f6fae86bd776ec9758c08 +size 557 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2600/scheduler.pt b/adapters/saved-alpaca-cot13b/checkpoint-2600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..58fac1653f7a2184e7e2542a7669d48723477f06 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b83666cda7a50df6110dc11db4e2def5794ca16d2bead42b96dc9f481eaba35f +size 627 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2600/trainer_state.json b/adapters/saved-alpaca-cot13b/checkpoint-2600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..55d4f8462a9896e0a528564e0cbc4277a7d58f60 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2600/trainer_state.json @@ -0,0 +1,900 @@ +{ + "best_metric": 0.727011501789093, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-cot13b/checkpoint-2600", + "epoch": 2.6680348896870187, + "global_step": 2600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "learning_rate": 5.9999999999999995e-05, + "loss": 2.1559, + "step": 20 + }, + { + "epoch": 0.04, + "learning_rate": 0.00011999999999999999, + "loss": 1.736, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.00017999999999999998, + "loss": 1.2009, + "step": 60 + }, + { + "epoch": 0.08, + "learning_rate": 0.00023999999999999998, + "loss": 0.9775, + "step": 80 + }, + { + "epoch": 0.1, + "learning_rate": 0.0003, + "loss": 0.8852, + "step": 100 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002978738483345145, + "loss": 0.8508, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.000295747696669029, + "loss": 0.8276, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.00029362154500354353, + "loss": 0.8102, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002914953933380581, + "loss": 0.7959, + "step": 180 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002893692416725726, + "loss": 0.7816, + "step": 200 + }, + { + "epoch": 0.21, + "eval_loss": 0.7917931079864502, + "eval_runtime": 25.1681, + "eval_samples_per_second": 79.466, + "eval_steps_per_second": 1.271, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.00028724309000708714, + "loss": 0.7793, + "step": 220 + }, + { + "epoch": 0.25, + "learning_rate": 0.00028511693834160166, + "loss": 0.7778, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002829907866761162, + "loss": 0.7828, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.00028086463501063075, + "loss": 0.7716, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027873848334514527, + "loss": 0.7686, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002766123316796598, + "loss": 0.7618, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002744861800141743, + "loss": 0.7568, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002723600283486889, + "loss": 0.7679, + "step": 360 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002702338766832034, + "loss": 0.7713, + "step": 380 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002681077250177179, + "loss": 0.7516, + "step": 400 + }, + { + "epoch": 0.41, + "eval_loss": 0.7647445201873779, + "eval_runtime": 25.1482, + "eval_samples_per_second": 79.529, + "eval_steps_per_second": 1.272, + "step": 400 + }, + { + "epoch": 0.43, + "learning_rate": 0.00026598157335223243, + "loss": 0.7608, + "step": 420 + }, + { + "epoch": 0.45, + "learning_rate": 0.00026385542168674695, + "loss": 0.758, + "step": 440 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002617292700212615, + "loss": 0.7547, + "step": 460 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025960311835577604, + "loss": 0.7593, + "step": 480 + }, + { + "epoch": 0.51, + "learning_rate": 0.00025747696669029056, + "loss": 0.7537, + "step": 500 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002553508150248051, + "loss": 0.7586, + "step": 520 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002532246633593196, + "loss": 0.7726, + "step": 540 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002510985116938341, + "loss": 0.756, + "step": 560 + }, + { + "epoch": 0.6, + "learning_rate": 0.0002489723600283487, + "loss": 0.7437, + "step": 580 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002468462083628632, + "loss": 0.7498, + "step": 600 + }, + { + "epoch": 0.62, + "eval_loss": 0.7551296353340149, + "eval_runtime": 25.165, + "eval_samples_per_second": 79.475, + "eval_steps_per_second": 1.272, + "step": 600 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002447200566973777, + "loss": 0.749, + "step": 620 + }, + { + "epoch": 0.66, + "learning_rate": 0.00024259390503189224, + "loss": 0.7431, + "step": 640 + }, + { + "epoch": 0.68, + "learning_rate": 0.0002404677533664068, + "loss": 0.7533, + "step": 660 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002383416017009213, + "loss": 0.7486, + "step": 680 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023621545003543583, + "loss": 0.737, + "step": 700 + }, + { + "epoch": 0.74, + "learning_rate": 0.0002340892983699504, + "loss": 0.7463, + "step": 720 + }, + { + "epoch": 0.76, + "learning_rate": 0.00023196314670446492, + "loss": 0.7419, + "step": 740 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022983699503897943, + "loss": 0.7414, + "step": 760 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022771084337349395, + "loss": 0.7502, + "step": 780 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022558469170800847, + "loss": 0.7397, + "step": 800 + }, + { + "epoch": 0.82, + "eval_loss": 0.7478033900260925, + "eval_runtime": 25.1278, + "eval_samples_per_second": 79.593, + "eval_steps_per_second": 1.273, + "step": 800 + }, + { + "epoch": 0.84, + "learning_rate": 0.00022345854004252302, + "loss": 0.7456, + "step": 820 + }, + { + "epoch": 0.86, + "learning_rate": 0.00022133238837703754, + "loss": 0.7317, + "step": 840 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021920623671155208, + "loss": 0.7377, + "step": 860 + }, + { + "epoch": 0.9, + "learning_rate": 0.0002170800850460666, + "loss": 0.7439, + "step": 880 + }, + { + "epoch": 0.92, + "learning_rate": 0.00021495393338058114, + "loss": 0.7394, + "step": 900 + }, + { + "epoch": 0.94, + "learning_rate": 0.00021282778171509566, + "loss": 0.7397, + "step": 920 + }, + { + "epoch": 0.96, + "learning_rate": 0.00021070163004961018, + "loss": 0.7369, + "step": 940 + }, + { + "epoch": 0.99, + "learning_rate": 0.0002085754783841247, + "loss": 0.7365, + "step": 960 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020644932671863922, + "loss": 0.7443, + "step": 980 + }, + { + "epoch": 1.03, + "learning_rate": 0.0002043231750531538, + "loss": 0.7364, + "step": 1000 + }, + { + "epoch": 1.03, + "eval_loss": 0.7435225248336792, + "eval_runtime": 25.1054, + "eval_samples_per_second": 79.664, + "eval_steps_per_second": 1.275, + "step": 1000 + }, + { + "epoch": 1.05, + "learning_rate": 0.0002021970233876683, + "loss": 0.7369, + "step": 1020 + }, + { + "epoch": 1.07, + "learning_rate": 0.00020007087172218283, + "loss": 0.7318, + "step": 1040 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019794472005669735, + "loss": 0.734, + "step": 1060 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001958185683912119, + "loss": 0.7299, + "step": 1080 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001936924167257264, + "loss": 0.7352, + "step": 1100 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019156626506024093, + "loss": 0.727, + "step": 1120 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001894401133947555, + "loss": 0.7262, + "step": 1140 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018731396172927002, + "loss": 0.7332, + "step": 1160 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018518781006378454, + "loss": 0.7315, + "step": 1180 + }, + { + "epoch": 1.23, + "learning_rate": 0.00018306165839829906, + "loss": 0.7391, + "step": 1200 + }, + { + "epoch": 1.23, + "eval_loss": 0.739651620388031, + "eval_runtime": 25.1539, + "eval_samples_per_second": 79.511, + "eval_steps_per_second": 1.272, + "step": 1200 + }, + { + "epoch": 1.25, + "learning_rate": 0.00018093550673281358, + "loss": 0.726, + "step": 1220 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017880935506732812, + "loss": 0.7358, + "step": 1240 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017668320340184267, + "loss": 0.7195, + "step": 1260 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017455705173635719, + "loss": 0.7304, + "step": 1280 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001724309000708717, + "loss": 0.7281, + "step": 1300 + }, + { + "epoch": 1.35, + "learning_rate": 0.00017030474840538625, + "loss": 0.7216, + "step": 1320 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016817859673990077, + "loss": 0.7247, + "step": 1340 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016605244507441529, + "loss": 0.7251, + "step": 1360 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001639262934089298, + "loss": 0.7266, + "step": 1380 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016180014174344438, + "loss": 0.7339, + "step": 1400 + }, + { + "epoch": 1.44, + "eval_loss": 0.737014651298523, + "eval_runtime": 25.1254, + "eval_samples_per_second": 79.601, + "eval_steps_per_second": 1.274, + "step": 1400 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001596739900779589, + "loss": 0.7336, + "step": 1420 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015754783841247341, + "loss": 0.7316, + "step": 1440 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015542168674698793, + "loss": 0.7317, + "step": 1460 + }, + { + "epoch": 1.52, + "learning_rate": 0.00015329553508150245, + "loss": 0.7289, + "step": 1480 + }, + { + "epoch": 1.54, + "learning_rate": 0.000151169383416017, + "loss": 0.7321, + "step": 1500 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014904323175053151, + "loss": 0.7213, + "step": 1520 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014691708008504606, + "loss": 0.7313, + "step": 1540 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014479092841956058, + "loss": 0.7232, + "step": 1560 + }, + { + "epoch": 1.62, + "learning_rate": 0.00014266477675407512, + "loss": 0.7298, + "step": 1580 + }, + { + "epoch": 1.64, + "learning_rate": 0.00014053862508858964, + "loss": 0.7203, + "step": 1600 + }, + { + "epoch": 1.64, + "eval_loss": 0.7359894514083862, + "eval_runtime": 25.1784, + "eval_samples_per_second": 79.433, + "eval_steps_per_second": 1.271, + "step": 1600 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013841247342310416, + "loss": 0.7187, + "step": 1620 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001362863217576187, + "loss": 0.708, + "step": 1640 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013416017009213323, + "loss": 0.7252, + "step": 1660 + }, + { + "epoch": 1.72, + "learning_rate": 0.00013203401842664774, + "loss": 0.717, + "step": 1680 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001299078667611623, + "loss": 0.7273, + "step": 1700 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001277817150956768, + "loss": 0.7244, + "step": 1720 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012565556343019135, + "loss": 0.7261, + "step": 1740 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012352941176470587, + "loss": 0.7191, + "step": 1760 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001214032600992204, + "loss": 0.724, + "step": 1780 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011927710843373494, + "loss": 0.714, + "step": 1800 + }, + { + "epoch": 1.85, + "eval_loss": 0.7326391935348511, + "eval_runtime": 25.1862, + "eval_samples_per_second": 79.409, + "eval_steps_per_second": 1.271, + "step": 1800 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011715095676824945, + "loss": 0.7196, + "step": 1820 + }, + { + "epoch": 1.89, + "learning_rate": 0.000115024805102764, + "loss": 0.7188, + "step": 1840 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011289865343727852, + "loss": 0.7223, + "step": 1860 + }, + { + "epoch": 1.93, + "learning_rate": 0.00011077250177179304, + "loss": 0.7311, + "step": 1880 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010864635010630757, + "loss": 0.7217, + "step": 1900 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010652019844082211, + "loss": 0.7205, + "step": 1920 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010439404677533663, + "loss": 0.7175, + "step": 1940 + }, + { + "epoch": 2.01, + "learning_rate": 0.00010226789510985115, + "loss": 0.7235, + "step": 1960 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001001417434443657, + "loss": 0.7158, + "step": 1980 + }, + { + "epoch": 2.05, + "learning_rate": 9.801559177888021e-05, + "loss": 0.7084, + "step": 2000 + }, + { + "epoch": 2.05, + "eval_loss": 0.730781614780426, + "eval_runtime": 25.1365, + "eval_samples_per_second": 79.565, + "eval_steps_per_second": 1.273, + "step": 2000 + }, + { + "epoch": 2.07, + "learning_rate": 9.588944011339475e-05, + "loss": 0.7159, + "step": 2020 + }, + { + "epoch": 2.09, + "learning_rate": 9.376328844790927e-05, + "loss": 0.7207, + "step": 2040 + }, + { + "epoch": 2.11, + "learning_rate": 9.163713678242381e-05, + "loss": 0.7107, + "step": 2060 + }, + { + "epoch": 2.13, + "learning_rate": 8.951098511693833e-05, + "loss": 0.7151, + "step": 2080 + }, + { + "epoch": 2.15, + "learning_rate": 8.738483345145286e-05, + "loss": 0.7157, + "step": 2100 + }, + { + "epoch": 2.18, + "learning_rate": 8.525868178596739e-05, + "loss": 0.7163, + "step": 2120 + }, + { + "epoch": 2.2, + "learning_rate": 8.313253012048193e-05, + "loss": 0.7152, + "step": 2140 + }, + { + "epoch": 2.22, + "learning_rate": 8.100637845499644e-05, + "loss": 0.7159, + "step": 2160 + }, + { + "epoch": 2.24, + "learning_rate": 7.888022678951099e-05, + "loss": 0.709, + "step": 2180 + }, + { + "epoch": 2.26, + "learning_rate": 7.675407512402551e-05, + "loss": 0.7247, + "step": 2200 + }, + { + "epoch": 2.26, + "eval_loss": 0.7292709350585938, + "eval_runtime": 25.1853, + "eval_samples_per_second": 79.412, + "eval_steps_per_second": 1.271, + "step": 2200 + }, + { + "epoch": 2.28, + "learning_rate": 7.462792345854004e-05, + "loss": 0.7214, + "step": 2220 + }, + { + "epoch": 2.3, + "learning_rate": 7.250177179305457e-05, + "loss": 0.7067, + "step": 2240 + }, + { + "epoch": 2.32, + "learning_rate": 7.037562012756909e-05, + "loss": 0.7264, + "step": 2260 + }, + { + "epoch": 2.34, + "learning_rate": 6.824946846208362e-05, + "loss": 0.7144, + "step": 2280 + }, + { + "epoch": 2.36, + "learning_rate": 6.612331679659815e-05, + "loss": 0.7184, + "step": 2300 + }, + { + "epoch": 2.38, + "learning_rate": 6.399716513111269e-05, + "loss": 0.7181, + "step": 2320 + }, + { + "epoch": 2.4, + "learning_rate": 6.187101346562722e-05, + "loss": 0.7165, + "step": 2340 + }, + { + "epoch": 2.42, + "learning_rate": 5.9744861800141736e-05, + "loss": 0.7229, + "step": 2360 + }, + { + "epoch": 2.44, + "learning_rate": 5.761871013465627e-05, + "loss": 0.7176, + "step": 2380 + }, + { + "epoch": 2.46, + "learning_rate": 5.5492558469170794e-05, + "loss": 0.7049, + "step": 2400 + }, + { + "epoch": 2.46, + "eval_loss": 0.7276496291160583, + "eval_runtime": 25.206, + "eval_samples_per_second": 79.346, + "eval_steps_per_second": 1.27, + "step": 2400 + }, + { + "epoch": 2.48, + "learning_rate": 5.3366406803685326e-05, + "loss": 0.7163, + "step": 2420 + }, + { + "epoch": 2.5, + "learning_rate": 5.124025513819986e-05, + "loss": 0.7128, + "step": 2440 + }, + { + "epoch": 2.52, + "learning_rate": 4.911410347271438e-05, + "loss": 0.7087, + "step": 2460 + }, + { + "epoch": 2.54, + "learning_rate": 4.6987951807228915e-05, + "loss": 0.7136, + "step": 2480 + }, + { + "epoch": 2.57, + "learning_rate": 4.486180014174344e-05, + "loss": 0.713, + "step": 2500 + }, + { + "epoch": 2.59, + "learning_rate": 4.273564847625797e-05, + "loss": 0.7085, + "step": 2520 + }, + { + "epoch": 2.61, + "learning_rate": 4.0609496810772504e-05, + "loss": 0.7159, + "step": 2540 + }, + { + "epoch": 2.63, + "learning_rate": 3.848334514528702e-05, + "loss": 0.7049, + "step": 2560 + }, + { + "epoch": 2.65, + "learning_rate": 3.635719347980156e-05, + "loss": 0.7083, + "step": 2580 + }, + { + "epoch": 2.67, + "learning_rate": 3.4231041814316086e-05, + "loss": 0.6997, + "step": 2600 + }, + { + "epoch": 2.67, + "eval_loss": 0.727011501789093, + "eval_runtime": 25.2042, + "eval_samples_per_second": 79.352, + "eval_steps_per_second": 1.27, + "step": 2600 + } + ], + "max_steps": 2922, + "num_train_epochs": 3, + "total_flos": 1.3144852584567144e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2600/training_args.bin b/adapters/saved-alpaca-cot13b/checkpoint-2600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..318c748fceb0df0b87683d008843fc06e8b214b8 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:849f12b57e926b6a1ebcfdbf5b3122f6fc883616b79c739cdf46fd55da383327 +size 3643 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2800/optimizer.pt b/adapters/saved-alpaca-cot13b/checkpoint-2800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d4837ddeddfc91d41e09058af8ff034afad74f5c --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff0744f2363caf7258815af1c26d91fb9335c02eaee50ddb33a467c3604cac2a +size 52523141 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2800/pytorch_model.bin b/adapters/saved-alpaca-cot13b/checkpoint-2800/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..b5bea887f9822a793e40206fff66f369de3fd8e6 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2800/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3c94b9822f21e14f55c369d5fbd37d76cc369d139875303643c3a8037894e53 +size 26271757 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_0.pth b/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..e93880bd2a153adf25c28a8585a1f893207438ac --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b65eb535a6f1a23878d9f6568e944961e26bc3885413fbf18978b71882d1dac +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_1.pth b/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..ee927f2089d470d9c54035e89e0914a122b0882e --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfc74a1e9932d4a1e5538058b4aca1b121c0df1fac944246204fc43595d4e2b5 +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_2.pth b/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..a039006f1156d86096e8867b5b3814b59b0147f2 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9c03c614dbd0e287ebaa0a5df8cc88cc119253472294a5e2cf12905e6ecfe00 +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_3.pth b/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..dc73626caa9ba7f4dd0cc9ecef5dec626cc2485f --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc20ecbfec304b3830a12a26c20da722ee40cb47ced67d007152071a913483d3 +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_4.pth b/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..207fa63056674ae09d3b3c3925437ab483457a09 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c33e16b34eb92994ecb7cf5c36425b24f35e253c6f8cb5ac618157af855fa5db +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_5.pth b/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..089c7b16fc90afbbf5841f4fdadce0c7aad177b2 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c44ee76ac2daa18c520861dfce60c2adc2c0c5317bdfec0c979325153248b834 +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_6.pth b/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..75e6c7f432f59f5b385b3cc763112cd74e3a2e28 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19e2dc7fc7a68d8abc8216173aa019bf97cc4e40b09625fb974de74084b974dc +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_7.pth b/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..f2342b2eb13e4fb93277a576dbefd1c5819efd07 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2800/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f353b44c39cbcea0f95d2136a1baaf6258deeb862eb2da9688e57ad7256fe057 +size 14583 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2800/scaler.pt b/adapters/saved-alpaca-cot13b/checkpoint-2800/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f7dd356e9b99ca534f1a188cab8067b6d7ce3b8f --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2800/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd6a3bb8c8d63cf885c60242a57958c538d7aa14ca7f199dd8ce9059bb1b68f8 +size 557 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2800/scheduler.pt b/adapters/saved-alpaca-cot13b/checkpoint-2800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..79cfac3fb5ec6892d654c38e0bedc5a0d79cedca --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a4095e4f3f07e2c29dc5d82b7d5c217691e90eda7037db035604e10afb78fb2 +size 627 diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2800/trainer_state.json b/adapters/saved-alpaca-cot13b/checkpoint-2800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..580be2006f25a85f60aced33d2fdc520b6fc7699 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2800/trainer_state.json @@ -0,0 +1,968 @@ +{ + "best_metric": 0.7261727452278137, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-cot13b/checkpoint-2800", + "epoch": 2.8732683427398666, + "global_step": 2800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "learning_rate": 5.9999999999999995e-05, + "loss": 2.1559, + "step": 20 + }, + { + "epoch": 0.04, + "learning_rate": 0.00011999999999999999, + "loss": 1.736, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.00017999999999999998, + "loss": 1.2009, + "step": 60 + }, + { + "epoch": 0.08, + "learning_rate": 0.00023999999999999998, + "loss": 0.9775, + "step": 80 + }, + { + "epoch": 0.1, + "learning_rate": 0.0003, + "loss": 0.8852, + "step": 100 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002978738483345145, + "loss": 0.8508, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.000295747696669029, + "loss": 0.8276, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.00029362154500354353, + "loss": 0.8102, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002914953933380581, + "loss": 0.7959, + "step": 180 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002893692416725726, + "loss": 0.7816, + "step": 200 + }, + { + "epoch": 0.21, + "eval_loss": 0.7917931079864502, + "eval_runtime": 25.1681, + "eval_samples_per_second": 79.466, + "eval_steps_per_second": 1.271, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.00028724309000708714, + "loss": 0.7793, + "step": 220 + }, + { + "epoch": 0.25, + "learning_rate": 0.00028511693834160166, + "loss": 0.7778, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002829907866761162, + "loss": 0.7828, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.00028086463501063075, + "loss": 0.7716, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027873848334514527, + "loss": 0.7686, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002766123316796598, + "loss": 0.7618, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002744861800141743, + "loss": 0.7568, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002723600283486889, + "loss": 0.7679, + "step": 360 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002702338766832034, + "loss": 0.7713, + "step": 380 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002681077250177179, + "loss": 0.7516, + "step": 400 + }, + { + "epoch": 0.41, + "eval_loss": 0.7647445201873779, + "eval_runtime": 25.1482, + "eval_samples_per_second": 79.529, + "eval_steps_per_second": 1.272, + "step": 400 + }, + { + "epoch": 0.43, + "learning_rate": 0.00026598157335223243, + "loss": 0.7608, + "step": 420 + }, + { + "epoch": 0.45, + "learning_rate": 0.00026385542168674695, + "loss": 0.758, + "step": 440 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002617292700212615, + "loss": 0.7547, + "step": 460 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025960311835577604, + "loss": 0.7593, + "step": 480 + }, + { + "epoch": 0.51, + "learning_rate": 0.00025747696669029056, + "loss": 0.7537, + "step": 500 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002553508150248051, + "loss": 0.7586, + "step": 520 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002532246633593196, + "loss": 0.7726, + "step": 540 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002510985116938341, + "loss": 0.756, + "step": 560 + }, + { + "epoch": 0.6, + "learning_rate": 0.0002489723600283487, + "loss": 0.7437, + "step": 580 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002468462083628632, + "loss": 0.7498, + "step": 600 + }, + { + "epoch": 0.62, + "eval_loss": 0.7551296353340149, + "eval_runtime": 25.165, + "eval_samples_per_second": 79.475, + "eval_steps_per_second": 1.272, + "step": 600 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002447200566973777, + "loss": 0.749, + "step": 620 + }, + { + "epoch": 0.66, + "learning_rate": 0.00024259390503189224, + "loss": 0.7431, + "step": 640 + }, + { + "epoch": 0.68, + "learning_rate": 0.0002404677533664068, + "loss": 0.7533, + "step": 660 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002383416017009213, + "loss": 0.7486, + "step": 680 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023621545003543583, + "loss": 0.737, + "step": 700 + }, + { + "epoch": 0.74, + "learning_rate": 0.0002340892983699504, + "loss": 0.7463, + "step": 720 + }, + { + "epoch": 0.76, + "learning_rate": 0.00023196314670446492, + "loss": 0.7419, + "step": 740 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022983699503897943, + "loss": 0.7414, + "step": 760 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022771084337349395, + "loss": 0.7502, + "step": 780 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022558469170800847, + "loss": 0.7397, + "step": 800 + }, + { + "epoch": 0.82, + "eval_loss": 0.7478033900260925, + "eval_runtime": 25.1278, + "eval_samples_per_second": 79.593, + "eval_steps_per_second": 1.273, + "step": 800 + }, + { + "epoch": 0.84, + "learning_rate": 0.00022345854004252302, + "loss": 0.7456, + "step": 820 + }, + { + "epoch": 0.86, + "learning_rate": 0.00022133238837703754, + "loss": 0.7317, + "step": 840 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021920623671155208, + "loss": 0.7377, + "step": 860 + }, + { + "epoch": 0.9, + "learning_rate": 0.0002170800850460666, + "loss": 0.7439, + "step": 880 + }, + { + "epoch": 0.92, + "learning_rate": 0.00021495393338058114, + "loss": 0.7394, + "step": 900 + }, + { + "epoch": 0.94, + "learning_rate": 0.00021282778171509566, + "loss": 0.7397, + "step": 920 + }, + { + "epoch": 0.96, + "learning_rate": 0.00021070163004961018, + "loss": 0.7369, + "step": 940 + }, + { + "epoch": 0.99, + "learning_rate": 0.0002085754783841247, + "loss": 0.7365, + "step": 960 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020644932671863922, + "loss": 0.7443, + "step": 980 + }, + { + "epoch": 1.03, + "learning_rate": 0.0002043231750531538, + "loss": 0.7364, + "step": 1000 + }, + { + "epoch": 1.03, + "eval_loss": 0.7435225248336792, + "eval_runtime": 25.1054, + "eval_samples_per_second": 79.664, + "eval_steps_per_second": 1.275, + "step": 1000 + }, + { + "epoch": 1.05, + "learning_rate": 0.0002021970233876683, + "loss": 0.7369, + "step": 1020 + }, + { + "epoch": 1.07, + "learning_rate": 0.00020007087172218283, + "loss": 0.7318, + "step": 1040 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019794472005669735, + "loss": 0.734, + "step": 1060 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001958185683912119, + "loss": 0.7299, + "step": 1080 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001936924167257264, + "loss": 0.7352, + "step": 1100 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019156626506024093, + "loss": 0.727, + "step": 1120 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001894401133947555, + "loss": 0.7262, + "step": 1140 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018731396172927002, + "loss": 0.7332, + "step": 1160 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018518781006378454, + "loss": 0.7315, + "step": 1180 + }, + { + "epoch": 1.23, + "learning_rate": 0.00018306165839829906, + "loss": 0.7391, + "step": 1200 + }, + { + "epoch": 1.23, + "eval_loss": 0.739651620388031, + "eval_runtime": 25.1539, + "eval_samples_per_second": 79.511, + "eval_steps_per_second": 1.272, + "step": 1200 + }, + { + "epoch": 1.25, + "learning_rate": 0.00018093550673281358, + "loss": 0.726, + "step": 1220 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017880935506732812, + "loss": 0.7358, + "step": 1240 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017668320340184267, + "loss": 0.7195, + "step": 1260 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017455705173635719, + "loss": 0.7304, + "step": 1280 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001724309000708717, + "loss": 0.7281, + "step": 1300 + }, + { + "epoch": 1.35, + "learning_rate": 0.00017030474840538625, + "loss": 0.7216, + "step": 1320 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016817859673990077, + "loss": 0.7247, + "step": 1340 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016605244507441529, + "loss": 0.7251, + "step": 1360 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001639262934089298, + "loss": 0.7266, + "step": 1380 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016180014174344438, + "loss": 0.7339, + "step": 1400 + }, + { + "epoch": 1.44, + "eval_loss": 0.737014651298523, + "eval_runtime": 25.1254, + "eval_samples_per_second": 79.601, + "eval_steps_per_second": 1.274, + "step": 1400 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001596739900779589, + "loss": 0.7336, + "step": 1420 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015754783841247341, + "loss": 0.7316, + "step": 1440 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015542168674698793, + "loss": 0.7317, + "step": 1460 + }, + { + "epoch": 1.52, + "learning_rate": 0.00015329553508150245, + "loss": 0.7289, + "step": 1480 + }, + { + "epoch": 1.54, + "learning_rate": 0.000151169383416017, + "loss": 0.7321, + "step": 1500 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014904323175053151, + "loss": 0.7213, + "step": 1520 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014691708008504606, + "loss": 0.7313, + "step": 1540 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014479092841956058, + "loss": 0.7232, + "step": 1560 + }, + { + "epoch": 1.62, + "learning_rate": 0.00014266477675407512, + "loss": 0.7298, + "step": 1580 + }, + { + "epoch": 1.64, + "learning_rate": 0.00014053862508858964, + "loss": 0.7203, + "step": 1600 + }, + { + "epoch": 1.64, + "eval_loss": 0.7359894514083862, + "eval_runtime": 25.1784, + "eval_samples_per_second": 79.433, + "eval_steps_per_second": 1.271, + "step": 1600 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013841247342310416, + "loss": 0.7187, + "step": 1620 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001362863217576187, + "loss": 0.708, + "step": 1640 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013416017009213323, + "loss": 0.7252, + "step": 1660 + }, + { + "epoch": 1.72, + "learning_rate": 0.00013203401842664774, + "loss": 0.717, + "step": 1680 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001299078667611623, + "loss": 0.7273, + "step": 1700 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001277817150956768, + "loss": 0.7244, + "step": 1720 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012565556343019135, + "loss": 0.7261, + "step": 1740 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012352941176470587, + "loss": 0.7191, + "step": 1760 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001214032600992204, + "loss": 0.724, + "step": 1780 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011927710843373494, + "loss": 0.714, + "step": 1800 + }, + { + "epoch": 1.85, + "eval_loss": 0.7326391935348511, + "eval_runtime": 25.1862, + "eval_samples_per_second": 79.409, + "eval_steps_per_second": 1.271, + "step": 1800 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011715095676824945, + "loss": 0.7196, + "step": 1820 + }, + { + "epoch": 1.89, + "learning_rate": 0.000115024805102764, + "loss": 0.7188, + "step": 1840 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011289865343727852, + "loss": 0.7223, + "step": 1860 + }, + { + "epoch": 1.93, + "learning_rate": 0.00011077250177179304, + "loss": 0.7311, + "step": 1880 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010864635010630757, + "loss": 0.7217, + "step": 1900 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010652019844082211, + "loss": 0.7205, + "step": 1920 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010439404677533663, + "loss": 0.7175, + "step": 1940 + }, + { + "epoch": 2.01, + "learning_rate": 0.00010226789510985115, + "loss": 0.7235, + "step": 1960 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001001417434443657, + "loss": 0.7158, + "step": 1980 + }, + { + "epoch": 2.05, + "learning_rate": 9.801559177888021e-05, + "loss": 0.7084, + "step": 2000 + }, + { + "epoch": 2.05, + "eval_loss": 0.730781614780426, + "eval_runtime": 25.1365, + "eval_samples_per_second": 79.565, + "eval_steps_per_second": 1.273, + "step": 2000 + }, + { + "epoch": 2.07, + "learning_rate": 9.588944011339475e-05, + "loss": 0.7159, + "step": 2020 + }, + { + "epoch": 2.09, + "learning_rate": 9.376328844790927e-05, + "loss": 0.7207, + "step": 2040 + }, + { + "epoch": 2.11, + "learning_rate": 9.163713678242381e-05, + "loss": 0.7107, + "step": 2060 + }, + { + "epoch": 2.13, + "learning_rate": 8.951098511693833e-05, + "loss": 0.7151, + "step": 2080 + }, + { + "epoch": 2.15, + "learning_rate": 8.738483345145286e-05, + "loss": 0.7157, + "step": 2100 + }, + { + "epoch": 2.18, + "learning_rate": 8.525868178596739e-05, + "loss": 0.7163, + "step": 2120 + }, + { + "epoch": 2.2, + "learning_rate": 8.313253012048193e-05, + "loss": 0.7152, + "step": 2140 + }, + { + "epoch": 2.22, + "learning_rate": 8.100637845499644e-05, + "loss": 0.7159, + "step": 2160 + }, + { + "epoch": 2.24, + "learning_rate": 7.888022678951099e-05, + "loss": 0.709, + "step": 2180 + }, + { + "epoch": 2.26, + "learning_rate": 7.675407512402551e-05, + "loss": 0.7247, + "step": 2200 + }, + { + "epoch": 2.26, + "eval_loss": 0.7292709350585938, + "eval_runtime": 25.1853, + "eval_samples_per_second": 79.412, + "eval_steps_per_second": 1.271, + "step": 2200 + }, + { + "epoch": 2.28, + "learning_rate": 7.462792345854004e-05, + "loss": 0.7214, + "step": 2220 + }, + { + "epoch": 2.3, + "learning_rate": 7.250177179305457e-05, + "loss": 0.7067, + "step": 2240 + }, + { + "epoch": 2.32, + "learning_rate": 7.037562012756909e-05, + "loss": 0.7264, + "step": 2260 + }, + { + "epoch": 2.34, + "learning_rate": 6.824946846208362e-05, + "loss": 0.7144, + "step": 2280 + }, + { + "epoch": 2.36, + "learning_rate": 6.612331679659815e-05, + "loss": 0.7184, + "step": 2300 + }, + { + "epoch": 2.38, + "learning_rate": 6.399716513111269e-05, + "loss": 0.7181, + "step": 2320 + }, + { + "epoch": 2.4, + "learning_rate": 6.187101346562722e-05, + "loss": 0.7165, + "step": 2340 + }, + { + "epoch": 2.42, + "learning_rate": 5.9744861800141736e-05, + "loss": 0.7229, + "step": 2360 + }, + { + "epoch": 2.44, + "learning_rate": 5.761871013465627e-05, + "loss": 0.7176, + "step": 2380 + }, + { + "epoch": 2.46, + "learning_rate": 5.5492558469170794e-05, + "loss": 0.7049, + "step": 2400 + }, + { + "epoch": 2.46, + "eval_loss": 0.7276496291160583, + "eval_runtime": 25.206, + "eval_samples_per_second": 79.346, + "eval_steps_per_second": 1.27, + "step": 2400 + }, + { + "epoch": 2.48, + "learning_rate": 5.3366406803685326e-05, + "loss": 0.7163, + "step": 2420 + }, + { + "epoch": 2.5, + "learning_rate": 5.124025513819986e-05, + "loss": 0.7128, + "step": 2440 + }, + { + "epoch": 2.52, + "learning_rate": 4.911410347271438e-05, + "loss": 0.7087, + "step": 2460 + }, + { + "epoch": 2.54, + "learning_rate": 4.6987951807228915e-05, + "loss": 0.7136, + "step": 2480 + }, + { + "epoch": 2.57, + "learning_rate": 4.486180014174344e-05, + "loss": 0.713, + "step": 2500 + }, + { + "epoch": 2.59, + "learning_rate": 4.273564847625797e-05, + "loss": 0.7085, + "step": 2520 + }, + { + "epoch": 2.61, + "learning_rate": 4.0609496810772504e-05, + "loss": 0.7159, + "step": 2540 + }, + { + "epoch": 2.63, + "learning_rate": 3.848334514528702e-05, + "loss": 0.7049, + "step": 2560 + }, + { + "epoch": 2.65, + "learning_rate": 3.635719347980156e-05, + "loss": 0.7083, + "step": 2580 + }, + { + "epoch": 2.67, + "learning_rate": 3.4231041814316086e-05, + "loss": 0.6997, + "step": 2600 + }, + { + "epoch": 2.67, + "eval_loss": 0.727011501789093, + "eval_runtime": 25.2042, + "eval_samples_per_second": 79.352, + "eval_steps_per_second": 1.27, + "step": 2600 + }, + { + "epoch": 2.69, + "learning_rate": 3.210489014883061e-05, + "loss": 0.7161, + "step": 2620 + }, + { + "epoch": 2.71, + "learning_rate": 2.9978738483345144e-05, + "loss": 0.7127, + "step": 2640 + }, + { + "epoch": 2.73, + "learning_rate": 2.7852586817859672e-05, + "loss": 0.713, + "step": 2660 + }, + { + "epoch": 2.75, + "learning_rate": 2.5726435152374197e-05, + "loss": 0.7111, + "step": 2680 + }, + { + "epoch": 2.77, + "learning_rate": 2.360028348688873e-05, + "loss": 0.7183, + "step": 2700 + }, + { + "epoch": 2.79, + "learning_rate": 2.1474131821403258e-05, + "loss": 0.7122, + "step": 2720 + }, + { + "epoch": 2.81, + "learning_rate": 1.9347980155917787e-05, + "loss": 0.7124, + "step": 2740 + }, + { + "epoch": 2.83, + "learning_rate": 1.7221828490432315e-05, + "loss": 0.7073, + "step": 2760 + }, + { + "epoch": 2.85, + "learning_rate": 1.5095676824946846e-05, + "loss": 0.7076, + "step": 2780 + }, + { + "epoch": 2.87, + "learning_rate": 1.2969525159461374e-05, + "loss": 0.7047, + "step": 2800 + }, + { + "epoch": 2.87, + "eval_loss": 0.7261727452278137, + "eval_runtime": 25.1883, + "eval_samples_per_second": 79.402, + "eval_steps_per_second": 1.27, + "step": 2800 + } + ], + "max_steps": 2922, + "num_train_epochs": 3, + "total_flos": 1.4156092326488834e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-cot13b/checkpoint-2800/training_args.bin b/adapters/saved-alpaca-cot13b/checkpoint-2800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..318c748fceb0df0b87683d008843fc06e8b214b8 --- /dev/null +++ b/adapters/saved-alpaca-cot13b/checkpoint-2800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:849f12b57e926b6a1ebcfdbf5b3122f6fc883616b79c739cdf46fd55da383327 +size 3643 diff --git a/adapters/saved-alpaca-cot7b/adapter_config.json b/adapters/saved-alpaca-cot7b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6e56f2ae8f10fadfeec6c730ac6b119025824443 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/adapter_config.json @@ -0,0 +1,18 @@ +{ + "base_model_name_or_path": "/mnt/bn/qingyi-bn-lq/llama/llama-7b-hf", + "bias": "none", + "enable_lora": null, + "fan_in_fan_out": false, + "inference_mode": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved-alpaca-cot7b/adapter_model.bin b/adapters/saved-alpaca-cot7b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..0e2374256eab6e847d799693e7158f5ddf390732 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32cd8a94d0d1d703a4af78201953a545b2b3ba363403a0d74709f355fc1a2262 +size 16822989 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2400/optimizer.pt b/adapters/saved-alpaca-cot7b/checkpoint-2400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a1a31f0cfb6753629abb3dedc2b16b11191ea3a --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b6813bdeefd50dd6faddf43ebe7ed860f80165e5e5094dc1ca89f795096d140 +size 33629893 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2400/pytorch_model.bin b/adapters/saved-alpaca-cot7b/checkpoint-2400/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..c51455ebb5de391d30e0824af009350da64c8cd1 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2400/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd452b9aa1488cdd8f1fec667d09514a99f35cce8a4c5e153517ac0c4b49655b +size 16822989 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_0.pth b/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..411eaad99bd57cdfaa5624c9dce44d249ce6f261 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbf1f42ae986f964d4e7546b4e6882197b23eb8216c9a8de889c888473fd7244 +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_1.pth b/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..ac91f8cea6ea1054fb3f55d00797e611e5717793 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bac94831f1983b182ea32957cf3c9716847335c3b25f4b2d010cad03476ae3b +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_2.pth b/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..54c855057c7494a8f37571a34741ecc106ee7e3e --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b93c06e4687bc9022dcd44bc198e4e307d65739c113822e27aa00564bc8736c7 +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_3.pth b/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..d6f580f76df14579ac327af6960b6079614e9868 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48a932b49cf7d546dcbed78e479786c84cbda72525e2ed1e4c2f7347aa86a3f3 +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_4.pth b/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..c999add7bcc23a5e1475d5203b6e5e1559790118 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec8e8361285bf684c7ccaad4f88821deaec7add4e1d46a9e7026e3ccb468f9fd +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_5.pth b/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..0ce9be9a16d3bae9e7f8aa4ab376b70b10675400 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c615bf74fb27eedd11991597e5281d75c39ac4d6088111127090fe3692004da2 +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_6.pth b/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..08def15ce6c76f554c30fc166a98e83d4be3ef83 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e8db7f84be4b71af527b2ec9778f08d8cc87d6fef1691bfce35adea419236c1 +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_7.pth b/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..a88ee77b5356103c2de048cffe87d574ebb1ca41 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2400/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91d927c785d536fe174581ad17fdcd996463db43c38afe1b171389d700fd6405 +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2400/scaler.pt b/adapters/saved-alpaca-cot7b/checkpoint-2400/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..05c991fe0d31f36923f65fa09bb9d7c1bde541bd --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2400/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3394d96409ceafdf6f72a31a1eab4e95f434c26b3e6eb0029414a0b03634c63 +size 557 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2400/scheduler.pt b/adapters/saved-alpaca-cot7b/checkpoint-2400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a1fd7c9fcf2ec4cb06ab5a2f9ac3098d334d0af --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62e98ec51e41fcc325644e63fe47526fa6163e15a725bfa8fb640aaa4a532796 +size 627 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2400/trainer_state.json b/adapters/saved-alpaca-cot7b/checkpoint-2400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c86af303063b810186044b85af0a44668bdc5a80 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2400/trainer_state.json @@ -0,0 +1,832 @@ +{ + "best_metric": 0.7678119540214539, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-cot7b/checkpoint-2400", + "epoch": 2.4628014366341713, + "global_step": 2400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "learning_rate": 5.9999999999999995e-05, + "loss": 2.2274, + "step": 20 + }, + { + "epoch": 0.04, + "learning_rate": 0.00011999999999999999, + "loss": 1.8266, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.00017999999999999998, + "loss": 1.2659, + "step": 60 + }, + { + "epoch": 0.08, + "learning_rate": 0.00023999999999999998, + "loss": 1.0441, + "step": 80 + }, + { + "epoch": 0.1, + "learning_rate": 0.0003, + "loss": 0.9422, + "step": 100 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002978738483345145, + "loss": 0.8938, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.000295747696669029, + "loss": 0.8698, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.00029362154500354353, + "loss": 0.8406, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002914953933380581, + "loss": 0.835, + "step": 180 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002893692416725726, + "loss": 0.8199, + "step": 200 + }, + { + "epoch": 0.21, + "eval_loss": 0.8377243876457214, + "eval_runtime": 16.2911, + "eval_samples_per_second": 122.767, + "eval_steps_per_second": 1.964, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.00028724309000708714, + "loss": 0.8119, + "step": 220 + }, + { + "epoch": 0.25, + "learning_rate": 0.00028511693834160166, + "loss": 0.808, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002829907866761162, + "loss": 0.8067, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.00028086463501063075, + "loss": 0.8058, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027873848334514527, + "loss": 0.7993, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002766123316796598, + "loss": 0.802, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002744861800141743, + "loss": 0.7932, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002723600283486889, + "loss": 0.7873, + "step": 360 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002702338766832034, + "loss": 0.7864, + "step": 380 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002681077250177179, + "loss": 0.7925, + "step": 400 + }, + { + "epoch": 0.41, + "eval_loss": 0.8077966570854187, + "eval_runtime": 16.2968, + "eval_samples_per_second": 122.724, + "eval_steps_per_second": 1.964, + "step": 400 + }, + { + "epoch": 0.43, + "learning_rate": 0.00026598157335223243, + "loss": 0.7862, + "step": 420 + }, + { + "epoch": 0.45, + "learning_rate": 0.00026385542168674695, + "loss": 0.7825, + "step": 440 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002617292700212615, + "loss": 0.7914, + "step": 460 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025960311835577604, + "loss": 0.7951, + "step": 480 + }, + { + "epoch": 0.51, + "learning_rate": 0.00025747696669029056, + "loss": 0.7824, + "step": 500 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002553508150248051, + "loss": 0.7808, + "step": 520 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002532246633593196, + "loss": 0.7811, + "step": 540 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002510985116938341, + "loss": 0.7826, + "step": 560 + }, + { + "epoch": 0.6, + "learning_rate": 0.0002489723600283487, + "loss": 0.7758, + "step": 580 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002468462083628632, + "loss": 0.7827, + "step": 600 + }, + { + "epoch": 0.62, + "eval_loss": 0.796688973903656, + "eval_runtime": 16.4193, + "eval_samples_per_second": 121.808, + "eval_steps_per_second": 1.949, + "step": 600 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002447200566973777, + "loss": 0.7732, + "step": 620 + }, + { + "epoch": 0.66, + "learning_rate": 0.00024259390503189224, + "loss": 0.7784, + "step": 640 + }, + { + "epoch": 0.68, + "learning_rate": 0.0002404677533664068, + "loss": 0.78, + "step": 660 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002383416017009213, + "loss": 0.7789, + "step": 680 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023621545003543583, + "loss": 0.7762, + "step": 700 + }, + { + "epoch": 0.74, + "learning_rate": 0.0002340892983699504, + "loss": 0.7802, + "step": 720 + }, + { + "epoch": 0.76, + "learning_rate": 0.00023196314670446492, + "loss": 0.7749, + "step": 740 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022983699503897943, + "loss": 0.7643, + "step": 760 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022771084337349395, + "loss": 0.7648, + "step": 780 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022558469170800847, + "loss": 0.767, + "step": 800 + }, + { + "epoch": 0.82, + "eval_loss": 0.7896291613578796, + "eval_runtime": 16.2986, + "eval_samples_per_second": 122.71, + "eval_steps_per_second": 1.963, + "step": 800 + }, + { + "epoch": 0.84, + "learning_rate": 0.00022345854004252302, + "loss": 0.7707, + "step": 820 + }, + { + "epoch": 0.86, + "learning_rate": 0.00022133238837703754, + "loss": 0.76, + "step": 840 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021920623671155208, + "loss": 0.7644, + "step": 860 + }, + { + "epoch": 0.9, + "learning_rate": 0.0002170800850460666, + "loss": 0.7741, + "step": 880 + }, + { + "epoch": 0.92, + "learning_rate": 0.00021495393338058114, + "loss": 0.7678, + "step": 900 + }, + { + "epoch": 0.94, + "learning_rate": 0.00021282778171509566, + "loss": 0.76, + "step": 920 + }, + { + "epoch": 0.96, + "learning_rate": 0.00021070163004961018, + "loss": 0.779, + "step": 940 + }, + { + "epoch": 0.99, + "learning_rate": 0.0002085754783841247, + "loss": 0.7605, + "step": 960 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020644932671863922, + "loss": 0.7571, + "step": 980 + }, + { + "epoch": 1.03, + "learning_rate": 0.0002043231750531538, + "loss": 0.7595, + "step": 1000 + }, + { + "epoch": 1.03, + "eval_loss": 0.7847340106964111, + "eval_runtime": 16.2989, + "eval_samples_per_second": 122.707, + "eval_steps_per_second": 1.963, + "step": 1000 + }, + { + "epoch": 1.05, + "learning_rate": 0.0002021970233876683, + "loss": 0.7569, + "step": 1020 + }, + { + "epoch": 1.07, + "learning_rate": 0.00020007087172218283, + "loss": 0.7523, + "step": 1040 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019794472005669735, + "loss": 0.7621, + "step": 1060 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001958185683912119, + "loss": 0.7556, + "step": 1080 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001936924167257264, + "loss": 0.7594, + "step": 1100 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019156626506024093, + "loss": 0.7612, + "step": 1120 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001894401133947555, + "loss": 0.7587, + "step": 1140 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018731396172927002, + "loss": 0.7533, + "step": 1160 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018518781006378454, + "loss": 0.7573, + "step": 1180 + }, + { + "epoch": 1.23, + "learning_rate": 0.00018306165839829906, + "loss": 0.761, + "step": 1200 + }, + { + "epoch": 1.23, + "eval_loss": 0.7821407318115234, + "eval_runtime": 16.2951, + "eval_samples_per_second": 122.736, + "eval_steps_per_second": 1.964, + "step": 1200 + }, + { + "epoch": 1.25, + "learning_rate": 0.00018093550673281358, + "loss": 0.7548, + "step": 1220 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017880935506732812, + "loss": 0.7556, + "step": 1240 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017668320340184267, + "loss": 0.7605, + "step": 1260 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017455705173635719, + "loss": 0.7528, + "step": 1280 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001724309000708717, + "loss": 0.7584, + "step": 1300 + }, + { + "epoch": 1.35, + "learning_rate": 0.00017030474840538625, + "loss": 0.7596, + "step": 1320 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016817859673990077, + "loss": 0.7514, + "step": 1340 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016605244507441529, + "loss": 0.746, + "step": 1360 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001639262934089298, + "loss": 0.7581, + "step": 1380 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016180014174344438, + "loss": 0.7619, + "step": 1400 + }, + { + "epoch": 1.44, + "eval_loss": 0.777686357498169, + "eval_runtime": 16.3364, + "eval_samples_per_second": 122.426, + "eval_steps_per_second": 1.959, + "step": 1400 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001596739900779589, + "loss": 0.7421, + "step": 1420 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015754783841247341, + "loss": 0.745, + "step": 1440 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015542168674698793, + "loss": 0.7631, + "step": 1460 + }, + { + "epoch": 1.52, + "learning_rate": 0.00015329553508150245, + "loss": 0.7532, + "step": 1480 + }, + { + "epoch": 1.54, + "learning_rate": 0.000151169383416017, + "loss": 0.7491, + "step": 1500 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014904323175053151, + "loss": 0.743, + "step": 1520 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014691708008504606, + "loss": 0.7575, + "step": 1540 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014479092841956058, + "loss": 0.7519, + "step": 1560 + }, + { + "epoch": 1.62, + "learning_rate": 0.00014266477675407512, + "loss": 0.7504, + "step": 1580 + }, + { + "epoch": 1.64, + "learning_rate": 0.00014053862508858964, + "loss": 0.748, + "step": 1600 + }, + { + "epoch": 1.64, + "eval_loss": 0.7752982378005981, + "eval_runtime": 16.3209, + "eval_samples_per_second": 122.542, + "eval_steps_per_second": 1.961, + "step": 1600 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013841247342310416, + "loss": 0.7536, + "step": 1620 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001362863217576187, + "loss": 0.7455, + "step": 1640 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013416017009213323, + "loss": 0.7509, + "step": 1660 + }, + { + "epoch": 1.72, + "learning_rate": 0.00013203401842664774, + "loss": 0.7529, + "step": 1680 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001299078667611623, + "loss": 0.7505, + "step": 1700 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001277817150956768, + "loss": 0.7384, + "step": 1720 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012565556343019135, + "loss": 0.7396, + "step": 1740 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012352941176470587, + "loss": 0.7552, + "step": 1760 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001214032600992204, + "loss": 0.7459, + "step": 1780 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011927710843373494, + "loss": 0.7494, + "step": 1800 + }, + { + "epoch": 1.85, + "eval_loss": 0.7731354832649231, + "eval_runtime": 16.3191, + "eval_samples_per_second": 122.556, + "eval_steps_per_second": 1.961, + "step": 1800 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011715095676824945, + "loss": 0.7498, + "step": 1820 + }, + { + "epoch": 1.89, + "learning_rate": 0.000115024805102764, + "loss": 0.7544, + "step": 1840 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011289865343727852, + "loss": 0.751, + "step": 1860 + }, + { + "epoch": 1.93, + "learning_rate": 0.00011077250177179304, + "loss": 0.745, + "step": 1880 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010864635010630757, + "loss": 0.742, + "step": 1900 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010652019844082211, + "loss": 0.745, + "step": 1920 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010439404677533663, + "loss": 0.7457, + "step": 1940 + }, + { + "epoch": 2.01, + "learning_rate": 0.00010226789510985115, + "loss": 0.7475, + "step": 1960 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001001417434443657, + "loss": 0.742, + "step": 1980 + }, + { + "epoch": 2.05, + "learning_rate": 9.801559177888021e-05, + "loss": 0.7365, + "step": 2000 + }, + { + "epoch": 2.05, + "eval_loss": 0.7715820074081421, + "eval_runtime": 16.3224, + "eval_samples_per_second": 122.531, + "eval_steps_per_second": 1.96, + "step": 2000 + }, + { + "epoch": 2.07, + "learning_rate": 9.588944011339475e-05, + "loss": 0.7431, + "step": 2020 + }, + { + "epoch": 2.09, + "learning_rate": 9.376328844790927e-05, + "loss": 0.7515, + "step": 2040 + }, + { + "epoch": 2.11, + "learning_rate": 9.163713678242381e-05, + "loss": 0.7483, + "step": 2060 + }, + { + "epoch": 2.13, + "learning_rate": 8.951098511693833e-05, + "loss": 0.7355, + "step": 2080 + }, + { + "epoch": 2.15, + "learning_rate": 8.738483345145286e-05, + "loss": 0.7404, + "step": 2100 + }, + { + "epoch": 2.18, + "learning_rate": 8.525868178596739e-05, + "loss": 0.7403, + "step": 2120 + }, + { + "epoch": 2.2, + "learning_rate": 8.313253012048193e-05, + "loss": 0.7435, + "step": 2140 + }, + { + "epoch": 2.22, + "learning_rate": 8.100637845499644e-05, + "loss": 0.7503, + "step": 2160 + }, + { + "epoch": 2.24, + "learning_rate": 7.888022678951099e-05, + "loss": 0.7335, + "step": 2180 + }, + { + "epoch": 2.26, + "learning_rate": 7.675407512402551e-05, + "loss": 0.7481, + "step": 2200 + }, + { + "epoch": 2.26, + "eval_loss": 0.769282341003418, + "eval_runtime": 16.3527, + "eval_samples_per_second": 122.304, + "eval_steps_per_second": 1.957, + "step": 2200 + }, + { + "epoch": 2.28, + "learning_rate": 7.462792345854004e-05, + "loss": 0.7396, + "step": 2220 + }, + { + "epoch": 2.3, + "learning_rate": 7.250177179305457e-05, + "loss": 0.736, + "step": 2240 + }, + { + "epoch": 2.32, + "learning_rate": 7.037562012756909e-05, + "loss": 0.7443, + "step": 2260 + }, + { + "epoch": 2.34, + "learning_rate": 6.824946846208362e-05, + "loss": 0.7501, + "step": 2280 + }, + { + "epoch": 2.36, + "learning_rate": 6.612331679659815e-05, + "loss": 0.7335, + "step": 2300 + }, + { + "epoch": 2.38, + "learning_rate": 6.399716513111269e-05, + "loss": 0.7507, + "step": 2320 + }, + { + "epoch": 2.4, + "learning_rate": 6.187101346562722e-05, + "loss": 0.7449, + "step": 2340 + }, + { + "epoch": 2.42, + "learning_rate": 5.9744861800141736e-05, + "loss": 0.7397, + "step": 2360 + }, + { + "epoch": 2.44, + "learning_rate": 5.761871013465627e-05, + "loss": 0.743, + "step": 2380 + }, + { + "epoch": 2.46, + "learning_rate": 5.5492558469170794e-05, + "loss": 0.7416, + "step": 2400 + }, + { + "epoch": 2.46, + "eval_loss": 0.7678119540214539, + "eval_runtime": 16.3147, + "eval_samples_per_second": 122.589, + "eval_steps_per_second": 1.961, + "step": 2400 + } + ], + "max_steps": 2922, + "num_train_epochs": 3, + "total_flos": 6.238780296288797e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2400/training_args.bin b/adapters/saved-alpaca-cot7b/checkpoint-2400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5717c8a116166edb6610e6f7e767e232221a8890 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb1b135b6618e07a419e53ff8d6d816f32941463416673f997f2bf9bbe4c42db +size 3643 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2600/optimizer.pt b/adapters/saved-alpaca-cot7b/checkpoint-2600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d4b44e800ad850bda6c53083a83eb3aa263335c --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdf41e9e9f7f894a5e4d91a62a8aea5064002a6e9fbf0eb712bae32399e2c5ec +size 33629893 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2600/pytorch_model.bin b/adapters/saved-alpaca-cot7b/checkpoint-2600/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f6c150d49343788b04450038a7e0d4b5352db09 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2600/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8e6766c00776ff0ffc8ff07abe5f259e6ba412184caf00a65140d132ea45784 +size 16822989 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_0.pth b/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..323a251c8f5fd11a7aa11c9ef8b604c0e56af0f5 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:913a69aac792c620de44a2c6c128348b232d37a8db347b18f26f697a65090da5 +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_1.pth b/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..1237c072e43e89712549a02d5fb796b04d6d83e4 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03e9959ef06cba5c0c73e6cc9455434e7b93b36a7b82f3c301c25be0b1a9e0fe +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_2.pth b/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..17258ede962ec9923b85daf271a297ce8638ace0 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82e7600c81a71caf0f46373122d90a77baeacedc136c575d9c119287d09f2651 +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_3.pth b/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9dff002e2a6b5594ead62bad0bfa8c713e46f6c --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bada781465f880829c79d94706c1900a1f25cff930a0da6bec361acd40418ce +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_4.pth b/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..7a971d88c5ec7765f3fc61c196b294a1c5404f46 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60c6fad04ade965b50f0fb28094eda2c3889410bb38eccb98f642880fa726f6c +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_5.pth b/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..d298130c8e0e901b6b12e92cfea8feec68b44f2a --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32043ff52d46b15be97b59317b5a350f4d8ca08e951944ef269a5d2c92763f8c +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_6.pth b/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..2d15b70742803268e2253c21433d76f7c56646ea --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c182e167509141841047f916e176eba1f6e6901351b9bebae1ece0edfd72c22 +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_7.pth b/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..f4e96c851ba4b87eae84f819f8d918f016ea4bf9 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2600/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9f801ad53c54400c4f4f755dfa90e21e61cae7ccce0fd3dee9b75cd9b7b9518 +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2600/scaler.pt b/adapters/saved-alpaca-cot7b/checkpoint-2600/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..65894f4d214482c93818b7f185d1810082ab9e0b --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2600/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5dc513ac70929e4303afe4c21d0bcbe3b91ca4fff6f6fae86bd776ec9758c08 +size 557 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2600/scheduler.pt b/adapters/saved-alpaca-cot7b/checkpoint-2600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..58fac1653f7a2184e7e2542a7669d48723477f06 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b83666cda7a50df6110dc11db4e2def5794ca16d2bead42b96dc9f481eaba35f +size 627 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2600/trainer_state.json b/adapters/saved-alpaca-cot7b/checkpoint-2600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e86264d057c406fa66c4e56dc02cce6bb11d8dec --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2600/trainer_state.json @@ -0,0 +1,900 @@ +{ + "best_metric": 0.7669394612312317, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-cot7b/checkpoint-2600", + "epoch": 2.6680348896870187, + "global_step": 2600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "learning_rate": 5.9999999999999995e-05, + "loss": 2.2274, + "step": 20 + }, + { + "epoch": 0.04, + "learning_rate": 0.00011999999999999999, + "loss": 1.8266, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.00017999999999999998, + "loss": 1.2659, + "step": 60 + }, + { + "epoch": 0.08, + "learning_rate": 0.00023999999999999998, + "loss": 1.0441, + "step": 80 + }, + { + "epoch": 0.1, + "learning_rate": 0.0003, + "loss": 0.9422, + "step": 100 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002978738483345145, + "loss": 0.8938, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.000295747696669029, + "loss": 0.8698, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.00029362154500354353, + "loss": 0.8406, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002914953933380581, + "loss": 0.835, + "step": 180 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002893692416725726, + "loss": 0.8199, + "step": 200 + }, + { + "epoch": 0.21, + "eval_loss": 0.8377243876457214, + "eval_runtime": 16.2911, + "eval_samples_per_second": 122.767, + "eval_steps_per_second": 1.964, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.00028724309000708714, + "loss": 0.8119, + "step": 220 + }, + { + "epoch": 0.25, + "learning_rate": 0.00028511693834160166, + "loss": 0.808, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002829907866761162, + "loss": 0.8067, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.00028086463501063075, + "loss": 0.8058, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027873848334514527, + "loss": 0.7993, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002766123316796598, + "loss": 0.802, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002744861800141743, + "loss": 0.7932, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002723600283486889, + "loss": 0.7873, + "step": 360 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002702338766832034, + "loss": 0.7864, + "step": 380 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002681077250177179, + "loss": 0.7925, + "step": 400 + }, + { + "epoch": 0.41, + "eval_loss": 0.8077966570854187, + "eval_runtime": 16.2968, + "eval_samples_per_second": 122.724, + "eval_steps_per_second": 1.964, + "step": 400 + }, + { + "epoch": 0.43, + "learning_rate": 0.00026598157335223243, + "loss": 0.7862, + "step": 420 + }, + { + "epoch": 0.45, + "learning_rate": 0.00026385542168674695, + "loss": 0.7825, + "step": 440 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002617292700212615, + "loss": 0.7914, + "step": 460 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025960311835577604, + "loss": 0.7951, + "step": 480 + }, + { + "epoch": 0.51, + "learning_rate": 0.00025747696669029056, + "loss": 0.7824, + "step": 500 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002553508150248051, + "loss": 0.7808, + "step": 520 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002532246633593196, + "loss": 0.7811, + "step": 540 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002510985116938341, + "loss": 0.7826, + "step": 560 + }, + { + "epoch": 0.6, + "learning_rate": 0.0002489723600283487, + "loss": 0.7758, + "step": 580 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002468462083628632, + "loss": 0.7827, + "step": 600 + }, + { + "epoch": 0.62, + "eval_loss": 0.796688973903656, + "eval_runtime": 16.4193, + "eval_samples_per_second": 121.808, + "eval_steps_per_second": 1.949, + "step": 600 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002447200566973777, + "loss": 0.7732, + "step": 620 + }, + { + "epoch": 0.66, + "learning_rate": 0.00024259390503189224, + "loss": 0.7784, + "step": 640 + }, + { + "epoch": 0.68, + "learning_rate": 0.0002404677533664068, + "loss": 0.78, + "step": 660 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002383416017009213, + "loss": 0.7789, + "step": 680 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023621545003543583, + "loss": 0.7762, + "step": 700 + }, + { + "epoch": 0.74, + "learning_rate": 0.0002340892983699504, + "loss": 0.7802, + "step": 720 + }, + { + "epoch": 0.76, + "learning_rate": 0.00023196314670446492, + "loss": 0.7749, + "step": 740 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022983699503897943, + "loss": 0.7643, + "step": 760 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022771084337349395, + "loss": 0.7648, + "step": 780 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022558469170800847, + "loss": 0.767, + "step": 800 + }, + { + "epoch": 0.82, + "eval_loss": 0.7896291613578796, + "eval_runtime": 16.2986, + "eval_samples_per_second": 122.71, + "eval_steps_per_second": 1.963, + "step": 800 + }, + { + "epoch": 0.84, + "learning_rate": 0.00022345854004252302, + "loss": 0.7707, + "step": 820 + }, + { + "epoch": 0.86, + "learning_rate": 0.00022133238837703754, + "loss": 0.76, + "step": 840 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021920623671155208, + "loss": 0.7644, + "step": 860 + }, + { + "epoch": 0.9, + "learning_rate": 0.0002170800850460666, + "loss": 0.7741, + "step": 880 + }, + { + "epoch": 0.92, + "learning_rate": 0.00021495393338058114, + "loss": 0.7678, + "step": 900 + }, + { + "epoch": 0.94, + "learning_rate": 0.00021282778171509566, + "loss": 0.76, + "step": 920 + }, + { + "epoch": 0.96, + "learning_rate": 0.00021070163004961018, + "loss": 0.779, + "step": 940 + }, + { + "epoch": 0.99, + "learning_rate": 0.0002085754783841247, + "loss": 0.7605, + "step": 960 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020644932671863922, + "loss": 0.7571, + "step": 980 + }, + { + "epoch": 1.03, + "learning_rate": 0.0002043231750531538, + "loss": 0.7595, + "step": 1000 + }, + { + "epoch": 1.03, + "eval_loss": 0.7847340106964111, + "eval_runtime": 16.2989, + "eval_samples_per_second": 122.707, + "eval_steps_per_second": 1.963, + "step": 1000 + }, + { + "epoch": 1.05, + "learning_rate": 0.0002021970233876683, + "loss": 0.7569, + "step": 1020 + }, + { + "epoch": 1.07, + "learning_rate": 0.00020007087172218283, + "loss": 0.7523, + "step": 1040 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019794472005669735, + "loss": 0.7621, + "step": 1060 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001958185683912119, + "loss": 0.7556, + "step": 1080 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001936924167257264, + "loss": 0.7594, + "step": 1100 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019156626506024093, + "loss": 0.7612, + "step": 1120 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001894401133947555, + "loss": 0.7587, + "step": 1140 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018731396172927002, + "loss": 0.7533, + "step": 1160 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018518781006378454, + "loss": 0.7573, + "step": 1180 + }, + { + "epoch": 1.23, + "learning_rate": 0.00018306165839829906, + "loss": 0.761, + "step": 1200 + }, + { + "epoch": 1.23, + "eval_loss": 0.7821407318115234, + "eval_runtime": 16.2951, + "eval_samples_per_second": 122.736, + "eval_steps_per_second": 1.964, + "step": 1200 + }, + { + "epoch": 1.25, + "learning_rate": 0.00018093550673281358, + "loss": 0.7548, + "step": 1220 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017880935506732812, + "loss": 0.7556, + "step": 1240 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017668320340184267, + "loss": 0.7605, + "step": 1260 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017455705173635719, + "loss": 0.7528, + "step": 1280 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001724309000708717, + "loss": 0.7584, + "step": 1300 + }, + { + "epoch": 1.35, + "learning_rate": 0.00017030474840538625, + "loss": 0.7596, + "step": 1320 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016817859673990077, + "loss": 0.7514, + "step": 1340 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016605244507441529, + "loss": 0.746, + "step": 1360 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001639262934089298, + "loss": 0.7581, + "step": 1380 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016180014174344438, + "loss": 0.7619, + "step": 1400 + }, + { + "epoch": 1.44, + "eval_loss": 0.777686357498169, + "eval_runtime": 16.3364, + "eval_samples_per_second": 122.426, + "eval_steps_per_second": 1.959, + "step": 1400 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001596739900779589, + "loss": 0.7421, + "step": 1420 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015754783841247341, + "loss": 0.745, + "step": 1440 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015542168674698793, + "loss": 0.7631, + "step": 1460 + }, + { + "epoch": 1.52, + "learning_rate": 0.00015329553508150245, + "loss": 0.7532, + "step": 1480 + }, + { + "epoch": 1.54, + "learning_rate": 0.000151169383416017, + "loss": 0.7491, + "step": 1500 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014904323175053151, + "loss": 0.743, + "step": 1520 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014691708008504606, + "loss": 0.7575, + "step": 1540 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014479092841956058, + "loss": 0.7519, + "step": 1560 + }, + { + "epoch": 1.62, + "learning_rate": 0.00014266477675407512, + "loss": 0.7504, + "step": 1580 + }, + { + "epoch": 1.64, + "learning_rate": 0.00014053862508858964, + "loss": 0.748, + "step": 1600 + }, + { + "epoch": 1.64, + "eval_loss": 0.7752982378005981, + "eval_runtime": 16.3209, + "eval_samples_per_second": 122.542, + "eval_steps_per_second": 1.961, + "step": 1600 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013841247342310416, + "loss": 0.7536, + "step": 1620 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001362863217576187, + "loss": 0.7455, + "step": 1640 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013416017009213323, + "loss": 0.7509, + "step": 1660 + }, + { + "epoch": 1.72, + "learning_rate": 0.00013203401842664774, + "loss": 0.7529, + "step": 1680 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001299078667611623, + "loss": 0.7505, + "step": 1700 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001277817150956768, + "loss": 0.7384, + "step": 1720 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012565556343019135, + "loss": 0.7396, + "step": 1740 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012352941176470587, + "loss": 0.7552, + "step": 1760 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001214032600992204, + "loss": 0.7459, + "step": 1780 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011927710843373494, + "loss": 0.7494, + "step": 1800 + }, + { + "epoch": 1.85, + "eval_loss": 0.7731354832649231, + "eval_runtime": 16.3191, + "eval_samples_per_second": 122.556, + "eval_steps_per_second": 1.961, + "step": 1800 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011715095676824945, + "loss": 0.7498, + "step": 1820 + }, + { + "epoch": 1.89, + "learning_rate": 0.000115024805102764, + "loss": 0.7544, + "step": 1840 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011289865343727852, + "loss": 0.751, + "step": 1860 + }, + { + "epoch": 1.93, + "learning_rate": 0.00011077250177179304, + "loss": 0.745, + "step": 1880 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010864635010630757, + "loss": 0.742, + "step": 1900 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010652019844082211, + "loss": 0.745, + "step": 1920 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010439404677533663, + "loss": 0.7457, + "step": 1940 + }, + { + "epoch": 2.01, + "learning_rate": 0.00010226789510985115, + "loss": 0.7475, + "step": 1960 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001001417434443657, + "loss": 0.742, + "step": 1980 + }, + { + "epoch": 2.05, + "learning_rate": 9.801559177888021e-05, + "loss": 0.7365, + "step": 2000 + }, + { + "epoch": 2.05, + "eval_loss": 0.7715820074081421, + "eval_runtime": 16.3224, + "eval_samples_per_second": 122.531, + "eval_steps_per_second": 1.96, + "step": 2000 + }, + { + "epoch": 2.07, + "learning_rate": 9.588944011339475e-05, + "loss": 0.7431, + "step": 2020 + }, + { + "epoch": 2.09, + "learning_rate": 9.376328844790927e-05, + "loss": 0.7515, + "step": 2040 + }, + { + "epoch": 2.11, + "learning_rate": 9.163713678242381e-05, + "loss": 0.7483, + "step": 2060 + }, + { + "epoch": 2.13, + "learning_rate": 8.951098511693833e-05, + "loss": 0.7355, + "step": 2080 + }, + { + "epoch": 2.15, + "learning_rate": 8.738483345145286e-05, + "loss": 0.7404, + "step": 2100 + }, + { + "epoch": 2.18, + "learning_rate": 8.525868178596739e-05, + "loss": 0.7403, + "step": 2120 + }, + { + "epoch": 2.2, + "learning_rate": 8.313253012048193e-05, + "loss": 0.7435, + "step": 2140 + }, + { + "epoch": 2.22, + "learning_rate": 8.100637845499644e-05, + "loss": 0.7503, + "step": 2160 + }, + { + "epoch": 2.24, + "learning_rate": 7.888022678951099e-05, + "loss": 0.7335, + "step": 2180 + }, + { + "epoch": 2.26, + "learning_rate": 7.675407512402551e-05, + "loss": 0.7481, + "step": 2200 + }, + { + "epoch": 2.26, + "eval_loss": 0.769282341003418, + "eval_runtime": 16.3527, + "eval_samples_per_second": 122.304, + "eval_steps_per_second": 1.957, + "step": 2200 + }, + { + "epoch": 2.28, + "learning_rate": 7.462792345854004e-05, + "loss": 0.7396, + "step": 2220 + }, + { + "epoch": 2.3, + "learning_rate": 7.250177179305457e-05, + "loss": 0.736, + "step": 2240 + }, + { + "epoch": 2.32, + "learning_rate": 7.037562012756909e-05, + "loss": 0.7443, + "step": 2260 + }, + { + "epoch": 2.34, + "learning_rate": 6.824946846208362e-05, + "loss": 0.7501, + "step": 2280 + }, + { + "epoch": 2.36, + "learning_rate": 6.612331679659815e-05, + "loss": 0.7335, + "step": 2300 + }, + { + "epoch": 2.38, + "learning_rate": 6.399716513111269e-05, + "loss": 0.7507, + "step": 2320 + }, + { + "epoch": 2.4, + "learning_rate": 6.187101346562722e-05, + "loss": 0.7449, + "step": 2340 + }, + { + "epoch": 2.42, + "learning_rate": 5.9744861800141736e-05, + "loss": 0.7397, + "step": 2360 + }, + { + "epoch": 2.44, + "learning_rate": 5.761871013465627e-05, + "loss": 0.743, + "step": 2380 + }, + { + "epoch": 2.46, + "learning_rate": 5.5492558469170794e-05, + "loss": 0.7416, + "step": 2400 + }, + { + "epoch": 2.46, + "eval_loss": 0.7678119540214539, + "eval_runtime": 16.3147, + "eval_samples_per_second": 122.589, + "eval_steps_per_second": 1.961, + "step": 2400 + }, + { + "epoch": 2.48, + "learning_rate": 5.3366406803685326e-05, + "loss": 0.7457, + "step": 2420 + }, + { + "epoch": 2.5, + "learning_rate": 5.124025513819986e-05, + "loss": 0.7405, + "step": 2440 + }, + { + "epoch": 2.52, + "learning_rate": 4.911410347271438e-05, + "loss": 0.7381, + "step": 2460 + }, + { + "epoch": 2.54, + "learning_rate": 4.6987951807228915e-05, + "loss": 0.736, + "step": 2480 + }, + { + "epoch": 2.57, + "learning_rate": 4.486180014174344e-05, + "loss": 0.7437, + "step": 2500 + }, + { + "epoch": 2.59, + "learning_rate": 4.273564847625797e-05, + "loss": 0.7485, + "step": 2520 + }, + { + "epoch": 2.61, + "learning_rate": 4.0609496810772504e-05, + "loss": 0.7383, + "step": 2540 + }, + { + "epoch": 2.63, + "learning_rate": 3.848334514528702e-05, + "loss": 0.7362, + "step": 2560 + }, + { + "epoch": 2.65, + "learning_rate": 3.635719347980156e-05, + "loss": 0.7387, + "step": 2580 + }, + { + "epoch": 2.67, + "learning_rate": 3.4231041814316086e-05, + "loss": 0.7419, + "step": 2600 + }, + { + "epoch": 2.67, + "eval_loss": 0.7669394612312317, + "eval_runtime": 16.3329, + "eval_samples_per_second": 122.452, + "eval_steps_per_second": 1.959, + "step": 2600 + } + ], + "max_steps": 2922, + "num_train_epochs": 3, + "total_flos": 6.758732816713777e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2600/training_args.bin b/adapters/saved-alpaca-cot7b/checkpoint-2600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5717c8a116166edb6610e6f7e767e232221a8890 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb1b135b6618e07a419e53ff8d6d816f32941463416673f997f2bf9bbe4c42db +size 3643 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2800/optimizer.pt b/adapters/saved-alpaca-cot7b/checkpoint-2800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c1dea1312539b7a144f807d7fb385473ce1fee51 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:487b3e2130bf568fe6fc254ed7fdf44f46ca290e2380158b7888233a4f297ab5 +size 33629893 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2800/pytorch_model.bin b/adapters/saved-alpaca-cot7b/checkpoint-2800/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..924462da3e71ecf01a996ba6728f07d6a078a971 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2800/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12d948747007bce4f007a625c24bc46319ab4abf787f65c827c47159d5265865 +size 16822989 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_0.pth b/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..c1a24b6b864f15d4408b7af7a1dd1e711efb6221 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:366c1259a9bd573b002f5e9f900dcf590b8a237e42509dc46733ea86afa26d52 +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_1.pth b/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..a41d9ad0395709168b3f5ae66e0b14891481716b --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c4de2f577b6ce660592fefbf29bec25f03adc18ca4343bc0e807c216876f21e +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_2.pth b/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..dd0f4f610e3b1cde00c17c0d0e331117a8443b01 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29d25d04bedb28b5e965180bd6f4bdf56e6a3892912c043411759d451aea0211 +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_3.pth b/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..061f76e5c3595e1405c5271d908965068da98035 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:833742c96b68b42b555bd78a478094ea88ec535258a079d26d8ae42d8e07ab86 +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_4.pth b/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..2972d2bd0c79f388dea9a18d00c060827f6b4676 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3db2b169fce9195e20c96df5f7a14734805d90913dbdf769aee142bfaa3eb92d +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_5.pth b/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..035c75d6ad8d466d38820e3a57d3a4949cefda0e --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:424445f2e002abf172124e6ce255891dab9e5d26e2c47342f30393e3415abfa5 +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_6.pth b/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..9a0247a6d3dd553a225a868971fae88a55c26823 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42e75686c6516c713b04e66c79935288c1a2a3e6f5667bdb29fbccb4863b3a4e +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_7.pth b/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..b1ff6142c066efc032b44ae7bd4187e3c01856dd --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2800/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58000b0ac0e7988db4a8e41637883fa5143571333f32f6780a4706fd6b3e338f +size 14583 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2800/scaler.pt b/adapters/saved-alpaca-cot7b/checkpoint-2800/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f7dd356e9b99ca534f1a188cab8067b6d7ce3b8f --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2800/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd6a3bb8c8d63cf885c60242a57958c538d7aa14ca7f199dd8ce9059bb1b68f8 +size 557 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2800/scheduler.pt b/adapters/saved-alpaca-cot7b/checkpoint-2800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..79cfac3fb5ec6892d654c38e0bedc5a0d79cedca --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a4095e4f3f07e2c29dc5d82b7d5c217691e90eda7037db035604e10afb78fb2 +size 627 diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2800/trainer_state.json b/adapters/saved-alpaca-cot7b/checkpoint-2800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..328e38d4c4045e5c7e67f1b917bb3a1618f4c452 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2800/trainer_state.json @@ -0,0 +1,968 @@ +{ + "best_metric": 0.7663606405258179, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-cot7b/checkpoint-2800", + "epoch": 2.8732683427398666, + "global_step": 2800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "learning_rate": 5.9999999999999995e-05, + "loss": 2.2274, + "step": 20 + }, + { + "epoch": 0.04, + "learning_rate": 0.00011999999999999999, + "loss": 1.8266, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.00017999999999999998, + "loss": 1.2659, + "step": 60 + }, + { + "epoch": 0.08, + "learning_rate": 0.00023999999999999998, + "loss": 1.0441, + "step": 80 + }, + { + "epoch": 0.1, + "learning_rate": 0.0003, + "loss": 0.9422, + "step": 100 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002978738483345145, + "loss": 0.8938, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.000295747696669029, + "loss": 0.8698, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.00029362154500354353, + "loss": 0.8406, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002914953933380581, + "loss": 0.835, + "step": 180 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002893692416725726, + "loss": 0.8199, + "step": 200 + }, + { + "epoch": 0.21, + "eval_loss": 0.8377243876457214, + "eval_runtime": 16.2911, + "eval_samples_per_second": 122.767, + "eval_steps_per_second": 1.964, + "step": 200 + }, + { + "epoch": 0.23, + "learning_rate": 0.00028724309000708714, + "loss": 0.8119, + "step": 220 + }, + { + "epoch": 0.25, + "learning_rate": 0.00028511693834160166, + "loss": 0.808, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002829907866761162, + "loss": 0.8067, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.00028086463501063075, + "loss": 0.8058, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027873848334514527, + "loss": 0.7993, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002766123316796598, + "loss": 0.802, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002744861800141743, + "loss": 0.7932, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002723600283486889, + "loss": 0.7873, + "step": 360 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002702338766832034, + "loss": 0.7864, + "step": 380 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002681077250177179, + "loss": 0.7925, + "step": 400 + }, + { + "epoch": 0.41, + "eval_loss": 0.8077966570854187, + "eval_runtime": 16.2968, + "eval_samples_per_second": 122.724, + "eval_steps_per_second": 1.964, + "step": 400 + }, + { + "epoch": 0.43, + "learning_rate": 0.00026598157335223243, + "loss": 0.7862, + "step": 420 + }, + { + "epoch": 0.45, + "learning_rate": 0.00026385542168674695, + "loss": 0.7825, + "step": 440 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002617292700212615, + "loss": 0.7914, + "step": 460 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025960311835577604, + "loss": 0.7951, + "step": 480 + }, + { + "epoch": 0.51, + "learning_rate": 0.00025747696669029056, + "loss": 0.7824, + "step": 500 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002553508150248051, + "loss": 0.7808, + "step": 520 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002532246633593196, + "loss": 0.7811, + "step": 540 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002510985116938341, + "loss": 0.7826, + "step": 560 + }, + { + "epoch": 0.6, + "learning_rate": 0.0002489723600283487, + "loss": 0.7758, + "step": 580 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002468462083628632, + "loss": 0.7827, + "step": 600 + }, + { + "epoch": 0.62, + "eval_loss": 0.796688973903656, + "eval_runtime": 16.4193, + "eval_samples_per_second": 121.808, + "eval_steps_per_second": 1.949, + "step": 600 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002447200566973777, + "loss": 0.7732, + "step": 620 + }, + { + "epoch": 0.66, + "learning_rate": 0.00024259390503189224, + "loss": 0.7784, + "step": 640 + }, + { + "epoch": 0.68, + "learning_rate": 0.0002404677533664068, + "loss": 0.78, + "step": 660 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002383416017009213, + "loss": 0.7789, + "step": 680 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023621545003543583, + "loss": 0.7762, + "step": 700 + }, + { + "epoch": 0.74, + "learning_rate": 0.0002340892983699504, + "loss": 0.7802, + "step": 720 + }, + { + "epoch": 0.76, + "learning_rate": 0.00023196314670446492, + "loss": 0.7749, + "step": 740 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022983699503897943, + "loss": 0.7643, + "step": 760 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022771084337349395, + "loss": 0.7648, + "step": 780 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022558469170800847, + "loss": 0.767, + "step": 800 + }, + { + "epoch": 0.82, + "eval_loss": 0.7896291613578796, + "eval_runtime": 16.2986, + "eval_samples_per_second": 122.71, + "eval_steps_per_second": 1.963, + "step": 800 + }, + { + "epoch": 0.84, + "learning_rate": 0.00022345854004252302, + "loss": 0.7707, + "step": 820 + }, + { + "epoch": 0.86, + "learning_rate": 0.00022133238837703754, + "loss": 0.76, + "step": 840 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021920623671155208, + "loss": 0.7644, + "step": 860 + }, + { + "epoch": 0.9, + "learning_rate": 0.0002170800850460666, + "loss": 0.7741, + "step": 880 + }, + { + "epoch": 0.92, + "learning_rate": 0.00021495393338058114, + "loss": 0.7678, + "step": 900 + }, + { + "epoch": 0.94, + "learning_rate": 0.00021282778171509566, + "loss": 0.76, + "step": 920 + }, + { + "epoch": 0.96, + "learning_rate": 0.00021070163004961018, + "loss": 0.779, + "step": 940 + }, + { + "epoch": 0.99, + "learning_rate": 0.0002085754783841247, + "loss": 0.7605, + "step": 960 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020644932671863922, + "loss": 0.7571, + "step": 980 + }, + { + "epoch": 1.03, + "learning_rate": 0.0002043231750531538, + "loss": 0.7595, + "step": 1000 + }, + { + "epoch": 1.03, + "eval_loss": 0.7847340106964111, + "eval_runtime": 16.2989, + "eval_samples_per_second": 122.707, + "eval_steps_per_second": 1.963, + "step": 1000 + }, + { + "epoch": 1.05, + "learning_rate": 0.0002021970233876683, + "loss": 0.7569, + "step": 1020 + }, + { + "epoch": 1.07, + "learning_rate": 0.00020007087172218283, + "loss": 0.7523, + "step": 1040 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019794472005669735, + "loss": 0.7621, + "step": 1060 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001958185683912119, + "loss": 0.7556, + "step": 1080 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001936924167257264, + "loss": 0.7594, + "step": 1100 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019156626506024093, + "loss": 0.7612, + "step": 1120 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001894401133947555, + "loss": 0.7587, + "step": 1140 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018731396172927002, + "loss": 0.7533, + "step": 1160 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018518781006378454, + "loss": 0.7573, + "step": 1180 + }, + { + "epoch": 1.23, + "learning_rate": 0.00018306165839829906, + "loss": 0.761, + "step": 1200 + }, + { + "epoch": 1.23, + "eval_loss": 0.7821407318115234, + "eval_runtime": 16.2951, + "eval_samples_per_second": 122.736, + "eval_steps_per_second": 1.964, + "step": 1200 + }, + { + "epoch": 1.25, + "learning_rate": 0.00018093550673281358, + "loss": 0.7548, + "step": 1220 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017880935506732812, + "loss": 0.7556, + "step": 1240 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017668320340184267, + "loss": 0.7605, + "step": 1260 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017455705173635719, + "loss": 0.7528, + "step": 1280 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001724309000708717, + "loss": 0.7584, + "step": 1300 + }, + { + "epoch": 1.35, + "learning_rate": 0.00017030474840538625, + "loss": 0.7596, + "step": 1320 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016817859673990077, + "loss": 0.7514, + "step": 1340 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016605244507441529, + "loss": 0.746, + "step": 1360 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001639262934089298, + "loss": 0.7581, + "step": 1380 + }, + { + "epoch": 1.44, + "learning_rate": 0.00016180014174344438, + "loss": 0.7619, + "step": 1400 + }, + { + "epoch": 1.44, + "eval_loss": 0.777686357498169, + "eval_runtime": 16.3364, + "eval_samples_per_second": 122.426, + "eval_steps_per_second": 1.959, + "step": 1400 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001596739900779589, + "loss": 0.7421, + "step": 1420 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015754783841247341, + "loss": 0.745, + "step": 1440 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015542168674698793, + "loss": 0.7631, + "step": 1460 + }, + { + "epoch": 1.52, + "learning_rate": 0.00015329553508150245, + "loss": 0.7532, + "step": 1480 + }, + { + "epoch": 1.54, + "learning_rate": 0.000151169383416017, + "loss": 0.7491, + "step": 1500 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014904323175053151, + "loss": 0.743, + "step": 1520 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014691708008504606, + "loss": 0.7575, + "step": 1540 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014479092841956058, + "loss": 0.7519, + "step": 1560 + }, + { + "epoch": 1.62, + "learning_rate": 0.00014266477675407512, + "loss": 0.7504, + "step": 1580 + }, + { + "epoch": 1.64, + "learning_rate": 0.00014053862508858964, + "loss": 0.748, + "step": 1600 + }, + { + "epoch": 1.64, + "eval_loss": 0.7752982378005981, + "eval_runtime": 16.3209, + "eval_samples_per_second": 122.542, + "eval_steps_per_second": 1.961, + "step": 1600 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013841247342310416, + "loss": 0.7536, + "step": 1620 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001362863217576187, + "loss": 0.7455, + "step": 1640 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013416017009213323, + "loss": 0.7509, + "step": 1660 + }, + { + "epoch": 1.72, + "learning_rate": 0.00013203401842664774, + "loss": 0.7529, + "step": 1680 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001299078667611623, + "loss": 0.7505, + "step": 1700 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001277817150956768, + "loss": 0.7384, + "step": 1720 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012565556343019135, + "loss": 0.7396, + "step": 1740 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012352941176470587, + "loss": 0.7552, + "step": 1760 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001214032600992204, + "loss": 0.7459, + "step": 1780 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011927710843373494, + "loss": 0.7494, + "step": 1800 + }, + { + "epoch": 1.85, + "eval_loss": 0.7731354832649231, + "eval_runtime": 16.3191, + "eval_samples_per_second": 122.556, + "eval_steps_per_second": 1.961, + "step": 1800 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011715095676824945, + "loss": 0.7498, + "step": 1820 + }, + { + "epoch": 1.89, + "learning_rate": 0.000115024805102764, + "loss": 0.7544, + "step": 1840 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011289865343727852, + "loss": 0.751, + "step": 1860 + }, + { + "epoch": 1.93, + "learning_rate": 0.00011077250177179304, + "loss": 0.745, + "step": 1880 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010864635010630757, + "loss": 0.742, + "step": 1900 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010652019844082211, + "loss": 0.745, + "step": 1920 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010439404677533663, + "loss": 0.7457, + "step": 1940 + }, + { + "epoch": 2.01, + "learning_rate": 0.00010226789510985115, + "loss": 0.7475, + "step": 1960 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001001417434443657, + "loss": 0.742, + "step": 1980 + }, + { + "epoch": 2.05, + "learning_rate": 9.801559177888021e-05, + "loss": 0.7365, + "step": 2000 + }, + { + "epoch": 2.05, + "eval_loss": 0.7715820074081421, + "eval_runtime": 16.3224, + "eval_samples_per_second": 122.531, + "eval_steps_per_second": 1.96, + "step": 2000 + }, + { + "epoch": 2.07, + "learning_rate": 9.588944011339475e-05, + "loss": 0.7431, + "step": 2020 + }, + { + "epoch": 2.09, + "learning_rate": 9.376328844790927e-05, + "loss": 0.7515, + "step": 2040 + }, + { + "epoch": 2.11, + "learning_rate": 9.163713678242381e-05, + "loss": 0.7483, + "step": 2060 + }, + { + "epoch": 2.13, + "learning_rate": 8.951098511693833e-05, + "loss": 0.7355, + "step": 2080 + }, + { + "epoch": 2.15, + "learning_rate": 8.738483345145286e-05, + "loss": 0.7404, + "step": 2100 + }, + { + "epoch": 2.18, + "learning_rate": 8.525868178596739e-05, + "loss": 0.7403, + "step": 2120 + }, + { + "epoch": 2.2, + "learning_rate": 8.313253012048193e-05, + "loss": 0.7435, + "step": 2140 + }, + { + "epoch": 2.22, + "learning_rate": 8.100637845499644e-05, + "loss": 0.7503, + "step": 2160 + }, + { + "epoch": 2.24, + "learning_rate": 7.888022678951099e-05, + "loss": 0.7335, + "step": 2180 + }, + { + "epoch": 2.26, + "learning_rate": 7.675407512402551e-05, + "loss": 0.7481, + "step": 2200 + }, + { + "epoch": 2.26, + "eval_loss": 0.769282341003418, + "eval_runtime": 16.3527, + "eval_samples_per_second": 122.304, + "eval_steps_per_second": 1.957, + "step": 2200 + }, + { + "epoch": 2.28, + "learning_rate": 7.462792345854004e-05, + "loss": 0.7396, + "step": 2220 + }, + { + "epoch": 2.3, + "learning_rate": 7.250177179305457e-05, + "loss": 0.736, + "step": 2240 + }, + { + "epoch": 2.32, + "learning_rate": 7.037562012756909e-05, + "loss": 0.7443, + "step": 2260 + }, + { + "epoch": 2.34, + "learning_rate": 6.824946846208362e-05, + "loss": 0.7501, + "step": 2280 + }, + { + "epoch": 2.36, + "learning_rate": 6.612331679659815e-05, + "loss": 0.7335, + "step": 2300 + }, + { + "epoch": 2.38, + "learning_rate": 6.399716513111269e-05, + "loss": 0.7507, + "step": 2320 + }, + { + "epoch": 2.4, + "learning_rate": 6.187101346562722e-05, + "loss": 0.7449, + "step": 2340 + }, + { + "epoch": 2.42, + "learning_rate": 5.9744861800141736e-05, + "loss": 0.7397, + "step": 2360 + }, + { + "epoch": 2.44, + "learning_rate": 5.761871013465627e-05, + "loss": 0.743, + "step": 2380 + }, + { + "epoch": 2.46, + "learning_rate": 5.5492558469170794e-05, + "loss": 0.7416, + "step": 2400 + }, + { + "epoch": 2.46, + "eval_loss": 0.7678119540214539, + "eval_runtime": 16.3147, + "eval_samples_per_second": 122.589, + "eval_steps_per_second": 1.961, + "step": 2400 + }, + { + "epoch": 2.48, + "learning_rate": 5.3366406803685326e-05, + "loss": 0.7457, + "step": 2420 + }, + { + "epoch": 2.5, + "learning_rate": 5.124025513819986e-05, + "loss": 0.7405, + "step": 2440 + }, + { + "epoch": 2.52, + "learning_rate": 4.911410347271438e-05, + "loss": 0.7381, + "step": 2460 + }, + { + "epoch": 2.54, + "learning_rate": 4.6987951807228915e-05, + "loss": 0.736, + "step": 2480 + }, + { + "epoch": 2.57, + "learning_rate": 4.486180014174344e-05, + "loss": 0.7437, + "step": 2500 + }, + { + "epoch": 2.59, + "learning_rate": 4.273564847625797e-05, + "loss": 0.7485, + "step": 2520 + }, + { + "epoch": 2.61, + "learning_rate": 4.0609496810772504e-05, + "loss": 0.7383, + "step": 2540 + }, + { + "epoch": 2.63, + "learning_rate": 3.848334514528702e-05, + "loss": 0.7362, + "step": 2560 + }, + { + "epoch": 2.65, + "learning_rate": 3.635719347980156e-05, + "loss": 0.7387, + "step": 2580 + }, + { + "epoch": 2.67, + "learning_rate": 3.4231041814316086e-05, + "loss": 0.7419, + "step": 2600 + }, + { + "epoch": 2.67, + "eval_loss": 0.7669394612312317, + "eval_runtime": 16.3329, + "eval_samples_per_second": 122.452, + "eval_steps_per_second": 1.959, + "step": 2600 + }, + { + "epoch": 2.69, + "learning_rate": 3.210489014883061e-05, + "loss": 0.7401, + "step": 2620 + }, + { + "epoch": 2.71, + "learning_rate": 2.9978738483345144e-05, + "loss": 0.7505, + "step": 2640 + }, + { + "epoch": 2.73, + "learning_rate": 2.7852586817859672e-05, + "loss": 0.737, + "step": 2660 + }, + { + "epoch": 2.75, + "learning_rate": 2.5726435152374197e-05, + "loss": 0.733, + "step": 2680 + }, + { + "epoch": 2.77, + "learning_rate": 2.360028348688873e-05, + "loss": 0.7354, + "step": 2700 + }, + { + "epoch": 2.79, + "learning_rate": 2.1474131821403258e-05, + "loss": 0.7408, + "step": 2720 + }, + { + "epoch": 2.81, + "learning_rate": 1.9347980155917787e-05, + "loss": 0.7406, + "step": 2740 + }, + { + "epoch": 2.83, + "learning_rate": 1.7221828490432315e-05, + "loss": 0.7297, + "step": 2760 + }, + { + "epoch": 2.85, + "learning_rate": 1.5095676824946846e-05, + "loss": 0.7395, + "step": 2780 + }, + { + "epoch": 2.87, + "learning_rate": 1.2969525159461374e-05, + "loss": 0.7373, + "step": 2800 + }, + { + "epoch": 2.87, + "eval_loss": 0.7663606405258179, + "eval_runtime": 16.3261, + "eval_samples_per_second": 122.503, + "eval_steps_per_second": 1.96, + "step": 2800 + } + ], + "max_steps": 2922, + "num_train_epochs": 3, + "total_flos": 7.278685337138758e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-alpaca-cot7b/checkpoint-2800/training_args.bin b/adapters/saved-alpaca-cot7b/checkpoint-2800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5717c8a116166edb6610e6f7e767e232221a8890 --- /dev/null +++ b/adapters/saved-alpaca-cot7b/checkpoint-2800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb1b135b6618e07a419e53ff8d6d816f32941463416673f997f2bf9bbe4c42db +size 3643 diff --git a/adapters/saved-belle-7b/adapter_config.json b/adapters/saved-belle-7b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6e56f2ae8f10fadfeec6c730ac6b119025824443 --- /dev/null +++ b/adapters/saved-belle-7b/adapter_config.json @@ -0,0 +1,18 @@ +{ + "base_model_name_or_path": "/mnt/bn/qingyi-bn-lq/llama/llama-7b-hf", + "bias": "none", + "enable_lora": null, + "fan_in_fan_out": false, + "inference_mode": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved-belle-7b/adapter_model.bin b/adapters/saved-belle-7b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..b8e61bc515cd92c045ac3cdca31f1d7b1faa5fb9 --- /dev/null +++ b/adapters/saved-belle-7b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d27cd1d7d044de00ef678a2cdea05763d689b257ebb9e729ebc8f0ac78b8ccfd +size 16822989 diff --git a/adapters/saved-belle-7b/checkpoint-12200/optimizer.pt b/adapters/saved-belle-7b/checkpoint-12200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8650995694460dae21195335867973af9eb22eb --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d365f3765b9c52ee64bc0d0a5a732ff9d6337ca3504b5e7fd040c37882ba50a1 +size 33629893 diff --git a/adapters/saved-belle-7b/checkpoint-12200/pytorch_model.bin b/adapters/saved-belle-7b/checkpoint-12200/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..3a2b2307ceda49afc4df8a1913c2d406e66c232f --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12200/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:886e23ec37465be5367b44bc13a21c565a55d404f61f3bfcc76996024e17b5bf +size 16822989 diff --git a/adapters/saved-belle-7b/checkpoint-12200/rng_state_0.pth b/adapters/saved-belle-7b/checkpoint-12200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..36f7f000ca546b3aa3baf5a24743c660a6deb713 --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34461786290aa0ca336bf026956dac6ce959ad0db41a2b4651c07e1f5573c7b2 +size 14583 diff --git a/adapters/saved-belle-7b/checkpoint-12200/rng_state_1.pth b/adapters/saved-belle-7b/checkpoint-12200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..fdec40ddf487ea07484b1155cc95144592b2495f --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c84010b0b1473496b21b47685459371700a3e6310c5c639b8f849d49af7e90fa +size 14583 diff --git a/adapters/saved-belle-7b/checkpoint-12200/rng_state_2.pth b/adapters/saved-belle-7b/checkpoint-12200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..bb32cb92f940eb7aa8e154d1768aada845f31cc5 --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e22b8a061fa407e05cbda29f578b9479529adeed90b58dabfa43232bb1c693f +size 14583 diff --git a/adapters/saved-belle-7b/checkpoint-12200/rng_state_3.pth b/adapters/saved-belle-7b/checkpoint-12200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..6c1d94f4a5a96fc73cbd7e021ed7009f522671bf --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a52fc370cacb6ed50ab35cf5c8e926aa06f63c2421c005b076861929d5244945 +size 14583 diff --git a/adapters/saved-belle-7b/checkpoint-12200/scaler.pt b/adapters/saved-belle-7b/checkpoint-12200/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e582126a33ecd4d1c5c8a36d98bc903bba61dcb --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12200/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3db6aae409dfc7a1801f262d5a8f0ba2416ebaf4ef2e6cbb64296ce228b8efa2 +size 557 diff --git a/adapters/saved-belle-7b/checkpoint-12200/scheduler.pt b/adapters/saved-belle-7b/checkpoint-12200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..72606bd96bfa631a00c04c305afb6de044cfee56 --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77921f51e1abfd1f754defa26b1df51f6b6921bccfafd22d003561dea6bb8f05 +size 627 diff --git a/adapters/saved-belle-7b/checkpoint-12200/trainer_state.json b/adapters/saved-belle-7b/checkpoint-12200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bd93bbc8dc8addba0b8b8c6af4b60d58f91d166b --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12200/trainer_state.json @@ -0,0 +1,4164 @@ +{ + "best_metric": 0.7255927324295044, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-belle-7b/checkpoint-12200", + "epoch": 2.8847574852954216, + "global_step": 12200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.8908, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 1.5545, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 1.1252, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 0.00023999999999999998, + "loss": 1.054, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.0003, + "loss": 1.0137, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002995233177087471, + "loss": 1.0046, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002990466354174942, + "loss": 0.9867, + "step": 140 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029856995312624134, + "loss": 0.9612, + "step": 160 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002980932708349884, + "loss": 0.9588, + "step": 180 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002976165885437356, + "loss": 0.9551, + "step": 200 + }, + { + "epoch": 0.05, + "eval_loss": 0.9459459185600281, + "eval_runtime": 19.4211, + "eval_samples_per_second": 102.981, + "eval_steps_per_second": 3.244, + "step": 200 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002971399062524827, + "loss": 0.9516, + "step": 220 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029666322396122984, + "loss": 0.937, + "step": 240 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002961865416699769, + "loss": 0.936, + "step": 260 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029570985937872406, + "loss": 0.9305, + "step": 280 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029523317708747115, + "loss": 0.9146, + "step": 300 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002947564947962183, + "loss": 0.9226, + "step": 320 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029427981250496543, + "loss": 0.9108, + "step": 340 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002938031302137125, + "loss": 0.9129, + "step": 360 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029332644792245965, + "loss": 0.9063, + "step": 380 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002928497656312068, + "loss": 0.8996, + "step": 400 + }, + { + "epoch": 0.09, + "eval_loss": 0.9003962874412537, + "eval_runtime": 19.5716, + "eval_samples_per_second": 102.189, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029237308333995393, + "loss": 0.898, + "step": 420 + }, + { + "epoch": 0.1, + "learning_rate": 0.000291896401048701, + "loss": 0.8936, + "step": 440 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029141971875744815, + "loss": 0.8932, + "step": 460 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029094303646619524, + "loss": 0.8779, + "step": 480 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002904663541749424, + "loss": 0.8871, + "step": 500 + }, + { + "epoch": 0.12, + "learning_rate": 0.00028998967188368946, + "loss": 0.8929, + "step": 520 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002895129895924366, + "loss": 0.8878, + "step": 540 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028903630730118374, + "loss": 0.8818, + "step": 560 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028855962500993083, + "loss": 0.8826, + "step": 580 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028808294271867797, + "loss": 0.879, + "step": 600 + }, + { + "epoch": 0.14, + "eval_loss": 0.8738257884979248, + "eval_runtime": 19.3526, + "eval_samples_per_second": 103.345, + "eval_steps_per_second": 3.255, + "step": 600 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002876062604274251, + "loss": 0.87, + "step": 620 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028712957813617224, + "loss": 0.8715, + "step": 640 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028665289584491933, + "loss": 0.8724, + "step": 660 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028617621355366647, + "loss": 0.8741, + "step": 680 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028569953126241355, + "loss": 0.8705, + "step": 700 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002852228489711607, + "loss": 0.8702, + "step": 720 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028474616667990783, + "loss": 0.8618, + "step": 740 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002842694843886549, + "loss": 0.8617, + "step": 760 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028379280209740206, + "loss": 0.8677, + "step": 780 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002833161198061492, + "loss": 0.853, + "step": 800 + }, + { + "epoch": 0.19, + "eval_loss": 0.8541846871376038, + "eval_runtime": 19.49, + "eval_samples_per_second": 102.617, + "eval_steps_per_second": 3.232, + "step": 800 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002828394375148963, + "loss": 0.8549, + "step": 820 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002823627552236434, + "loss": 0.847, + "step": 840 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028188607293239056, + "loss": 0.8585, + "step": 860 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028140939064113764, + "loss": 0.8568, + "step": 880 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002809327083498848, + "loss": 0.8482, + "step": 900 + }, + { + "epoch": 0.22, + "learning_rate": 0.00028045602605863187, + "loss": 0.845, + "step": 920 + }, + { + "epoch": 0.22, + "learning_rate": 0.000279979343767379, + "loss": 0.8548, + "step": 940 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027950266147612615, + "loss": 0.8372, + "step": 960 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002790259791848733, + "loss": 0.8423, + "step": 980 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027854929689362037, + "loss": 0.8433, + "step": 1000 + }, + { + "epoch": 0.24, + "eval_loss": 0.8396860361099243, + "eval_runtime": 19.3459, + "eval_samples_per_second": 103.381, + "eval_steps_per_second": 3.257, + "step": 1000 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002780726146023675, + "loss": 0.848, + "step": 1020 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002775959323111146, + "loss": 0.8383, + "step": 1040 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027711925001986173, + "loss": 0.8385, + "step": 1060 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002766425677286089, + "loss": 0.8308, + "step": 1080 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027616588543735596, + "loss": 0.8244, + "step": 1100 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002756892031461031, + "loss": 0.835, + "step": 1120 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002752125208548502, + "loss": 0.8337, + "step": 1140 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002747358385635974, + "loss": 0.8348, + "step": 1160 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027425915627234446, + "loss": 0.8353, + "step": 1180 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002737824739810916, + "loss": 0.8294, + "step": 1200 + }, + { + "epoch": 0.28, + "eval_loss": 0.8274422287940979, + "eval_runtime": 19.4187, + "eval_samples_per_second": 102.993, + "eval_steps_per_second": 3.244, + "step": 1200 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002733057916898387, + "loss": 0.8337, + "step": 1220 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002728291093985858, + "loss": 0.8435, + "step": 1240 + }, + { + "epoch": 0.3, + "learning_rate": 0.00027235242710733296, + "loss": 0.8347, + "step": 1260 + }, + { + "epoch": 0.3, + "learning_rate": 0.00027187574481608005, + "loss": 0.8258, + "step": 1280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002713990625248272, + "loss": 0.8304, + "step": 1300 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002709223802335743, + "loss": 0.8264, + "step": 1320 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002704456979423214, + "loss": 0.8313, + "step": 1340 + }, + { + "epoch": 0.32, + "learning_rate": 0.00026996901565106855, + "loss": 0.814, + "step": 1360 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002694923333598157, + "loss": 0.8223, + "step": 1380 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002690156510685628, + "loss": 0.8159, + "step": 1400 + }, + { + "epoch": 0.33, + "eval_loss": 0.8179089426994324, + "eval_runtime": 19.4736, + "eval_samples_per_second": 102.703, + "eval_steps_per_second": 3.235, + "step": 1400 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002685389687773099, + "loss": 0.8218, + "step": 1420 + }, + { + "epoch": 0.34, + "learning_rate": 0.000268062286486057, + "loss": 0.808, + "step": 1440 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026758560419480414, + "loss": 0.8253, + "step": 1460 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002671089219035513, + "loss": 0.8174, + "step": 1480 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026663223961229836, + "loss": 0.8157, + "step": 1500 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002661555573210455, + "loss": 0.8142, + "step": 1520 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002656788750297926, + "loss": 0.8112, + "step": 1540 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026520219273853973, + "loss": 0.8232, + "step": 1560 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026472551044728687, + "loss": 0.8254, + "step": 1580 + }, + { + "epoch": 0.38, + "learning_rate": 0.000264248828156034, + "loss": 0.8059, + "step": 1600 + }, + { + "epoch": 0.38, + "eval_loss": 0.8101135492324829, + "eval_runtime": 19.5846, + "eval_samples_per_second": 102.121, + "eval_steps_per_second": 3.217, + "step": 1600 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002637721458647811, + "loss": 0.8062, + "step": 1620 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026329546357352823, + "loss": 0.805, + "step": 1640 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002628187812822753, + "loss": 0.8109, + "step": 1660 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026234209899102245, + "loss": 0.801, + "step": 1680 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002618654166997696, + "loss": 0.8043, + "step": 1700 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002613887344085167, + "loss": 0.8002, + "step": 1720 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002609120521172638, + "loss": 0.8152, + "step": 1740 + }, + { + "epoch": 0.42, + "learning_rate": 0.00026043536982601096, + "loss": 0.8052, + "step": 1760 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002599586875347581, + "loss": 0.8136, + "step": 1780 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002594820052435052, + "loss": 0.8044, + "step": 1800 + }, + { + "epoch": 0.43, + "eval_loss": 0.8030326962471008, + "eval_runtime": 19.4835, + "eval_samples_per_second": 102.651, + "eval_steps_per_second": 3.234, + "step": 1800 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002590053229522523, + "loss": 0.7995, + "step": 1820 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002585286406609994, + "loss": 0.7958, + "step": 1840 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025805195836974654, + "loss": 0.8034, + "step": 1860 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025757527607849363, + "loss": 0.8016, + "step": 1880 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025709859378724077, + "loss": 0.8048, + "step": 1900 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002566219114959879, + "loss": 0.8004, + "step": 1920 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025614522920473505, + "loss": 0.8041, + "step": 1940 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025566854691348213, + "loss": 0.7908, + "step": 1960 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025519186462222927, + "loss": 0.7958, + "step": 1980 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002547151823309764, + "loss": 0.8013, + "step": 2000 + }, + { + "epoch": 0.47, + "eval_loss": 0.7965430021286011, + "eval_runtime": 19.4852, + "eval_samples_per_second": 102.642, + "eval_steps_per_second": 3.233, + "step": 2000 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002542385000397235, + "loss": 0.803, + "step": 2020 + }, + { + "epoch": 0.48, + "learning_rate": 0.00025376181774847064, + "loss": 0.7966, + "step": 2040 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002532851354572177, + "loss": 0.7946, + "step": 2060 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025280845316596486, + "loss": 0.8023, + "step": 2080 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025233177087471194, + "loss": 0.7953, + "step": 2100 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002518550885834591, + "loss": 0.8053, + "step": 2120 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002513784062922062, + "loss": 0.7883, + "step": 2140 + }, + { + "epoch": 0.51, + "learning_rate": 0.00025090172400095336, + "loss": 0.7984, + "step": 2160 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025042504170970045, + "loss": 0.7962, + "step": 2180 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002499483594184476, + "loss": 0.7847, + "step": 2200 + }, + { + "epoch": 0.52, + "eval_loss": 0.7915623784065247, + "eval_runtime": 19.5509, + "eval_samples_per_second": 102.297, + "eval_steps_per_second": 3.222, + "step": 2200 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002494716771271947, + "loss": 0.7917, + "step": 2220 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002489949948359418, + "loss": 0.7942, + "step": 2240 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024851831254468895, + "loss": 0.7921, + "step": 2260 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024804163025343603, + "loss": 0.7971, + "step": 2280 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002475649479621832, + "loss": 0.7919, + "step": 2300 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002470882656709303, + "loss": 0.7917, + "step": 2320 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024661158337967745, + "loss": 0.8024, + "step": 2340 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024613490108842454, + "loss": 0.7761, + "step": 2360 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002456582187971717, + "loss": 0.7958, + "step": 2380 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024518153650591876, + "loss": 0.7855, + "step": 2400 + }, + { + "epoch": 0.57, + "eval_loss": 0.7870249152183533, + "eval_runtime": 19.5953, + "eval_samples_per_second": 102.065, + "eval_steps_per_second": 3.215, + "step": 2400 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002447048542146659, + "loss": 0.784, + "step": 2420 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024422817192341304, + "loss": 0.7926, + "step": 2440 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024375148963216013, + "loss": 0.7845, + "step": 2460 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024327480734090726, + "loss": 0.782, + "step": 2480 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024279812504965438, + "loss": 0.7808, + "step": 2500 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024232144275840152, + "loss": 0.7926, + "step": 2520 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024184476046714863, + "loss": 0.7795, + "step": 2540 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024136807817589574, + "loss": 0.7888, + "step": 2560 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024089139588464288, + "loss": 0.7888, + "step": 2580 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024041471359339, + "loss": 0.7863, + "step": 2600 + }, + { + "epoch": 0.61, + "eval_loss": 0.7825512290000916, + "eval_runtime": 19.4274, + "eval_samples_per_second": 102.948, + "eval_steps_per_second": 3.243, + "step": 2600 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002399380313021371, + "loss": 0.7881, + "step": 2620 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023946134901088422, + "loss": 0.7841, + "step": 2640 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023898466671963133, + "loss": 0.7849, + "step": 2660 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023850798442837844, + "loss": 0.7809, + "step": 2680 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002380313021371256, + "loss": 0.7757, + "step": 2700 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023755461984587272, + "loss": 0.7787, + "step": 2720 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023707793755461983, + "loss": 0.7766, + "step": 2740 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023660125526336694, + "loss": 0.7867, + "step": 2760 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023612457297211405, + "loss": 0.7767, + "step": 2780 + }, + { + "epoch": 0.66, + "learning_rate": 0.0002356478906808612, + "loss": 0.7806, + "step": 2800 + }, + { + "epoch": 0.66, + "eval_loss": 0.7781409621238708, + "eval_runtime": 20.131, + "eval_samples_per_second": 99.349, + "eval_steps_per_second": 3.13, + "step": 2800 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002351712083896083, + "loss": 0.7774, + "step": 2820 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023469452609835542, + "loss": 0.7782, + "step": 2840 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023421784380710253, + "loss": 0.7773, + "step": 2860 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023374116151584964, + "loss": 0.7845, + "step": 2880 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002332644792245968, + "loss": 0.7879, + "step": 2900 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023278779693334392, + "loss": 0.7801, + "step": 2920 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023231111464209103, + "loss": 0.7713, + "step": 2940 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023183443235083814, + "loss": 0.7742, + "step": 2960 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023135775005958526, + "loss": 0.7783, + "step": 2980 + }, + { + "epoch": 0.71, + "learning_rate": 0.0002308810677683324, + "loss": 0.7698, + "step": 3000 + }, + { + "epoch": 0.71, + "eval_loss": 0.7747411131858826, + "eval_runtime": 20.0968, + "eval_samples_per_second": 99.519, + "eval_steps_per_second": 3.135, + "step": 3000 + }, + { + "epoch": 0.71, + "learning_rate": 0.0002304043854770795, + "loss": 0.7696, + "step": 3020 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022992770318582662, + "loss": 0.7744, + "step": 3040 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022945102089457373, + "loss": 0.7687, + "step": 3060 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022897433860332084, + "loss": 0.7765, + "step": 3080 + }, + { + "epoch": 0.73, + "learning_rate": 0.000228497656312068, + "loss": 0.7709, + "step": 3100 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022802097402081512, + "loss": 0.773, + "step": 3120 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022754429172956224, + "loss": 0.7862, + "step": 3140 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022706760943830935, + "loss": 0.7668, + "step": 3160 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022659092714705646, + "loss": 0.7816, + "step": 3180 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022611424485580357, + "loss": 0.7831, + "step": 3200 + }, + { + "epoch": 0.76, + "eval_loss": 0.7719215154647827, + "eval_runtime": 19.6387, + "eval_samples_per_second": 101.84, + "eval_steps_per_second": 3.208, + "step": 3200 + }, + { + "epoch": 0.76, + "learning_rate": 0.0002256375625645507, + "loss": 0.7723, + "step": 3220 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022516088027329782, + "loss": 0.7727, + "step": 3240 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022468419798204493, + "loss": 0.7719, + "step": 3260 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022420751569079207, + "loss": 0.7796, + "step": 3280 + }, + { + "epoch": 0.78, + "learning_rate": 0.0002237308333995392, + "loss": 0.7685, + "step": 3300 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022325415110828633, + "loss": 0.7725, + "step": 3320 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022277746881703344, + "loss": 0.7638, + "step": 3340 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022230078652578055, + "loss": 0.7771, + "step": 3360 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022182410423452766, + "loss": 0.7689, + "step": 3380 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022134742194327477, + "loss": 0.7797, + "step": 3400 + }, + { + "epoch": 0.8, + "eval_loss": 0.768983006477356, + "eval_runtime": 19.4428, + "eval_samples_per_second": 102.866, + "eval_steps_per_second": 3.24, + "step": 3400 + }, + { + "epoch": 0.81, + "learning_rate": 0.0002208707396520219, + "loss": 0.7734, + "step": 3420 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022039405736076903, + "loss": 0.7719, + "step": 3440 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021991737506951614, + "loss": 0.767, + "step": 3460 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021944069277826328, + "loss": 0.7758, + "step": 3480 + }, + { + "epoch": 0.83, + "learning_rate": 0.0002189640104870104, + "loss": 0.7768, + "step": 3500 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021848732819575753, + "loss": 0.7641, + "step": 3520 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021801064590450464, + "loss": 0.7694, + "step": 3540 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021753396361325175, + "loss": 0.7835, + "step": 3560 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021705728132199886, + "loss": 0.7642, + "step": 3580 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021658059903074598, + "loss": 0.7719, + "step": 3600 + }, + { + "epoch": 0.85, + "eval_loss": 0.7660636305809021, + "eval_runtime": 19.5996, + "eval_samples_per_second": 102.043, + "eval_steps_per_second": 3.214, + "step": 3600 + }, + { + "epoch": 0.86, + "learning_rate": 0.0002161039167394931, + "loss": 0.7723, + "step": 3620 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021562723444824023, + "loss": 0.76, + "step": 3640 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021515055215698734, + "loss": 0.7643, + "step": 3660 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021467386986573448, + "loss": 0.7599, + "step": 3680 + }, + { + "epoch": 0.87, + "learning_rate": 0.0002141971875744816, + "loss": 0.7623, + "step": 3700 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002137205052832287, + "loss": 0.7621, + "step": 3720 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021324382299197584, + "loss": 0.7691, + "step": 3740 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021276714070072295, + "loss": 0.7665, + "step": 3760 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021229045840947007, + "loss": 0.7742, + "step": 3780 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021181377611821718, + "loss": 0.7624, + "step": 3800 + }, + { + "epoch": 0.9, + "eval_loss": 0.7643172740936279, + "eval_runtime": 19.487, + "eval_samples_per_second": 102.633, + "eval_steps_per_second": 3.233, + "step": 3800 + }, + { + "epoch": 0.9, + "learning_rate": 0.0002113370938269643, + "loss": 0.7726, + "step": 3820 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002108604115357114, + "loss": 0.7559, + "step": 3840 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021038372924445857, + "loss": 0.7634, + "step": 3860 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020990704695320568, + "loss": 0.765, + "step": 3880 + }, + { + "epoch": 0.92, + "learning_rate": 0.0002094303646619528, + "loss": 0.7649, + "step": 3900 + }, + { + "epoch": 0.93, + "learning_rate": 0.0002089536823706999, + "loss": 0.763, + "step": 3920 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020847700007944705, + "loss": 0.7679, + "step": 3940 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020800031778819416, + "loss": 0.7644, + "step": 3960 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020752363549694127, + "loss": 0.7655, + "step": 3980 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020704695320568838, + "loss": 0.7681, + "step": 4000 + }, + { + "epoch": 0.95, + "eval_loss": 0.7610963582992554, + "eval_runtime": 19.5269, + "eval_samples_per_second": 102.423, + "eval_steps_per_second": 3.226, + "step": 4000 + }, + { + "epoch": 0.95, + "learning_rate": 0.0002065702709144355, + "loss": 0.7623, + "step": 4020 + }, + { + "epoch": 0.96, + "learning_rate": 0.0002060935886231826, + "loss": 0.7625, + "step": 4040 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020561690633192977, + "loss": 0.7524, + "step": 4060 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020514022404067688, + "loss": 0.764, + "step": 4080 + }, + { + "epoch": 0.97, + "learning_rate": 0.000204663541749424, + "loss": 0.7513, + "step": 4100 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002041868594581711, + "loss": 0.753, + "step": 4120 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020371017716691822, + "loss": 0.7602, + "step": 4140 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020323349487566536, + "loss": 0.7701, + "step": 4160 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020275681258441247, + "loss": 0.7602, + "step": 4180 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020228013029315958, + "loss": 0.7598, + "step": 4200 + }, + { + "epoch": 0.99, + "eval_loss": 0.760128915309906, + "eval_runtime": 19.4387, + "eval_samples_per_second": 102.888, + "eval_steps_per_second": 3.241, + "step": 4200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0002018034480019067, + "loss": 0.7579, + "step": 4220 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020132676571065384, + "loss": 0.7628, + "step": 4240 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020085008341940097, + "loss": 0.7551, + "step": 4260 + }, + { + "epoch": 1.01, + "learning_rate": 0.0002003734011281481, + "loss": 0.7582, + "step": 4280 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001998967188368952, + "loss": 0.7623, + "step": 4300 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001994200365456423, + "loss": 0.7504, + "step": 4320 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019894335425438942, + "loss": 0.7587, + "step": 4340 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019846667196313654, + "loss": 0.7528, + "step": 4360 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019798998967188367, + "loss": 0.754, + "step": 4380 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019751330738063079, + "loss": 0.759, + "step": 4400 + }, + { + "epoch": 1.04, + "eval_loss": 0.7575392127037048, + "eval_runtime": 19.5275, + "eval_samples_per_second": 102.42, + "eval_steps_per_second": 3.226, + "step": 4400 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001970366250893779, + "loss": 0.7592, + "step": 4420 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019655994279812504, + "loss": 0.7548, + "step": 4440 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019608326050687218, + "loss": 0.7632, + "step": 4460 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001956065782156193, + "loss": 0.7472, + "step": 4480 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001951298959243664, + "loss": 0.7496, + "step": 4500 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001946532136331135, + "loss": 0.7549, + "step": 4520 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019417653134186063, + "loss": 0.77, + "step": 4540 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019369984905060774, + "loss": 0.759, + "step": 4560 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019322316675935488, + "loss": 0.7554, + "step": 4580 + }, + { + "epoch": 1.09, + "learning_rate": 0.000192746484468102, + "loss": 0.7577, + "step": 4600 + }, + { + "epoch": 1.09, + "eval_loss": 0.7568497061729431, + "eval_runtime": 19.53, + "eval_samples_per_second": 102.406, + "eval_steps_per_second": 3.226, + "step": 4600 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001922698021768491, + "loss": 0.7617, + "step": 4620 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019179311988559624, + "loss": 0.7551, + "step": 4640 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019131643759434335, + "loss": 0.7482, + "step": 4660 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001908397553030905, + "loss": 0.7516, + "step": 4680 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001903630730118376, + "loss": 0.7555, + "step": 4700 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018988639072058472, + "loss": 0.7605, + "step": 4720 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018940970842933183, + "loss": 0.7506, + "step": 4740 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018893302613807894, + "loss": 0.7622, + "step": 4760 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018845634384682605, + "loss": 0.75, + "step": 4780 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001879796615555732, + "loss": 0.7572, + "step": 4800 + }, + { + "epoch": 1.13, + "eval_loss": 0.7548028826713562, + "eval_runtime": 19.5411, + "eval_samples_per_second": 102.349, + "eval_steps_per_second": 3.224, + "step": 4800 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018750297926432033, + "loss": 0.7427, + "step": 4820 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018702629697306744, + "loss": 0.7489, + "step": 4840 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018654961468181455, + "loss": 0.755, + "step": 4860 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018607293239056167, + "loss": 0.7517, + "step": 4880 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001855962500993088, + "loss": 0.7529, + "step": 4900 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018511956780805592, + "loss": 0.7498, + "step": 4920 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018464288551680303, + "loss": 0.756, + "step": 4940 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018416620322555014, + "loss": 0.7492, + "step": 4960 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018368952093429725, + "loss": 0.7491, + "step": 4980 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018321283864304437, + "loss": 0.7585, + "step": 5000 + }, + { + "epoch": 1.18, + "eval_loss": 0.7538104057312012, + "eval_runtime": 19.6106, + "eval_samples_per_second": 101.986, + "eval_steps_per_second": 3.213, + "step": 5000 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018273615635179153, + "loss": 0.7531, + "step": 5020 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018225947406053865, + "loss": 0.7511, + "step": 5040 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018178279176928576, + "loss": 0.7541, + "step": 5060 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018130610947803287, + "loss": 0.7465, + "step": 5080 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018082942718678, + "loss": 0.7403, + "step": 5100 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018035274489552712, + "loss": 0.749, + "step": 5120 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017987606260427423, + "loss": 0.7548, + "step": 5140 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017939938031302134, + "loss": 0.7443, + "step": 5160 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017892269802176846, + "loss": 0.7461, + "step": 5180 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017844601573051557, + "loss": 0.7511, + "step": 5200 + }, + { + "epoch": 1.23, + "eval_loss": 0.7509217262268066, + "eval_runtime": 19.5437, + "eval_samples_per_second": 102.335, + "eval_steps_per_second": 3.224, + "step": 5200 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017796933343926274, + "loss": 0.7562, + "step": 5220 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017749265114800985, + "loss": 0.7489, + "step": 5240 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017701596885675696, + "loss": 0.7499, + "step": 5260 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017653928656550407, + "loss": 0.7519, + "step": 5280 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017606260427425118, + "loss": 0.7536, + "step": 5300 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017558592198299832, + "loss": 0.7536, + "step": 5320 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017510923969174544, + "loss": 0.7492, + "step": 5340 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017463255740049255, + "loss": 0.7454, + "step": 5360 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017415587510923966, + "loss": 0.7528, + "step": 5380 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001736791928179868, + "loss": 0.7409, + "step": 5400 + }, + { + "epoch": 1.28, + "eval_loss": 0.7497395873069763, + "eval_runtime": 19.5671, + "eval_samples_per_second": 102.212, + "eval_steps_per_second": 3.22, + "step": 5400 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017320251052673394, + "loss": 0.7434, + "step": 5420 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017272582823548105, + "loss": 0.7543, + "step": 5440 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017224914594422816, + "loss": 0.7457, + "step": 5460 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017177246365297527, + "loss": 0.7439, + "step": 5480 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001712957813617224, + "loss": 0.7412, + "step": 5500 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001708190990704695, + "loss": 0.7409, + "step": 5520 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017034241677921664, + "loss": 0.7473, + "step": 5540 + }, + { + "epoch": 1.31, + "learning_rate": 0.00016986573448796375, + "loss": 0.7486, + "step": 5560 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016938905219671086, + "loss": 0.7439, + "step": 5580 + }, + { + "epoch": 1.32, + "learning_rate": 0.000168912369905458, + "loss": 0.7524, + "step": 5600 + }, + { + "epoch": 1.32, + "eval_loss": 0.7480019330978394, + "eval_runtime": 19.5018, + "eval_samples_per_second": 102.555, + "eval_steps_per_second": 3.23, + "step": 5600 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016843568761420514, + "loss": 0.7464, + "step": 5620 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016795900532295225, + "loss": 0.7511, + "step": 5640 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016748232303169936, + "loss": 0.7423, + "step": 5660 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016700564074044648, + "loss": 0.7422, + "step": 5680 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001665289584491936, + "loss": 0.742, + "step": 5700 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001660522761579407, + "loss": 0.7421, + "step": 5720 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016557559386668784, + "loss": 0.749, + "step": 5740 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016509891157543495, + "loss": 0.7432, + "step": 5760 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001646222292841821, + "loss": 0.7426, + "step": 5780 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001641455469929292, + "loss": 0.7543, + "step": 5800 + }, + { + "epoch": 1.37, + "eval_loss": 0.7470090389251709, + "eval_runtime": 19.5563, + "eval_samples_per_second": 102.269, + "eval_steps_per_second": 3.221, + "step": 5800 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016366886470167632, + "loss": 0.7451, + "step": 5820 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016319218241042346, + "loss": 0.7481, + "step": 5840 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016271550011917057, + "loss": 0.7381, + "step": 5860 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016223881782791768, + "loss": 0.7461, + "step": 5880 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001617621355366648, + "loss": 0.7467, + "step": 5900 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001612854532454119, + "loss": 0.745, + "step": 5920 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016080877095415902, + "loss": 0.745, + "step": 5940 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016033208866290615, + "loss": 0.7386, + "step": 5960 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001598554063716533, + "loss": 0.7363, + "step": 5980 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001593787240804004, + "loss": 0.7412, + "step": 6000 + }, + { + "epoch": 1.42, + "eval_loss": 0.7454522848129272, + "eval_runtime": 19.555, + "eval_samples_per_second": 102.276, + "eval_steps_per_second": 3.222, + "step": 6000 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015890204178914752, + "loss": 0.7501, + "step": 6020 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015842535949789463, + "loss": 0.7528, + "step": 6040 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015794867720664177, + "loss": 0.7373, + "step": 6060 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015747199491538888, + "loss": 0.7451, + "step": 6080 + }, + { + "epoch": 1.44, + "learning_rate": 0.000156995312624136, + "loss": 0.7384, + "step": 6100 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001565186303328831, + "loss": 0.7471, + "step": 6120 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015604194804163022, + "loss": 0.7454, + "step": 6140 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015556526575037733, + "loss": 0.7415, + "step": 6160 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001550885834591245, + "loss": 0.7514, + "step": 6180 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001546119011678716, + "loss": 0.7343, + "step": 6200 + }, + { + "epoch": 1.47, + "eval_loss": 0.7457332611083984, + "eval_runtime": 19.5673, + "eval_samples_per_second": 102.212, + "eval_steps_per_second": 3.22, + "step": 6200 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015413521887661872, + "loss": 0.7452, + "step": 6220 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015365853658536583, + "loss": 0.7456, + "step": 6240 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015318185429411297, + "loss": 0.7326, + "step": 6260 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015270517200286008, + "loss": 0.7431, + "step": 6280 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001522284897116072, + "loss": 0.7419, + "step": 6300 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001517518074203543, + "loss": 0.7375, + "step": 6320 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015127512512910142, + "loss": 0.7419, + "step": 6340 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001507984428378486, + "loss": 0.7431, + "step": 6360 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001503217605465957, + "loss": 0.7412, + "step": 6380 + }, + { + "epoch": 1.51, + "learning_rate": 0.00014984507825534278, + "loss": 0.7447, + "step": 6400 + }, + { + "epoch": 1.51, + "eval_loss": 0.7441338896751404, + "eval_runtime": 19.4509, + "eval_samples_per_second": 102.823, + "eval_steps_per_second": 3.239, + "step": 6400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014936839596408992, + "loss": 0.7436, + "step": 6420 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014889171367283704, + "loss": 0.7402, + "step": 6440 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014841503138158415, + "loss": 0.7454, + "step": 6460 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001479383490903313, + "loss": 0.738, + "step": 6480 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001474616667990784, + "loss": 0.7396, + "step": 6500 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014698498450782554, + "loss": 0.7333, + "step": 6520 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014650830221657265, + "loss": 0.7482, + "step": 6540 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014603161992531976, + "loss": 0.7376, + "step": 6560 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014555493763406687, + "loss": 0.7369, + "step": 6580 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014507825534281401, + "loss": 0.7347, + "step": 6600 + }, + { + "epoch": 1.56, + "eval_loss": 0.7425362467765808, + "eval_runtime": 19.5248, + "eval_samples_per_second": 102.434, + "eval_steps_per_second": 3.227, + "step": 6600 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014460157305156113, + "loss": 0.7446, + "step": 6620 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014412489076030824, + "loss": 0.7343, + "step": 6640 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014364820846905535, + "loss": 0.7468, + "step": 6660 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001431715261778025, + "loss": 0.749, + "step": 6680 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001426948438865496, + "loss": 0.7401, + "step": 6700 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001422181615952967, + "loss": 0.7364, + "step": 6720 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014174147930404385, + "loss": 0.7442, + "step": 6740 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014126479701279096, + "loss": 0.7385, + "step": 6760 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014078811472153808, + "loss": 0.7412, + "step": 6780 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014031143243028522, + "loss": 0.7377, + "step": 6800 + }, + { + "epoch": 1.61, + "eval_loss": 0.7418386936187744, + "eval_runtime": 19.5679, + "eval_samples_per_second": 102.208, + "eval_steps_per_second": 3.22, + "step": 6800 + }, + { + "epoch": 1.61, + "learning_rate": 0.00013983475013903233, + "loss": 0.7432, + "step": 6820 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013935806784777944, + "loss": 0.7379, + "step": 6840 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013888138555652655, + "loss": 0.7346, + "step": 6860 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013840470326527366, + "loss": 0.7373, + "step": 6880 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001379280209740208, + "loss": 0.7403, + "step": 6900 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013745133868276792, + "loss": 0.7477, + "step": 6920 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013697465639151506, + "loss": 0.7343, + "step": 6940 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013649797410026217, + "loss": 0.7419, + "step": 6960 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013602129180900928, + "loss": 0.7327, + "step": 6980 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013554460951775642, + "loss": 0.7398, + "step": 7000 + }, + { + "epoch": 1.66, + "eval_loss": 0.7402775883674622, + "eval_runtime": 19.5554, + "eval_samples_per_second": 102.274, + "eval_steps_per_second": 3.222, + "step": 7000 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013506792722650353, + "loss": 0.7311, + "step": 7020 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013459124493525064, + "loss": 0.7319, + "step": 7040 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013411456264399775, + "loss": 0.7315, + "step": 7060 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001336378803527449, + "loss": 0.7329, + "step": 7080 + }, + { + "epoch": 1.68, + "learning_rate": 0.000133161198061492, + "loss": 0.7471, + "step": 7100 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013268451577023912, + "loss": 0.7446, + "step": 7120 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013220783347898623, + "loss": 0.7359, + "step": 7140 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013173115118773337, + "loss": 0.7348, + "step": 7160 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013125446889648048, + "loss": 0.7331, + "step": 7180 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013077778660522762, + "loss": 0.7385, + "step": 7200 + }, + { + "epoch": 1.7, + "eval_loss": 0.7401012182235718, + "eval_runtime": 19.7831, + "eval_samples_per_second": 101.096, + "eval_steps_per_second": 3.185, + "step": 7200 + }, + { + "epoch": 1.71, + "learning_rate": 0.00013030110431397473, + "loss": 0.744, + "step": 7220 + }, + { + "epoch": 1.71, + "learning_rate": 0.00012982442202272185, + "loss": 0.7327, + "step": 7240 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012934773973146896, + "loss": 0.7384, + "step": 7260 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001288710574402161, + "loss": 0.7399, + "step": 7280 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001283943751489632, + "loss": 0.7376, + "step": 7300 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012791769285771032, + "loss": 0.7416, + "step": 7320 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012744101056645743, + "loss": 0.7299, + "step": 7340 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012696432827520455, + "loss": 0.7389, + "step": 7360 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012648764598395168, + "loss": 0.7295, + "step": 7380 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001260109636926988, + "loss": 0.7389, + "step": 7400 + }, + { + "epoch": 1.75, + "eval_loss": 0.7385362386703491, + "eval_runtime": 19.6728, + "eval_samples_per_second": 101.663, + "eval_steps_per_second": 3.202, + "step": 7400 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012553428140144594, + "loss": 0.7346, + "step": 7420 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012505759911019305, + "loss": 0.7357, + "step": 7440 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012458091681894016, + "loss": 0.7295, + "step": 7460 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001241042345276873, + "loss": 0.7418, + "step": 7480 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001236275522364344, + "loss": 0.7248, + "step": 7500 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012315086994518152, + "loss": 0.7326, + "step": 7520 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012267418765392864, + "loss": 0.7422, + "step": 7540 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012219750536267577, + "loss": 0.7376, + "step": 7560 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012172082307142289, + "loss": 0.7358, + "step": 7580 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012124414078017001, + "loss": 0.7337, + "step": 7600 + }, + { + "epoch": 1.8, + "eval_loss": 0.737734854221344, + "eval_runtime": 19.8317, + "eval_samples_per_second": 100.849, + "eval_steps_per_second": 3.177, + "step": 7600 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012076745848891712, + "loss": 0.7318, + "step": 7620 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012029077619766424, + "loss": 0.7356, + "step": 7640 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011981409390641138, + "loss": 0.7355, + "step": 7660 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011933741161515849, + "loss": 0.74, + "step": 7680 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001188607293239056, + "loss": 0.7342, + "step": 7700 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011838404703265273, + "loss": 0.7368, + "step": 7720 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011790736474139984, + "loss": 0.7337, + "step": 7740 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011743068245014698, + "loss": 0.7317, + "step": 7760 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011695400015889409, + "loss": 0.738, + "step": 7780 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001164773178676412, + "loss": 0.7375, + "step": 7800 + }, + { + "epoch": 1.84, + "eval_loss": 0.7366506457328796, + "eval_runtime": 19.9586, + "eval_samples_per_second": 100.208, + "eval_steps_per_second": 3.157, + "step": 7800 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011600063557638833, + "loss": 0.7349, + "step": 7820 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011552395328513544, + "loss": 0.733, + "step": 7840 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011504727099388258, + "loss": 0.7277, + "step": 7860 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011457058870262969, + "loss": 0.7235, + "step": 7880 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001140939064113768, + "loss": 0.7405, + "step": 7900 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011361722412012393, + "loss": 0.7378, + "step": 7920 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011314054182887104, + "loss": 0.7292, + "step": 7940 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011266385953761818, + "loss": 0.7427, + "step": 7960 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011218717724636529, + "loss": 0.7313, + "step": 7980 + }, + { + "epoch": 1.89, + "learning_rate": 0.0001117104949551124, + "loss": 0.7252, + "step": 8000 + }, + { + "epoch": 1.89, + "eval_loss": 0.736083984375, + "eval_runtime": 19.7958, + "eval_samples_per_second": 101.031, + "eval_steps_per_second": 3.182, + "step": 8000 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011123381266385953, + "loss": 0.7268, + "step": 8020 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011075713037260666, + "loss": 0.729, + "step": 8040 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011028044808135377, + "loss": 0.7358, + "step": 8060 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010980376579010089, + "loss": 0.7408, + "step": 8080 + }, + { + "epoch": 1.92, + "learning_rate": 0.000109327083498848, + "loss": 0.73, + "step": 8100 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010887423532215777, + "loss": 0.7298, + "step": 8120 + }, + { + "epoch": 1.92, + "learning_rate": 0.0001083975530309049, + "loss": 0.7324, + "step": 8140 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010792087073965201, + "loss": 0.7296, + "step": 8160 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010744418844839912, + "loss": 0.7346, + "step": 8180 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010696750615714626, + "loss": 0.7281, + "step": 8200 + }, + { + "epoch": 1.94, + "eval_loss": 0.7352190613746643, + "eval_runtime": 19.6635, + "eval_samples_per_second": 101.711, + "eval_steps_per_second": 3.204, + "step": 8200 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010649082386589337, + "loss": 0.7377, + "step": 8220 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001060141415746405, + "loss": 0.7281, + "step": 8240 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010553745928338761, + "loss": 0.7251, + "step": 8260 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010506077699213472, + "loss": 0.7331, + "step": 8280 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010458409470088186, + "loss": 0.7432, + "step": 8300 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010410741240962897, + "loss": 0.7366, + "step": 8320 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001036307301183761, + "loss": 0.7334, + "step": 8340 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010315404782712321, + "loss": 0.7351, + "step": 8360 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010267736553587032, + "loss": 0.7355, + "step": 8380 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010220068324461746, + "loss": 0.7228, + "step": 8400 + }, + { + "epoch": 1.99, + "eval_loss": 0.7341500520706177, + "eval_runtime": 19.6196, + "eval_samples_per_second": 101.939, + "eval_steps_per_second": 3.211, + "step": 8400 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010172400095336457, + "loss": 0.7451, + "step": 8420 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010124731866211169, + "loss": 0.7356, + "step": 8440 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010077063637085881, + "loss": 0.7255, + "step": 8460 + }, + { + "epoch": 2.01, + "learning_rate": 0.00010029395407960592, + "loss": 0.7267, + "step": 8480 + }, + { + "epoch": 2.01, + "learning_rate": 9.981727178835306e-05, + "loss": 0.7291, + "step": 8500 + }, + { + "epoch": 2.01, + "learning_rate": 9.934058949710018e-05, + "loss": 0.7294, + "step": 8520 + }, + { + "epoch": 2.02, + "learning_rate": 9.886390720584729e-05, + "loss": 0.7377, + "step": 8540 + }, + { + "epoch": 2.02, + "learning_rate": 9.838722491459441e-05, + "loss": 0.7324, + "step": 8560 + }, + { + "epoch": 2.03, + "learning_rate": 9.791054262334154e-05, + "loss": 0.7286, + "step": 8580 + }, + { + "epoch": 2.03, + "learning_rate": 9.743386033208867e-05, + "loss": 0.7286, + "step": 8600 + }, + { + "epoch": 2.03, + "eval_loss": 0.734474241733551, + "eval_runtime": 19.5642, + "eval_samples_per_second": 102.228, + "eval_steps_per_second": 3.22, + "step": 8600 + }, + { + "epoch": 2.04, + "learning_rate": 9.695717804083578e-05, + "loss": 0.7304, + "step": 8620 + }, + { + "epoch": 2.04, + "learning_rate": 9.648049574958289e-05, + "loss": 0.7348, + "step": 8640 + }, + { + "epoch": 2.05, + "learning_rate": 9.600381345833002e-05, + "loss": 0.7261, + "step": 8660 + }, + { + "epoch": 2.05, + "learning_rate": 9.552713116707714e-05, + "loss": 0.7313, + "step": 8680 + }, + { + "epoch": 2.06, + "learning_rate": 9.505044887582425e-05, + "loss": 0.7379, + "step": 8700 + }, + { + "epoch": 2.06, + "learning_rate": 9.457376658457138e-05, + "loss": 0.7203, + "step": 8720 + }, + { + "epoch": 2.07, + "learning_rate": 9.409708429331849e-05, + "loss": 0.7306, + "step": 8740 + }, + { + "epoch": 2.07, + "learning_rate": 9.36204020020656e-05, + "loss": 0.7332, + "step": 8760 + }, + { + "epoch": 2.08, + "learning_rate": 9.314371971081274e-05, + "loss": 0.7228, + "step": 8780 + }, + { + "epoch": 2.08, + "learning_rate": 9.266703741955985e-05, + "loss": 0.731, + "step": 8800 + }, + { + "epoch": 2.08, + "eval_loss": 0.7332338690757751, + "eval_runtime": 19.7114, + "eval_samples_per_second": 101.464, + "eval_steps_per_second": 3.196, + "step": 8800 + }, + { + "epoch": 2.09, + "learning_rate": 9.219035512830698e-05, + "loss": 0.7267, + "step": 8820 + }, + { + "epoch": 2.09, + "learning_rate": 9.171367283705409e-05, + "loss": 0.7285, + "step": 8840 + }, + { + "epoch": 2.09, + "learning_rate": 9.12369905458012e-05, + "loss": 0.7214, + "step": 8860 + }, + { + "epoch": 2.1, + "learning_rate": 9.076030825454834e-05, + "loss": 0.7204, + "step": 8880 + }, + { + "epoch": 2.1, + "learning_rate": 9.028362596329546e-05, + "loss": 0.7253, + "step": 8900 + }, + { + "epoch": 2.11, + "learning_rate": 8.980694367204258e-05, + "loss": 0.7253, + "step": 8920 + }, + { + "epoch": 2.11, + "learning_rate": 8.933026138078969e-05, + "loss": 0.7238, + "step": 8940 + }, + { + "epoch": 2.12, + "learning_rate": 8.88535790895368e-05, + "loss": 0.7286, + "step": 8960 + }, + { + "epoch": 2.12, + "learning_rate": 8.837689679828394e-05, + "loss": 0.7385, + "step": 8980 + }, + { + "epoch": 2.13, + "learning_rate": 8.790021450703106e-05, + "loss": 0.7237, + "step": 9000 + }, + { + "epoch": 2.13, + "eval_loss": 0.7329864501953125, + "eval_runtime": 19.7024, + "eval_samples_per_second": 101.51, + "eval_steps_per_second": 3.198, + "step": 9000 + }, + { + "epoch": 2.13, + "learning_rate": 8.742353221577817e-05, + "loss": 0.7311, + "step": 9020 + }, + { + "epoch": 2.14, + "learning_rate": 8.69468499245253e-05, + "loss": 0.7374, + "step": 9040 + }, + { + "epoch": 2.14, + "learning_rate": 8.64701676332724e-05, + "loss": 0.7194, + "step": 9060 + }, + { + "epoch": 2.15, + "learning_rate": 8.599348534201955e-05, + "loss": 0.7237, + "step": 9080 + }, + { + "epoch": 2.15, + "learning_rate": 8.551680305076666e-05, + "loss": 0.7287, + "step": 9100 + }, + { + "epoch": 2.16, + "learning_rate": 8.504012075951377e-05, + "loss": 0.7385, + "step": 9120 + }, + { + "epoch": 2.16, + "learning_rate": 8.45634384682609e-05, + "loss": 0.7319, + "step": 9140 + }, + { + "epoch": 2.17, + "learning_rate": 8.408675617700802e-05, + "loss": 0.7278, + "step": 9160 + }, + { + "epoch": 2.17, + "learning_rate": 8.361007388575515e-05, + "loss": 0.7293, + "step": 9180 + }, + { + "epoch": 2.18, + "learning_rate": 8.313339159450226e-05, + "loss": 0.7232, + "step": 9200 + }, + { + "epoch": 2.18, + "eval_loss": 0.7326176762580872, + "eval_runtime": 20.1581, + "eval_samples_per_second": 99.215, + "eval_steps_per_second": 3.125, + "step": 9200 + }, + { + "epoch": 2.18, + "learning_rate": 8.265670930324937e-05, + "loss": 0.7281, + "step": 9220 + }, + { + "epoch": 2.18, + "learning_rate": 8.21800270119965e-05, + "loss": 0.728, + "step": 9240 + }, + { + "epoch": 2.19, + "learning_rate": 8.170334472074362e-05, + "loss": 0.728, + "step": 9260 + }, + { + "epoch": 2.19, + "learning_rate": 8.122666242949073e-05, + "loss": 0.7221, + "step": 9280 + }, + { + "epoch": 2.2, + "learning_rate": 8.074998013823786e-05, + "loss": 0.7242, + "step": 9300 + }, + { + "epoch": 2.2, + "learning_rate": 8.027329784698497e-05, + "loss": 0.7306, + "step": 9320 + }, + { + "epoch": 2.21, + "learning_rate": 7.979661555573208e-05, + "loss": 0.7218, + "step": 9340 + }, + { + "epoch": 2.21, + "learning_rate": 7.931993326447922e-05, + "loss": 0.7289, + "step": 9360 + }, + { + "epoch": 2.22, + "learning_rate": 7.884325097322634e-05, + "loss": 0.7177, + "step": 9380 + }, + { + "epoch": 2.22, + "learning_rate": 7.836656868197346e-05, + "loss": 0.7265, + "step": 9400 + }, + { + "epoch": 2.22, + "eval_loss": 0.7311453819274902, + "eval_runtime": 19.9076, + "eval_samples_per_second": 100.464, + "eval_steps_per_second": 3.165, + "step": 9400 + }, + { + "epoch": 2.23, + "learning_rate": 7.788988639072057e-05, + "loss": 0.7269, + "step": 9420 + }, + { + "epoch": 2.23, + "learning_rate": 7.741320409946769e-05, + "loss": 0.7275, + "step": 9440 + }, + { + "epoch": 2.24, + "learning_rate": 7.693652180821483e-05, + "loss": 0.7317, + "step": 9460 + }, + { + "epoch": 2.24, + "learning_rate": 7.645983951696194e-05, + "loss": 0.7344, + "step": 9480 + }, + { + "epoch": 2.25, + "learning_rate": 7.598315722570906e-05, + "loss": 0.7263, + "step": 9500 + }, + { + "epoch": 2.25, + "learning_rate": 7.550647493445617e-05, + "loss": 0.7299, + "step": 9520 + }, + { + "epoch": 2.26, + "learning_rate": 7.502979264320329e-05, + "loss": 0.724, + "step": 9540 + }, + { + "epoch": 2.26, + "learning_rate": 7.455311035195041e-05, + "loss": 0.7266, + "step": 9560 + }, + { + "epoch": 2.27, + "learning_rate": 7.407642806069754e-05, + "loss": 0.7299, + "step": 9580 + }, + { + "epoch": 2.27, + "learning_rate": 7.359974576944465e-05, + "loss": 0.7236, + "step": 9600 + }, + { + "epoch": 2.27, + "eval_loss": 0.7311366200447083, + "eval_runtime": 20.0053, + "eval_samples_per_second": 99.973, + "eval_steps_per_second": 3.149, + "step": 9600 + }, + { + "epoch": 2.27, + "learning_rate": 7.314689759275442e-05, + "loss": 0.7252, + "step": 9620 + }, + { + "epoch": 2.28, + "learning_rate": 7.267021530150154e-05, + "loss": 0.7252, + "step": 9640 + }, + { + "epoch": 2.28, + "learning_rate": 7.219353301024865e-05, + "loss": 0.7188, + "step": 9660 + }, + { + "epoch": 2.29, + "learning_rate": 7.171685071899578e-05, + "loss": 0.7243, + "step": 9680 + }, + { + "epoch": 2.29, + "learning_rate": 7.12401684277429e-05, + "loss": 0.7298, + "step": 9700 + }, + { + "epoch": 2.3, + "learning_rate": 7.076348613649002e-05, + "loss": 0.7325, + "step": 9720 + }, + { + "epoch": 2.3, + "learning_rate": 7.028680384523714e-05, + "loss": 0.7286, + "step": 9740 + }, + { + "epoch": 2.31, + "learning_rate": 6.981012155398426e-05, + "loss": 0.7201, + "step": 9760 + }, + { + "epoch": 2.31, + "learning_rate": 6.933343926273138e-05, + "loss": 0.7184, + "step": 9780 + }, + { + "epoch": 2.32, + "learning_rate": 6.885675697147851e-05, + "loss": 0.7291, + "step": 9800 + }, + { + "epoch": 2.32, + "eval_loss": 0.7308618426322937, + "eval_runtime": 19.7965, + "eval_samples_per_second": 101.028, + "eval_steps_per_second": 3.182, + "step": 9800 + }, + { + "epoch": 2.32, + "learning_rate": 6.838007468022563e-05, + "loss": 0.7318, + "step": 9820 + }, + { + "epoch": 2.33, + "learning_rate": 6.790339238897274e-05, + "loss": 0.7227, + "step": 9840 + }, + { + "epoch": 2.33, + "learning_rate": 6.742671009771986e-05, + "loss": 0.7377, + "step": 9860 + }, + { + "epoch": 2.34, + "learning_rate": 6.695002780646698e-05, + "loss": 0.7367, + "step": 9880 + }, + { + "epoch": 2.34, + "learning_rate": 6.647334551521411e-05, + "loss": 0.7218, + "step": 9900 + }, + { + "epoch": 2.35, + "learning_rate": 6.599666322396122e-05, + "loss": 0.7282, + "step": 9920 + }, + { + "epoch": 2.35, + "learning_rate": 6.551998093270835e-05, + "loss": 0.7231, + "step": 9940 + }, + { + "epoch": 2.36, + "learning_rate": 6.504329864145546e-05, + "loss": 0.7257, + "step": 9960 + }, + { + "epoch": 2.36, + "learning_rate": 6.456661635020258e-05, + "loss": 0.7275, + "step": 9980 + }, + { + "epoch": 2.36, + "learning_rate": 6.40899340589497e-05, + "loss": 0.725, + "step": 10000 + }, + { + "epoch": 2.36, + "eval_loss": 0.7301817536354065, + "eval_runtime": 19.7914, + "eval_samples_per_second": 101.054, + "eval_steps_per_second": 3.183, + "step": 10000 + }, + { + "epoch": 2.37, + "learning_rate": 6.361325176769682e-05, + "loss": 0.72, + "step": 10020 + }, + { + "epoch": 2.37, + "learning_rate": 6.313656947644395e-05, + "loss": 0.7267, + "step": 10040 + }, + { + "epoch": 2.38, + "learning_rate": 6.265988718519107e-05, + "loss": 0.7276, + "step": 10060 + }, + { + "epoch": 2.38, + "learning_rate": 6.218320489393818e-05, + "loss": 0.7262, + "step": 10080 + }, + { + "epoch": 2.39, + "learning_rate": 6.17065226026853e-05, + "loss": 0.7149, + "step": 10100 + }, + { + "epoch": 2.39, + "learning_rate": 6.122984031143242e-05, + "loss": 0.7305, + "step": 10120 + }, + { + "epoch": 2.4, + "learning_rate": 6.075315802017954e-05, + "loss": 0.7314, + "step": 10140 + }, + { + "epoch": 2.4, + "learning_rate": 6.027647572892667e-05, + "loss": 0.7154, + "step": 10160 + }, + { + "epoch": 2.41, + "learning_rate": 5.9799793437673786e-05, + "loss": 0.7263, + "step": 10180 + }, + { + "epoch": 2.41, + "learning_rate": 5.93231111464209e-05, + "loss": 0.7203, + "step": 10200 + }, + { + "epoch": 2.41, + "eval_loss": 0.7294782996177673, + "eval_runtime": 19.7824, + "eval_samples_per_second": 101.1, + "eval_steps_per_second": 3.185, + "step": 10200 + }, + { + "epoch": 2.42, + "learning_rate": 5.8846428855168024e-05, + "loss": 0.7208, + "step": 10220 + }, + { + "epoch": 2.42, + "learning_rate": 5.836974656391514e-05, + "loss": 0.7266, + "step": 10240 + }, + { + "epoch": 2.43, + "learning_rate": 5.789306427266227e-05, + "loss": 0.7285, + "step": 10260 + }, + { + "epoch": 2.43, + "learning_rate": 5.741638198140939e-05, + "loss": 0.7215, + "step": 10280 + }, + { + "epoch": 2.44, + "learning_rate": 5.6939699690156506e-05, + "loss": 0.7203, + "step": 10300 + }, + { + "epoch": 2.44, + "learning_rate": 5.6463017398903625e-05, + "loss": 0.7314, + "step": 10320 + }, + { + "epoch": 2.44, + "learning_rate": 5.5986335107650744e-05, + "loss": 0.7394, + "step": 10340 + }, + { + "epoch": 2.45, + "learning_rate": 5.550965281639787e-05, + "loss": 0.7138, + "step": 10360 + }, + { + "epoch": 2.45, + "learning_rate": 5.503297052514498e-05, + "loss": 0.721, + "step": 10380 + }, + { + "epoch": 2.46, + "learning_rate": 5.455628823389211e-05, + "loss": 0.7199, + "step": 10400 + }, + { + "epoch": 2.46, + "eval_loss": 0.728507936000824, + "eval_runtime": 19.7761, + "eval_samples_per_second": 101.132, + "eval_steps_per_second": 3.186, + "step": 10400 + }, + { + "epoch": 2.46, + "learning_rate": 5.4079605942639226e-05, + "loss": 0.7228, + "step": 10420 + }, + { + "epoch": 2.47, + "learning_rate": 5.3602923651386345e-05, + "loss": 0.7193, + "step": 10440 + }, + { + "epoch": 2.47, + "learning_rate": 5.3126241360133464e-05, + "loss": 0.7269, + "step": 10460 + }, + { + "epoch": 2.48, + "learning_rate": 5.264955906888058e-05, + "loss": 0.729, + "step": 10480 + }, + { + "epoch": 2.48, + "learning_rate": 5.217287677762771e-05, + "loss": 0.7193, + "step": 10500 + }, + { + "epoch": 2.49, + "learning_rate": 5.169619448637483e-05, + "loss": 0.7158, + "step": 10520 + }, + { + "epoch": 2.49, + "learning_rate": 5.121951219512195e-05, + "loss": 0.7158, + "step": 10540 + }, + { + "epoch": 2.5, + "learning_rate": 5.0742829903869065e-05, + "loss": 0.7177, + "step": 10560 + }, + { + "epoch": 2.5, + "learning_rate": 5.0266147612616184e-05, + "loss": 0.7187, + "step": 10580 + }, + { + "epoch": 2.51, + "learning_rate": 4.978946532136331e-05, + "loss": 0.7185, + "step": 10600 + }, + { + "epoch": 2.51, + "eval_loss": 0.7283052802085876, + "eval_runtime": 20.2682, + "eval_samples_per_second": 98.677, + "eval_steps_per_second": 3.108, + "step": 10600 + }, + { + "epoch": 2.51, + "learning_rate": 4.931278303011042e-05, + "loss": 0.7264, + "step": 10620 + }, + { + "epoch": 2.52, + "learning_rate": 4.883610073885755e-05, + "loss": 0.7208, + "step": 10640 + }, + { + "epoch": 2.52, + "learning_rate": 4.835941844760467e-05, + "loss": 0.7275, + "step": 10660 + }, + { + "epoch": 2.53, + "learning_rate": 4.7882736156351786e-05, + "loss": 0.7205, + "step": 10680 + }, + { + "epoch": 2.53, + "learning_rate": 4.740605386509891e-05, + "loss": 0.7213, + "step": 10700 + }, + { + "epoch": 2.53, + "learning_rate": 4.692937157384602e-05, + "loss": 0.7324, + "step": 10720 + }, + { + "epoch": 2.54, + "learning_rate": 4.645268928259315e-05, + "loss": 0.7197, + "step": 10740 + }, + { + "epoch": 2.54, + "learning_rate": 4.597600699134027e-05, + "loss": 0.7162, + "step": 10760 + }, + { + "epoch": 2.55, + "learning_rate": 4.5499324700087394e-05, + "loss": 0.7223, + "step": 10780 + }, + { + "epoch": 2.55, + "learning_rate": 4.5022642408834506e-05, + "loss": 0.7249, + "step": 10800 + }, + { + "epoch": 2.55, + "eval_loss": 0.7278863191604614, + "eval_runtime": 19.7684, + "eval_samples_per_second": 101.171, + "eval_steps_per_second": 3.187, + "step": 10800 + }, + { + "epoch": 2.56, + "learning_rate": 4.4545960117581625e-05, + "loss": 0.7245, + "step": 10820 + }, + { + "epoch": 2.56, + "learning_rate": 4.406927782632875e-05, + "loss": 0.7298, + "step": 10840 + }, + { + "epoch": 2.57, + "learning_rate": 4.359259553507587e-05, + "loss": 0.7172, + "step": 10860 + }, + { + "epoch": 2.57, + "learning_rate": 4.3115913243822995e-05, + "loss": 0.7183, + "step": 10880 + }, + { + "epoch": 2.58, + "learning_rate": 4.263923095257011e-05, + "loss": 0.7172, + "step": 10900 + }, + { + "epoch": 2.58, + "learning_rate": 4.2162548661317226e-05, + "loss": 0.7166, + "step": 10920 + }, + { + "epoch": 2.59, + "learning_rate": 4.168586637006435e-05, + "loss": 0.7303, + "step": 10940 + }, + { + "epoch": 2.59, + "learning_rate": 4.1209184078811464e-05, + "loss": 0.716, + "step": 10960 + }, + { + "epoch": 2.6, + "learning_rate": 4.073250178755859e-05, + "loss": 0.7199, + "step": 10980 + }, + { + "epoch": 2.6, + "learning_rate": 4.025581949630571e-05, + "loss": 0.7227, + "step": 11000 + }, + { + "epoch": 2.6, + "eval_loss": 0.7274474501609802, + "eval_runtime": 19.9546, + "eval_samples_per_second": 100.228, + "eval_steps_per_second": 3.157, + "step": 11000 + }, + { + "epoch": 2.61, + "learning_rate": 3.9779137205052834e-05, + "loss": 0.7134, + "step": 11020 + }, + { + "epoch": 2.61, + "learning_rate": 3.930245491379995e-05, + "loss": 0.7354, + "step": 11040 + }, + { + "epoch": 2.62, + "learning_rate": 3.8825772622547065e-05, + "loss": 0.7269, + "step": 11060 + }, + { + "epoch": 2.62, + "learning_rate": 3.834909033129419e-05, + "loss": 0.7261, + "step": 11080 + }, + { + "epoch": 2.62, + "learning_rate": 3.787240804004131e-05, + "loss": 0.735, + "step": 11100 + }, + { + "epoch": 2.63, + "learning_rate": 3.739572574878843e-05, + "loss": 0.716, + "step": 11120 + }, + { + "epoch": 2.63, + "learning_rate": 3.691904345753555e-05, + "loss": 0.721, + "step": 11140 + }, + { + "epoch": 2.64, + "learning_rate": 3.644236116628267e-05, + "loss": 0.7201, + "step": 11160 + }, + { + "epoch": 2.64, + "learning_rate": 3.596567887502979e-05, + "loss": 0.7231, + "step": 11180 + }, + { + "epoch": 2.65, + "learning_rate": 3.548899658377691e-05, + "loss": 0.7172, + "step": 11200 + }, + { + "epoch": 2.65, + "eval_loss": 0.7270590662956238, + "eval_runtime": 19.753, + "eval_samples_per_second": 101.251, + "eval_steps_per_second": 3.189, + "step": 11200 + }, + { + "epoch": 2.65, + "learning_rate": 3.501231429252403e-05, + "loss": 0.7296, + "step": 11220 + }, + { + "epoch": 2.66, + "learning_rate": 3.453563200127115e-05, + "loss": 0.7239, + "step": 11240 + }, + { + "epoch": 2.66, + "learning_rate": 3.405894971001827e-05, + "loss": 0.7215, + "step": 11260 + }, + { + "epoch": 2.67, + "learning_rate": 3.358226741876539e-05, + "loss": 0.7176, + "step": 11280 + }, + { + "epoch": 2.67, + "learning_rate": 3.310558512751251e-05, + "loss": 0.7277, + "step": 11300 + }, + { + "epoch": 2.68, + "learning_rate": 3.262890283625963e-05, + "loss": 0.7237, + "step": 11320 + }, + { + "epoch": 2.68, + "learning_rate": 3.215222054500675e-05, + "loss": 0.7167, + "step": 11340 + }, + { + "epoch": 2.69, + "learning_rate": 3.167553825375387e-05, + "loss": 0.7184, + "step": 11360 + }, + { + "epoch": 2.69, + "learning_rate": 3.119885596250099e-05, + "loss": 0.7238, + "step": 11380 + }, + { + "epoch": 2.7, + "learning_rate": 3.072217367124811e-05, + "loss": 0.7188, + "step": 11400 + }, + { + "epoch": 2.7, + "eval_loss": 0.7263159155845642, + "eval_runtime": 19.6317, + "eval_samples_per_second": 101.876, + "eval_steps_per_second": 3.209, + "step": 11400 + }, + { + "epoch": 2.7, + "learning_rate": 3.0245491379995232e-05, + "loss": 0.7146, + "step": 11420 + }, + { + "epoch": 2.71, + "learning_rate": 2.9768809088742348e-05, + "loss": 0.7307, + "step": 11440 + }, + { + "epoch": 2.71, + "learning_rate": 2.929212679748947e-05, + "loss": 0.721, + "step": 11460 + }, + { + "epoch": 2.71, + "learning_rate": 2.881544450623659e-05, + "loss": 0.7293, + "step": 11480 + }, + { + "epoch": 2.72, + "learning_rate": 2.833876221498371e-05, + "loss": 0.7245, + "step": 11500 + }, + { + "epoch": 2.72, + "learning_rate": 2.7862079923730833e-05, + "loss": 0.7264, + "step": 11520 + }, + { + "epoch": 2.73, + "learning_rate": 2.7385397632477952e-05, + "loss": 0.722, + "step": 11540 + }, + { + "epoch": 2.73, + "learning_rate": 2.6908715341225068e-05, + "loss": 0.7195, + "step": 11560 + }, + { + "epoch": 2.74, + "learning_rate": 2.643203304997219e-05, + "loss": 0.7181, + "step": 11580 + }, + { + "epoch": 2.74, + "learning_rate": 2.5955350758719312e-05, + "loss": 0.7225, + "step": 11600 + }, + { + "epoch": 2.74, + "eval_loss": 0.7265506386756897, + "eval_runtime": 19.5252, + "eval_samples_per_second": 102.432, + "eval_steps_per_second": 3.227, + "step": 11600 + }, + { + "epoch": 2.75, + "learning_rate": 2.547866846746643e-05, + "loss": 0.7151, + "step": 11620 + }, + { + "epoch": 2.75, + "learning_rate": 2.5001986176213553e-05, + "loss": 0.7211, + "step": 11640 + }, + { + "epoch": 2.76, + "learning_rate": 2.4525303884960672e-05, + "loss": 0.7231, + "step": 11660 + }, + { + "epoch": 2.76, + "learning_rate": 2.404862159370779e-05, + "loss": 0.7236, + "step": 11680 + }, + { + "epoch": 2.77, + "learning_rate": 2.357193930245491e-05, + "loss": 0.7161, + "step": 11700 + }, + { + "epoch": 2.77, + "learning_rate": 2.3095257011202032e-05, + "loss": 0.7248, + "step": 11720 + }, + { + "epoch": 2.78, + "learning_rate": 2.261857471994915e-05, + "loss": 0.7195, + "step": 11740 + }, + { + "epoch": 2.78, + "learning_rate": 2.2141892428696274e-05, + "loss": 0.718, + "step": 11760 + }, + { + "epoch": 2.79, + "learning_rate": 2.1665210137443392e-05, + "loss": 0.7161, + "step": 11780 + }, + { + "epoch": 2.79, + "learning_rate": 2.118852784619051e-05, + "loss": 0.7204, + "step": 11800 + }, + { + "epoch": 2.79, + "eval_loss": 0.7261104583740234, + "eval_runtime": 20.0617, + "eval_samples_per_second": 99.692, + "eval_steps_per_second": 3.14, + "step": 11800 + }, + { + "epoch": 2.79, + "learning_rate": 2.071184555493763e-05, + "loss": 0.716, + "step": 11820 + }, + { + "epoch": 2.8, + "learning_rate": 2.0235163263684753e-05, + "loss": 0.7211, + "step": 11840 + }, + { + "epoch": 2.8, + "learning_rate": 1.975848097243187e-05, + "loss": 0.7242, + "step": 11860 + }, + { + "epoch": 2.81, + "learning_rate": 1.9281798681178994e-05, + "loss": 0.7129, + "step": 11880 + }, + { + "epoch": 2.81, + "learning_rate": 1.8828950504488756e-05, + "loss": 0.7233, + "step": 11900 + }, + { + "epoch": 2.82, + "learning_rate": 1.8352268213235875e-05, + "loss": 0.7286, + "step": 11920 + }, + { + "epoch": 2.82, + "learning_rate": 1.7875585921982997e-05, + "loss": 0.7147, + "step": 11940 + }, + { + "epoch": 2.83, + "learning_rate": 1.7398903630730116e-05, + "loss": 0.7303, + "step": 11960 + }, + { + "epoch": 2.83, + "learning_rate": 1.692222133947724e-05, + "loss": 0.7126, + "step": 11980 + }, + { + "epoch": 2.84, + "learning_rate": 1.6445539048224358e-05, + "loss": 0.7174, + "step": 12000 + }, + { + "epoch": 2.84, + "eval_loss": 0.7259587645530701, + "eval_runtime": 20.6636, + "eval_samples_per_second": 96.788, + "eval_steps_per_second": 3.049, + "step": 12000 + }, + { + "epoch": 2.84, + "learning_rate": 1.5968856756971476e-05, + "loss": 0.7147, + "step": 12020 + }, + { + "epoch": 2.85, + "learning_rate": 1.54921744657186e-05, + "loss": 0.7184, + "step": 12040 + }, + { + "epoch": 2.85, + "learning_rate": 1.5015492174465718e-05, + "loss": 0.7218, + "step": 12060 + }, + { + "epoch": 2.86, + "learning_rate": 1.4538809883212837e-05, + "loss": 0.7172, + "step": 12080 + }, + { + "epoch": 2.86, + "learning_rate": 1.4062127591959957e-05, + "loss": 0.7326, + "step": 12100 + }, + { + "epoch": 2.87, + "learning_rate": 1.3585445300707078e-05, + "loss": 0.726, + "step": 12120 + }, + { + "epoch": 2.87, + "learning_rate": 1.3108763009454197e-05, + "loss": 0.711, + "step": 12140 + }, + { + "epoch": 2.88, + "learning_rate": 1.2632080718201317e-05, + "loss": 0.7199, + "step": 12160 + }, + { + "epoch": 2.88, + "learning_rate": 1.215539842694844e-05, + "loss": 0.7256, + "step": 12180 + }, + { + "epoch": 2.88, + "learning_rate": 1.1678716135695557e-05, + "loss": 0.7183, + "step": 12200 + }, + { + "epoch": 2.88, + "eval_loss": 0.7255927324295044, + "eval_runtime": 20.0566, + "eval_samples_per_second": 99.718, + "eval_steps_per_second": 3.141, + "step": 12200 + } + ], + "max_steps": 12687, + "num_train_epochs": 3, + "total_flos": 1.5858308142157791e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-belle-7b/checkpoint-12200/training_args.bin b/adapters/saved-belle-7b/checkpoint-12200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e7508cbd7713243e73fe59f258eaa12f0deefce5 --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9809a7383d594262bcb22ae9222e2580aba6862268d17b0cc4f7bd3fe5579126 +size 3579 diff --git a/adapters/saved-belle-7b/checkpoint-12400/optimizer.pt b/adapters/saved-belle-7b/checkpoint-12400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0704560bbe8d6f092f8fd51d100be0e1f8196485 --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47a25d5c2dcb97c30ca7aa84fe22c2800eca794f29e2285c20a3f10fb2690f30 +size 33629893 diff --git a/adapters/saved-belle-7b/checkpoint-12400/pytorch_model.bin b/adapters/saved-belle-7b/checkpoint-12400/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..2b1aecb59788e5ba8d2b37dcea8ff168b7884e98 --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12400/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba76a76db4f647ade604a26e14f6adf41a1424cd504f31c505424bcfc9379d57 +size 16822989 diff --git a/adapters/saved-belle-7b/checkpoint-12400/rng_state_0.pth b/adapters/saved-belle-7b/checkpoint-12400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..99e2d202082aa7a818e572f831cb0a997cc8a7e9 --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729a4a2ae3fab67a80c0f6a8b6b8179df9902df51229df6010c20af306e40c24 +size 14583 diff --git a/adapters/saved-belle-7b/checkpoint-12400/rng_state_1.pth b/adapters/saved-belle-7b/checkpoint-12400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3038b129f228ae0aa10b407cdaddd68d99613460 --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d820719a92192fe734ce1427cd693011f4fda470a0816265e760a333eab6840 +size 14583 diff --git a/adapters/saved-belle-7b/checkpoint-12400/rng_state_2.pth b/adapters/saved-belle-7b/checkpoint-12400/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6dfbb9f89ab446fbb5c1444f886e8146e5a090fc --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12400/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db1b28158db2bcfc9dcdb1526f9a09218d72790eeb76395cc6bf9ea6960f92bf +size 14583 diff --git a/adapters/saved-belle-7b/checkpoint-12400/rng_state_3.pth b/adapters/saved-belle-7b/checkpoint-12400/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..97955a0ab8fdbd8a8de189d36f1118543319d8ac --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12400/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bc8bc0b2380d39c90fda139b9ed65356e07caf3dc769bfbbb5122e5fdc26432 +size 14583 diff --git a/adapters/saved-belle-7b/checkpoint-12400/scaler.pt b/adapters/saved-belle-7b/checkpoint-12400/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c7b22a407b04d8b81eff6d677950ac738efd839 --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12400/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:575153d700c166f22361a24b65fd79b18ce5b87d5ddeb5cdacf5d85b6e8bc6f6 +size 557 diff --git a/adapters/saved-belle-7b/checkpoint-12400/scheduler.pt b/adapters/saved-belle-7b/checkpoint-12400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a9e7490984f61bb6387806f7a25f1020ae3b086 --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94b8806669d94290e94a3a62e5300ab905b119059183ec49bc9e447da9b3bd1e +size 627 diff --git a/adapters/saved-belle-7b/checkpoint-12400/trainer_state.json b/adapters/saved-belle-7b/checkpoint-12400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d3df879671c81ea0c2bf2ea0766971a3d9d3ae1c --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12400/trainer_state.json @@ -0,0 +1,4232 @@ +{ + "best_metric": 0.7255927324295044, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-belle-7b/checkpoint-12200", + "epoch": 2.93204859161174, + "global_step": 12400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.8908, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 1.5545, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 1.1252, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 0.00023999999999999998, + "loss": 1.054, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.0003, + "loss": 1.0137, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002995233177087471, + "loss": 1.0046, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002990466354174942, + "loss": 0.9867, + "step": 140 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029856995312624134, + "loss": 0.9612, + "step": 160 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002980932708349884, + "loss": 0.9588, + "step": 180 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002976165885437356, + "loss": 0.9551, + "step": 200 + }, + { + "epoch": 0.05, + "eval_loss": 0.9459459185600281, + "eval_runtime": 19.4211, + "eval_samples_per_second": 102.981, + "eval_steps_per_second": 3.244, + "step": 200 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002971399062524827, + "loss": 0.9516, + "step": 220 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029666322396122984, + "loss": 0.937, + "step": 240 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002961865416699769, + "loss": 0.936, + "step": 260 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029570985937872406, + "loss": 0.9305, + "step": 280 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029523317708747115, + "loss": 0.9146, + "step": 300 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002947564947962183, + "loss": 0.9226, + "step": 320 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029427981250496543, + "loss": 0.9108, + "step": 340 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002938031302137125, + "loss": 0.9129, + "step": 360 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029332644792245965, + "loss": 0.9063, + "step": 380 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002928497656312068, + "loss": 0.8996, + "step": 400 + }, + { + "epoch": 0.09, + "eval_loss": 0.9003962874412537, + "eval_runtime": 19.5716, + "eval_samples_per_second": 102.189, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029237308333995393, + "loss": 0.898, + "step": 420 + }, + { + "epoch": 0.1, + "learning_rate": 0.000291896401048701, + "loss": 0.8936, + "step": 440 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029141971875744815, + "loss": 0.8932, + "step": 460 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029094303646619524, + "loss": 0.8779, + "step": 480 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002904663541749424, + "loss": 0.8871, + "step": 500 + }, + { + "epoch": 0.12, + "learning_rate": 0.00028998967188368946, + "loss": 0.8929, + "step": 520 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002895129895924366, + "loss": 0.8878, + "step": 540 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028903630730118374, + "loss": 0.8818, + "step": 560 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028855962500993083, + "loss": 0.8826, + "step": 580 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028808294271867797, + "loss": 0.879, + "step": 600 + }, + { + "epoch": 0.14, + "eval_loss": 0.8738257884979248, + "eval_runtime": 19.3526, + "eval_samples_per_second": 103.345, + "eval_steps_per_second": 3.255, + "step": 600 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002876062604274251, + "loss": 0.87, + "step": 620 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028712957813617224, + "loss": 0.8715, + "step": 640 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028665289584491933, + "loss": 0.8724, + "step": 660 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028617621355366647, + "loss": 0.8741, + "step": 680 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028569953126241355, + "loss": 0.8705, + "step": 700 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002852228489711607, + "loss": 0.8702, + "step": 720 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028474616667990783, + "loss": 0.8618, + "step": 740 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002842694843886549, + "loss": 0.8617, + "step": 760 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028379280209740206, + "loss": 0.8677, + "step": 780 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002833161198061492, + "loss": 0.853, + "step": 800 + }, + { + "epoch": 0.19, + "eval_loss": 0.8541846871376038, + "eval_runtime": 19.49, + "eval_samples_per_second": 102.617, + "eval_steps_per_second": 3.232, + "step": 800 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002828394375148963, + "loss": 0.8549, + "step": 820 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002823627552236434, + "loss": 0.847, + "step": 840 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028188607293239056, + "loss": 0.8585, + "step": 860 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028140939064113764, + "loss": 0.8568, + "step": 880 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002809327083498848, + "loss": 0.8482, + "step": 900 + }, + { + "epoch": 0.22, + "learning_rate": 0.00028045602605863187, + "loss": 0.845, + "step": 920 + }, + { + "epoch": 0.22, + "learning_rate": 0.000279979343767379, + "loss": 0.8548, + "step": 940 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027950266147612615, + "loss": 0.8372, + "step": 960 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002790259791848733, + "loss": 0.8423, + "step": 980 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027854929689362037, + "loss": 0.8433, + "step": 1000 + }, + { + "epoch": 0.24, + "eval_loss": 0.8396860361099243, + "eval_runtime": 19.3459, + "eval_samples_per_second": 103.381, + "eval_steps_per_second": 3.257, + "step": 1000 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002780726146023675, + "loss": 0.848, + "step": 1020 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002775959323111146, + "loss": 0.8383, + "step": 1040 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027711925001986173, + "loss": 0.8385, + "step": 1060 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002766425677286089, + "loss": 0.8308, + "step": 1080 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027616588543735596, + "loss": 0.8244, + "step": 1100 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002756892031461031, + "loss": 0.835, + "step": 1120 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002752125208548502, + "loss": 0.8337, + "step": 1140 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002747358385635974, + "loss": 0.8348, + "step": 1160 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027425915627234446, + "loss": 0.8353, + "step": 1180 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002737824739810916, + "loss": 0.8294, + "step": 1200 + }, + { + "epoch": 0.28, + "eval_loss": 0.8274422287940979, + "eval_runtime": 19.4187, + "eval_samples_per_second": 102.993, + "eval_steps_per_second": 3.244, + "step": 1200 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002733057916898387, + "loss": 0.8337, + "step": 1220 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002728291093985858, + "loss": 0.8435, + "step": 1240 + }, + { + "epoch": 0.3, + "learning_rate": 0.00027235242710733296, + "loss": 0.8347, + "step": 1260 + }, + { + "epoch": 0.3, + "learning_rate": 0.00027187574481608005, + "loss": 0.8258, + "step": 1280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002713990625248272, + "loss": 0.8304, + "step": 1300 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002709223802335743, + "loss": 0.8264, + "step": 1320 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002704456979423214, + "loss": 0.8313, + "step": 1340 + }, + { + "epoch": 0.32, + "learning_rate": 0.00026996901565106855, + "loss": 0.814, + "step": 1360 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002694923333598157, + "loss": 0.8223, + "step": 1380 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002690156510685628, + "loss": 0.8159, + "step": 1400 + }, + { + "epoch": 0.33, + "eval_loss": 0.8179089426994324, + "eval_runtime": 19.4736, + "eval_samples_per_second": 102.703, + "eval_steps_per_second": 3.235, + "step": 1400 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002685389687773099, + "loss": 0.8218, + "step": 1420 + }, + { + "epoch": 0.34, + "learning_rate": 0.000268062286486057, + "loss": 0.808, + "step": 1440 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026758560419480414, + "loss": 0.8253, + "step": 1460 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002671089219035513, + "loss": 0.8174, + "step": 1480 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026663223961229836, + "loss": 0.8157, + "step": 1500 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002661555573210455, + "loss": 0.8142, + "step": 1520 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002656788750297926, + "loss": 0.8112, + "step": 1540 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026520219273853973, + "loss": 0.8232, + "step": 1560 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026472551044728687, + "loss": 0.8254, + "step": 1580 + }, + { + "epoch": 0.38, + "learning_rate": 0.000264248828156034, + "loss": 0.8059, + "step": 1600 + }, + { + "epoch": 0.38, + "eval_loss": 0.8101135492324829, + "eval_runtime": 19.5846, + "eval_samples_per_second": 102.121, + "eval_steps_per_second": 3.217, + "step": 1600 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002637721458647811, + "loss": 0.8062, + "step": 1620 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026329546357352823, + "loss": 0.805, + "step": 1640 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002628187812822753, + "loss": 0.8109, + "step": 1660 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026234209899102245, + "loss": 0.801, + "step": 1680 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002618654166997696, + "loss": 0.8043, + "step": 1700 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002613887344085167, + "loss": 0.8002, + "step": 1720 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002609120521172638, + "loss": 0.8152, + "step": 1740 + }, + { + "epoch": 0.42, + "learning_rate": 0.00026043536982601096, + "loss": 0.8052, + "step": 1760 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002599586875347581, + "loss": 0.8136, + "step": 1780 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002594820052435052, + "loss": 0.8044, + "step": 1800 + }, + { + "epoch": 0.43, + "eval_loss": 0.8030326962471008, + "eval_runtime": 19.4835, + "eval_samples_per_second": 102.651, + "eval_steps_per_second": 3.234, + "step": 1800 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002590053229522523, + "loss": 0.7995, + "step": 1820 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002585286406609994, + "loss": 0.7958, + "step": 1840 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025805195836974654, + "loss": 0.8034, + "step": 1860 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025757527607849363, + "loss": 0.8016, + "step": 1880 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025709859378724077, + "loss": 0.8048, + "step": 1900 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002566219114959879, + "loss": 0.8004, + "step": 1920 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025614522920473505, + "loss": 0.8041, + "step": 1940 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025566854691348213, + "loss": 0.7908, + "step": 1960 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025519186462222927, + "loss": 0.7958, + "step": 1980 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002547151823309764, + "loss": 0.8013, + "step": 2000 + }, + { + "epoch": 0.47, + "eval_loss": 0.7965430021286011, + "eval_runtime": 19.4852, + "eval_samples_per_second": 102.642, + "eval_steps_per_second": 3.233, + "step": 2000 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002542385000397235, + "loss": 0.803, + "step": 2020 + }, + { + "epoch": 0.48, + "learning_rate": 0.00025376181774847064, + "loss": 0.7966, + "step": 2040 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002532851354572177, + "loss": 0.7946, + "step": 2060 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025280845316596486, + "loss": 0.8023, + "step": 2080 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025233177087471194, + "loss": 0.7953, + "step": 2100 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002518550885834591, + "loss": 0.8053, + "step": 2120 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002513784062922062, + "loss": 0.7883, + "step": 2140 + }, + { + "epoch": 0.51, + "learning_rate": 0.00025090172400095336, + "loss": 0.7984, + "step": 2160 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025042504170970045, + "loss": 0.7962, + "step": 2180 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002499483594184476, + "loss": 0.7847, + "step": 2200 + }, + { + "epoch": 0.52, + "eval_loss": 0.7915623784065247, + "eval_runtime": 19.5509, + "eval_samples_per_second": 102.297, + "eval_steps_per_second": 3.222, + "step": 2200 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002494716771271947, + "loss": 0.7917, + "step": 2220 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002489949948359418, + "loss": 0.7942, + "step": 2240 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024851831254468895, + "loss": 0.7921, + "step": 2260 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024804163025343603, + "loss": 0.7971, + "step": 2280 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002475649479621832, + "loss": 0.7919, + "step": 2300 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002470882656709303, + "loss": 0.7917, + "step": 2320 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024661158337967745, + "loss": 0.8024, + "step": 2340 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024613490108842454, + "loss": 0.7761, + "step": 2360 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002456582187971717, + "loss": 0.7958, + "step": 2380 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024518153650591876, + "loss": 0.7855, + "step": 2400 + }, + { + "epoch": 0.57, + "eval_loss": 0.7870249152183533, + "eval_runtime": 19.5953, + "eval_samples_per_second": 102.065, + "eval_steps_per_second": 3.215, + "step": 2400 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002447048542146659, + "loss": 0.784, + "step": 2420 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024422817192341304, + "loss": 0.7926, + "step": 2440 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024375148963216013, + "loss": 0.7845, + "step": 2460 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024327480734090726, + "loss": 0.782, + "step": 2480 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024279812504965438, + "loss": 0.7808, + "step": 2500 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024232144275840152, + "loss": 0.7926, + "step": 2520 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024184476046714863, + "loss": 0.7795, + "step": 2540 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024136807817589574, + "loss": 0.7888, + "step": 2560 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024089139588464288, + "loss": 0.7888, + "step": 2580 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024041471359339, + "loss": 0.7863, + "step": 2600 + }, + { + "epoch": 0.61, + "eval_loss": 0.7825512290000916, + "eval_runtime": 19.4274, + "eval_samples_per_second": 102.948, + "eval_steps_per_second": 3.243, + "step": 2600 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002399380313021371, + "loss": 0.7881, + "step": 2620 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023946134901088422, + "loss": 0.7841, + "step": 2640 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023898466671963133, + "loss": 0.7849, + "step": 2660 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023850798442837844, + "loss": 0.7809, + "step": 2680 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002380313021371256, + "loss": 0.7757, + "step": 2700 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023755461984587272, + "loss": 0.7787, + "step": 2720 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023707793755461983, + "loss": 0.7766, + "step": 2740 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023660125526336694, + "loss": 0.7867, + "step": 2760 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023612457297211405, + "loss": 0.7767, + "step": 2780 + }, + { + "epoch": 0.66, + "learning_rate": 0.0002356478906808612, + "loss": 0.7806, + "step": 2800 + }, + { + "epoch": 0.66, + "eval_loss": 0.7781409621238708, + "eval_runtime": 20.131, + "eval_samples_per_second": 99.349, + "eval_steps_per_second": 3.13, + "step": 2800 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002351712083896083, + "loss": 0.7774, + "step": 2820 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023469452609835542, + "loss": 0.7782, + "step": 2840 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023421784380710253, + "loss": 0.7773, + "step": 2860 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023374116151584964, + "loss": 0.7845, + "step": 2880 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002332644792245968, + "loss": 0.7879, + "step": 2900 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023278779693334392, + "loss": 0.7801, + "step": 2920 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023231111464209103, + "loss": 0.7713, + "step": 2940 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023183443235083814, + "loss": 0.7742, + "step": 2960 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023135775005958526, + "loss": 0.7783, + "step": 2980 + }, + { + "epoch": 0.71, + "learning_rate": 0.0002308810677683324, + "loss": 0.7698, + "step": 3000 + }, + { + "epoch": 0.71, + "eval_loss": 0.7747411131858826, + "eval_runtime": 20.0968, + "eval_samples_per_second": 99.519, + "eval_steps_per_second": 3.135, + "step": 3000 + }, + { + "epoch": 0.71, + "learning_rate": 0.0002304043854770795, + "loss": 0.7696, + "step": 3020 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022992770318582662, + "loss": 0.7744, + "step": 3040 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022945102089457373, + "loss": 0.7687, + "step": 3060 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022897433860332084, + "loss": 0.7765, + "step": 3080 + }, + { + "epoch": 0.73, + "learning_rate": 0.000228497656312068, + "loss": 0.7709, + "step": 3100 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022802097402081512, + "loss": 0.773, + "step": 3120 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022754429172956224, + "loss": 0.7862, + "step": 3140 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022706760943830935, + "loss": 0.7668, + "step": 3160 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022659092714705646, + "loss": 0.7816, + "step": 3180 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022611424485580357, + "loss": 0.7831, + "step": 3200 + }, + { + "epoch": 0.76, + "eval_loss": 0.7719215154647827, + "eval_runtime": 19.6387, + "eval_samples_per_second": 101.84, + "eval_steps_per_second": 3.208, + "step": 3200 + }, + { + "epoch": 0.76, + "learning_rate": 0.0002256375625645507, + "loss": 0.7723, + "step": 3220 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022516088027329782, + "loss": 0.7727, + "step": 3240 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022468419798204493, + "loss": 0.7719, + "step": 3260 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022420751569079207, + "loss": 0.7796, + "step": 3280 + }, + { + "epoch": 0.78, + "learning_rate": 0.0002237308333995392, + "loss": 0.7685, + "step": 3300 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022325415110828633, + "loss": 0.7725, + "step": 3320 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022277746881703344, + "loss": 0.7638, + "step": 3340 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022230078652578055, + "loss": 0.7771, + "step": 3360 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022182410423452766, + "loss": 0.7689, + "step": 3380 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022134742194327477, + "loss": 0.7797, + "step": 3400 + }, + { + "epoch": 0.8, + "eval_loss": 0.768983006477356, + "eval_runtime": 19.4428, + "eval_samples_per_second": 102.866, + "eval_steps_per_second": 3.24, + "step": 3400 + }, + { + "epoch": 0.81, + "learning_rate": 0.0002208707396520219, + "loss": 0.7734, + "step": 3420 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022039405736076903, + "loss": 0.7719, + "step": 3440 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021991737506951614, + "loss": 0.767, + "step": 3460 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021944069277826328, + "loss": 0.7758, + "step": 3480 + }, + { + "epoch": 0.83, + "learning_rate": 0.0002189640104870104, + "loss": 0.7768, + "step": 3500 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021848732819575753, + "loss": 0.7641, + "step": 3520 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021801064590450464, + "loss": 0.7694, + "step": 3540 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021753396361325175, + "loss": 0.7835, + "step": 3560 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021705728132199886, + "loss": 0.7642, + "step": 3580 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021658059903074598, + "loss": 0.7719, + "step": 3600 + }, + { + "epoch": 0.85, + "eval_loss": 0.7660636305809021, + "eval_runtime": 19.5996, + "eval_samples_per_second": 102.043, + "eval_steps_per_second": 3.214, + "step": 3600 + }, + { + "epoch": 0.86, + "learning_rate": 0.0002161039167394931, + "loss": 0.7723, + "step": 3620 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021562723444824023, + "loss": 0.76, + "step": 3640 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021515055215698734, + "loss": 0.7643, + "step": 3660 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021467386986573448, + "loss": 0.7599, + "step": 3680 + }, + { + "epoch": 0.87, + "learning_rate": 0.0002141971875744816, + "loss": 0.7623, + "step": 3700 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002137205052832287, + "loss": 0.7621, + "step": 3720 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021324382299197584, + "loss": 0.7691, + "step": 3740 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021276714070072295, + "loss": 0.7665, + "step": 3760 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021229045840947007, + "loss": 0.7742, + "step": 3780 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021181377611821718, + "loss": 0.7624, + "step": 3800 + }, + { + "epoch": 0.9, + "eval_loss": 0.7643172740936279, + "eval_runtime": 19.487, + "eval_samples_per_second": 102.633, + "eval_steps_per_second": 3.233, + "step": 3800 + }, + { + "epoch": 0.9, + "learning_rate": 0.0002113370938269643, + "loss": 0.7726, + "step": 3820 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002108604115357114, + "loss": 0.7559, + "step": 3840 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021038372924445857, + "loss": 0.7634, + "step": 3860 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020990704695320568, + "loss": 0.765, + "step": 3880 + }, + { + "epoch": 0.92, + "learning_rate": 0.0002094303646619528, + "loss": 0.7649, + "step": 3900 + }, + { + "epoch": 0.93, + "learning_rate": 0.0002089536823706999, + "loss": 0.763, + "step": 3920 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020847700007944705, + "loss": 0.7679, + "step": 3940 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020800031778819416, + "loss": 0.7644, + "step": 3960 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020752363549694127, + "loss": 0.7655, + "step": 3980 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020704695320568838, + "loss": 0.7681, + "step": 4000 + }, + { + "epoch": 0.95, + "eval_loss": 0.7610963582992554, + "eval_runtime": 19.5269, + "eval_samples_per_second": 102.423, + "eval_steps_per_second": 3.226, + "step": 4000 + }, + { + "epoch": 0.95, + "learning_rate": 0.0002065702709144355, + "loss": 0.7623, + "step": 4020 + }, + { + "epoch": 0.96, + "learning_rate": 0.0002060935886231826, + "loss": 0.7625, + "step": 4040 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020561690633192977, + "loss": 0.7524, + "step": 4060 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020514022404067688, + "loss": 0.764, + "step": 4080 + }, + { + "epoch": 0.97, + "learning_rate": 0.000204663541749424, + "loss": 0.7513, + "step": 4100 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002041868594581711, + "loss": 0.753, + "step": 4120 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020371017716691822, + "loss": 0.7602, + "step": 4140 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020323349487566536, + "loss": 0.7701, + "step": 4160 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020275681258441247, + "loss": 0.7602, + "step": 4180 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020228013029315958, + "loss": 0.7598, + "step": 4200 + }, + { + "epoch": 0.99, + "eval_loss": 0.760128915309906, + "eval_runtime": 19.4387, + "eval_samples_per_second": 102.888, + "eval_steps_per_second": 3.241, + "step": 4200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0002018034480019067, + "loss": 0.7579, + "step": 4220 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020132676571065384, + "loss": 0.7628, + "step": 4240 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020085008341940097, + "loss": 0.7551, + "step": 4260 + }, + { + "epoch": 1.01, + "learning_rate": 0.0002003734011281481, + "loss": 0.7582, + "step": 4280 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001998967188368952, + "loss": 0.7623, + "step": 4300 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001994200365456423, + "loss": 0.7504, + "step": 4320 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019894335425438942, + "loss": 0.7587, + "step": 4340 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019846667196313654, + "loss": 0.7528, + "step": 4360 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019798998967188367, + "loss": 0.754, + "step": 4380 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019751330738063079, + "loss": 0.759, + "step": 4400 + }, + { + "epoch": 1.04, + "eval_loss": 0.7575392127037048, + "eval_runtime": 19.5275, + "eval_samples_per_second": 102.42, + "eval_steps_per_second": 3.226, + "step": 4400 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001970366250893779, + "loss": 0.7592, + "step": 4420 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019655994279812504, + "loss": 0.7548, + "step": 4440 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019608326050687218, + "loss": 0.7632, + "step": 4460 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001956065782156193, + "loss": 0.7472, + "step": 4480 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001951298959243664, + "loss": 0.7496, + "step": 4500 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001946532136331135, + "loss": 0.7549, + "step": 4520 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019417653134186063, + "loss": 0.77, + "step": 4540 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019369984905060774, + "loss": 0.759, + "step": 4560 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019322316675935488, + "loss": 0.7554, + "step": 4580 + }, + { + "epoch": 1.09, + "learning_rate": 0.000192746484468102, + "loss": 0.7577, + "step": 4600 + }, + { + "epoch": 1.09, + "eval_loss": 0.7568497061729431, + "eval_runtime": 19.53, + "eval_samples_per_second": 102.406, + "eval_steps_per_second": 3.226, + "step": 4600 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001922698021768491, + "loss": 0.7617, + "step": 4620 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019179311988559624, + "loss": 0.7551, + "step": 4640 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019131643759434335, + "loss": 0.7482, + "step": 4660 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001908397553030905, + "loss": 0.7516, + "step": 4680 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001903630730118376, + "loss": 0.7555, + "step": 4700 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018988639072058472, + "loss": 0.7605, + "step": 4720 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018940970842933183, + "loss": 0.7506, + "step": 4740 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018893302613807894, + "loss": 0.7622, + "step": 4760 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018845634384682605, + "loss": 0.75, + "step": 4780 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001879796615555732, + "loss": 0.7572, + "step": 4800 + }, + { + "epoch": 1.13, + "eval_loss": 0.7548028826713562, + "eval_runtime": 19.5411, + "eval_samples_per_second": 102.349, + "eval_steps_per_second": 3.224, + "step": 4800 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018750297926432033, + "loss": 0.7427, + "step": 4820 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018702629697306744, + "loss": 0.7489, + "step": 4840 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018654961468181455, + "loss": 0.755, + "step": 4860 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018607293239056167, + "loss": 0.7517, + "step": 4880 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001855962500993088, + "loss": 0.7529, + "step": 4900 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018511956780805592, + "loss": 0.7498, + "step": 4920 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018464288551680303, + "loss": 0.756, + "step": 4940 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018416620322555014, + "loss": 0.7492, + "step": 4960 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018368952093429725, + "loss": 0.7491, + "step": 4980 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018321283864304437, + "loss": 0.7585, + "step": 5000 + }, + { + "epoch": 1.18, + "eval_loss": 0.7538104057312012, + "eval_runtime": 19.6106, + "eval_samples_per_second": 101.986, + "eval_steps_per_second": 3.213, + "step": 5000 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018273615635179153, + "loss": 0.7531, + "step": 5020 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018225947406053865, + "loss": 0.7511, + "step": 5040 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018178279176928576, + "loss": 0.7541, + "step": 5060 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018130610947803287, + "loss": 0.7465, + "step": 5080 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018082942718678, + "loss": 0.7403, + "step": 5100 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018035274489552712, + "loss": 0.749, + "step": 5120 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017987606260427423, + "loss": 0.7548, + "step": 5140 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017939938031302134, + "loss": 0.7443, + "step": 5160 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017892269802176846, + "loss": 0.7461, + "step": 5180 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017844601573051557, + "loss": 0.7511, + "step": 5200 + }, + { + "epoch": 1.23, + "eval_loss": 0.7509217262268066, + "eval_runtime": 19.5437, + "eval_samples_per_second": 102.335, + "eval_steps_per_second": 3.224, + "step": 5200 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017796933343926274, + "loss": 0.7562, + "step": 5220 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017749265114800985, + "loss": 0.7489, + "step": 5240 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017701596885675696, + "loss": 0.7499, + "step": 5260 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017653928656550407, + "loss": 0.7519, + "step": 5280 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017606260427425118, + "loss": 0.7536, + "step": 5300 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017558592198299832, + "loss": 0.7536, + "step": 5320 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017510923969174544, + "loss": 0.7492, + "step": 5340 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017463255740049255, + "loss": 0.7454, + "step": 5360 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017415587510923966, + "loss": 0.7528, + "step": 5380 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001736791928179868, + "loss": 0.7409, + "step": 5400 + }, + { + "epoch": 1.28, + "eval_loss": 0.7497395873069763, + "eval_runtime": 19.5671, + "eval_samples_per_second": 102.212, + "eval_steps_per_second": 3.22, + "step": 5400 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017320251052673394, + "loss": 0.7434, + "step": 5420 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017272582823548105, + "loss": 0.7543, + "step": 5440 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017224914594422816, + "loss": 0.7457, + "step": 5460 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017177246365297527, + "loss": 0.7439, + "step": 5480 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001712957813617224, + "loss": 0.7412, + "step": 5500 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001708190990704695, + "loss": 0.7409, + "step": 5520 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017034241677921664, + "loss": 0.7473, + "step": 5540 + }, + { + "epoch": 1.31, + "learning_rate": 0.00016986573448796375, + "loss": 0.7486, + "step": 5560 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016938905219671086, + "loss": 0.7439, + "step": 5580 + }, + { + "epoch": 1.32, + "learning_rate": 0.000168912369905458, + "loss": 0.7524, + "step": 5600 + }, + { + "epoch": 1.32, + "eval_loss": 0.7480019330978394, + "eval_runtime": 19.5018, + "eval_samples_per_second": 102.555, + "eval_steps_per_second": 3.23, + "step": 5600 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016843568761420514, + "loss": 0.7464, + "step": 5620 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016795900532295225, + "loss": 0.7511, + "step": 5640 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016748232303169936, + "loss": 0.7423, + "step": 5660 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016700564074044648, + "loss": 0.7422, + "step": 5680 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001665289584491936, + "loss": 0.742, + "step": 5700 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001660522761579407, + "loss": 0.7421, + "step": 5720 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016557559386668784, + "loss": 0.749, + "step": 5740 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016509891157543495, + "loss": 0.7432, + "step": 5760 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001646222292841821, + "loss": 0.7426, + "step": 5780 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001641455469929292, + "loss": 0.7543, + "step": 5800 + }, + { + "epoch": 1.37, + "eval_loss": 0.7470090389251709, + "eval_runtime": 19.5563, + "eval_samples_per_second": 102.269, + "eval_steps_per_second": 3.221, + "step": 5800 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016366886470167632, + "loss": 0.7451, + "step": 5820 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016319218241042346, + "loss": 0.7481, + "step": 5840 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016271550011917057, + "loss": 0.7381, + "step": 5860 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016223881782791768, + "loss": 0.7461, + "step": 5880 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001617621355366648, + "loss": 0.7467, + "step": 5900 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001612854532454119, + "loss": 0.745, + "step": 5920 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016080877095415902, + "loss": 0.745, + "step": 5940 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016033208866290615, + "loss": 0.7386, + "step": 5960 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001598554063716533, + "loss": 0.7363, + "step": 5980 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001593787240804004, + "loss": 0.7412, + "step": 6000 + }, + { + "epoch": 1.42, + "eval_loss": 0.7454522848129272, + "eval_runtime": 19.555, + "eval_samples_per_second": 102.276, + "eval_steps_per_second": 3.222, + "step": 6000 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015890204178914752, + "loss": 0.7501, + "step": 6020 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015842535949789463, + "loss": 0.7528, + "step": 6040 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015794867720664177, + "loss": 0.7373, + "step": 6060 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015747199491538888, + "loss": 0.7451, + "step": 6080 + }, + { + "epoch": 1.44, + "learning_rate": 0.000156995312624136, + "loss": 0.7384, + "step": 6100 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001565186303328831, + "loss": 0.7471, + "step": 6120 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015604194804163022, + "loss": 0.7454, + "step": 6140 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015556526575037733, + "loss": 0.7415, + "step": 6160 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001550885834591245, + "loss": 0.7514, + "step": 6180 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001546119011678716, + "loss": 0.7343, + "step": 6200 + }, + { + "epoch": 1.47, + "eval_loss": 0.7457332611083984, + "eval_runtime": 19.5673, + "eval_samples_per_second": 102.212, + "eval_steps_per_second": 3.22, + "step": 6200 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015413521887661872, + "loss": 0.7452, + "step": 6220 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015365853658536583, + "loss": 0.7456, + "step": 6240 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015318185429411297, + "loss": 0.7326, + "step": 6260 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015270517200286008, + "loss": 0.7431, + "step": 6280 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001522284897116072, + "loss": 0.7419, + "step": 6300 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001517518074203543, + "loss": 0.7375, + "step": 6320 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015127512512910142, + "loss": 0.7419, + "step": 6340 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001507984428378486, + "loss": 0.7431, + "step": 6360 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001503217605465957, + "loss": 0.7412, + "step": 6380 + }, + { + "epoch": 1.51, + "learning_rate": 0.00014984507825534278, + "loss": 0.7447, + "step": 6400 + }, + { + "epoch": 1.51, + "eval_loss": 0.7441338896751404, + "eval_runtime": 19.4509, + "eval_samples_per_second": 102.823, + "eval_steps_per_second": 3.239, + "step": 6400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014936839596408992, + "loss": 0.7436, + "step": 6420 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014889171367283704, + "loss": 0.7402, + "step": 6440 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014841503138158415, + "loss": 0.7454, + "step": 6460 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001479383490903313, + "loss": 0.738, + "step": 6480 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001474616667990784, + "loss": 0.7396, + "step": 6500 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014698498450782554, + "loss": 0.7333, + "step": 6520 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014650830221657265, + "loss": 0.7482, + "step": 6540 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014603161992531976, + "loss": 0.7376, + "step": 6560 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014555493763406687, + "loss": 0.7369, + "step": 6580 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014507825534281401, + "loss": 0.7347, + "step": 6600 + }, + { + "epoch": 1.56, + "eval_loss": 0.7425362467765808, + "eval_runtime": 19.5248, + "eval_samples_per_second": 102.434, + "eval_steps_per_second": 3.227, + "step": 6600 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014460157305156113, + "loss": 0.7446, + "step": 6620 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014412489076030824, + "loss": 0.7343, + "step": 6640 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014364820846905535, + "loss": 0.7468, + "step": 6660 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001431715261778025, + "loss": 0.749, + "step": 6680 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001426948438865496, + "loss": 0.7401, + "step": 6700 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001422181615952967, + "loss": 0.7364, + "step": 6720 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014174147930404385, + "loss": 0.7442, + "step": 6740 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014126479701279096, + "loss": 0.7385, + "step": 6760 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014078811472153808, + "loss": 0.7412, + "step": 6780 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014031143243028522, + "loss": 0.7377, + "step": 6800 + }, + { + "epoch": 1.61, + "eval_loss": 0.7418386936187744, + "eval_runtime": 19.5679, + "eval_samples_per_second": 102.208, + "eval_steps_per_second": 3.22, + "step": 6800 + }, + { + "epoch": 1.61, + "learning_rate": 0.00013983475013903233, + "loss": 0.7432, + "step": 6820 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013935806784777944, + "loss": 0.7379, + "step": 6840 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013888138555652655, + "loss": 0.7346, + "step": 6860 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013840470326527366, + "loss": 0.7373, + "step": 6880 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001379280209740208, + "loss": 0.7403, + "step": 6900 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013745133868276792, + "loss": 0.7477, + "step": 6920 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013697465639151506, + "loss": 0.7343, + "step": 6940 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013649797410026217, + "loss": 0.7419, + "step": 6960 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013602129180900928, + "loss": 0.7327, + "step": 6980 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013554460951775642, + "loss": 0.7398, + "step": 7000 + }, + { + "epoch": 1.66, + "eval_loss": 0.7402775883674622, + "eval_runtime": 19.5554, + "eval_samples_per_second": 102.274, + "eval_steps_per_second": 3.222, + "step": 7000 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013506792722650353, + "loss": 0.7311, + "step": 7020 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013459124493525064, + "loss": 0.7319, + "step": 7040 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013411456264399775, + "loss": 0.7315, + "step": 7060 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001336378803527449, + "loss": 0.7329, + "step": 7080 + }, + { + "epoch": 1.68, + "learning_rate": 0.000133161198061492, + "loss": 0.7471, + "step": 7100 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013268451577023912, + "loss": 0.7446, + "step": 7120 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013220783347898623, + "loss": 0.7359, + "step": 7140 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013173115118773337, + "loss": 0.7348, + "step": 7160 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013125446889648048, + "loss": 0.7331, + "step": 7180 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013077778660522762, + "loss": 0.7385, + "step": 7200 + }, + { + "epoch": 1.7, + "eval_loss": 0.7401012182235718, + "eval_runtime": 19.7831, + "eval_samples_per_second": 101.096, + "eval_steps_per_second": 3.185, + "step": 7200 + }, + { + "epoch": 1.71, + "learning_rate": 0.00013030110431397473, + "loss": 0.744, + "step": 7220 + }, + { + "epoch": 1.71, + "learning_rate": 0.00012982442202272185, + "loss": 0.7327, + "step": 7240 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012934773973146896, + "loss": 0.7384, + "step": 7260 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001288710574402161, + "loss": 0.7399, + "step": 7280 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001283943751489632, + "loss": 0.7376, + "step": 7300 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012791769285771032, + "loss": 0.7416, + "step": 7320 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012744101056645743, + "loss": 0.7299, + "step": 7340 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012696432827520455, + "loss": 0.7389, + "step": 7360 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012648764598395168, + "loss": 0.7295, + "step": 7380 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001260109636926988, + "loss": 0.7389, + "step": 7400 + }, + { + "epoch": 1.75, + "eval_loss": 0.7385362386703491, + "eval_runtime": 19.6728, + "eval_samples_per_second": 101.663, + "eval_steps_per_second": 3.202, + "step": 7400 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012553428140144594, + "loss": 0.7346, + "step": 7420 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012505759911019305, + "loss": 0.7357, + "step": 7440 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012458091681894016, + "loss": 0.7295, + "step": 7460 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001241042345276873, + "loss": 0.7418, + "step": 7480 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001236275522364344, + "loss": 0.7248, + "step": 7500 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012315086994518152, + "loss": 0.7326, + "step": 7520 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012267418765392864, + "loss": 0.7422, + "step": 7540 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012219750536267577, + "loss": 0.7376, + "step": 7560 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012172082307142289, + "loss": 0.7358, + "step": 7580 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012124414078017001, + "loss": 0.7337, + "step": 7600 + }, + { + "epoch": 1.8, + "eval_loss": 0.737734854221344, + "eval_runtime": 19.8317, + "eval_samples_per_second": 100.849, + "eval_steps_per_second": 3.177, + "step": 7600 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012076745848891712, + "loss": 0.7318, + "step": 7620 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012029077619766424, + "loss": 0.7356, + "step": 7640 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011981409390641138, + "loss": 0.7355, + "step": 7660 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011933741161515849, + "loss": 0.74, + "step": 7680 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001188607293239056, + "loss": 0.7342, + "step": 7700 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011838404703265273, + "loss": 0.7368, + "step": 7720 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011790736474139984, + "loss": 0.7337, + "step": 7740 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011743068245014698, + "loss": 0.7317, + "step": 7760 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011695400015889409, + "loss": 0.738, + "step": 7780 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001164773178676412, + "loss": 0.7375, + "step": 7800 + }, + { + "epoch": 1.84, + "eval_loss": 0.7366506457328796, + "eval_runtime": 19.9586, + "eval_samples_per_second": 100.208, + "eval_steps_per_second": 3.157, + "step": 7800 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011600063557638833, + "loss": 0.7349, + "step": 7820 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011552395328513544, + "loss": 0.733, + "step": 7840 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011504727099388258, + "loss": 0.7277, + "step": 7860 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011457058870262969, + "loss": 0.7235, + "step": 7880 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001140939064113768, + "loss": 0.7405, + "step": 7900 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011361722412012393, + "loss": 0.7378, + "step": 7920 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011314054182887104, + "loss": 0.7292, + "step": 7940 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011266385953761818, + "loss": 0.7427, + "step": 7960 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011218717724636529, + "loss": 0.7313, + "step": 7980 + }, + { + "epoch": 1.89, + "learning_rate": 0.0001117104949551124, + "loss": 0.7252, + "step": 8000 + }, + { + "epoch": 1.89, + "eval_loss": 0.736083984375, + "eval_runtime": 19.7958, + "eval_samples_per_second": 101.031, + "eval_steps_per_second": 3.182, + "step": 8000 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011123381266385953, + "loss": 0.7268, + "step": 8020 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011075713037260666, + "loss": 0.729, + "step": 8040 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011028044808135377, + "loss": 0.7358, + "step": 8060 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010980376579010089, + "loss": 0.7408, + "step": 8080 + }, + { + "epoch": 1.92, + "learning_rate": 0.000109327083498848, + "loss": 0.73, + "step": 8100 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010887423532215777, + "loss": 0.7298, + "step": 8120 + }, + { + "epoch": 1.92, + "learning_rate": 0.0001083975530309049, + "loss": 0.7324, + "step": 8140 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010792087073965201, + "loss": 0.7296, + "step": 8160 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010744418844839912, + "loss": 0.7346, + "step": 8180 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010696750615714626, + "loss": 0.7281, + "step": 8200 + }, + { + "epoch": 1.94, + "eval_loss": 0.7352190613746643, + "eval_runtime": 19.6635, + "eval_samples_per_second": 101.711, + "eval_steps_per_second": 3.204, + "step": 8200 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010649082386589337, + "loss": 0.7377, + "step": 8220 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001060141415746405, + "loss": 0.7281, + "step": 8240 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010553745928338761, + "loss": 0.7251, + "step": 8260 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010506077699213472, + "loss": 0.7331, + "step": 8280 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010458409470088186, + "loss": 0.7432, + "step": 8300 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010410741240962897, + "loss": 0.7366, + "step": 8320 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001036307301183761, + "loss": 0.7334, + "step": 8340 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010315404782712321, + "loss": 0.7351, + "step": 8360 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010267736553587032, + "loss": 0.7355, + "step": 8380 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010220068324461746, + "loss": 0.7228, + "step": 8400 + }, + { + "epoch": 1.99, + "eval_loss": 0.7341500520706177, + "eval_runtime": 19.6196, + "eval_samples_per_second": 101.939, + "eval_steps_per_second": 3.211, + "step": 8400 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010172400095336457, + "loss": 0.7451, + "step": 8420 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010124731866211169, + "loss": 0.7356, + "step": 8440 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010077063637085881, + "loss": 0.7255, + "step": 8460 + }, + { + "epoch": 2.01, + "learning_rate": 0.00010029395407960592, + "loss": 0.7267, + "step": 8480 + }, + { + "epoch": 2.01, + "learning_rate": 9.981727178835306e-05, + "loss": 0.7291, + "step": 8500 + }, + { + "epoch": 2.01, + "learning_rate": 9.934058949710018e-05, + "loss": 0.7294, + "step": 8520 + }, + { + "epoch": 2.02, + "learning_rate": 9.886390720584729e-05, + "loss": 0.7377, + "step": 8540 + }, + { + "epoch": 2.02, + "learning_rate": 9.838722491459441e-05, + "loss": 0.7324, + "step": 8560 + }, + { + "epoch": 2.03, + "learning_rate": 9.791054262334154e-05, + "loss": 0.7286, + "step": 8580 + }, + { + "epoch": 2.03, + "learning_rate": 9.743386033208867e-05, + "loss": 0.7286, + "step": 8600 + }, + { + "epoch": 2.03, + "eval_loss": 0.734474241733551, + "eval_runtime": 19.5642, + "eval_samples_per_second": 102.228, + "eval_steps_per_second": 3.22, + "step": 8600 + }, + { + "epoch": 2.04, + "learning_rate": 9.695717804083578e-05, + "loss": 0.7304, + "step": 8620 + }, + { + "epoch": 2.04, + "learning_rate": 9.648049574958289e-05, + "loss": 0.7348, + "step": 8640 + }, + { + "epoch": 2.05, + "learning_rate": 9.600381345833002e-05, + "loss": 0.7261, + "step": 8660 + }, + { + "epoch": 2.05, + "learning_rate": 9.552713116707714e-05, + "loss": 0.7313, + "step": 8680 + }, + { + "epoch": 2.06, + "learning_rate": 9.505044887582425e-05, + "loss": 0.7379, + "step": 8700 + }, + { + "epoch": 2.06, + "learning_rate": 9.457376658457138e-05, + "loss": 0.7203, + "step": 8720 + }, + { + "epoch": 2.07, + "learning_rate": 9.409708429331849e-05, + "loss": 0.7306, + "step": 8740 + }, + { + "epoch": 2.07, + "learning_rate": 9.36204020020656e-05, + "loss": 0.7332, + "step": 8760 + }, + { + "epoch": 2.08, + "learning_rate": 9.314371971081274e-05, + "loss": 0.7228, + "step": 8780 + }, + { + "epoch": 2.08, + "learning_rate": 9.266703741955985e-05, + "loss": 0.731, + "step": 8800 + }, + { + "epoch": 2.08, + "eval_loss": 0.7332338690757751, + "eval_runtime": 19.7114, + "eval_samples_per_second": 101.464, + "eval_steps_per_second": 3.196, + "step": 8800 + }, + { + "epoch": 2.09, + "learning_rate": 9.219035512830698e-05, + "loss": 0.7267, + "step": 8820 + }, + { + "epoch": 2.09, + "learning_rate": 9.171367283705409e-05, + "loss": 0.7285, + "step": 8840 + }, + { + "epoch": 2.09, + "learning_rate": 9.12369905458012e-05, + "loss": 0.7214, + "step": 8860 + }, + { + "epoch": 2.1, + "learning_rate": 9.076030825454834e-05, + "loss": 0.7204, + "step": 8880 + }, + { + "epoch": 2.1, + "learning_rate": 9.028362596329546e-05, + "loss": 0.7253, + "step": 8900 + }, + { + "epoch": 2.11, + "learning_rate": 8.980694367204258e-05, + "loss": 0.7253, + "step": 8920 + }, + { + "epoch": 2.11, + "learning_rate": 8.933026138078969e-05, + "loss": 0.7238, + "step": 8940 + }, + { + "epoch": 2.12, + "learning_rate": 8.88535790895368e-05, + "loss": 0.7286, + "step": 8960 + }, + { + "epoch": 2.12, + "learning_rate": 8.837689679828394e-05, + "loss": 0.7385, + "step": 8980 + }, + { + "epoch": 2.13, + "learning_rate": 8.790021450703106e-05, + "loss": 0.7237, + "step": 9000 + }, + { + "epoch": 2.13, + "eval_loss": 0.7329864501953125, + "eval_runtime": 19.7024, + "eval_samples_per_second": 101.51, + "eval_steps_per_second": 3.198, + "step": 9000 + }, + { + "epoch": 2.13, + "learning_rate": 8.742353221577817e-05, + "loss": 0.7311, + "step": 9020 + }, + { + "epoch": 2.14, + "learning_rate": 8.69468499245253e-05, + "loss": 0.7374, + "step": 9040 + }, + { + "epoch": 2.14, + "learning_rate": 8.64701676332724e-05, + "loss": 0.7194, + "step": 9060 + }, + { + "epoch": 2.15, + "learning_rate": 8.599348534201955e-05, + "loss": 0.7237, + "step": 9080 + }, + { + "epoch": 2.15, + "learning_rate": 8.551680305076666e-05, + "loss": 0.7287, + "step": 9100 + }, + { + "epoch": 2.16, + "learning_rate": 8.504012075951377e-05, + "loss": 0.7385, + "step": 9120 + }, + { + "epoch": 2.16, + "learning_rate": 8.45634384682609e-05, + "loss": 0.7319, + "step": 9140 + }, + { + "epoch": 2.17, + "learning_rate": 8.408675617700802e-05, + "loss": 0.7278, + "step": 9160 + }, + { + "epoch": 2.17, + "learning_rate": 8.361007388575515e-05, + "loss": 0.7293, + "step": 9180 + }, + { + "epoch": 2.18, + "learning_rate": 8.313339159450226e-05, + "loss": 0.7232, + "step": 9200 + }, + { + "epoch": 2.18, + "eval_loss": 0.7326176762580872, + "eval_runtime": 20.1581, + "eval_samples_per_second": 99.215, + "eval_steps_per_second": 3.125, + "step": 9200 + }, + { + "epoch": 2.18, + "learning_rate": 8.265670930324937e-05, + "loss": 0.7281, + "step": 9220 + }, + { + "epoch": 2.18, + "learning_rate": 8.21800270119965e-05, + "loss": 0.728, + "step": 9240 + }, + { + "epoch": 2.19, + "learning_rate": 8.170334472074362e-05, + "loss": 0.728, + "step": 9260 + }, + { + "epoch": 2.19, + "learning_rate": 8.122666242949073e-05, + "loss": 0.7221, + "step": 9280 + }, + { + "epoch": 2.2, + "learning_rate": 8.074998013823786e-05, + "loss": 0.7242, + "step": 9300 + }, + { + "epoch": 2.2, + "learning_rate": 8.027329784698497e-05, + "loss": 0.7306, + "step": 9320 + }, + { + "epoch": 2.21, + "learning_rate": 7.979661555573208e-05, + "loss": 0.7218, + "step": 9340 + }, + { + "epoch": 2.21, + "learning_rate": 7.931993326447922e-05, + "loss": 0.7289, + "step": 9360 + }, + { + "epoch": 2.22, + "learning_rate": 7.884325097322634e-05, + "loss": 0.7177, + "step": 9380 + }, + { + "epoch": 2.22, + "learning_rate": 7.836656868197346e-05, + "loss": 0.7265, + "step": 9400 + }, + { + "epoch": 2.22, + "eval_loss": 0.7311453819274902, + "eval_runtime": 19.9076, + "eval_samples_per_second": 100.464, + "eval_steps_per_second": 3.165, + "step": 9400 + }, + { + "epoch": 2.23, + "learning_rate": 7.788988639072057e-05, + "loss": 0.7269, + "step": 9420 + }, + { + "epoch": 2.23, + "learning_rate": 7.741320409946769e-05, + "loss": 0.7275, + "step": 9440 + }, + { + "epoch": 2.24, + "learning_rate": 7.693652180821483e-05, + "loss": 0.7317, + "step": 9460 + }, + { + "epoch": 2.24, + "learning_rate": 7.645983951696194e-05, + "loss": 0.7344, + "step": 9480 + }, + { + "epoch": 2.25, + "learning_rate": 7.598315722570906e-05, + "loss": 0.7263, + "step": 9500 + }, + { + "epoch": 2.25, + "learning_rate": 7.550647493445617e-05, + "loss": 0.7299, + "step": 9520 + }, + { + "epoch": 2.26, + "learning_rate": 7.502979264320329e-05, + "loss": 0.724, + "step": 9540 + }, + { + "epoch": 2.26, + "learning_rate": 7.455311035195041e-05, + "loss": 0.7266, + "step": 9560 + }, + { + "epoch": 2.27, + "learning_rate": 7.407642806069754e-05, + "loss": 0.7299, + "step": 9580 + }, + { + "epoch": 2.27, + "learning_rate": 7.359974576944465e-05, + "loss": 0.7236, + "step": 9600 + }, + { + "epoch": 2.27, + "eval_loss": 0.7311366200447083, + "eval_runtime": 20.0053, + "eval_samples_per_second": 99.973, + "eval_steps_per_second": 3.149, + "step": 9600 + }, + { + "epoch": 2.27, + "learning_rate": 7.314689759275442e-05, + "loss": 0.7252, + "step": 9620 + }, + { + "epoch": 2.28, + "learning_rate": 7.267021530150154e-05, + "loss": 0.7252, + "step": 9640 + }, + { + "epoch": 2.28, + "learning_rate": 7.219353301024865e-05, + "loss": 0.7188, + "step": 9660 + }, + { + "epoch": 2.29, + "learning_rate": 7.171685071899578e-05, + "loss": 0.7243, + "step": 9680 + }, + { + "epoch": 2.29, + "learning_rate": 7.12401684277429e-05, + "loss": 0.7298, + "step": 9700 + }, + { + "epoch": 2.3, + "learning_rate": 7.076348613649002e-05, + "loss": 0.7325, + "step": 9720 + }, + { + "epoch": 2.3, + "learning_rate": 7.028680384523714e-05, + "loss": 0.7286, + "step": 9740 + }, + { + "epoch": 2.31, + "learning_rate": 6.981012155398426e-05, + "loss": 0.7201, + "step": 9760 + }, + { + "epoch": 2.31, + "learning_rate": 6.933343926273138e-05, + "loss": 0.7184, + "step": 9780 + }, + { + "epoch": 2.32, + "learning_rate": 6.885675697147851e-05, + "loss": 0.7291, + "step": 9800 + }, + { + "epoch": 2.32, + "eval_loss": 0.7308618426322937, + "eval_runtime": 19.7965, + "eval_samples_per_second": 101.028, + "eval_steps_per_second": 3.182, + "step": 9800 + }, + { + "epoch": 2.32, + "learning_rate": 6.838007468022563e-05, + "loss": 0.7318, + "step": 9820 + }, + { + "epoch": 2.33, + "learning_rate": 6.790339238897274e-05, + "loss": 0.7227, + "step": 9840 + }, + { + "epoch": 2.33, + "learning_rate": 6.742671009771986e-05, + "loss": 0.7377, + "step": 9860 + }, + { + "epoch": 2.34, + "learning_rate": 6.695002780646698e-05, + "loss": 0.7367, + "step": 9880 + }, + { + "epoch": 2.34, + "learning_rate": 6.647334551521411e-05, + "loss": 0.7218, + "step": 9900 + }, + { + "epoch": 2.35, + "learning_rate": 6.599666322396122e-05, + "loss": 0.7282, + "step": 9920 + }, + { + "epoch": 2.35, + "learning_rate": 6.551998093270835e-05, + "loss": 0.7231, + "step": 9940 + }, + { + "epoch": 2.36, + "learning_rate": 6.504329864145546e-05, + "loss": 0.7257, + "step": 9960 + }, + { + "epoch": 2.36, + "learning_rate": 6.456661635020258e-05, + "loss": 0.7275, + "step": 9980 + }, + { + "epoch": 2.36, + "learning_rate": 6.40899340589497e-05, + "loss": 0.725, + "step": 10000 + }, + { + "epoch": 2.36, + "eval_loss": 0.7301817536354065, + "eval_runtime": 19.7914, + "eval_samples_per_second": 101.054, + "eval_steps_per_second": 3.183, + "step": 10000 + }, + { + "epoch": 2.37, + "learning_rate": 6.361325176769682e-05, + "loss": 0.72, + "step": 10020 + }, + { + "epoch": 2.37, + "learning_rate": 6.313656947644395e-05, + "loss": 0.7267, + "step": 10040 + }, + { + "epoch": 2.38, + "learning_rate": 6.265988718519107e-05, + "loss": 0.7276, + "step": 10060 + }, + { + "epoch": 2.38, + "learning_rate": 6.218320489393818e-05, + "loss": 0.7262, + "step": 10080 + }, + { + "epoch": 2.39, + "learning_rate": 6.17065226026853e-05, + "loss": 0.7149, + "step": 10100 + }, + { + "epoch": 2.39, + "learning_rate": 6.122984031143242e-05, + "loss": 0.7305, + "step": 10120 + }, + { + "epoch": 2.4, + "learning_rate": 6.075315802017954e-05, + "loss": 0.7314, + "step": 10140 + }, + { + "epoch": 2.4, + "learning_rate": 6.027647572892667e-05, + "loss": 0.7154, + "step": 10160 + }, + { + "epoch": 2.41, + "learning_rate": 5.9799793437673786e-05, + "loss": 0.7263, + "step": 10180 + }, + { + "epoch": 2.41, + "learning_rate": 5.93231111464209e-05, + "loss": 0.7203, + "step": 10200 + }, + { + "epoch": 2.41, + "eval_loss": 0.7294782996177673, + "eval_runtime": 19.7824, + "eval_samples_per_second": 101.1, + "eval_steps_per_second": 3.185, + "step": 10200 + }, + { + "epoch": 2.42, + "learning_rate": 5.8846428855168024e-05, + "loss": 0.7208, + "step": 10220 + }, + { + "epoch": 2.42, + "learning_rate": 5.836974656391514e-05, + "loss": 0.7266, + "step": 10240 + }, + { + "epoch": 2.43, + "learning_rate": 5.789306427266227e-05, + "loss": 0.7285, + "step": 10260 + }, + { + "epoch": 2.43, + "learning_rate": 5.741638198140939e-05, + "loss": 0.7215, + "step": 10280 + }, + { + "epoch": 2.44, + "learning_rate": 5.6939699690156506e-05, + "loss": 0.7203, + "step": 10300 + }, + { + "epoch": 2.44, + "learning_rate": 5.6463017398903625e-05, + "loss": 0.7314, + "step": 10320 + }, + { + "epoch": 2.44, + "learning_rate": 5.5986335107650744e-05, + "loss": 0.7394, + "step": 10340 + }, + { + "epoch": 2.45, + "learning_rate": 5.550965281639787e-05, + "loss": 0.7138, + "step": 10360 + }, + { + "epoch": 2.45, + "learning_rate": 5.503297052514498e-05, + "loss": 0.721, + "step": 10380 + }, + { + "epoch": 2.46, + "learning_rate": 5.455628823389211e-05, + "loss": 0.7199, + "step": 10400 + }, + { + "epoch": 2.46, + "eval_loss": 0.728507936000824, + "eval_runtime": 19.7761, + "eval_samples_per_second": 101.132, + "eval_steps_per_second": 3.186, + "step": 10400 + }, + { + "epoch": 2.46, + "learning_rate": 5.4079605942639226e-05, + "loss": 0.7228, + "step": 10420 + }, + { + "epoch": 2.47, + "learning_rate": 5.3602923651386345e-05, + "loss": 0.7193, + "step": 10440 + }, + { + "epoch": 2.47, + "learning_rate": 5.3126241360133464e-05, + "loss": 0.7269, + "step": 10460 + }, + { + "epoch": 2.48, + "learning_rate": 5.264955906888058e-05, + "loss": 0.729, + "step": 10480 + }, + { + "epoch": 2.48, + "learning_rate": 5.217287677762771e-05, + "loss": 0.7193, + "step": 10500 + }, + { + "epoch": 2.49, + "learning_rate": 5.169619448637483e-05, + "loss": 0.7158, + "step": 10520 + }, + { + "epoch": 2.49, + "learning_rate": 5.121951219512195e-05, + "loss": 0.7158, + "step": 10540 + }, + { + "epoch": 2.5, + "learning_rate": 5.0742829903869065e-05, + "loss": 0.7177, + "step": 10560 + }, + { + "epoch": 2.5, + "learning_rate": 5.0266147612616184e-05, + "loss": 0.7187, + "step": 10580 + }, + { + "epoch": 2.51, + "learning_rate": 4.978946532136331e-05, + "loss": 0.7185, + "step": 10600 + }, + { + "epoch": 2.51, + "eval_loss": 0.7283052802085876, + "eval_runtime": 20.2682, + "eval_samples_per_second": 98.677, + "eval_steps_per_second": 3.108, + "step": 10600 + }, + { + "epoch": 2.51, + "learning_rate": 4.931278303011042e-05, + "loss": 0.7264, + "step": 10620 + }, + { + "epoch": 2.52, + "learning_rate": 4.883610073885755e-05, + "loss": 0.7208, + "step": 10640 + }, + { + "epoch": 2.52, + "learning_rate": 4.835941844760467e-05, + "loss": 0.7275, + "step": 10660 + }, + { + "epoch": 2.53, + "learning_rate": 4.7882736156351786e-05, + "loss": 0.7205, + "step": 10680 + }, + { + "epoch": 2.53, + "learning_rate": 4.740605386509891e-05, + "loss": 0.7213, + "step": 10700 + }, + { + "epoch": 2.53, + "learning_rate": 4.692937157384602e-05, + "loss": 0.7324, + "step": 10720 + }, + { + "epoch": 2.54, + "learning_rate": 4.645268928259315e-05, + "loss": 0.7197, + "step": 10740 + }, + { + "epoch": 2.54, + "learning_rate": 4.597600699134027e-05, + "loss": 0.7162, + "step": 10760 + }, + { + "epoch": 2.55, + "learning_rate": 4.5499324700087394e-05, + "loss": 0.7223, + "step": 10780 + }, + { + "epoch": 2.55, + "learning_rate": 4.5022642408834506e-05, + "loss": 0.7249, + "step": 10800 + }, + { + "epoch": 2.55, + "eval_loss": 0.7278863191604614, + "eval_runtime": 19.7684, + "eval_samples_per_second": 101.171, + "eval_steps_per_second": 3.187, + "step": 10800 + }, + { + "epoch": 2.56, + "learning_rate": 4.4545960117581625e-05, + "loss": 0.7245, + "step": 10820 + }, + { + "epoch": 2.56, + "learning_rate": 4.406927782632875e-05, + "loss": 0.7298, + "step": 10840 + }, + { + "epoch": 2.57, + "learning_rate": 4.359259553507587e-05, + "loss": 0.7172, + "step": 10860 + }, + { + "epoch": 2.57, + "learning_rate": 4.3115913243822995e-05, + "loss": 0.7183, + "step": 10880 + }, + { + "epoch": 2.58, + "learning_rate": 4.263923095257011e-05, + "loss": 0.7172, + "step": 10900 + }, + { + "epoch": 2.58, + "learning_rate": 4.2162548661317226e-05, + "loss": 0.7166, + "step": 10920 + }, + { + "epoch": 2.59, + "learning_rate": 4.168586637006435e-05, + "loss": 0.7303, + "step": 10940 + }, + { + "epoch": 2.59, + "learning_rate": 4.1209184078811464e-05, + "loss": 0.716, + "step": 10960 + }, + { + "epoch": 2.6, + "learning_rate": 4.073250178755859e-05, + "loss": 0.7199, + "step": 10980 + }, + { + "epoch": 2.6, + "learning_rate": 4.025581949630571e-05, + "loss": 0.7227, + "step": 11000 + }, + { + "epoch": 2.6, + "eval_loss": 0.7274474501609802, + "eval_runtime": 19.9546, + "eval_samples_per_second": 100.228, + "eval_steps_per_second": 3.157, + "step": 11000 + }, + { + "epoch": 2.61, + "learning_rate": 3.9779137205052834e-05, + "loss": 0.7134, + "step": 11020 + }, + { + "epoch": 2.61, + "learning_rate": 3.930245491379995e-05, + "loss": 0.7354, + "step": 11040 + }, + { + "epoch": 2.62, + "learning_rate": 3.8825772622547065e-05, + "loss": 0.7269, + "step": 11060 + }, + { + "epoch": 2.62, + "learning_rate": 3.834909033129419e-05, + "loss": 0.7261, + "step": 11080 + }, + { + "epoch": 2.62, + "learning_rate": 3.787240804004131e-05, + "loss": 0.735, + "step": 11100 + }, + { + "epoch": 2.63, + "learning_rate": 3.739572574878843e-05, + "loss": 0.716, + "step": 11120 + }, + { + "epoch": 2.63, + "learning_rate": 3.691904345753555e-05, + "loss": 0.721, + "step": 11140 + }, + { + "epoch": 2.64, + "learning_rate": 3.644236116628267e-05, + "loss": 0.7201, + "step": 11160 + }, + { + "epoch": 2.64, + "learning_rate": 3.596567887502979e-05, + "loss": 0.7231, + "step": 11180 + }, + { + "epoch": 2.65, + "learning_rate": 3.548899658377691e-05, + "loss": 0.7172, + "step": 11200 + }, + { + "epoch": 2.65, + "eval_loss": 0.7270590662956238, + "eval_runtime": 19.753, + "eval_samples_per_second": 101.251, + "eval_steps_per_second": 3.189, + "step": 11200 + }, + { + "epoch": 2.65, + "learning_rate": 3.501231429252403e-05, + "loss": 0.7296, + "step": 11220 + }, + { + "epoch": 2.66, + "learning_rate": 3.453563200127115e-05, + "loss": 0.7239, + "step": 11240 + }, + { + "epoch": 2.66, + "learning_rate": 3.405894971001827e-05, + "loss": 0.7215, + "step": 11260 + }, + { + "epoch": 2.67, + "learning_rate": 3.358226741876539e-05, + "loss": 0.7176, + "step": 11280 + }, + { + "epoch": 2.67, + "learning_rate": 3.310558512751251e-05, + "loss": 0.7277, + "step": 11300 + }, + { + "epoch": 2.68, + "learning_rate": 3.262890283625963e-05, + "loss": 0.7237, + "step": 11320 + }, + { + "epoch": 2.68, + "learning_rate": 3.215222054500675e-05, + "loss": 0.7167, + "step": 11340 + }, + { + "epoch": 2.69, + "learning_rate": 3.167553825375387e-05, + "loss": 0.7184, + "step": 11360 + }, + { + "epoch": 2.69, + "learning_rate": 3.119885596250099e-05, + "loss": 0.7238, + "step": 11380 + }, + { + "epoch": 2.7, + "learning_rate": 3.072217367124811e-05, + "loss": 0.7188, + "step": 11400 + }, + { + "epoch": 2.7, + "eval_loss": 0.7263159155845642, + "eval_runtime": 19.6317, + "eval_samples_per_second": 101.876, + "eval_steps_per_second": 3.209, + "step": 11400 + }, + { + "epoch": 2.7, + "learning_rate": 3.0245491379995232e-05, + "loss": 0.7146, + "step": 11420 + }, + { + "epoch": 2.71, + "learning_rate": 2.9768809088742348e-05, + "loss": 0.7307, + "step": 11440 + }, + { + "epoch": 2.71, + "learning_rate": 2.929212679748947e-05, + "loss": 0.721, + "step": 11460 + }, + { + "epoch": 2.71, + "learning_rate": 2.881544450623659e-05, + "loss": 0.7293, + "step": 11480 + }, + { + "epoch": 2.72, + "learning_rate": 2.833876221498371e-05, + "loss": 0.7245, + "step": 11500 + }, + { + "epoch": 2.72, + "learning_rate": 2.7862079923730833e-05, + "loss": 0.7264, + "step": 11520 + }, + { + "epoch": 2.73, + "learning_rate": 2.7385397632477952e-05, + "loss": 0.722, + "step": 11540 + }, + { + "epoch": 2.73, + "learning_rate": 2.6908715341225068e-05, + "loss": 0.7195, + "step": 11560 + }, + { + "epoch": 2.74, + "learning_rate": 2.643203304997219e-05, + "loss": 0.7181, + "step": 11580 + }, + { + "epoch": 2.74, + "learning_rate": 2.5955350758719312e-05, + "loss": 0.7225, + "step": 11600 + }, + { + "epoch": 2.74, + "eval_loss": 0.7265506386756897, + "eval_runtime": 19.5252, + "eval_samples_per_second": 102.432, + "eval_steps_per_second": 3.227, + "step": 11600 + }, + { + "epoch": 2.75, + "learning_rate": 2.547866846746643e-05, + "loss": 0.7151, + "step": 11620 + }, + { + "epoch": 2.75, + "learning_rate": 2.5001986176213553e-05, + "loss": 0.7211, + "step": 11640 + }, + { + "epoch": 2.76, + "learning_rate": 2.4525303884960672e-05, + "loss": 0.7231, + "step": 11660 + }, + { + "epoch": 2.76, + "learning_rate": 2.404862159370779e-05, + "loss": 0.7236, + "step": 11680 + }, + { + "epoch": 2.77, + "learning_rate": 2.357193930245491e-05, + "loss": 0.7161, + "step": 11700 + }, + { + "epoch": 2.77, + "learning_rate": 2.3095257011202032e-05, + "loss": 0.7248, + "step": 11720 + }, + { + "epoch": 2.78, + "learning_rate": 2.261857471994915e-05, + "loss": 0.7195, + "step": 11740 + }, + { + "epoch": 2.78, + "learning_rate": 2.2141892428696274e-05, + "loss": 0.718, + "step": 11760 + }, + { + "epoch": 2.79, + "learning_rate": 2.1665210137443392e-05, + "loss": 0.7161, + "step": 11780 + }, + { + "epoch": 2.79, + "learning_rate": 2.118852784619051e-05, + "loss": 0.7204, + "step": 11800 + }, + { + "epoch": 2.79, + "eval_loss": 0.7261104583740234, + "eval_runtime": 20.0617, + "eval_samples_per_second": 99.692, + "eval_steps_per_second": 3.14, + "step": 11800 + }, + { + "epoch": 2.79, + "learning_rate": 2.071184555493763e-05, + "loss": 0.716, + "step": 11820 + }, + { + "epoch": 2.8, + "learning_rate": 2.0235163263684753e-05, + "loss": 0.7211, + "step": 11840 + }, + { + "epoch": 2.8, + "learning_rate": 1.975848097243187e-05, + "loss": 0.7242, + "step": 11860 + }, + { + "epoch": 2.81, + "learning_rate": 1.9281798681178994e-05, + "loss": 0.7129, + "step": 11880 + }, + { + "epoch": 2.81, + "learning_rate": 1.8828950504488756e-05, + "loss": 0.7233, + "step": 11900 + }, + { + "epoch": 2.82, + "learning_rate": 1.8352268213235875e-05, + "loss": 0.7286, + "step": 11920 + }, + { + "epoch": 2.82, + "learning_rate": 1.7875585921982997e-05, + "loss": 0.7147, + "step": 11940 + }, + { + "epoch": 2.83, + "learning_rate": 1.7398903630730116e-05, + "loss": 0.7303, + "step": 11960 + }, + { + "epoch": 2.83, + "learning_rate": 1.692222133947724e-05, + "loss": 0.7126, + "step": 11980 + }, + { + "epoch": 2.84, + "learning_rate": 1.6445539048224358e-05, + "loss": 0.7174, + "step": 12000 + }, + { + "epoch": 2.84, + "eval_loss": 0.7259587645530701, + "eval_runtime": 20.6636, + "eval_samples_per_second": 96.788, + "eval_steps_per_second": 3.049, + "step": 12000 + }, + { + "epoch": 2.84, + "learning_rate": 1.5968856756971476e-05, + "loss": 0.7147, + "step": 12020 + }, + { + "epoch": 2.85, + "learning_rate": 1.54921744657186e-05, + "loss": 0.7184, + "step": 12040 + }, + { + "epoch": 2.85, + "learning_rate": 1.5015492174465718e-05, + "loss": 0.7218, + "step": 12060 + }, + { + "epoch": 2.86, + "learning_rate": 1.4538809883212837e-05, + "loss": 0.7172, + "step": 12080 + }, + { + "epoch": 2.86, + "learning_rate": 1.4062127591959957e-05, + "loss": 0.7326, + "step": 12100 + }, + { + "epoch": 2.87, + "learning_rate": 1.3585445300707078e-05, + "loss": 0.726, + "step": 12120 + }, + { + "epoch": 2.87, + "learning_rate": 1.3108763009454197e-05, + "loss": 0.711, + "step": 12140 + }, + { + "epoch": 2.88, + "learning_rate": 1.2632080718201317e-05, + "loss": 0.7199, + "step": 12160 + }, + { + "epoch": 2.88, + "learning_rate": 1.215539842694844e-05, + "loss": 0.7256, + "step": 12180 + }, + { + "epoch": 2.88, + "learning_rate": 1.1678716135695557e-05, + "loss": 0.7183, + "step": 12200 + }, + { + "epoch": 2.88, + "eval_loss": 0.7255927324295044, + "eval_runtime": 20.0566, + "eval_samples_per_second": 99.718, + "eval_steps_per_second": 3.141, + "step": 12200 + }, + { + "epoch": 2.89, + "learning_rate": 1.1202033844442679e-05, + "loss": 0.7244, + "step": 12220 + }, + { + "epoch": 2.89, + "learning_rate": 1.07253515531898e-05, + "loss": 0.717, + "step": 12240 + }, + { + "epoch": 2.9, + "learning_rate": 1.0248669261936918e-05, + "loss": 0.7224, + "step": 12260 + }, + { + "epoch": 2.9, + "learning_rate": 9.771986970684039e-06, + "loss": 0.7124, + "step": 12280 + }, + { + "epoch": 2.91, + "learning_rate": 9.295304679431158e-06, + "loss": 0.7285, + "step": 12300 + }, + { + "epoch": 2.91, + "learning_rate": 8.818622388178278e-06, + "loss": 0.7337, + "step": 12320 + }, + { + "epoch": 2.92, + "learning_rate": 8.341940096925399e-06, + "loss": 0.716, + "step": 12340 + }, + { + "epoch": 2.92, + "learning_rate": 7.865257805672518e-06, + "loss": 0.7212, + "step": 12360 + }, + { + "epoch": 2.93, + "learning_rate": 7.3885755144196385e-06, + "loss": 0.7262, + "step": 12380 + }, + { + "epoch": 2.93, + "learning_rate": 6.911893223166759e-06, + "loss": 0.7151, + "step": 12400 + }, + { + "epoch": 2.93, + "eval_loss": 0.7256051301956177, + "eval_runtime": 19.7012, + "eval_samples_per_second": 101.516, + "eval_steps_per_second": 3.198, + "step": 12400 + } + ], + "max_steps": 12687, + "num_train_epochs": 3, + "total_flos": 1.6118284402370281e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-belle-7b/checkpoint-12400/training_args.bin b/adapters/saved-belle-7b/checkpoint-12400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e7508cbd7713243e73fe59f258eaa12f0deefce5 --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9809a7383d594262bcb22ae9222e2580aba6862268d17b0cc4f7bd3fe5579126 +size 3579 diff --git a/adapters/saved-belle-7b/checkpoint-12600/optimizer.pt b/adapters/saved-belle-7b/checkpoint-12600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e6b8b2476096a971f6de8bad6cf0473a7ec9a17 --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:885eb4ec129d7c8069995ca164c6cd9ca1bdf62ee68f6d3431ac4dabe89514a1 +size 33629893 diff --git a/adapters/saved-belle-7b/checkpoint-12600/pytorch_model.bin b/adapters/saved-belle-7b/checkpoint-12600/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..c04f95489fbf95fe7f82cda6a45f0b2af3618c9e --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12600/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa6831f535a8d20dcd9cee918274b60c31a6769a4f930f8ab117221550ffc442 +size 16822989 diff --git a/adapters/saved-belle-7b/checkpoint-12600/rng_state_0.pth b/adapters/saved-belle-7b/checkpoint-12600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..687528ab25210e68ee2bbd8d2d28579703b15312 --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94c0e36b412af8c09c7d8cd37e6450c3ad33493fb53983386aaff297b80875d7 +size 14583 diff --git a/adapters/saved-belle-7b/checkpoint-12600/rng_state_1.pth b/adapters/saved-belle-7b/checkpoint-12600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2ccc2d2104a65b2fb0814cfedc634fa0afa21717 --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14b4add9aff8b1c5fc12726232d798786dac018335569bed79667b5afd28ddad +size 14583 diff --git a/adapters/saved-belle-7b/checkpoint-12600/rng_state_2.pth b/adapters/saved-belle-7b/checkpoint-12600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..77a8b96f7ca1ea610143051d564752901a1ef86a --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2f8af2ce3c2934e6afc10db200c806db0e3a9a8bd3703ea64e30b7c8299a14a +size 14583 diff --git a/adapters/saved-belle-7b/checkpoint-12600/rng_state_3.pth b/adapters/saved-belle-7b/checkpoint-12600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..e7aa08744df9b9d3f7b06a952f51bd9e5a9ccd18 --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62407c450b62b5a32192f7d587ad97377befd66da7f5104ca85178f88957a623 +size 14583 diff --git a/adapters/saved-belle-7b/checkpoint-12600/scaler.pt b/adapters/saved-belle-7b/checkpoint-12600/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..988628f5fe973f56db482436f0169df197023d13 --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12600/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1887e5e8f7d92b19cde0c7085df2807a08474a1ecb77d7def57ef45f486fda47 +size 557 diff --git a/adapters/saved-belle-7b/checkpoint-12600/scheduler.pt b/adapters/saved-belle-7b/checkpoint-12600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..dfdd45464d66f55f98a5212b38e7ba3581e076ff --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b42a1b85bf0a5f5b73ebc5e26f0693f3fb4068809f18d25ac288ece0a6b1c488 +size 627 diff --git a/adapters/saved-belle-7b/checkpoint-12600/trainer_state.json b/adapters/saved-belle-7b/checkpoint-12600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e60bc91567c490cedeb6a7bb5bb358bfa8bbacee --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12600/trainer_state.json @@ -0,0 +1,4300 @@ +{ + "best_metric": 0.7255715727806091, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-belle-7b/checkpoint-12600", + "epoch": 2.9793396979280584, + "global_step": 12600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.8908, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 1.5545, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 1.1252, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 0.00023999999999999998, + "loss": 1.054, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.0003, + "loss": 1.0137, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002995233177087471, + "loss": 1.0046, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002990466354174942, + "loss": 0.9867, + "step": 140 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029856995312624134, + "loss": 0.9612, + "step": 160 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002980932708349884, + "loss": 0.9588, + "step": 180 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002976165885437356, + "loss": 0.9551, + "step": 200 + }, + { + "epoch": 0.05, + "eval_loss": 0.9459459185600281, + "eval_runtime": 19.4211, + "eval_samples_per_second": 102.981, + "eval_steps_per_second": 3.244, + "step": 200 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002971399062524827, + "loss": 0.9516, + "step": 220 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029666322396122984, + "loss": 0.937, + "step": 240 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002961865416699769, + "loss": 0.936, + "step": 260 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029570985937872406, + "loss": 0.9305, + "step": 280 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029523317708747115, + "loss": 0.9146, + "step": 300 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002947564947962183, + "loss": 0.9226, + "step": 320 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029427981250496543, + "loss": 0.9108, + "step": 340 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002938031302137125, + "loss": 0.9129, + "step": 360 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029332644792245965, + "loss": 0.9063, + "step": 380 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002928497656312068, + "loss": 0.8996, + "step": 400 + }, + { + "epoch": 0.09, + "eval_loss": 0.9003962874412537, + "eval_runtime": 19.5716, + "eval_samples_per_second": 102.189, + "eval_steps_per_second": 3.219, + "step": 400 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029237308333995393, + "loss": 0.898, + "step": 420 + }, + { + "epoch": 0.1, + "learning_rate": 0.000291896401048701, + "loss": 0.8936, + "step": 440 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029141971875744815, + "loss": 0.8932, + "step": 460 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029094303646619524, + "loss": 0.8779, + "step": 480 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002904663541749424, + "loss": 0.8871, + "step": 500 + }, + { + "epoch": 0.12, + "learning_rate": 0.00028998967188368946, + "loss": 0.8929, + "step": 520 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002895129895924366, + "loss": 0.8878, + "step": 540 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028903630730118374, + "loss": 0.8818, + "step": 560 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028855962500993083, + "loss": 0.8826, + "step": 580 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028808294271867797, + "loss": 0.879, + "step": 600 + }, + { + "epoch": 0.14, + "eval_loss": 0.8738257884979248, + "eval_runtime": 19.3526, + "eval_samples_per_second": 103.345, + "eval_steps_per_second": 3.255, + "step": 600 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002876062604274251, + "loss": 0.87, + "step": 620 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028712957813617224, + "loss": 0.8715, + "step": 640 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028665289584491933, + "loss": 0.8724, + "step": 660 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028617621355366647, + "loss": 0.8741, + "step": 680 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028569953126241355, + "loss": 0.8705, + "step": 700 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002852228489711607, + "loss": 0.8702, + "step": 720 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028474616667990783, + "loss": 0.8618, + "step": 740 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002842694843886549, + "loss": 0.8617, + "step": 760 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028379280209740206, + "loss": 0.8677, + "step": 780 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002833161198061492, + "loss": 0.853, + "step": 800 + }, + { + "epoch": 0.19, + "eval_loss": 0.8541846871376038, + "eval_runtime": 19.49, + "eval_samples_per_second": 102.617, + "eval_steps_per_second": 3.232, + "step": 800 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002828394375148963, + "loss": 0.8549, + "step": 820 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002823627552236434, + "loss": 0.847, + "step": 840 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028188607293239056, + "loss": 0.8585, + "step": 860 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028140939064113764, + "loss": 0.8568, + "step": 880 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002809327083498848, + "loss": 0.8482, + "step": 900 + }, + { + "epoch": 0.22, + "learning_rate": 0.00028045602605863187, + "loss": 0.845, + "step": 920 + }, + { + "epoch": 0.22, + "learning_rate": 0.000279979343767379, + "loss": 0.8548, + "step": 940 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027950266147612615, + "loss": 0.8372, + "step": 960 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002790259791848733, + "loss": 0.8423, + "step": 980 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027854929689362037, + "loss": 0.8433, + "step": 1000 + }, + { + "epoch": 0.24, + "eval_loss": 0.8396860361099243, + "eval_runtime": 19.3459, + "eval_samples_per_second": 103.381, + "eval_steps_per_second": 3.257, + "step": 1000 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002780726146023675, + "loss": 0.848, + "step": 1020 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002775959323111146, + "loss": 0.8383, + "step": 1040 + }, + { + "epoch": 0.25, + "learning_rate": 0.00027711925001986173, + "loss": 0.8385, + "step": 1060 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002766425677286089, + "loss": 0.8308, + "step": 1080 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027616588543735596, + "loss": 0.8244, + "step": 1100 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002756892031461031, + "loss": 0.835, + "step": 1120 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002752125208548502, + "loss": 0.8337, + "step": 1140 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002747358385635974, + "loss": 0.8348, + "step": 1160 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027425915627234446, + "loss": 0.8353, + "step": 1180 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002737824739810916, + "loss": 0.8294, + "step": 1200 + }, + { + "epoch": 0.28, + "eval_loss": 0.8274422287940979, + "eval_runtime": 19.4187, + "eval_samples_per_second": 102.993, + "eval_steps_per_second": 3.244, + "step": 1200 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002733057916898387, + "loss": 0.8337, + "step": 1220 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002728291093985858, + "loss": 0.8435, + "step": 1240 + }, + { + "epoch": 0.3, + "learning_rate": 0.00027235242710733296, + "loss": 0.8347, + "step": 1260 + }, + { + "epoch": 0.3, + "learning_rate": 0.00027187574481608005, + "loss": 0.8258, + "step": 1280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002713990625248272, + "loss": 0.8304, + "step": 1300 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002709223802335743, + "loss": 0.8264, + "step": 1320 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002704456979423214, + "loss": 0.8313, + "step": 1340 + }, + { + "epoch": 0.32, + "learning_rate": 0.00026996901565106855, + "loss": 0.814, + "step": 1360 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002694923333598157, + "loss": 0.8223, + "step": 1380 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002690156510685628, + "loss": 0.8159, + "step": 1400 + }, + { + "epoch": 0.33, + "eval_loss": 0.8179089426994324, + "eval_runtime": 19.4736, + "eval_samples_per_second": 102.703, + "eval_steps_per_second": 3.235, + "step": 1400 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002685389687773099, + "loss": 0.8218, + "step": 1420 + }, + { + "epoch": 0.34, + "learning_rate": 0.000268062286486057, + "loss": 0.808, + "step": 1440 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026758560419480414, + "loss": 0.8253, + "step": 1460 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002671089219035513, + "loss": 0.8174, + "step": 1480 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026663223961229836, + "loss": 0.8157, + "step": 1500 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002661555573210455, + "loss": 0.8142, + "step": 1520 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002656788750297926, + "loss": 0.8112, + "step": 1540 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026520219273853973, + "loss": 0.8232, + "step": 1560 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026472551044728687, + "loss": 0.8254, + "step": 1580 + }, + { + "epoch": 0.38, + "learning_rate": 0.000264248828156034, + "loss": 0.8059, + "step": 1600 + }, + { + "epoch": 0.38, + "eval_loss": 0.8101135492324829, + "eval_runtime": 19.5846, + "eval_samples_per_second": 102.121, + "eval_steps_per_second": 3.217, + "step": 1600 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002637721458647811, + "loss": 0.8062, + "step": 1620 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026329546357352823, + "loss": 0.805, + "step": 1640 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002628187812822753, + "loss": 0.8109, + "step": 1660 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026234209899102245, + "loss": 0.801, + "step": 1680 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002618654166997696, + "loss": 0.8043, + "step": 1700 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002613887344085167, + "loss": 0.8002, + "step": 1720 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002609120521172638, + "loss": 0.8152, + "step": 1740 + }, + { + "epoch": 0.42, + "learning_rate": 0.00026043536982601096, + "loss": 0.8052, + "step": 1760 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002599586875347581, + "loss": 0.8136, + "step": 1780 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002594820052435052, + "loss": 0.8044, + "step": 1800 + }, + { + "epoch": 0.43, + "eval_loss": 0.8030326962471008, + "eval_runtime": 19.4835, + "eval_samples_per_second": 102.651, + "eval_steps_per_second": 3.234, + "step": 1800 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002590053229522523, + "loss": 0.7995, + "step": 1820 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002585286406609994, + "loss": 0.7958, + "step": 1840 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025805195836974654, + "loss": 0.8034, + "step": 1860 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025757527607849363, + "loss": 0.8016, + "step": 1880 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025709859378724077, + "loss": 0.8048, + "step": 1900 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002566219114959879, + "loss": 0.8004, + "step": 1920 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025614522920473505, + "loss": 0.8041, + "step": 1940 + }, + { + "epoch": 0.46, + "learning_rate": 0.00025566854691348213, + "loss": 0.7908, + "step": 1960 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025519186462222927, + "loss": 0.7958, + "step": 1980 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002547151823309764, + "loss": 0.8013, + "step": 2000 + }, + { + "epoch": 0.47, + "eval_loss": 0.7965430021286011, + "eval_runtime": 19.4852, + "eval_samples_per_second": 102.642, + "eval_steps_per_second": 3.233, + "step": 2000 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002542385000397235, + "loss": 0.803, + "step": 2020 + }, + { + "epoch": 0.48, + "learning_rate": 0.00025376181774847064, + "loss": 0.7966, + "step": 2040 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002532851354572177, + "loss": 0.7946, + "step": 2060 + }, + { + "epoch": 0.49, + "learning_rate": 0.00025280845316596486, + "loss": 0.8023, + "step": 2080 + }, + { + "epoch": 0.5, + "learning_rate": 0.00025233177087471194, + "loss": 0.7953, + "step": 2100 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002518550885834591, + "loss": 0.8053, + "step": 2120 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002513784062922062, + "loss": 0.7883, + "step": 2140 + }, + { + "epoch": 0.51, + "learning_rate": 0.00025090172400095336, + "loss": 0.7984, + "step": 2160 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025042504170970045, + "loss": 0.7962, + "step": 2180 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002499483594184476, + "loss": 0.7847, + "step": 2200 + }, + { + "epoch": 0.52, + "eval_loss": 0.7915623784065247, + "eval_runtime": 19.5509, + "eval_samples_per_second": 102.297, + "eval_steps_per_second": 3.222, + "step": 2200 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002494716771271947, + "loss": 0.7917, + "step": 2220 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002489949948359418, + "loss": 0.7942, + "step": 2240 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024851831254468895, + "loss": 0.7921, + "step": 2260 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024804163025343603, + "loss": 0.7971, + "step": 2280 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002475649479621832, + "loss": 0.7919, + "step": 2300 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002470882656709303, + "loss": 0.7917, + "step": 2320 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024661158337967745, + "loss": 0.8024, + "step": 2340 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024613490108842454, + "loss": 0.7761, + "step": 2360 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002456582187971717, + "loss": 0.7958, + "step": 2380 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024518153650591876, + "loss": 0.7855, + "step": 2400 + }, + { + "epoch": 0.57, + "eval_loss": 0.7870249152183533, + "eval_runtime": 19.5953, + "eval_samples_per_second": 102.065, + "eval_steps_per_second": 3.215, + "step": 2400 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002447048542146659, + "loss": 0.784, + "step": 2420 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024422817192341304, + "loss": 0.7926, + "step": 2440 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024375148963216013, + "loss": 0.7845, + "step": 2460 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024327480734090726, + "loss": 0.782, + "step": 2480 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024279812504965438, + "loss": 0.7808, + "step": 2500 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024232144275840152, + "loss": 0.7926, + "step": 2520 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024184476046714863, + "loss": 0.7795, + "step": 2540 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024136807817589574, + "loss": 0.7888, + "step": 2560 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024089139588464288, + "loss": 0.7888, + "step": 2580 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024041471359339, + "loss": 0.7863, + "step": 2600 + }, + { + "epoch": 0.61, + "eval_loss": 0.7825512290000916, + "eval_runtime": 19.4274, + "eval_samples_per_second": 102.948, + "eval_steps_per_second": 3.243, + "step": 2600 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002399380313021371, + "loss": 0.7881, + "step": 2620 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023946134901088422, + "loss": 0.7841, + "step": 2640 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023898466671963133, + "loss": 0.7849, + "step": 2660 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023850798442837844, + "loss": 0.7809, + "step": 2680 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002380313021371256, + "loss": 0.7757, + "step": 2700 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023755461984587272, + "loss": 0.7787, + "step": 2720 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023707793755461983, + "loss": 0.7766, + "step": 2740 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023660125526336694, + "loss": 0.7867, + "step": 2760 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023612457297211405, + "loss": 0.7767, + "step": 2780 + }, + { + "epoch": 0.66, + "learning_rate": 0.0002356478906808612, + "loss": 0.7806, + "step": 2800 + }, + { + "epoch": 0.66, + "eval_loss": 0.7781409621238708, + "eval_runtime": 20.131, + "eval_samples_per_second": 99.349, + "eval_steps_per_second": 3.13, + "step": 2800 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002351712083896083, + "loss": 0.7774, + "step": 2820 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023469452609835542, + "loss": 0.7782, + "step": 2840 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023421784380710253, + "loss": 0.7773, + "step": 2860 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023374116151584964, + "loss": 0.7845, + "step": 2880 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002332644792245968, + "loss": 0.7879, + "step": 2900 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023278779693334392, + "loss": 0.7801, + "step": 2920 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023231111464209103, + "loss": 0.7713, + "step": 2940 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023183443235083814, + "loss": 0.7742, + "step": 2960 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023135775005958526, + "loss": 0.7783, + "step": 2980 + }, + { + "epoch": 0.71, + "learning_rate": 0.0002308810677683324, + "loss": 0.7698, + "step": 3000 + }, + { + "epoch": 0.71, + "eval_loss": 0.7747411131858826, + "eval_runtime": 20.0968, + "eval_samples_per_second": 99.519, + "eval_steps_per_second": 3.135, + "step": 3000 + }, + { + "epoch": 0.71, + "learning_rate": 0.0002304043854770795, + "loss": 0.7696, + "step": 3020 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022992770318582662, + "loss": 0.7744, + "step": 3040 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022945102089457373, + "loss": 0.7687, + "step": 3060 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022897433860332084, + "loss": 0.7765, + "step": 3080 + }, + { + "epoch": 0.73, + "learning_rate": 0.000228497656312068, + "loss": 0.7709, + "step": 3100 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022802097402081512, + "loss": 0.773, + "step": 3120 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022754429172956224, + "loss": 0.7862, + "step": 3140 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022706760943830935, + "loss": 0.7668, + "step": 3160 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022659092714705646, + "loss": 0.7816, + "step": 3180 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022611424485580357, + "loss": 0.7831, + "step": 3200 + }, + { + "epoch": 0.76, + "eval_loss": 0.7719215154647827, + "eval_runtime": 19.6387, + "eval_samples_per_second": 101.84, + "eval_steps_per_second": 3.208, + "step": 3200 + }, + { + "epoch": 0.76, + "learning_rate": 0.0002256375625645507, + "loss": 0.7723, + "step": 3220 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022516088027329782, + "loss": 0.7727, + "step": 3240 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022468419798204493, + "loss": 0.7719, + "step": 3260 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022420751569079207, + "loss": 0.7796, + "step": 3280 + }, + { + "epoch": 0.78, + "learning_rate": 0.0002237308333995392, + "loss": 0.7685, + "step": 3300 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022325415110828633, + "loss": 0.7725, + "step": 3320 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022277746881703344, + "loss": 0.7638, + "step": 3340 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022230078652578055, + "loss": 0.7771, + "step": 3360 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022182410423452766, + "loss": 0.7689, + "step": 3380 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022134742194327477, + "loss": 0.7797, + "step": 3400 + }, + { + "epoch": 0.8, + "eval_loss": 0.768983006477356, + "eval_runtime": 19.4428, + "eval_samples_per_second": 102.866, + "eval_steps_per_second": 3.24, + "step": 3400 + }, + { + "epoch": 0.81, + "learning_rate": 0.0002208707396520219, + "loss": 0.7734, + "step": 3420 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022039405736076903, + "loss": 0.7719, + "step": 3440 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021991737506951614, + "loss": 0.767, + "step": 3460 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021944069277826328, + "loss": 0.7758, + "step": 3480 + }, + { + "epoch": 0.83, + "learning_rate": 0.0002189640104870104, + "loss": 0.7768, + "step": 3500 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021848732819575753, + "loss": 0.7641, + "step": 3520 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021801064590450464, + "loss": 0.7694, + "step": 3540 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021753396361325175, + "loss": 0.7835, + "step": 3560 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021705728132199886, + "loss": 0.7642, + "step": 3580 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021658059903074598, + "loss": 0.7719, + "step": 3600 + }, + { + "epoch": 0.85, + "eval_loss": 0.7660636305809021, + "eval_runtime": 19.5996, + "eval_samples_per_second": 102.043, + "eval_steps_per_second": 3.214, + "step": 3600 + }, + { + "epoch": 0.86, + "learning_rate": 0.0002161039167394931, + "loss": 0.7723, + "step": 3620 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021562723444824023, + "loss": 0.76, + "step": 3640 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021515055215698734, + "loss": 0.7643, + "step": 3660 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021467386986573448, + "loss": 0.7599, + "step": 3680 + }, + { + "epoch": 0.87, + "learning_rate": 0.0002141971875744816, + "loss": 0.7623, + "step": 3700 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002137205052832287, + "loss": 0.7621, + "step": 3720 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021324382299197584, + "loss": 0.7691, + "step": 3740 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021276714070072295, + "loss": 0.7665, + "step": 3760 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021229045840947007, + "loss": 0.7742, + "step": 3780 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021181377611821718, + "loss": 0.7624, + "step": 3800 + }, + { + "epoch": 0.9, + "eval_loss": 0.7643172740936279, + "eval_runtime": 19.487, + "eval_samples_per_second": 102.633, + "eval_steps_per_second": 3.233, + "step": 3800 + }, + { + "epoch": 0.9, + "learning_rate": 0.0002113370938269643, + "loss": 0.7726, + "step": 3820 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002108604115357114, + "loss": 0.7559, + "step": 3840 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021038372924445857, + "loss": 0.7634, + "step": 3860 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020990704695320568, + "loss": 0.765, + "step": 3880 + }, + { + "epoch": 0.92, + "learning_rate": 0.0002094303646619528, + "loss": 0.7649, + "step": 3900 + }, + { + "epoch": 0.93, + "learning_rate": 0.0002089536823706999, + "loss": 0.763, + "step": 3920 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020847700007944705, + "loss": 0.7679, + "step": 3940 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020800031778819416, + "loss": 0.7644, + "step": 3960 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020752363549694127, + "loss": 0.7655, + "step": 3980 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020704695320568838, + "loss": 0.7681, + "step": 4000 + }, + { + "epoch": 0.95, + "eval_loss": 0.7610963582992554, + "eval_runtime": 19.5269, + "eval_samples_per_second": 102.423, + "eval_steps_per_second": 3.226, + "step": 4000 + }, + { + "epoch": 0.95, + "learning_rate": 0.0002065702709144355, + "loss": 0.7623, + "step": 4020 + }, + { + "epoch": 0.96, + "learning_rate": 0.0002060935886231826, + "loss": 0.7625, + "step": 4040 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020561690633192977, + "loss": 0.7524, + "step": 4060 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020514022404067688, + "loss": 0.764, + "step": 4080 + }, + { + "epoch": 0.97, + "learning_rate": 0.000204663541749424, + "loss": 0.7513, + "step": 4100 + }, + { + "epoch": 0.97, + "learning_rate": 0.0002041868594581711, + "loss": 0.753, + "step": 4120 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020371017716691822, + "loss": 0.7602, + "step": 4140 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020323349487566536, + "loss": 0.7701, + "step": 4160 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020275681258441247, + "loss": 0.7602, + "step": 4180 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020228013029315958, + "loss": 0.7598, + "step": 4200 + }, + { + "epoch": 0.99, + "eval_loss": 0.760128915309906, + "eval_runtime": 19.4387, + "eval_samples_per_second": 102.888, + "eval_steps_per_second": 3.241, + "step": 4200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0002018034480019067, + "loss": 0.7579, + "step": 4220 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020132676571065384, + "loss": 0.7628, + "step": 4240 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020085008341940097, + "loss": 0.7551, + "step": 4260 + }, + { + "epoch": 1.01, + "learning_rate": 0.0002003734011281481, + "loss": 0.7582, + "step": 4280 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001998967188368952, + "loss": 0.7623, + "step": 4300 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001994200365456423, + "loss": 0.7504, + "step": 4320 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019894335425438942, + "loss": 0.7587, + "step": 4340 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019846667196313654, + "loss": 0.7528, + "step": 4360 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019798998967188367, + "loss": 0.754, + "step": 4380 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019751330738063079, + "loss": 0.759, + "step": 4400 + }, + { + "epoch": 1.04, + "eval_loss": 0.7575392127037048, + "eval_runtime": 19.5275, + "eval_samples_per_second": 102.42, + "eval_steps_per_second": 3.226, + "step": 4400 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001970366250893779, + "loss": 0.7592, + "step": 4420 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019655994279812504, + "loss": 0.7548, + "step": 4440 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019608326050687218, + "loss": 0.7632, + "step": 4460 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001956065782156193, + "loss": 0.7472, + "step": 4480 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001951298959243664, + "loss": 0.7496, + "step": 4500 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001946532136331135, + "loss": 0.7549, + "step": 4520 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019417653134186063, + "loss": 0.77, + "step": 4540 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019369984905060774, + "loss": 0.759, + "step": 4560 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019322316675935488, + "loss": 0.7554, + "step": 4580 + }, + { + "epoch": 1.09, + "learning_rate": 0.000192746484468102, + "loss": 0.7577, + "step": 4600 + }, + { + "epoch": 1.09, + "eval_loss": 0.7568497061729431, + "eval_runtime": 19.53, + "eval_samples_per_second": 102.406, + "eval_steps_per_second": 3.226, + "step": 4600 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001922698021768491, + "loss": 0.7617, + "step": 4620 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019179311988559624, + "loss": 0.7551, + "step": 4640 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019131643759434335, + "loss": 0.7482, + "step": 4660 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001908397553030905, + "loss": 0.7516, + "step": 4680 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001903630730118376, + "loss": 0.7555, + "step": 4700 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018988639072058472, + "loss": 0.7605, + "step": 4720 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018940970842933183, + "loss": 0.7506, + "step": 4740 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018893302613807894, + "loss": 0.7622, + "step": 4760 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018845634384682605, + "loss": 0.75, + "step": 4780 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001879796615555732, + "loss": 0.7572, + "step": 4800 + }, + { + "epoch": 1.13, + "eval_loss": 0.7548028826713562, + "eval_runtime": 19.5411, + "eval_samples_per_second": 102.349, + "eval_steps_per_second": 3.224, + "step": 4800 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018750297926432033, + "loss": 0.7427, + "step": 4820 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018702629697306744, + "loss": 0.7489, + "step": 4840 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018654961468181455, + "loss": 0.755, + "step": 4860 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018607293239056167, + "loss": 0.7517, + "step": 4880 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001855962500993088, + "loss": 0.7529, + "step": 4900 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018511956780805592, + "loss": 0.7498, + "step": 4920 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018464288551680303, + "loss": 0.756, + "step": 4940 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018416620322555014, + "loss": 0.7492, + "step": 4960 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018368952093429725, + "loss": 0.7491, + "step": 4980 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018321283864304437, + "loss": 0.7585, + "step": 5000 + }, + { + "epoch": 1.18, + "eval_loss": 0.7538104057312012, + "eval_runtime": 19.6106, + "eval_samples_per_second": 101.986, + "eval_steps_per_second": 3.213, + "step": 5000 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018273615635179153, + "loss": 0.7531, + "step": 5020 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018225947406053865, + "loss": 0.7511, + "step": 5040 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018178279176928576, + "loss": 0.7541, + "step": 5060 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018130610947803287, + "loss": 0.7465, + "step": 5080 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018082942718678, + "loss": 0.7403, + "step": 5100 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018035274489552712, + "loss": 0.749, + "step": 5120 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017987606260427423, + "loss": 0.7548, + "step": 5140 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017939938031302134, + "loss": 0.7443, + "step": 5160 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017892269802176846, + "loss": 0.7461, + "step": 5180 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017844601573051557, + "loss": 0.7511, + "step": 5200 + }, + { + "epoch": 1.23, + "eval_loss": 0.7509217262268066, + "eval_runtime": 19.5437, + "eval_samples_per_second": 102.335, + "eval_steps_per_second": 3.224, + "step": 5200 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017796933343926274, + "loss": 0.7562, + "step": 5220 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017749265114800985, + "loss": 0.7489, + "step": 5240 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017701596885675696, + "loss": 0.7499, + "step": 5260 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017653928656550407, + "loss": 0.7519, + "step": 5280 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017606260427425118, + "loss": 0.7536, + "step": 5300 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017558592198299832, + "loss": 0.7536, + "step": 5320 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017510923969174544, + "loss": 0.7492, + "step": 5340 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017463255740049255, + "loss": 0.7454, + "step": 5360 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017415587510923966, + "loss": 0.7528, + "step": 5380 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001736791928179868, + "loss": 0.7409, + "step": 5400 + }, + { + "epoch": 1.28, + "eval_loss": 0.7497395873069763, + "eval_runtime": 19.5671, + "eval_samples_per_second": 102.212, + "eval_steps_per_second": 3.22, + "step": 5400 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017320251052673394, + "loss": 0.7434, + "step": 5420 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017272582823548105, + "loss": 0.7543, + "step": 5440 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017224914594422816, + "loss": 0.7457, + "step": 5460 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017177246365297527, + "loss": 0.7439, + "step": 5480 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001712957813617224, + "loss": 0.7412, + "step": 5500 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001708190990704695, + "loss": 0.7409, + "step": 5520 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017034241677921664, + "loss": 0.7473, + "step": 5540 + }, + { + "epoch": 1.31, + "learning_rate": 0.00016986573448796375, + "loss": 0.7486, + "step": 5560 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016938905219671086, + "loss": 0.7439, + "step": 5580 + }, + { + "epoch": 1.32, + "learning_rate": 0.000168912369905458, + "loss": 0.7524, + "step": 5600 + }, + { + "epoch": 1.32, + "eval_loss": 0.7480019330978394, + "eval_runtime": 19.5018, + "eval_samples_per_second": 102.555, + "eval_steps_per_second": 3.23, + "step": 5600 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016843568761420514, + "loss": 0.7464, + "step": 5620 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016795900532295225, + "loss": 0.7511, + "step": 5640 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016748232303169936, + "loss": 0.7423, + "step": 5660 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016700564074044648, + "loss": 0.7422, + "step": 5680 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001665289584491936, + "loss": 0.742, + "step": 5700 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001660522761579407, + "loss": 0.7421, + "step": 5720 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016557559386668784, + "loss": 0.749, + "step": 5740 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016509891157543495, + "loss": 0.7432, + "step": 5760 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001646222292841821, + "loss": 0.7426, + "step": 5780 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001641455469929292, + "loss": 0.7543, + "step": 5800 + }, + { + "epoch": 1.37, + "eval_loss": 0.7470090389251709, + "eval_runtime": 19.5563, + "eval_samples_per_second": 102.269, + "eval_steps_per_second": 3.221, + "step": 5800 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016366886470167632, + "loss": 0.7451, + "step": 5820 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016319218241042346, + "loss": 0.7481, + "step": 5840 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016271550011917057, + "loss": 0.7381, + "step": 5860 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016223881782791768, + "loss": 0.7461, + "step": 5880 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001617621355366648, + "loss": 0.7467, + "step": 5900 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001612854532454119, + "loss": 0.745, + "step": 5920 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016080877095415902, + "loss": 0.745, + "step": 5940 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016033208866290615, + "loss": 0.7386, + "step": 5960 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001598554063716533, + "loss": 0.7363, + "step": 5980 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001593787240804004, + "loss": 0.7412, + "step": 6000 + }, + { + "epoch": 1.42, + "eval_loss": 0.7454522848129272, + "eval_runtime": 19.555, + "eval_samples_per_second": 102.276, + "eval_steps_per_second": 3.222, + "step": 6000 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015890204178914752, + "loss": 0.7501, + "step": 6020 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015842535949789463, + "loss": 0.7528, + "step": 6040 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015794867720664177, + "loss": 0.7373, + "step": 6060 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015747199491538888, + "loss": 0.7451, + "step": 6080 + }, + { + "epoch": 1.44, + "learning_rate": 0.000156995312624136, + "loss": 0.7384, + "step": 6100 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001565186303328831, + "loss": 0.7471, + "step": 6120 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015604194804163022, + "loss": 0.7454, + "step": 6140 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015556526575037733, + "loss": 0.7415, + "step": 6160 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001550885834591245, + "loss": 0.7514, + "step": 6180 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001546119011678716, + "loss": 0.7343, + "step": 6200 + }, + { + "epoch": 1.47, + "eval_loss": 0.7457332611083984, + "eval_runtime": 19.5673, + "eval_samples_per_second": 102.212, + "eval_steps_per_second": 3.22, + "step": 6200 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015413521887661872, + "loss": 0.7452, + "step": 6220 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015365853658536583, + "loss": 0.7456, + "step": 6240 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015318185429411297, + "loss": 0.7326, + "step": 6260 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015270517200286008, + "loss": 0.7431, + "step": 6280 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001522284897116072, + "loss": 0.7419, + "step": 6300 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001517518074203543, + "loss": 0.7375, + "step": 6320 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015127512512910142, + "loss": 0.7419, + "step": 6340 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001507984428378486, + "loss": 0.7431, + "step": 6360 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001503217605465957, + "loss": 0.7412, + "step": 6380 + }, + { + "epoch": 1.51, + "learning_rate": 0.00014984507825534278, + "loss": 0.7447, + "step": 6400 + }, + { + "epoch": 1.51, + "eval_loss": 0.7441338896751404, + "eval_runtime": 19.4509, + "eval_samples_per_second": 102.823, + "eval_steps_per_second": 3.239, + "step": 6400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014936839596408992, + "loss": 0.7436, + "step": 6420 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014889171367283704, + "loss": 0.7402, + "step": 6440 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014841503138158415, + "loss": 0.7454, + "step": 6460 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001479383490903313, + "loss": 0.738, + "step": 6480 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001474616667990784, + "loss": 0.7396, + "step": 6500 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014698498450782554, + "loss": 0.7333, + "step": 6520 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014650830221657265, + "loss": 0.7482, + "step": 6540 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014603161992531976, + "loss": 0.7376, + "step": 6560 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014555493763406687, + "loss": 0.7369, + "step": 6580 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014507825534281401, + "loss": 0.7347, + "step": 6600 + }, + { + "epoch": 1.56, + "eval_loss": 0.7425362467765808, + "eval_runtime": 19.5248, + "eval_samples_per_second": 102.434, + "eval_steps_per_second": 3.227, + "step": 6600 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014460157305156113, + "loss": 0.7446, + "step": 6620 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014412489076030824, + "loss": 0.7343, + "step": 6640 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014364820846905535, + "loss": 0.7468, + "step": 6660 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001431715261778025, + "loss": 0.749, + "step": 6680 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001426948438865496, + "loss": 0.7401, + "step": 6700 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001422181615952967, + "loss": 0.7364, + "step": 6720 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014174147930404385, + "loss": 0.7442, + "step": 6740 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014126479701279096, + "loss": 0.7385, + "step": 6760 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014078811472153808, + "loss": 0.7412, + "step": 6780 + }, + { + "epoch": 1.61, + "learning_rate": 0.00014031143243028522, + "loss": 0.7377, + "step": 6800 + }, + { + "epoch": 1.61, + "eval_loss": 0.7418386936187744, + "eval_runtime": 19.5679, + "eval_samples_per_second": 102.208, + "eval_steps_per_second": 3.22, + "step": 6800 + }, + { + "epoch": 1.61, + "learning_rate": 0.00013983475013903233, + "loss": 0.7432, + "step": 6820 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013935806784777944, + "loss": 0.7379, + "step": 6840 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013888138555652655, + "loss": 0.7346, + "step": 6860 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013840470326527366, + "loss": 0.7373, + "step": 6880 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001379280209740208, + "loss": 0.7403, + "step": 6900 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013745133868276792, + "loss": 0.7477, + "step": 6920 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013697465639151506, + "loss": 0.7343, + "step": 6940 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013649797410026217, + "loss": 0.7419, + "step": 6960 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013602129180900928, + "loss": 0.7327, + "step": 6980 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013554460951775642, + "loss": 0.7398, + "step": 7000 + }, + { + "epoch": 1.66, + "eval_loss": 0.7402775883674622, + "eval_runtime": 19.5554, + "eval_samples_per_second": 102.274, + "eval_steps_per_second": 3.222, + "step": 7000 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013506792722650353, + "loss": 0.7311, + "step": 7020 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013459124493525064, + "loss": 0.7319, + "step": 7040 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013411456264399775, + "loss": 0.7315, + "step": 7060 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001336378803527449, + "loss": 0.7329, + "step": 7080 + }, + { + "epoch": 1.68, + "learning_rate": 0.000133161198061492, + "loss": 0.7471, + "step": 7100 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013268451577023912, + "loss": 0.7446, + "step": 7120 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013220783347898623, + "loss": 0.7359, + "step": 7140 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013173115118773337, + "loss": 0.7348, + "step": 7160 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013125446889648048, + "loss": 0.7331, + "step": 7180 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013077778660522762, + "loss": 0.7385, + "step": 7200 + }, + { + "epoch": 1.7, + "eval_loss": 0.7401012182235718, + "eval_runtime": 19.7831, + "eval_samples_per_second": 101.096, + "eval_steps_per_second": 3.185, + "step": 7200 + }, + { + "epoch": 1.71, + "learning_rate": 0.00013030110431397473, + "loss": 0.744, + "step": 7220 + }, + { + "epoch": 1.71, + "learning_rate": 0.00012982442202272185, + "loss": 0.7327, + "step": 7240 + }, + { + "epoch": 1.72, + "learning_rate": 0.00012934773973146896, + "loss": 0.7384, + "step": 7260 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001288710574402161, + "loss": 0.7399, + "step": 7280 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001283943751489632, + "loss": 0.7376, + "step": 7300 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012791769285771032, + "loss": 0.7416, + "step": 7320 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012744101056645743, + "loss": 0.7299, + "step": 7340 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012696432827520455, + "loss": 0.7389, + "step": 7360 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012648764598395168, + "loss": 0.7295, + "step": 7380 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001260109636926988, + "loss": 0.7389, + "step": 7400 + }, + { + "epoch": 1.75, + "eval_loss": 0.7385362386703491, + "eval_runtime": 19.6728, + "eval_samples_per_second": 101.663, + "eval_steps_per_second": 3.202, + "step": 7400 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012553428140144594, + "loss": 0.7346, + "step": 7420 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012505759911019305, + "loss": 0.7357, + "step": 7440 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012458091681894016, + "loss": 0.7295, + "step": 7460 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001241042345276873, + "loss": 0.7418, + "step": 7480 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001236275522364344, + "loss": 0.7248, + "step": 7500 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012315086994518152, + "loss": 0.7326, + "step": 7520 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012267418765392864, + "loss": 0.7422, + "step": 7540 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012219750536267577, + "loss": 0.7376, + "step": 7560 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012172082307142289, + "loss": 0.7358, + "step": 7580 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012124414078017001, + "loss": 0.7337, + "step": 7600 + }, + { + "epoch": 1.8, + "eval_loss": 0.737734854221344, + "eval_runtime": 19.8317, + "eval_samples_per_second": 100.849, + "eval_steps_per_second": 3.177, + "step": 7600 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012076745848891712, + "loss": 0.7318, + "step": 7620 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012029077619766424, + "loss": 0.7356, + "step": 7640 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011981409390641138, + "loss": 0.7355, + "step": 7660 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011933741161515849, + "loss": 0.74, + "step": 7680 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001188607293239056, + "loss": 0.7342, + "step": 7700 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011838404703265273, + "loss": 0.7368, + "step": 7720 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011790736474139984, + "loss": 0.7337, + "step": 7740 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011743068245014698, + "loss": 0.7317, + "step": 7760 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011695400015889409, + "loss": 0.738, + "step": 7780 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001164773178676412, + "loss": 0.7375, + "step": 7800 + }, + { + "epoch": 1.84, + "eval_loss": 0.7366506457328796, + "eval_runtime": 19.9586, + "eval_samples_per_second": 100.208, + "eval_steps_per_second": 3.157, + "step": 7800 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011600063557638833, + "loss": 0.7349, + "step": 7820 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011552395328513544, + "loss": 0.733, + "step": 7840 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011504727099388258, + "loss": 0.7277, + "step": 7860 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011457058870262969, + "loss": 0.7235, + "step": 7880 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001140939064113768, + "loss": 0.7405, + "step": 7900 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011361722412012393, + "loss": 0.7378, + "step": 7920 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011314054182887104, + "loss": 0.7292, + "step": 7940 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011266385953761818, + "loss": 0.7427, + "step": 7960 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011218717724636529, + "loss": 0.7313, + "step": 7980 + }, + { + "epoch": 1.89, + "learning_rate": 0.0001117104949551124, + "loss": 0.7252, + "step": 8000 + }, + { + "epoch": 1.89, + "eval_loss": 0.736083984375, + "eval_runtime": 19.7958, + "eval_samples_per_second": 101.031, + "eval_steps_per_second": 3.182, + "step": 8000 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011123381266385953, + "loss": 0.7268, + "step": 8020 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011075713037260666, + "loss": 0.729, + "step": 8040 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011028044808135377, + "loss": 0.7358, + "step": 8060 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010980376579010089, + "loss": 0.7408, + "step": 8080 + }, + { + "epoch": 1.92, + "learning_rate": 0.000109327083498848, + "loss": 0.73, + "step": 8100 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010887423532215777, + "loss": 0.7298, + "step": 8120 + }, + { + "epoch": 1.92, + "learning_rate": 0.0001083975530309049, + "loss": 0.7324, + "step": 8140 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010792087073965201, + "loss": 0.7296, + "step": 8160 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010744418844839912, + "loss": 0.7346, + "step": 8180 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010696750615714626, + "loss": 0.7281, + "step": 8200 + }, + { + "epoch": 1.94, + "eval_loss": 0.7352190613746643, + "eval_runtime": 19.6635, + "eval_samples_per_second": 101.711, + "eval_steps_per_second": 3.204, + "step": 8200 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010649082386589337, + "loss": 0.7377, + "step": 8220 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001060141415746405, + "loss": 0.7281, + "step": 8240 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010553745928338761, + "loss": 0.7251, + "step": 8260 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010506077699213472, + "loss": 0.7331, + "step": 8280 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010458409470088186, + "loss": 0.7432, + "step": 8300 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010410741240962897, + "loss": 0.7366, + "step": 8320 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001036307301183761, + "loss": 0.7334, + "step": 8340 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010315404782712321, + "loss": 0.7351, + "step": 8360 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010267736553587032, + "loss": 0.7355, + "step": 8380 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010220068324461746, + "loss": 0.7228, + "step": 8400 + }, + { + "epoch": 1.99, + "eval_loss": 0.7341500520706177, + "eval_runtime": 19.6196, + "eval_samples_per_second": 101.939, + "eval_steps_per_second": 3.211, + "step": 8400 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010172400095336457, + "loss": 0.7451, + "step": 8420 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010124731866211169, + "loss": 0.7356, + "step": 8440 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010077063637085881, + "loss": 0.7255, + "step": 8460 + }, + { + "epoch": 2.01, + "learning_rate": 0.00010029395407960592, + "loss": 0.7267, + "step": 8480 + }, + { + "epoch": 2.01, + "learning_rate": 9.981727178835306e-05, + "loss": 0.7291, + "step": 8500 + }, + { + "epoch": 2.01, + "learning_rate": 9.934058949710018e-05, + "loss": 0.7294, + "step": 8520 + }, + { + "epoch": 2.02, + "learning_rate": 9.886390720584729e-05, + "loss": 0.7377, + "step": 8540 + }, + { + "epoch": 2.02, + "learning_rate": 9.838722491459441e-05, + "loss": 0.7324, + "step": 8560 + }, + { + "epoch": 2.03, + "learning_rate": 9.791054262334154e-05, + "loss": 0.7286, + "step": 8580 + }, + { + "epoch": 2.03, + "learning_rate": 9.743386033208867e-05, + "loss": 0.7286, + "step": 8600 + }, + { + "epoch": 2.03, + "eval_loss": 0.734474241733551, + "eval_runtime": 19.5642, + "eval_samples_per_second": 102.228, + "eval_steps_per_second": 3.22, + "step": 8600 + }, + { + "epoch": 2.04, + "learning_rate": 9.695717804083578e-05, + "loss": 0.7304, + "step": 8620 + }, + { + "epoch": 2.04, + "learning_rate": 9.648049574958289e-05, + "loss": 0.7348, + "step": 8640 + }, + { + "epoch": 2.05, + "learning_rate": 9.600381345833002e-05, + "loss": 0.7261, + "step": 8660 + }, + { + "epoch": 2.05, + "learning_rate": 9.552713116707714e-05, + "loss": 0.7313, + "step": 8680 + }, + { + "epoch": 2.06, + "learning_rate": 9.505044887582425e-05, + "loss": 0.7379, + "step": 8700 + }, + { + "epoch": 2.06, + "learning_rate": 9.457376658457138e-05, + "loss": 0.7203, + "step": 8720 + }, + { + "epoch": 2.07, + "learning_rate": 9.409708429331849e-05, + "loss": 0.7306, + "step": 8740 + }, + { + "epoch": 2.07, + "learning_rate": 9.36204020020656e-05, + "loss": 0.7332, + "step": 8760 + }, + { + "epoch": 2.08, + "learning_rate": 9.314371971081274e-05, + "loss": 0.7228, + "step": 8780 + }, + { + "epoch": 2.08, + "learning_rate": 9.266703741955985e-05, + "loss": 0.731, + "step": 8800 + }, + { + "epoch": 2.08, + "eval_loss": 0.7332338690757751, + "eval_runtime": 19.7114, + "eval_samples_per_second": 101.464, + "eval_steps_per_second": 3.196, + "step": 8800 + }, + { + "epoch": 2.09, + "learning_rate": 9.219035512830698e-05, + "loss": 0.7267, + "step": 8820 + }, + { + "epoch": 2.09, + "learning_rate": 9.171367283705409e-05, + "loss": 0.7285, + "step": 8840 + }, + { + "epoch": 2.09, + "learning_rate": 9.12369905458012e-05, + "loss": 0.7214, + "step": 8860 + }, + { + "epoch": 2.1, + "learning_rate": 9.076030825454834e-05, + "loss": 0.7204, + "step": 8880 + }, + { + "epoch": 2.1, + "learning_rate": 9.028362596329546e-05, + "loss": 0.7253, + "step": 8900 + }, + { + "epoch": 2.11, + "learning_rate": 8.980694367204258e-05, + "loss": 0.7253, + "step": 8920 + }, + { + "epoch": 2.11, + "learning_rate": 8.933026138078969e-05, + "loss": 0.7238, + "step": 8940 + }, + { + "epoch": 2.12, + "learning_rate": 8.88535790895368e-05, + "loss": 0.7286, + "step": 8960 + }, + { + "epoch": 2.12, + "learning_rate": 8.837689679828394e-05, + "loss": 0.7385, + "step": 8980 + }, + { + "epoch": 2.13, + "learning_rate": 8.790021450703106e-05, + "loss": 0.7237, + "step": 9000 + }, + { + "epoch": 2.13, + "eval_loss": 0.7329864501953125, + "eval_runtime": 19.7024, + "eval_samples_per_second": 101.51, + "eval_steps_per_second": 3.198, + "step": 9000 + }, + { + "epoch": 2.13, + "learning_rate": 8.742353221577817e-05, + "loss": 0.7311, + "step": 9020 + }, + { + "epoch": 2.14, + "learning_rate": 8.69468499245253e-05, + "loss": 0.7374, + "step": 9040 + }, + { + "epoch": 2.14, + "learning_rate": 8.64701676332724e-05, + "loss": 0.7194, + "step": 9060 + }, + { + "epoch": 2.15, + "learning_rate": 8.599348534201955e-05, + "loss": 0.7237, + "step": 9080 + }, + { + "epoch": 2.15, + "learning_rate": 8.551680305076666e-05, + "loss": 0.7287, + "step": 9100 + }, + { + "epoch": 2.16, + "learning_rate": 8.504012075951377e-05, + "loss": 0.7385, + "step": 9120 + }, + { + "epoch": 2.16, + "learning_rate": 8.45634384682609e-05, + "loss": 0.7319, + "step": 9140 + }, + { + "epoch": 2.17, + "learning_rate": 8.408675617700802e-05, + "loss": 0.7278, + "step": 9160 + }, + { + "epoch": 2.17, + "learning_rate": 8.361007388575515e-05, + "loss": 0.7293, + "step": 9180 + }, + { + "epoch": 2.18, + "learning_rate": 8.313339159450226e-05, + "loss": 0.7232, + "step": 9200 + }, + { + "epoch": 2.18, + "eval_loss": 0.7326176762580872, + "eval_runtime": 20.1581, + "eval_samples_per_second": 99.215, + "eval_steps_per_second": 3.125, + "step": 9200 + }, + { + "epoch": 2.18, + "learning_rate": 8.265670930324937e-05, + "loss": 0.7281, + "step": 9220 + }, + { + "epoch": 2.18, + "learning_rate": 8.21800270119965e-05, + "loss": 0.728, + "step": 9240 + }, + { + "epoch": 2.19, + "learning_rate": 8.170334472074362e-05, + "loss": 0.728, + "step": 9260 + }, + { + "epoch": 2.19, + "learning_rate": 8.122666242949073e-05, + "loss": 0.7221, + "step": 9280 + }, + { + "epoch": 2.2, + "learning_rate": 8.074998013823786e-05, + "loss": 0.7242, + "step": 9300 + }, + { + "epoch": 2.2, + "learning_rate": 8.027329784698497e-05, + "loss": 0.7306, + "step": 9320 + }, + { + "epoch": 2.21, + "learning_rate": 7.979661555573208e-05, + "loss": 0.7218, + "step": 9340 + }, + { + "epoch": 2.21, + "learning_rate": 7.931993326447922e-05, + "loss": 0.7289, + "step": 9360 + }, + { + "epoch": 2.22, + "learning_rate": 7.884325097322634e-05, + "loss": 0.7177, + "step": 9380 + }, + { + "epoch": 2.22, + "learning_rate": 7.836656868197346e-05, + "loss": 0.7265, + "step": 9400 + }, + { + "epoch": 2.22, + "eval_loss": 0.7311453819274902, + "eval_runtime": 19.9076, + "eval_samples_per_second": 100.464, + "eval_steps_per_second": 3.165, + "step": 9400 + }, + { + "epoch": 2.23, + "learning_rate": 7.788988639072057e-05, + "loss": 0.7269, + "step": 9420 + }, + { + "epoch": 2.23, + "learning_rate": 7.741320409946769e-05, + "loss": 0.7275, + "step": 9440 + }, + { + "epoch": 2.24, + "learning_rate": 7.693652180821483e-05, + "loss": 0.7317, + "step": 9460 + }, + { + "epoch": 2.24, + "learning_rate": 7.645983951696194e-05, + "loss": 0.7344, + "step": 9480 + }, + { + "epoch": 2.25, + "learning_rate": 7.598315722570906e-05, + "loss": 0.7263, + "step": 9500 + }, + { + "epoch": 2.25, + "learning_rate": 7.550647493445617e-05, + "loss": 0.7299, + "step": 9520 + }, + { + "epoch": 2.26, + "learning_rate": 7.502979264320329e-05, + "loss": 0.724, + "step": 9540 + }, + { + "epoch": 2.26, + "learning_rate": 7.455311035195041e-05, + "loss": 0.7266, + "step": 9560 + }, + { + "epoch": 2.27, + "learning_rate": 7.407642806069754e-05, + "loss": 0.7299, + "step": 9580 + }, + { + "epoch": 2.27, + "learning_rate": 7.359974576944465e-05, + "loss": 0.7236, + "step": 9600 + }, + { + "epoch": 2.27, + "eval_loss": 0.7311366200447083, + "eval_runtime": 20.0053, + "eval_samples_per_second": 99.973, + "eval_steps_per_second": 3.149, + "step": 9600 + }, + { + "epoch": 2.27, + "learning_rate": 7.314689759275442e-05, + "loss": 0.7252, + "step": 9620 + }, + { + "epoch": 2.28, + "learning_rate": 7.267021530150154e-05, + "loss": 0.7252, + "step": 9640 + }, + { + "epoch": 2.28, + "learning_rate": 7.219353301024865e-05, + "loss": 0.7188, + "step": 9660 + }, + { + "epoch": 2.29, + "learning_rate": 7.171685071899578e-05, + "loss": 0.7243, + "step": 9680 + }, + { + "epoch": 2.29, + "learning_rate": 7.12401684277429e-05, + "loss": 0.7298, + "step": 9700 + }, + { + "epoch": 2.3, + "learning_rate": 7.076348613649002e-05, + "loss": 0.7325, + "step": 9720 + }, + { + "epoch": 2.3, + "learning_rate": 7.028680384523714e-05, + "loss": 0.7286, + "step": 9740 + }, + { + "epoch": 2.31, + "learning_rate": 6.981012155398426e-05, + "loss": 0.7201, + "step": 9760 + }, + { + "epoch": 2.31, + "learning_rate": 6.933343926273138e-05, + "loss": 0.7184, + "step": 9780 + }, + { + "epoch": 2.32, + "learning_rate": 6.885675697147851e-05, + "loss": 0.7291, + "step": 9800 + }, + { + "epoch": 2.32, + "eval_loss": 0.7308618426322937, + "eval_runtime": 19.7965, + "eval_samples_per_second": 101.028, + "eval_steps_per_second": 3.182, + "step": 9800 + }, + { + "epoch": 2.32, + "learning_rate": 6.838007468022563e-05, + "loss": 0.7318, + "step": 9820 + }, + { + "epoch": 2.33, + "learning_rate": 6.790339238897274e-05, + "loss": 0.7227, + "step": 9840 + }, + { + "epoch": 2.33, + "learning_rate": 6.742671009771986e-05, + "loss": 0.7377, + "step": 9860 + }, + { + "epoch": 2.34, + "learning_rate": 6.695002780646698e-05, + "loss": 0.7367, + "step": 9880 + }, + { + "epoch": 2.34, + "learning_rate": 6.647334551521411e-05, + "loss": 0.7218, + "step": 9900 + }, + { + "epoch": 2.35, + "learning_rate": 6.599666322396122e-05, + "loss": 0.7282, + "step": 9920 + }, + { + "epoch": 2.35, + "learning_rate": 6.551998093270835e-05, + "loss": 0.7231, + "step": 9940 + }, + { + "epoch": 2.36, + "learning_rate": 6.504329864145546e-05, + "loss": 0.7257, + "step": 9960 + }, + { + "epoch": 2.36, + "learning_rate": 6.456661635020258e-05, + "loss": 0.7275, + "step": 9980 + }, + { + "epoch": 2.36, + "learning_rate": 6.40899340589497e-05, + "loss": 0.725, + "step": 10000 + }, + { + "epoch": 2.36, + "eval_loss": 0.7301817536354065, + "eval_runtime": 19.7914, + "eval_samples_per_second": 101.054, + "eval_steps_per_second": 3.183, + "step": 10000 + }, + { + "epoch": 2.37, + "learning_rate": 6.361325176769682e-05, + "loss": 0.72, + "step": 10020 + }, + { + "epoch": 2.37, + "learning_rate": 6.313656947644395e-05, + "loss": 0.7267, + "step": 10040 + }, + { + "epoch": 2.38, + "learning_rate": 6.265988718519107e-05, + "loss": 0.7276, + "step": 10060 + }, + { + "epoch": 2.38, + "learning_rate": 6.218320489393818e-05, + "loss": 0.7262, + "step": 10080 + }, + { + "epoch": 2.39, + "learning_rate": 6.17065226026853e-05, + "loss": 0.7149, + "step": 10100 + }, + { + "epoch": 2.39, + "learning_rate": 6.122984031143242e-05, + "loss": 0.7305, + "step": 10120 + }, + { + "epoch": 2.4, + "learning_rate": 6.075315802017954e-05, + "loss": 0.7314, + "step": 10140 + }, + { + "epoch": 2.4, + "learning_rate": 6.027647572892667e-05, + "loss": 0.7154, + "step": 10160 + }, + { + "epoch": 2.41, + "learning_rate": 5.9799793437673786e-05, + "loss": 0.7263, + "step": 10180 + }, + { + "epoch": 2.41, + "learning_rate": 5.93231111464209e-05, + "loss": 0.7203, + "step": 10200 + }, + { + "epoch": 2.41, + "eval_loss": 0.7294782996177673, + "eval_runtime": 19.7824, + "eval_samples_per_second": 101.1, + "eval_steps_per_second": 3.185, + "step": 10200 + }, + { + "epoch": 2.42, + "learning_rate": 5.8846428855168024e-05, + "loss": 0.7208, + "step": 10220 + }, + { + "epoch": 2.42, + "learning_rate": 5.836974656391514e-05, + "loss": 0.7266, + "step": 10240 + }, + { + "epoch": 2.43, + "learning_rate": 5.789306427266227e-05, + "loss": 0.7285, + "step": 10260 + }, + { + "epoch": 2.43, + "learning_rate": 5.741638198140939e-05, + "loss": 0.7215, + "step": 10280 + }, + { + "epoch": 2.44, + "learning_rate": 5.6939699690156506e-05, + "loss": 0.7203, + "step": 10300 + }, + { + "epoch": 2.44, + "learning_rate": 5.6463017398903625e-05, + "loss": 0.7314, + "step": 10320 + }, + { + "epoch": 2.44, + "learning_rate": 5.5986335107650744e-05, + "loss": 0.7394, + "step": 10340 + }, + { + "epoch": 2.45, + "learning_rate": 5.550965281639787e-05, + "loss": 0.7138, + "step": 10360 + }, + { + "epoch": 2.45, + "learning_rate": 5.503297052514498e-05, + "loss": 0.721, + "step": 10380 + }, + { + "epoch": 2.46, + "learning_rate": 5.455628823389211e-05, + "loss": 0.7199, + "step": 10400 + }, + { + "epoch": 2.46, + "eval_loss": 0.728507936000824, + "eval_runtime": 19.7761, + "eval_samples_per_second": 101.132, + "eval_steps_per_second": 3.186, + "step": 10400 + }, + { + "epoch": 2.46, + "learning_rate": 5.4079605942639226e-05, + "loss": 0.7228, + "step": 10420 + }, + { + "epoch": 2.47, + "learning_rate": 5.3602923651386345e-05, + "loss": 0.7193, + "step": 10440 + }, + { + "epoch": 2.47, + "learning_rate": 5.3126241360133464e-05, + "loss": 0.7269, + "step": 10460 + }, + { + "epoch": 2.48, + "learning_rate": 5.264955906888058e-05, + "loss": 0.729, + "step": 10480 + }, + { + "epoch": 2.48, + "learning_rate": 5.217287677762771e-05, + "loss": 0.7193, + "step": 10500 + }, + { + "epoch": 2.49, + "learning_rate": 5.169619448637483e-05, + "loss": 0.7158, + "step": 10520 + }, + { + "epoch": 2.49, + "learning_rate": 5.121951219512195e-05, + "loss": 0.7158, + "step": 10540 + }, + { + "epoch": 2.5, + "learning_rate": 5.0742829903869065e-05, + "loss": 0.7177, + "step": 10560 + }, + { + "epoch": 2.5, + "learning_rate": 5.0266147612616184e-05, + "loss": 0.7187, + "step": 10580 + }, + { + "epoch": 2.51, + "learning_rate": 4.978946532136331e-05, + "loss": 0.7185, + "step": 10600 + }, + { + "epoch": 2.51, + "eval_loss": 0.7283052802085876, + "eval_runtime": 20.2682, + "eval_samples_per_second": 98.677, + "eval_steps_per_second": 3.108, + "step": 10600 + }, + { + "epoch": 2.51, + "learning_rate": 4.931278303011042e-05, + "loss": 0.7264, + "step": 10620 + }, + { + "epoch": 2.52, + "learning_rate": 4.883610073885755e-05, + "loss": 0.7208, + "step": 10640 + }, + { + "epoch": 2.52, + "learning_rate": 4.835941844760467e-05, + "loss": 0.7275, + "step": 10660 + }, + { + "epoch": 2.53, + "learning_rate": 4.7882736156351786e-05, + "loss": 0.7205, + "step": 10680 + }, + { + "epoch": 2.53, + "learning_rate": 4.740605386509891e-05, + "loss": 0.7213, + "step": 10700 + }, + { + "epoch": 2.53, + "learning_rate": 4.692937157384602e-05, + "loss": 0.7324, + "step": 10720 + }, + { + "epoch": 2.54, + "learning_rate": 4.645268928259315e-05, + "loss": 0.7197, + "step": 10740 + }, + { + "epoch": 2.54, + "learning_rate": 4.597600699134027e-05, + "loss": 0.7162, + "step": 10760 + }, + { + "epoch": 2.55, + "learning_rate": 4.5499324700087394e-05, + "loss": 0.7223, + "step": 10780 + }, + { + "epoch": 2.55, + "learning_rate": 4.5022642408834506e-05, + "loss": 0.7249, + "step": 10800 + }, + { + "epoch": 2.55, + "eval_loss": 0.7278863191604614, + "eval_runtime": 19.7684, + "eval_samples_per_second": 101.171, + "eval_steps_per_second": 3.187, + "step": 10800 + }, + { + "epoch": 2.56, + "learning_rate": 4.4545960117581625e-05, + "loss": 0.7245, + "step": 10820 + }, + { + "epoch": 2.56, + "learning_rate": 4.406927782632875e-05, + "loss": 0.7298, + "step": 10840 + }, + { + "epoch": 2.57, + "learning_rate": 4.359259553507587e-05, + "loss": 0.7172, + "step": 10860 + }, + { + "epoch": 2.57, + "learning_rate": 4.3115913243822995e-05, + "loss": 0.7183, + "step": 10880 + }, + { + "epoch": 2.58, + "learning_rate": 4.263923095257011e-05, + "loss": 0.7172, + "step": 10900 + }, + { + "epoch": 2.58, + "learning_rate": 4.2162548661317226e-05, + "loss": 0.7166, + "step": 10920 + }, + { + "epoch": 2.59, + "learning_rate": 4.168586637006435e-05, + "loss": 0.7303, + "step": 10940 + }, + { + "epoch": 2.59, + "learning_rate": 4.1209184078811464e-05, + "loss": 0.716, + "step": 10960 + }, + { + "epoch": 2.6, + "learning_rate": 4.073250178755859e-05, + "loss": 0.7199, + "step": 10980 + }, + { + "epoch": 2.6, + "learning_rate": 4.025581949630571e-05, + "loss": 0.7227, + "step": 11000 + }, + { + "epoch": 2.6, + "eval_loss": 0.7274474501609802, + "eval_runtime": 19.9546, + "eval_samples_per_second": 100.228, + "eval_steps_per_second": 3.157, + "step": 11000 + }, + { + "epoch": 2.61, + "learning_rate": 3.9779137205052834e-05, + "loss": 0.7134, + "step": 11020 + }, + { + "epoch": 2.61, + "learning_rate": 3.930245491379995e-05, + "loss": 0.7354, + "step": 11040 + }, + { + "epoch": 2.62, + "learning_rate": 3.8825772622547065e-05, + "loss": 0.7269, + "step": 11060 + }, + { + "epoch": 2.62, + "learning_rate": 3.834909033129419e-05, + "loss": 0.7261, + "step": 11080 + }, + { + "epoch": 2.62, + "learning_rate": 3.787240804004131e-05, + "loss": 0.735, + "step": 11100 + }, + { + "epoch": 2.63, + "learning_rate": 3.739572574878843e-05, + "loss": 0.716, + "step": 11120 + }, + { + "epoch": 2.63, + "learning_rate": 3.691904345753555e-05, + "loss": 0.721, + "step": 11140 + }, + { + "epoch": 2.64, + "learning_rate": 3.644236116628267e-05, + "loss": 0.7201, + "step": 11160 + }, + { + "epoch": 2.64, + "learning_rate": 3.596567887502979e-05, + "loss": 0.7231, + "step": 11180 + }, + { + "epoch": 2.65, + "learning_rate": 3.548899658377691e-05, + "loss": 0.7172, + "step": 11200 + }, + { + "epoch": 2.65, + "eval_loss": 0.7270590662956238, + "eval_runtime": 19.753, + "eval_samples_per_second": 101.251, + "eval_steps_per_second": 3.189, + "step": 11200 + }, + { + "epoch": 2.65, + "learning_rate": 3.501231429252403e-05, + "loss": 0.7296, + "step": 11220 + }, + { + "epoch": 2.66, + "learning_rate": 3.453563200127115e-05, + "loss": 0.7239, + "step": 11240 + }, + { + "epoch": 2.66, + "learning_rate": 3.405894971001827e-05, + "loss": 0.7215, + "step": 11260 + }, + { + "epoch": 2.67, + "learning_rate": 3.358226741876539e-05, + "loss": 0.7176, + "step": 11280 + }, + { + "epoch": 2.67, + "learning_rate": 3.310558512751251e-05, + "loss": 0.7277, + "step": 11300 + }, + { + "epoch": 2.68, + "learning_rate": 3.262890283625963e-05, + "loss": 0.7237, + "step": 11320 + }, + { + "epoch": 2.68, + "learning_rate": 3.215222054500675e-05, + "loss": 0.7167, + "step": 11340 + }, + { + "epoch": 2.69, + "learning_rate": 3.167553825375387e-05, + "loss": 0.7184, + "step": 11360 + }, + { + "epoch": 2.69, + "learning_rate": 3.119885596250099e-05, + "loss": 0.7238, + "step": 11380 + }, + { + "epoch": 2.7, + "learning_rate": 3.072217367124811e-05, + "loss": 0.7188, + "step": 11400 + }, + { + "epoch": 2.7, + "eval_loss": 0.7263159155845642, + "eval_runtime": 19.6317, + "eval_samples_per_second": 101.876, + "eval_steps_per_second": 3.209, + "step": 11400 + }, + { + "epoch": 2.7, + "learning_rate": 3.0245491379995232e-05, + "loss": 0.7146, + "step": 11420 + }, + { + "epoch": 2.71, + "learning_rate": 2.9768809088742348e-05, + "loss": 0.7307, + "step": 11440 + }, + { + "epoch": 2.71, + "learning_rate": 2.929212679748947e-05, + "loss": 0.721, + "step": 11460 + }, + { + "epoch": 2.71, + "learning_rate": 2.881544450623659e-05, + "loss": 0.7293, + "step": 11480 + }, + { + "epoch": 2.72, + "learning_rate": 2.833876221498371e-05, + "loss": 0.7245, + "step": 11500 + }, + { + "epoch": 2.72, + "learning_rate": 2.7862079923730833e-05, + "loss": 0.7264, + "step": 11520 + }, + { + "epoch": 2.73, + "learning_rate": 2.7385397632477952e-05, + "loss": 0.722, + "step": 11540 + }, + { + "epoch": 2.73, + "learning_rate": 2.6908715341225068e-05, + "loss": 0.7195, + "step": 11560 + }, + { + "epoch": 2.74, + "learning_rate": 2.643203304997219e-05, + "loss": 0.7181, + "step": 11580 + }, + { + "epoch": 2.74, + "learning_rate": 2.5955350758719312e-05, + "loss": 0.7225, + "step": 11600 + }, + { + "epoch": 2.74, + "eval_loss": 0.7265506386756897, + "eval_runtime": 19.5252, + "eval_samples_per_second": 102.432, + "eval_steps_per_second": 3.227, + "step": 11600 + }, + { + "epoch": 2.75, + "learning_rate": 2.547866846746643e-05, + "loss": 0.7151, + "step": 11620 + }, + { + "epoch": 2.75, + "learning_rate": 2.5001986176213553e-05, + "loss": 0.7211, + "step": 11640 + }, + { + "epoch": 2.76, + "learning_rate": 2.4525303884960672e-05, + "loss": 0.7231, + "step": 11660 + }, + { + "epoch": 2.76, + "learning_rate": 2.404862159370779e-05, + "loss": 0.7236, + "step": 11680 + }, + { + "epoch": 2.77, + "learning_rate": 2.357193930245491e-05, + "loss": 0.7161, + "step": 11700 + }, + { + "epoch": 2.77, + "learning_rate": 2.3095257011202032e-05, + "loss": 0.7248, + "step": 11720 + }, + { + "epoch": 2.78, + "learning_rate": 2.261857471994915e-05, + "loss": 0.7195, + "step": 11740 + }, + { + "epoch": 2.78, + "learning_rate": 2.2141892428696274e-05, + "loss": 0.718, + "step": 11760 + }, + { + "epoch": 2.79, + "learning_rate": 2.1665210137443392e-05, + "loss": 0.7161, + "step": 11780 + }, + { + "epoch": 2.79, + "learning_rate": 2.118852784619051e-05, + "loss": 0.7204, + "step": 11800 + }, + { + "epoch": 2.79, + "eval_loss": 0.7261104583740234, + "eval_runtime": 20.0617, + "eval_samples_per_second": 99.692, + "eval_steps_per_second": 3.14, + "step": 11800 + }, + { + "epoch": 2.79, + "learning_rate": 2.071184555493763e-05, + "loss": 0.716, + "step": 11820 + }, + { + "epoch": 2.8, + "learning_rate": 2.0235163263684753e-05, + "loss": 0.7211, + "step": 11840 + }, + { + "epoch": 2.8, + "learning_rate": 1.975848097243187e-05, + "loss": 0.7242, + "step": 11860 + }, + { + "epoch": 2.81, + "learning_rate": 1.9281798681178994e-05, + "loss": 0.7129, + "step": 11880 + }, + { + "epoch": 2.81, + "learning_rate": 1.8828950504488756e-05, + "loss": 0.7233, + "step": 11900 + }, + { + "epoch": 2.82, + "learning_rate": 1.8352268213235875e-05, + "loss": 0.7286, + "step": 11920 + }, + { + "epoch": 2.82, + "learning_rate": 1.7875585921982997e-05, + "loss": 0.7147, + "step": 11940 + }, + { + "epoch": 2.83, + "learning_rate": 1.7398903630730116e-05, + "loss": 0.7303, + "step": 11960 + }, + { + "epoch": 2.83, + "learning_rate": 1.692222133947724e-05, + "loss": 0.7126, + "step": 11980 + }, + { + "epoch": 2.84, + "learning_rate": 1.6445539048224358e-05, + "loss": 0.7174, + "step": 12000 + }, + { + "epoch": 2.84, + "eval_loss": 0.7259587645530701, + "eval_runtime": 20.6636, + "eval_samples_per_second": 96.788, + "eval_steps_per_second": 3.049, + "step": 12000 + }, + { + "epoch": 2.84, + "learning_rate": 1.5968856756971476e-05, + "loss": 0.7147, + "step": 12020 + }, + { + "epoch": 2.85, + "learning_rate": 1.54921744657186e-05, + "loss": 0.7184, + "step": 12040 + }, + { + "epoch": 2.85, + "learning_rate": 1.5015492174465718e-05, + "loss": 0.7218, + "step": 12060 + }, + { + "epoch": 2.86, + "learning_rate": 1.4538809883212837e-05, + "loss": 0.7172, + "step": 12080 + }, + { + "epoch": 2.86, + "learning_rate": 1.4062127591959957e-05, + "loss": 0.7326, + "step": 12100 + }, + { + "epoch": 2.87, + "learning_rate": 1.3585445300707078e-05, + "loss": 0.726, + "step": 12120 + }, + { + "epoch": 2.87, + "learning_rate": 1.3108763009454197e-05, + "loss": 0.711, + "step": 12140 + }, + { + "epoch": 2.88, + "learning_rate": 1.2632080718201317e-05, + "loss": 0.7199, + "step": 12160 + }, + { + "epoch": 2.88, + "learning_rate": 1.215539842694844e-05, + "loss": 0.7256, + "step": 12180 + }, + { + "epoch": 2.88, + "learning_rate": 1.1678716135695557e-05, + "loss": 0.7183, + "step": 12200 + }, + { + "epoch": 2.88, + "eval_loss": 0.7255927324295044, + "eval_runtime": 20.0566, + "eval_samples_per_second": 99.718, + "eval_steps_per_second": 3.141, + "step": 12200 + }, + { + "epoch": 2.89, + "learning_rate": 1.1202033844442679e-05, + "loss": 0.7244, + "step": 12220 + }, + { + "epoch": 2.89, + "learning_rate": 1.07253515531898e-05, + "loss": 0.717, + "step": 12240 + }, + { + "epoch": 2.9, + "learning_rate": 1.0248669261936918e-05, + "loss": 0.7224, + "step": 12260 + }, + { + "epoch": 2.9, + "learning_rate": 9.771986970684039e-06, + "loss": 0.7124, + "step": 12280 + }, + { + "epoch": 2.91, + "learning_rate": 9.295304679431158e-06, + "loss": 0.7285, + "step": 12300 + }, + { + "epoch": 2.91, + "learning_rate": 8.818622388178278e-06, + "loss": 0.7337, + "step": 12320 + }, + { + "epoch": 2.92, + "learning_rate": 8.341940096925399e-06, + "loss": 0.716, + "step": 12340 + }, + { + "epoch": 2.92, + "learning_rate": 7.865257805672518e-06, + "loss": 0.7212, + "step": 12360 + }, + { + "epoch": 2.93, + "learning_rate": 7.3885755144196385e-06, + "loss": 0.7262, + "step": 12380 + }, + { + "epoch": 2.93, + "learning_rate": 6.911893223166759e-06, + "loss": 0.7151, + "step": 12400 + }, + { + "epoch": 2.93, + "eval_loss": 0.7256051301956177, + "eval_runtime": 19.7012, + "eval_samples_per_second": 101.516, + "eval_steps_per_second": 3.198, + "step": 12400 + }, + { + "epoch": 2.94, + "learning_rate": 6.435210931913879e-06, + "loss": 0.7112, + "step": 12420 + }, + { + "epoch": 2.94, + "learning_rate": 5.958528640660999e-06, + "loss": 0.7162, + "step": 12440 + }, + { + "epoch": 2.95, + "learning_rate": 5.481846349408119e-06, + "loss": 0.724, + "step": 12460 + }, + { + "epoch": 2.95, + "learning_rate": 5.005164058155239e-06, + "loss": 0.7217, + "step": 12480 + }, + { + "epoch": 2.96, + "learning_rate": 4.5284817669023595e-06, + "loss": 0.7226, + "step": 12500 + }, + { + "epoch": 2.96, + "learning_rate": 4.051799475649479e-06, + "loss": 0.7256, + "step": 12520 + }, + { + "epoch": 2.97, + "learning_rate": 3.5751171843965994e-06, + "loss": 0.733, + "step": 12540 + }, + { + "epoch": 2.97, + "learning_rate": 3.0984348931437196e-06, + "loss": 0.727, + "step": 12560 + }, + { + "epoch": 2.97, + "learning_rate": 2.6217526018908393e-06, + "loss": 0.7177, + "step": 12580 + }, + { + "epoch": 2.98, + "learning_rate": 2.1450703106379595e-06, + "loss": 0.72, + "step": 12600 + }, + { + "epoch": 2.98, + "eval_loss": 0.7255715727806091, + "eval_runtime": 19.5863, + "eval_samples_per_second": 102.112, + "eval_steps_per_second": 3.217, + "step": 12600 + } + ], + "max_steps": 12687, + "num_train_epochs": 3, + "total_flos": 1.6378260662582772e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-belle-7b/checkpoint-12600/training_args.bin b/adapters/saved-belle-7b/checkpoint-12600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e7508cbd7713243e73fe59f258eaa12f0deefce5 --- /dev/null +++ b/adapters/saved-belle-7b/checkpoint-12600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9809a7383d594262bcb22ae9222e2580aba6862268d17b0cc4f7bd3fe5579126 +size 3579 diff --git a/adapters/saved-belle1.5m7b/adapter_config.json b/adapters/saved-belle1.5m7b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6e56f2ae8f10fadfeec6c730ac6b119025824443 --- /dev/null +++ b/adapters/saved-belle1.5m7b/adapter_config.json @@ -0,0 +1,18 @@ +{ + "base_model_name_or_path": "/mnt/bn/qingyi-bn-lq/llama/llama-7b-hf", + "bias": "none", + "enable_lora": null, + "fan_in_fan_out": false, + "inference_mode": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved-belle1.5m7b/adapter_model.bin b/adapters/saved-belle1.5m7b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..7125559070160d62bea36ea075d2c18d35f03062 --- /dev/null +++ b/adapters/saved-belle1.5m7b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfdee8241ea4195daa206148eab2b6b2db6a68c9365dd85d715e957f6560bea8 +size 16822989 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/optimizer.pt b/adapters/saved-belle1.5m7b/checkpoint-11600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..cef769d148e6ed2dc58599b0ae289af1595170f7 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13daca4b68b59900b0760c5d2831559adbaea8ab8fd5dab87f1070ef95bb7130 +size 33629893 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/pytorch_model.bin b/adapters/saved-belle1.5m7b/checkpoint-11600/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..eee4012e67968f398435b1bdb7d807f0cf295333 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:468d4558755ecba97eadcf4eefb4d330b2079b80f28800d38503e6a7e9611672 +size 16822989 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_0.pth b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..3dae6cea366b554c559a96451079be5e3c7b0ffc --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:562a09f170190d4d6d4bfa4027461b66e687acc30bf9340db9878db0a1ec73b7 +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_1.pth b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..69b7e95832d5e0c037aeff5d770bddc0c3563189 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:330cf4ba3692f3328d87f505d4d2ec9c21f28c19e9c0da369d92304b82573aea +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_10.pth b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_10.pth new file mode 100644 index 0000000000000000000000000000000000000000..c7a33a987dc434d23ff71ee7fa481efb5635abd7 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_10.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a92fc52040e6785f462c2c24d46a906fae2007875aa0c28de36b64b6d9c1adba +size 14587 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_11.pth b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_11.pth new file mode 100644 index 0000000000000000000000000000000000000000..de9b0ffe0287b5a9564259aaf38ac15eda61c967 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_11.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ab8cdc75558afa3f7ed6a8e8d4363139e786b79fbfa22543c6b9a510944eca3 +size 14587 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_12.pth b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_12.pth new file mode 100644 index 0000000000000000000000000000000000000000..64d0f4df88ee87aca99a5764cebc7d2ccde802a3 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_12.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14527ff6066340c8c72cf7739874cbbcccd89df49d4ed95430cf90f8be949502 +size 14587 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_13.pth b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_13.pth new file mode 100644 index 0000000000000000000000000000000000000000..f35261793d1fa1de424c237307cdebd78ab6b697 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_13.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da1f291d3cae284e371846db29623e36f6ea5778b32d860bc40c5b12889ae771 +size 14587 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_14.pth b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_14.pth new file mode 100644 index 0000000000000000000000000000000000000000..49a75d67696019680ad84edf35fe51ae9c0ba92c --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_14.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3c771906d29c075c8af0322d439a2cd5e06da4ebf3d05ba83b031987f0d3ffd +size 14587 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_15.pth b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_15.pth new file mode 100644 index 0000000000000000000000000000000000000000..afe06ac209e10428193eba7d5e75f6201664db23 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_15.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:032c1a43cbf4798bd283ed830d37d3357c66dbb91350c52eb8cba238b33f1f97 +size 14587 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_2.pth b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..65cabbed9dde476b33f94b81ae30c6265a29e4e0 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f77cf77cbe6b5200fa05c52ddc292842703c0495b3fc5431a2591408c03b9f93 +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_3.pth b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..1a150e5a55c7779bf60173171ddfd58f1c8dd237 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:275705712b787f8a29149cb6b102cfdc144da6930227855c14f6dedd39f2fa33 +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_4.pth b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..04d27d57717cb887674a7a63917398f389210b6d --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f82da09612c4778cfd00c081ead066981298ea92551e4733c940967b64b53efb +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_5.pth b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..72bda1b38ad958c359508eebbe39aaefb1d8aff2 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93a171833e46fec0dde30adeeae83441d50972ce2f9ba718d606f4c1e52bca3c +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_6.pth b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..99b63b8747e871f941184bc044e8d232762e3597 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a04a9cbce340f56c1d8e990ca7cbf1736a4eb7eeb57d1d3fa0aefdd230cf29a0 +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_7.pth b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d12c874d9637746086b691f4c5eb2f6272c58e02 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c43896a13f1d5f2b708f5f7ba51142fd2bcd7451d2176bd9b0ed889e2da5defa +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_8.pth b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..66c065b293e51ac0f34990bf11f1011be5afe9b1 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90dbaf19e822e018c3980e34edc17e7475e70f9a49b8ba6cc66e5a08f37dc449 +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_9.pth b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_9.pth new file mode 100644 index 0000000000000000000000000000000000000000..3842e8ab449e1d04f831a2c0af961b21bb54550f --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/rng_state_9.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:167995fdd2a92ef89db6ac4aeaf2f2b19d87b25464ca9f76b8a2a1635608ede5 +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/scaler.pt b/adapters/saved-belle1.5m7b/checkpoint-11600/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..acb35b3eef7bfa42568fa8bc5541bda1462816d3 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84fe9b56c0ce1fcb77005043ef7d136ac4611dbca39a2f003836392f71e07df8 +size 557 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/scheduler.pt b/adapters/saved-belle1.5m7b/checkpoint-11600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..dab0ffe7f049e214b200fa8ccd249387dfdaad5e --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:676fddd3ba5032ed7d214830f1218e10dc08f58b1c4cc29305972e3eb57bc014 +size 627 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/trainer_state.json b/adapters/saved-belle1.5m7b/checkpoint-11600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..16c80bdb8ed1e7e265755372fca9c29d2c365b74 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/trainer_state.json @@ -0,0 +1,3960 @@ +{ + "best_metric": 0.7637657523155212, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-belle1.5m7b/checkpoint-11600", + "epoch": 0.9632951336987211, + "global_step": 11600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7631, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011999999999999999, + "loss": 1.4784, + "step": 40 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017999999999999998, + "loss": 1.1332, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 0.00023999999999999998, + "loss": 1.0665, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003, + "loss": 1.0418, + "step": 100 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029949757159604753, + "loss": 1.0246, + "step": 120 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002989951431920951, + "loss": 1.0054, + "step": 140 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029849271478814266, + "loss": 1.002, + "step": 160 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002979902863841902, + "loss": 0.9908, + "step": 180 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002974878579802378, + "loss": 0.9784, + "step": 200 + }, + { + "epoch": 0.02, + "eval_loss": 0.9920349717140198, + "eval_runtime": 8.2147, + "eval_samples_per_second": 243.465, + "eval_steps_per_second": 1.948, + "step": 200 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029698542957628534, + "loss": 0.9817, + "step": 220 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002964830011723329, + "loss": 0.983, + "step": 240 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029598057276838046, + "loss": 0.9637, + "step": 260 + }, + { + "epoch": 0.02, + "learning_rate": 0.000295478144364428, + "loss": 0.9534, + "step": 280 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002949757159604756, + "loss": 0.9475, + "step": 300 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029447328755652315, + "loss": 0.9502, + "step": 320 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002939708591525707, + "loss": 0.946, + "step": 340 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029346843074861827, + "loss": 0.9545, + "step": 360 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029296600234466583, + "loss": 0.946, + "step": 380 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002924635739407134, + "loss": 0.9295, + "step": 400 + }, + { + "epoch": 0.03, + "eval_loss": 0.9477736949920654, + "eval_runtime": 8.229, + "eval_samples_per_second": 243.042, + "eval_steps_per_second": 1.944, + "step": 400 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029196114553676095, + "loss": 0.9232, + "step": 420 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002914587171328085, + "loss": 0.9201, + "step": 440 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029095628872885613, + "loss": 0.9212, + "step": 460 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002904538603249037, + "loss": 0.9131, + "step": 480 + }, + { + "epoch": 0.04, + "learning_rate": 0.00028995143192095125, + "loss": 0.922, + "step": 500 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002894490035169988, + "loss": 0.9176, + "step": 520 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002889465751130464, + "loss": 0.9177, + "step": 540 + }, + { + "epoch": 0.05, + "learning_rate": 0.00028844414670909393, + "loss": 0.9132, + "step": 560 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002879417183051415, + "loss": 0.921, + "step": 580 + }, + { + "epoch": 0.05, + "learning_rate": 0.00028743928990118906, + "loss": 0.9013, + "step": 600 + }, + { + "epoch": 0.05, + "eval_loss": 0.9194319248199463, + "eval_runtime": 8.23, + "eval_samples_per_second": 243.013, + "eval_steps_per_second": 1.944, + "step": 600 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002869368614972366, + "loss": 0.904, + "step": 620 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002864344330932842, + "loss": 0.9016, + "step": 640 + }, + { + "epoch": 0.05, + "learning_rate": 0.00028593200468933174, + "loss": 0.8931, + "step": 660 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002854295762853793, + "loss": 0.9, + "step": 680 + }, + { + "epoch": 0.06, + "learning_rate": 0.00028492714788142686, + "loss": 0.896, + "step": 700 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002844247194774744, + "loss": 0.8982, + "step": 720 + }, + { + "epoch": 0.06, + "learning_rate": 0.00028392229107352204, + "loss": 0.9, + "step": 740 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002834198626695696, + "loss": 0.8861, + "step": 760 + }, + { + "epoch": 0.06, + "learning_rate": 0.00028291743426561716, + "loss": 0.8926, + "step": 780 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002824150058616647, + "loss": 0.8853, + "step": 800 + }, + { + "epoch": 0.07, + "eval_loss": 0.8987648487091064, + "eval_runtime": 8.2248, + "eval_samples_per_second": 243.167, + "eval_steps_per_second": 1.945, + "step": 800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002819125774577123, + "loss": 0.8922, + "step": 820 + }, + { + "epoch": 0.07, + "learning_rate": 0.00028141014905375984, + "loss": 0.8813, + "step": 840 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002809077206498074, + "loss": 0.8825, + "step": 860 + }, + { + "epoch": 0.07, + "learning_rate": 0.00028040529224585497, + "loss": 0.8821, + "step": 880 + }, + { + "epoch": 0.07, + "learning_rate": 0.00027990286384190253, + "loss": 0.8853, + "step": 900 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002794004354379501, + "loss": 0.8806, + "step": 920 + }, + { + "epoch": 0.08, + "learning_rate": 0.00027889800703399765, + "loss": 0.8774, + "step": 940 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002783955786300452, + "loss": 0.8783, + "step": 960 + }, + { + "epoch": 0.08, + "learning_rate": 0.00027789315022609277, + "loss": 0.876, + "step": 980 + }, + { + "epoch": 0.08, + "learning_rate": 0.00027739072182214033, + "loss": 0.8656, + "step": 1000 + }, + { + "epoch": 0.08, + "eval_loss": 0.8842049241065979, + "eval_runtime": 8.2372, + "eval_samples_per_second": 242.8, + "eval_steps_per_second": 1.942, + "step": 1000 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002768882934181879, + "loss": 0.8776, + "step": 1020 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027638586501423546, + "loss": 0.8751, + "step": 1040 + }, + { + "epoch": 0.09, + "learning_rate": 0.000275883436610283, + "loss": 0.8701, + "step": 1060 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002753810082063306, + "loss": 0.8618, + "step": 1080 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027487857980237814, + "loss": 0.8604, + "step": 1100 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002743761513984257, + "loss": 0.8516, + "step": 1120 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027387372299447326, + "loss": 0.8553, + "step": 1140 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002733712945905208, + "loss": 0.8638, + "step": 1160 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002728688661865684, + "loss": 0.8528, + "step": 1180 + }, + { + "epoch": 0.1, + "learning_rate": 0.00027236643778261595, + "loss": 0.8641, + "step": 1200 + }, + { + "epoch": 0.1, + "eval_loss": 0.871113121509552, + "eval_runtime": 8.2308, + "eval_samples_per_second": 242.989, + "eval_steps_per_second": 1.944, + "step": 1200 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002718640093786635, + "loss": 0.8656, + "step": 1220 + }, + { + "epoch": 0.1, + "learning_rate": 0.00027136158097471107, + "loss": 0.8534, + "step": 1240 + }, + { + "epoch": 0.1, + "learning_rate": 0.00027085915257075863, + "loss": 0.8512, + "step": 1260 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002703567241668062, + "loss": 0.8472, + "step": 1280 + }, + { + "epoch": 0.11, + "learning_rate": 0.00026985429576285375, + "loss": 0.8584, + "step": 1300 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002693518673589013, + "loss": 0.8557, + "step": 1320 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002688494389549489, + "loss": 0.8547, + "step": 1340 + }, + { + "epoch": 0.11, + "learning_rate": 0.00026834701055099643, + "loss": 0.8576, + "step": 1360 + }, + { + "epoch": 0.11, + "learning_rate": 0.000267844582147044, + "loss": 0.8499, + "step": 1380 + }, + { + "epoch": 0.12, + "learning_rate": 0.00026734215374309156, + "loss": 0.839, + "step": 1400 + }, + { + "epoch": 0.12, + "eval_loss": 0.8613501191139221, + "eval_runtime": 8.2712, + "eval_samples_per_second": 241.802, + "eval_steps_per_second": 1.934, + "step": 1400 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002668397253391391, + "loss": 0.8538, + "step": 1420 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002663372969351867, + "loss": 0.8541, + "step": 1440 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002658348685312343, + "loss": 0.8381, + "step": 1460 + }, + { + "epoch": 0.12, + "learning_rate": 0.00026533244012728186, + "loss": 0.8527, + "step": 1480 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002648300117233294, + "loss": 0.8493, + "step": 1500 + }, + { + "epoch": 0.13, + "learning_rate": 0.000264327583319377, + "loss": 0.846, + "step": 1520 + }, + { + "epoch": 0.13, + "learning_rate": 0.00026382515491542454, + "loss": 0.8435, + "step": 1540 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002633227265114721, + "loss": 0.8517, + "step": 1560 + }, + { + "epoch": 0.13, + "learning_rate": 0.00026282029810751966, + "loss": 0.8403, + "step": 1580 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002623178697035672, + "loss": 0.8419, + "step": 1600 + }, + { + "epoch": 0.13, + "eval_loss": 0.8518173098564148, + "eval_runtime": 8.2505, + "eval_samples_per_second": 242.411, + "eval_steps_per_second": 1.939, + "step": 1600 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002618154412996148, + "loss": 0.8393, + "step": 1620 + }, + { + "epoch": 0.14, + "learning_rate": 0.00026131301289566234, + "loss": 0.8471, + "step": 1640 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002608105844917099, + "loss": 0.85, + "step": 1660 + }, + { + "epoch": 0.14, + "learning_rate": 0.00026030815608775747, + "loss": 0.8388, + "step": 1680 + }, + { + "epoch": 0.14, + "learning_rate": 0.00025980572768380503, + "loss": 0.8394, + "step": 1700 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002593032992798526, + "loss": 0.8333, + "step": 1720 + }, + { + "epoch": 0.14, + "learning_rate": 0.00025880087087590015, + "loss": 0.8343, + "step": 1740 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002582984424719477, + "loss": 0.8303, + "step": 1760 + }, + { + "epoch": 0.15, + "learning_rate": 0.00025779601406799533, + "loss": 0.8247, + "step": 1780 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002572935856640429, + "loss": 0.8282, + "step": 1800 + }, + { + "epoch": 0.15, + "eval_loss": 0.8434953689575195, + "eval_runtime": 8.2633, + "eval_samples_per_second": 242.034, + "eval_steps_per_second": 1.936, + "step": 1800 + }, + { + "epoch": 0.15, + "learning_rate": 0.00025679115726009045, + "loss": 0.8348, + "step": 1820 + }, + { + "epoch": 0.15, + "learning_rate": 0.000256288728856138, + "loss": 0.8393, + "step": 1840 + }, + { + "epoch": 0.15, + "learning_rate": 0.00025578630045218557, + "loss": 0.8304, + "step": 1860 + }, + { + "epoch": 0.16, + "learning_rate": 0.00025528387204823313, + "loss": 0.8309, + "step": 1880 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002547814436442807, + "loss": 0.8382, + "step": 1900 + }, + { + "epoch": 0.16, + "learning_rate": 0.00025427901524032826, + "loss": 0.841, + "step": 1920 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002537765868363758, + "loss": 0.8354, + "step": 1940 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002532741584324234, + "loss": 0.8334, + "step": 1960 + }, + { + "epoch": 0.16, + "learning_rate": 0.00025277173002847094, + "loss": 0.8243, + "step": 1980 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002522693016245185, + "loss": 0.8337, + "step": 2000 + }, + { + "epoch": 0.17, + "eval_loss": 0.8386329412460327, + "eval_runtime": 8.2418, + "eval_samples_per_second": 242.666, + "eval_steps_per_second": 1.941, + "step": 2000 + }, + { + "epoch": 0.17, + "learning_rate": 0.00025176687322056606, + "loss": 0.8237, + "step": 2020 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002512644448166136, + "loss": 0.8206, + "step": 2040 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002507620164126612, + "loss": 0.8277, + "step": 2060 + }, + { + "epoch": 0.17, + "learning_rate": 0.00025025958800870874, + "loss": 0.8271, + "step": 2080 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002497571596047563, + "loss": 0.8353, + "step": 2100 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024925473120080387, + "loss": 0.8253, + "step": 2120 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024875230279685143, + "loss": 0.8212, + "step": 2140 + }, + { + "epoch": 0.18, + "learning_rate": 0.000248249874392899, + "loss": 0.8192, + "step": 2160 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024774744598894655, + "loss": 0.8292, + "step": 2180 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002472450175849941, + "loss": 0.825, + "step": 2200 + }, + { + "epoch": 0.18, + "eval_loss": 0.8326684236526489, + "eval_runtime": 8.2602, + "eval_samples_per_second": 242.126, + "eval_steps_per_second": 1.937, + "step": 2200 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024674258918104167, + "loss": 0.8308, + "step": 2220 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024624016077708923, + "loss": 0.8324, + "step": 2240 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002457377323731368, + "loss": 0.8257, + "step": 2260 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024523530396918436, + "loss": 0.8181, + "step": 2280 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002447328755652319, + "loss": 0.8126, + "step": 2300 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002442304471612795, + "loss": 0.819, + "step": 2320 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024372801875732707, + "loss": 0.8198, + "step": 2340 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024322559035337463, + "loss": 0.8188, + "step": 2360 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002427231619494222, + "loss": 0.8145, + "step": 2380 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024222073354546975, + "loss": 0.81, + "step": 2400 + }, + { + "epoch": 0.2, + "eval_loss": 0.8278167843818665, + "eval_runtime": 8.2577, + "eval_samples_per_second": 242.198, + "eval_steps_per_second": 1.938, + "step": 2400 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002417183051415173, + "loss": 0.8142, + "step": 2420 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024121587673756487, + "loss": 0.8059, + "step": 2440 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024071344833361243, + "loss": 0.826, + "step": 2460 + }, + { + "epoch": 0.21, + "learning_rate": 0.00024021101992966, + "loss": 0.8173, + "step": 2480 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023970859152570756, + "loss": 0.8063, + "step": 2500 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023920616312175512, + "loss": 0.812, + "step": 2520 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023870373471780268, + "loss": 0.82, + "step": 2540 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023820130631385024, + "loss": 0.8207, + "step": 2560 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002376988779098978, + "loss": 0.8113, + "step": 2580 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023719644950594536, + "loss": 0.8175, + "step": 2600 + }, + { + "epoch": 0.22, + "eval_loss": 0.8231886029243469, + "eval_runtime": 8.2627, + "eval_samples_per_second": 242.052, + "eval_steps_per_second": 1.936, + "step": 2600 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023669402110199295, + "loss": 0.8105, + "step": 2620 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002361915926980405, + "loss": 0.8106, + "step": 2640 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023568916429408807, + "loss": 0.8114, + "step": 2660 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023518673589013563, + "loss": 0.8152, + "step": 2680 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002346843074861832, + "loss": 0.8059, + "step": 2700 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023418187908223076, + "loss": 0.8065, + "step": 2720 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023367945067827832, + "loss": 0.8147, + "step": 2740 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023317702227432588, + "loss": 0.8006, + "step": 2760 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023267459387037344, + "loss": 0.805, + "step": 2780 + }, + { + "epoch": 0.23, + "learning_rate": 0.000232172165466421, + "loss": 0.8142, + "step": 2800 + }, + { + "epoch": 0.23, + "eval_loss": 0.8191845417022705, + "eval_runtime": 8.28, + "eval_samples_per_second": 241.547, + "eval_steps_per_second": 1.932, + "step": 2800 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023166973706246856, + "loss": 0.8134, + "step": 2820 + }, + { + "epoch": 0.24, + "learning_rate": 0.00023116730865851612, + "loss": 0.8066, + "step": 2840 + }, + { + "epoch": 0.24, + "learning_rate": 0.00023066488025456368, + "loss": 0.8049, + "step": 2860 + }, + { + "epoch": 0.24, + "learning_rate": 0.00023016245185061124, + "loss": 0.7987, + "step": 2880 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022966002344665886, + "loss": 0.8113, + "step": 2900 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022915759504270642, + "loss": 0.8086, + "step": 2920 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022865516663875398, + "loss": 0.8019, + "step": 2940 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022815273823480154, + "loss": 0.8051, + "step": 2960 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002276503098308491, + "loss": 0.8032, + "step": 2980 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022714788142689667, + "loss": 0.8058, + "step": 3000 + }, + { + "epoch": 0.25, + "eval_loss": 0.8153129816055298, + "eval_runtime": 8.2727, + "eval_samples_per_second": 241.759, + "eval_steps_per_second": 1.934, + "step": 3000 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022664545302294423, + "loss": 0.801, + "step": 3020 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002261430246189918, + "loss": 0.801, + "step": 3040 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022564059621503935, + "loss": 0.8037, + "step": 3060 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002251381678110869, + "loss": 0.7992, + "step": 3080 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022463573940713447, + "loss": 0.8036, + "step": 3100 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022413331100318203, + "loss": 0.8064, + "step": 3120 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002236308825992296, + "loss": 0.7966, + "step": 3140 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022312845419527716, + "loss": 0.8047, + "step": 3160 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022262602579132472, + "loss": 0.7986, + "step": 3180 + }, + { + "epoch": 0.27, + "learning_rate": 0.00022212359738737228, + "loss": 0.8026, + "step": 3200 + }, + { + "epoch": 0.27, + "eval_loss": 0.8119255304336548, + "eval_runtime": 8.288, + "eval_samples_per_second": 241.313, + "eval_steps_per_second": 1.931, + "step": 3200 + }, + { + "epoch": 0.27, + "learning_rate": 0.00022162116898341984, + "loss": 0.8028, + "step": 3220 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002211187405794674, + "loss": 0.7987, + "step": 3240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000220616312175515, + "loss": 0.8148, + "step": 3260 + }, + { + "epoch": 0.27, + "learning_rate": 0.00022011388377156255, + "loss": 0.7932, + "step": 3280 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002196114553676101, + "loss": 0.8027, + "step": 3300 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021910902696365767, + "loss": 0.7989, + "step": 3320 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021860659855970523, + "loss": 0.7983, + "step": 3340 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002181041701557528, + "loss": 0.7977, + "step": 3360 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021760174175180035, + "loss": 0.7972, + "step": 3380 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021709931334784792, + "loss": 0.8025, + "step": 3400 + }, + { + "epoch": 0.28, + "eval_loss": 0.8084473609924316, + "eval_runtime": 8.2772, + "eval_samples_per_second": 241.627, + "eval_steps_per_second": 1.933, + "step": 3400 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021659688494389548, + "loss": 0.7993, + "step": 3420 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021609445653994304, + "loss": 0.7919, + "step": 3440 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002155920281359906, + "loss": 0.8062, + "step": 3460 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021508959973203816, + "loss": 0.7937, + "step": 3480 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021458717132808572, + "loss": 0.7947, + "step": 3500 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021408474292413328, + "loss": 0.7973, + "step": 3520 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021358231452018084, + "loss": 0.7964, + "step": 3540 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002130798861162284, + "loss": 0.7994, + "step": 3560 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021257745771227597, + "loss": 0.7911, + "step": 3580 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021207502930832353, + "loss": 0.805, + "step": 3600 + }, + { + "epoch": 0.3, + "eval_loss": 0.8064665794372559, + "eval_runtime": 8.2739, + "eval_samples_per_second": 241.725, + "eval_steps_per_second": 1.934, + "step": 3600 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021157260090437112, + "loss": 0.8014, + "step": 3620 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021107017250041868, + "loss": 0.7941, + "step": 3640 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021056774409646624, + "loss": 0.793, + "step": 3660 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002100653156925138, + "loss": 0.7918, + "step": 3680 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020956288728856136, + "loss": 0.7926, + "step": 3700 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020906045888460892, + "loss": 0.797, + "step": 3720 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020855803048065648, + "loss": 0.7965, + "step": 3740 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020805560207670404, + "loss": 0.7963, + "step": 3760 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002075531736727516, + "loss": 0.7924, + "step": 3780 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020705074526879917, + "loss": 0.788, + "step": 3800 + }, + { + "epoch": 0.32, + "eval_loss": 0.8037804961204529, + "eval_runtime": 8.3133, + "eval_samples_per_second": 240.579, + "eval_steps_per_second": 1.925, + "step": 3800 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020654831686484673, + "loss": 0.794, + "step": 3820 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002060458884608943, + "loss": 0.7946, + "step": 3840 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020554346005694185, + "loss": 0.7934, + "step": 3860 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002050410316529894, + "loss": 0.7935, + "step": 3880 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020453860324903697, + "loss": 0.7864, + "step": 3900 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020403617484508453, + "loss": 0.8026, + "step": 3920 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002035337464411321, + "loss": 0.7902, + "step": 3940 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020303131803717966, + "loss": 0.7915, + "step": 3960 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020252888963322724, + "loss": 0.798, + "step": 3980 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002020264612292748, + "loss": 0.7989, + "step": 4000 + }, + { + "epoch": 0.33, + "eval_loss": 0.8007607460021973, + "eval_runtime": 8.2882, + "eval_samples_per_second": 241.308, + "eval_steps_per_second": 1.93, + "step": 4000 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002015240328253224, + "loss": 0.7831, + "step": 4020 + }, + { + "epoch": 0.34, + "learning_rate": 0.00020102160442136995, + "loss": 0.7877, + "step": 4040 + }, + { + "epoch": 0.34, + "learning_rate": 0.00020051917601741752, + "loss": 0.7855, + "step": 4060 + }, + { + "epoch": 0.34, + "learning_rate": 0.00020001674761346508, + "loss": 0.7889, + "step": 4080 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019951431920951264, + "loss": 0.7958, + "step": 4100 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001990118908055602, + "loss": 0.7934, + "step": 4120 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019850946240160776, + "loss": 0.785, + "step": 4140 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019800703399765532, + "loss": 0.7987, + "step": 4160 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019750460559370288, + "loss": 0.7886, + "step": 4180 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019702729860994808, + "loss": 0.79, + "step": 4200 + }, + { + "epoch": 0.35, + "eval_loss": 0.7986196279525757, + "eval_runtime": 8.2739, + "eval_samples_per_second": 241.724, + "eval_steps_per_second": 1.934, + "step": 4200 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019652487020599565, + "loss": 0.7889, + "step": 4220 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001960224418020432, + "loss": 0.7883, + "step": 4240 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019552001339809077, + "loss": 0.7895, + "step": 4260 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019501758499413833, + "loss": 0.7838, + "step": 4280 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001945151565901859, + "loss": 0.7957, + "step": 4300 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019401272818623345, + "loss": 0.7915, + "step": 4320 + }, + { + "epoch": 0.36, + "learning_rate": 0.000193510299782281, + "loss": 0.7987, + "step": 4340 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019300787137832857, + "loss": 0.7811, + "step": 4360 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019250544297437613, + "loss": 0.7921, + "step": 4380 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001920030145704237, + "loss": 0.7928, + "step": 4400 + }, + { + "epoch": 0.37, + "eval_loss": 0.7964197993278503, + "eval_runtime": 8.2761, + "eval_samples_per_second": 241.66, + "eval_steps_per_second": 1.933, + "step": 4400 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019150058616647126, + "loss": 0.7924, + "step": 4420 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019099815776251885, + "loss": 0.7898, + "step": 4440 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001904957293585664, + "loss": 0.7893, + "step": 4460 + }, + { + "epoch": 0.37, + "learning_rate": 0.00018999330095461397, + "loss": 0.795, + "step": 4480 + }, + { + "epoch": 0.37, + "learning_rate": 0.00018949087255066153, + "loss": 0.7832, + "step": 4500 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001889884441467091, + "loss": 0.7835, + "step": 4520 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018848601574275665, + "loss": 0.7809, + "step": 4540 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001879835873388042, + "loss": 0.7792, + "step": 4560 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018748115893485177, + "loss": 0.7884, + "step": 4580 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018697873053089933, + "loss": 0.7859, + "step": 4600 + }, + { + "epoch": 0.38, + "eval_loss": 0.794753909111023, + "eval_runtime": 8.2787, + "eval_samples_per_second": 241.584, + "eval_steps_per_second": 1.933, + "step": 4600 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001864763021269469, + "loss": 0.7709, + "step": 4620 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018597387372299446, + "loss": 0.7786, + "step": 4640 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018547144531904202, + "loss": 0.7786, + "step": 4660 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018496901691508958, + "loss": 0.7824, + "step": 4680 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018446658851113714, + "loss": 0.7741, + "step": 4700 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001839641601071847, + "loss": 0.78, + "step": 4720 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018346173170323226, + "loss": 0.782, + "step": 4740 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018295930329927982, + "loss": 0.7808, + "step": 4760 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001824568748953274, + "loss": 0.7776, + "step": 4780 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018195444649137497, + "loss": 0.7824, + "step": 4800 + }, + { + "epoch": 0.4, + "eval_loss": 0.7927345633506775, + "eval_runtime": 8.2706, + "eval_samples_per_second": 241.82, + "eval_steps_per_second": 1.935, + "step": 4800 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018145201808742253, + "loss": 0.7843, + "step": 4820 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001809495896834701, + "loss": 0.7908, + "step": 4840 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018044716127951766, + "loss": 0.7872, + "step": 4860 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017994473287556522, + "loss": 0.7763, + "step": 4880 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017944230447161278, + "loss": 0.7846, + "step": 4900 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017893987606766034, + "loss": 0.7775, + "step": 4920 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001784374476637079, + "loss": 0.793, + "step": 4940 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017793501925975546, + "loss": 0.7814, + "step": 4960 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017743259085580302, + "loss": 0.7913, + "step": 4980 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017693016245185058, + "loss": 0.7755, + "step": 5000 + }, + { + "epoch": 0.42, + "eval_loss": 0.7912722826004028, + "eval_runtime": 8.285, + "eval_samples_per_second": 241.401, + "eval_steps_per_second": 1.931, + "step": 5000 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017642773404789815, + "loss": 0.7848, + "step": 5020 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001759253056439457, + "loss": 0.7826, + "step": 5040 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017542287723999327, + "loss": 0.7753, + "step": 5060 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017492044883604083, + "loss": 0.7786, + "step": 5080 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001744180204320884, + "loss": 0.7929, + "step": 5100 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017391559202813595, + "loss": 0.7836, + "step": 5120 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017341316362418354, + "loss": 0.7802, + "step": 5140 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001729107352202311, + "loss": 0.7808, + "step": 5160 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017240830681627866, + "loss": 0.7784, + "step": 5180 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017190587841232622, + "loss": 0.7803, + "step": 5200 + }, + { + "epoch": 0.43, + "eval_loss": 0.7892646193504333, + "eval_runtime": 8.297, + "eval_samples_per_second": 241.05, + "eval_steps_per_second": 1.928, + "step": 5200 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017140345000837378, + "loss": 0.7872, + "step": 5220 + }, + { + "epoch": 0.44, + "learning_rate": 0.00017090102160442135, + "loss": 0.7795, + "step": 5240 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001703985932004689, + "loss": 0.7777, + "step": 5260 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016989616479651647, + "loss": 0.7775, + "step": 5280 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016939373639256406, + "loss": 0.7789, + "step": 5300 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016889130798861162, + "loss": 0.7863, + "step": 5320 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016838887958465918, + "loss": 0.7774, + "step": 5340 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016788645118070674, + "loss": 0.7856, + "step": 5360 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001673840227767543, + "loss": 0.78, + "step": 5380 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016688159437280186, + "loss": 0.7929, + "step": 5400 + }, + { + "epoch": 0.45, + "eval_loss": 0.7884517908096313, + "eval_runtime": 8.3482, + "eval_samples_per_second": 239.571, + "eval_steps_per_second": 1.917, + "step": 5400 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016637916596884945, + "loss": 0.7728, + "step": 5420 + }, + { + "epoch": 0.45, + "learning_rate": 0.000165876737564897, + "loss": 0.7827, + "step": 5440 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016537430916094457, + "loss": 0.767, + "step": 5460 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016487188075699213, + "loss": 0.7768, + "step": 5480 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001643694523530397, + "loss": 0.776, + "step": 5500 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016386702394908726, + "loss": 0.77, + "step": 5520 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016336459554513482, + "loss": 0.7825, + "step": 5540 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016286216714118238, + "loss": 0.7829, + "step": 5560 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016235973873722994, + "loss": 0.791, + "step": 5580 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001618573103332775, + "loss": 0.7807, + "step": 5600 + }, + { + "epoch": 0.47, + "eval_loss": 0.7869579792022705, + "eval_runtime": 8.297, + "eval_samples_per_second": 241.051, + "eval_steps_per_second": 1.928, + "step": 5600 + }, + { + "epoch": 0.47, + "learning_rate": 0.00016135488192932506, + "loss": 0.7762, + "step": 5620 + }, + { + "epoch": 0.47, + "learning_rate": 0.00016085245352537262, + "loss": 0.7805, + "step": 5640 + }, + { + "epoch": 0.47, + "learning_rate": 0.00016035002512142018, + "loss": 0.7797, + "step": 5660 + }, + { + "epoch": 0.47, + "learning_rate": 0.00015984759671746774, + "loss": 0.7832, + "step": 5680 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001593451683135153, + "loss": 0.7739, + "step": 5700 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015884273990956287, + "loss": 0.7703, + "step": 5720 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015834031150561043, + "loss": 0.7707, + "step": 5740 + }, + { + "epoch": 0.48, + "learning_rate": 0.000157837883101658, + "loss": 0.7649, + "step": 5760 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015733545469770558, + "loss": 0.7713, + "step": 5780 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015683302629375314, + "loss": 0.7753, + "step": 5800 + }, + { + "epoch": 0.48, + "eval_loss": 0.7855839729309082, + "eval_runtime": 8.3158, + "eval_samples_per_second": 240.507, + "eval_steps_per_second": 1.924, + "step": 5800 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001563305978898007, + "loss": 0.7795, + "step": 5820 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015582816948584826, + "loss": 0.7714, + "step": 5840 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015532574108189582, + "loss": 0.7772, + "step": 5860 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015482331267794338, + "loss": 0.7784, + "step": 5880 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015432088427399094, + "loss": 0.7628, + "step": 5900 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001538184558700385, + "loss": 0.7848, + "step": 5920 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015331602746608607, + "loss": 0.78, + "step": 5940 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015281359906213363, + "loss": 0.7856, + "step": 5960 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001523111706581812, + "loss": 0.776, + "step": 5980 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015180874225422875, + "loss": 0.7752, + "step": 6000 + }, + { + "epoch": 0.5, + "eval_loss": 0.7837858200073242, + "eval_runtime": 8.2979, + "eval_samples_per_second": 241.025, + "eval_steps_per_second": 1.928, + "step": 6000 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001513063138502763, + "loss": 0.7719, + "step": 6020 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015080388544632387, + "loss": 0.7841, + "step": 6040 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015030145704237143, + "loss": 0.7779, + "step": 6060 + }, + { + "epoch": 0.5, + "learning_rate": 0.000149799028638419, + "loss": 0.7706, + "step": 6080 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014929660023446656, + "loss": 0.762, + "step": 6100 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014879417183051412, + "loss": 0.7854, + "step": 6120 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001482917434265617, + "loss": 0.7803, + "step": 6140 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014778931502260927, + "loss": 0.7769, + "step": 6160 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014728688661865683, + "loss": 0.7773, + "step": 6180 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001467844582147044, + "loss": 0.7725, + "step": 6200 + }, + { + "epoch": 0.51, + "eval_loss": 0.7822731137275696, + "eval_runtime": 8.3078, + "eval_samples_per_second": 240.738, + "eval_steps_per_second": 1.926, + "step": 6200 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014630715123094956, + "loss": 0.7797, + "step": 6220 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014580472282699713, + "loss": 0.77, + "step": 6240 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014530229442304469, + "loss": 0.7699, + "step": 6260 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014479986601909225, + "loss": 0.7769, + "step": 6280 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014429743761513984, + "loss": 0.7755, + "step": 6300 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001437950092111874, + "loss": 0.7752, + "step": 6320 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014329258080723496, + "loss": 0.7791, + "step": 6340 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014279015240328252, + "loss": 0.7606, + "step": 6360 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014228772399933008, + "loss": 0.7733, + "step": 6380 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014178529559537764, + "loss": 0.7855, + "step": 6400 + }, + { + "epoch": 0.53, + "eval_loss": 0.7812179923057556, + "eval_runtime": 8.2896, + "eval_samples_per_second": 241.265, + "eval_steps_per_second": 1.93, + "step": 6400 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001412828671914252, + "loss": 0.7718, + "step": 6420 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014078043878747276, + "loss": 0.7676, + "step": 6440 + }, + { + "epoch": 0.54, + "learning_rate": 0.00014027801038352035, + "loss": 0.7732, + "step": 6460 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001397755819795679, + "loss": 0.7745, + "step": 6480 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013927315357561547, + "loss": 0.7747, + "step": 6500 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013877072517166304, + "loss": 0.7648, + "step": 6520 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001382682967677106, + "loss": 0.7767, + "step": 6540 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013776586836375816, + "loss": 0.7735, + "step": 6560 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013726343995980572, + "loss": 0.7684, + "step": 6580 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013676101155585328, + "loss": 0.7783, + "step": 6600 + }, + { + "epoch": 0.55, + "eval_loss": 0.7797773480415344, + "eval_runtime": 8.2853, + "eval_samples_per_second": 241.391, + "eval_steps_per_second": 1.931, + "step": 6600 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013625858315190084, + "loss": 0.7755, + "step": 6620 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001357561547479484, + "loss": 0.7813, + "step": 6640 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013525372634399596, + "loss": 0.7721, + "step": 6660 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013475129794004352, + "loss": 0.7621, + "step": 6680 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013424886953609109, + "loss": 0.7703, + "step": 6700 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013374644113213865, + "loss": 0.7795, + "step": 6720 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001332440127281862, + "loss": 0.779, + "step": 6740 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013274158432423377, + "loss": 0.7762, + "step": 6760 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013223915592028133, + "loss": 0.7769, + "step": 6780 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001317367275163289, + "loss": 0.7762, + "step": 6800 + }, + { + "epoch": 0.56, + "eval_loss": 0.7790360450744629, + "eval_runtime": 8.301, + "eval_samples_per_second": 240.935, + "eval_steps_per_second": 1.927, + "step": 6800 + }, + { + "epoch": 0.57, + "learning_rate": 0.00013123429911237648, + "loss": 0.7587, + "step": 6820 + }, + { + "epoch": 0.57, + "learning_rate": 0.00013073187070842404, + "loss": 0.7667, + "step": 6840 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001302294423044716, + "loss": 0.7575, + "step": 6860 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012972701390051916, + "loss": 0.7755, + "step": 6880 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012922458549656672, + "loss": 0.7718, + "step": 6900 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012872215709261429, + "loss": 0.7715, + "step": 6920 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012821972868866185, + "loss": 0.7679, + "step": 6940 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012771730028470944, + "loss": 0.7691, + "step": 6960 + }, + { + "epoch": 0.58, + "learning_rate": 0.000127214871880757, + "loss": 0.7766, + "step": 6980 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012671244347680456, + "loss": 0.7708, + "step": 7000 + }, + { + "epoch": 0.58, + "eval_loss": 0.7774990200996399, + "eval_runtime": 8.2721, + "eval_samples_per_second": 241.778, + "eval_steps_per_second": 1.934, + "step": 7000 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012621001507285212, + "loss": 0.7687, + "step": 7020 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012570758666889968, + "loss": 0.7675, + "step": 7040 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012520515826494724, + "loss": 0.7746, + "step": 7060 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001247027298609948, + "loss": 0.7626, + "step": 7080 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012420030145704236, + "loss": 0.764, + "step": 7100 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012369787305308992, + "loss": 0.7734, + "step": 7120 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012319544464913749, + "loss": 0.7642, + "step": 7140 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012269301624518505, + "loss": 0.7715, + "step": 7160 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001221905878412326, + "loss": 0.7619, + "step": 7180 + }, + { + "epoch": 0.6, + "learning_rate": 0.00012168815943728017, + "loss": 0.7674, + "step": 7200 + }, + { + "epoch": 0.6, + "eval_loss": 0.777417004108429, + "eval_runtime": 8.2947, + "eval_samples_per_second": 241.117, + "eval_steps_per_second": 1.929, + "step": 7200 + }, + { + "epoch": 0.6, + "learning_rate": 0.00012118573103332774, + "loss": 0.7709, + "step": 7220 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001206833026293753, + "loss": 0.7682, + "step": 7240 + }, + { + "epoch": 0.6, + "learning_rate": 0.00012018087422542287, + "loss": 0.7604, + "step": 7260 + }, + { + "epoch": 0.6, + "learning_rate": 0.00011967844582147043, + "loss": 0.767, + "step": 7280 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011917601741751799, + "loss": 0.7707, + "step": 7300 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011867358901356555, + "loss": 0.7602, + "step": 7320 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011817116060961311, + "loss": 0.7584, + "step": 7340 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011766873220566067, + "loss": 0.7696, + "step": 7360 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011716630380170823, + "loss": 0.7675, + "step": 7380 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011666387539775581, + "loss": 0.7716, + "step": 7400 + }, + { + "epoch": 0.61, + "eval_loss": 0.7760618329048157, + "eval_runtime": 8.2995, + "eval_samples_per_second": 240.98, + "eval_steps_per_second": 1.928, + "step": 7400 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011616144699380338, + "loss": 0.7648, + "step": 7420 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011565901858985094, + "loss": 0.7643, + "step": 7440 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001151565901858985, + "loss": 0.759, + "step": 7460 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011465416178194607, + "loss": 0.7694, + "step": 7480 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011415173337799363, + "loss": 0.7633, + "step": 7500 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011364930497404119, + "loss": 0.773, + "step": 7520 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011314687657008876, + "loss": 0.76, + "step": 7540 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011264444816613632, + "loss": 0.7708, + "step": 7560 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011214201976218388, + "loss": 0.7757, + "step": 7580 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011163959135823145, + "loss": 0.7655, + "step": 7600 + }, + { + "epoch": 0.63, + "eval_loss": 0.7751550078392029, + "eval_runtime": 8.2715, + "eval_samples_per_second": 241.793, + "eval_steps_per_second": 1.934, + "step": 7600 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011113716295427901, + "loss": 0.7607, + "step": 7620 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011063473455032657, + "loss": 0.7703, + "step": 7640 + }, + { + "epoch": 0.64, + "learning_rate": 0.00011013230614637413, + "loss": 0.7653, + "step": 7660 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010962987774242169, + "loss": 0.7753, + "step": 7680 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010912744933846925, + "loss": 0.7639, + "step": 7700 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010862502093451683, + "loss": 0.7701, + "step": 7720 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010812259253056439, + "loss": 0.7614, + "step": 7740 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010762016412661195, + "loss": 0.7612, + "step": 7760 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010711773572265951, + "loss": 0.7597, + "step": 7780 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010661530731870707, + "loss": 0.7621, + "step": 7800 + }, + { + "epoch": 0.65, + "eval_loss": 0.7740359902381897, + "eval_runtime": 8.2962, + "eval_samples_per_second": 241.075, + "eval_steps_per_second": 1.929, + "step": 7800 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010611287891475463, + "loss": 0.7592, + "step": 7820 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001056104505108022, + "loss": 0.7665, + "step": 7840 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010510802210684975, + "loss": 0.7646, + "step": 7860 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010460559370289732, + "loss": 0.7668, + "step": 7880 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010410316529894489, + "loss": 0.7756, + "step": 7900 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010360073689499245, + "loss": 0.7684, + "step": 7920 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010309830849104001, + "loss": 0.7566, + "step": 7940 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010259588008708757, + "loss": 0.7581, + "step": 7960 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010209345168313515, + "loss": 0.7624, + "step": 7980 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010159102327918271, + "loss": 0.7594, + "step": 8000 + }, + { + "epoch": 0.66, + "eval_loss": 0.7729437351226807, + "eval_runtime": 8.2949, + "eval_samples_per_second": 241.112, + "eval_steps_per_second": 1.929, + "step": 8000 + }, + { + "epoch": 0.67, + "learning_rate": 0.00010108859487523028, + "loss": 0.7657, + "step": 8020 + }, + { + "epoch": 0.67, + "learning_rate": 0.00010058616647127785, + "loss": 0.7604, + "step": 8040 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001000837380673254, + "loss": 0.7659, + "step": 8060 + }, + { + "epoch": 0.67, + "learning_rate": 9.958130966337297e-05, + "loss": 0.7688, + "step": 8080 + }, + { + "epoch": 0.67, + "learning_rate": 9.907888125942053e-05, + "loss": 0.7702, + "step": 8100 + }, + { + "epoch": 0.67, + "learning_rate": 9.857645285546809e-05, + "loss": 0.7572, + "step": 8120 + }, + { + "epoch": 0.68, + "learning_rate": 9.807402445151565e-05, + "loss": 0.7603, + "step": 8140 + }, + { + "epoch": 0.68, + "learning_rate": 9.757159604756321e-05, + "loss": 0.761, + "step": 8160 + }, + { + "epoch": 0.68, + "learning_rate": 9.706916764361077e-05, + "loss": 0.7649, + "step": 8180 + }, + { + "epoch": 0.68, + "learning_rate": 9.656673923965835e-05, + "loss": 0.7554, + "step": 8200 + }, + { + "epoch": 0.68, + "eval_loss": 0.7727349996566772, + "eval_runtime": 8.2811, + "eval_samples_per_second": 241.515, + "eval_steps_per_second": 1.932, + "step": 8200 + }, + { + "epoch": 0.68, + "learning_rate": 9.606431083570591e-05, + "loss": 0.7634, + "step": 8220 + }, + { + "epoch": 0.68, + "learning_rate": 9.556188243175347e-05, + "loss": 0.7611, + "step": 8240 + }, + { + "epoch": 0.69, + "learning_rate": 9.505945402780103e-05, + "loss": 0.7606, + "step": 8260 + }, + { + "epoch": 0.69, + "learning_rate": 9.455702562384859e-05, + "loss": 0.7703, + "step": 8280 + }, + { + "epoch": 0.69, + "learning_rate": 9.405459721989615e-05, + "loss": 0.7616, + "step": 8300 + }, + { + "epoch": 0.69, + "learning_rate": 9.355216881594372e-05, + "loss": 0.7638, + "step": 8320 + }, + { + "epoch": 0.69, + "learning_rate": 9.30748618321889e-05, + "loss": 0.7461, + "step": 8340 + }, + { + "epoch": 0.69, + "learning_rate": 9.257243342823646e-05, + "loss": 0.7577, + "step": 8360 + }, + { + "epoch": 0.7, + "learning_rate": 9.207000502428403e-05, + "loss": 0.7553, + "step": 8380 + }, + { + "epoch": 0.7, + "learning_rate": 9.15675766203316e-05, + "loss": 0.7564, + "step": 8400 + }, + { + "epoch": 0.7, + "eval_loss": 0.771743893623352, + "eval_runtime": 8.2767, + "eval_samples_per_second": 241.642, + "eval_steps_per_second": 1.933, + "step": 8400 + }, + { + "epoch": 0.7, + "learning_rate": 9.106514821637916e-05, + "loss": 0.7605, + "step": 8420 + }, + { + "epoch": 0.7, + "learning_rate": 9.056271981242672e-05, + "loss": 0.773, + "step": 8440 + }, + { + "epoch": 0.7, + "learning_rate": 9.006029140847428e-05, + "loss": 0.7657, + "step": 8460 + }, + { + "epoch": 0.7, + "learning_rate": 8.955786300452185e-05, + "loss": 0.7597, + "step": 8480 + }, + { + "epoch": 0.71, + "learning_rate": 8.90554346005694e-05, + "loss": 0.7687, + "step": 8500 + }, + { + "epoch": 0.71, + "learning_rate": 8.855300619661697e-05, + "loss": 0.7545, + "step": 8520 + }, + { + "epoch": 0.71, + "learning_rate": 8.805057779266453e-05, + "loss": 0.7575, + "step": 8540 + }, + { + "epoch": 0.71, + "learning_rate": 8.754814938871209e-05, + "loss": 0.771, + "step": 8560 + }, + { + "epoch": 0.71, + "learning_rate": 8.704572098475966e-05, + "loss": 0.7622, + "step": 8580 + }, + { + "epoch": 0.71, + "learning_rate": 8.654329258080723e-05, + "loss": 0.7621, + "step": 8600 + }, + { + "epoch": 0.71, + "eval_loss": 0.7706981897354126, + "eval_runtime": 8.2852, + "eval_samples_per_second": 241.394, + "eval_steps_per_second": 1.931, + "step": 8600 + }, + { + "epoch": 0.72, + "learning_rate": 8.604086417685479e-05, + "loss": 0.7516, + "step": 8620 + }, + { + "epoch": 0.72, + "learning_rate": 8.553843577290235e-05, + "loss": 0.7526, + "step": 8640 + }, + { + "epoch": 0.72, + "learning_rate": 8.503600736894991e-05, + "loss": 0.7639, + "step": 8660 + }, + { + "epoch": 0.72, + "learning_rate": 8.453357896499747e-05, + "loss": 0.7561, + "step": 8680 + }, + { + "epoch": 0.72, + "learning_rate": 8.403115056104506e-05, + "loss": 0.7643, + "step": 8700 + }, + { + "epoch": 0.72, + "learning_rate": 8.352872215709262e-05, + "loss": 0.7665, + "step": 8720 + }, + { + "epoch": 0.73, + "learning_rate": 8.302629375314018e-05, + "loss": 0.7681, + "step": 8740 + }, + { + "epoch": 0.73, + "learning_rate": 8.252386534918774e-05, + "loss": 0.7655, + "step": 8760 + }, + { + "epoch": 0.73, + "learning_rate": 8.20214369452353e-05, + "loss": 0.7603, + "step": 8780 + }, + { + "epoch": 0.73, + "learning_rate": 8.151900854128286e-05, + "loss": 0.7624, + "step": 8800 + }, + { + "epoch": 0.73, + "eval_loss": 0.7699927687644958, + "eval_runtime": 8.2872, + "eval_samples_per_second": 241.335, + "eval_steps_per_second": 1.931, + "step": 8800 + }, + { + "epoch": 0.73, + "learning_rate": 8.101658013733043e-05, + "loss": 0.7666, + "step": 8820 + }, + { + "epoch": 0.73, + "learning_rate": 8.051415173337799e-05, + "loss": 0.752, + "step": 8840 + }, + { + "epoch": 0.74, + "learning_rate": 8.001172332942555e-05, + "loss": 0.7654, + "step": 8860 + }, + { + "epoch": 0.74, + "learning_rate": 7.950929492547312e-05, + "loss": 0.7546, + "step": 8880 + }, + { + "epoch": 0.74, + "learning_rate": 7.900686652152068e-05, + "loss": 0.755, + "step": 8900 + }, + { + "epoch": 0.74, + "learning_rate": 7.850443811756824e-05, + "loss": 0.7578, + "step": 8920 + }, + { + "epoch": 0.74, + "learning_rate": 7.80020097136158e-05, + "loss": 0.7527, + "step": 8940 + }, + { + "epoch": 0.74, + "learning_rate": 7.749958130966337e-05, + "loss": 0.7622, + "step": 8960 + }, + { + "epoch": 0.75, + "learning_rate": 7.699715290571093e-05, + "loss": 0.7681, + "step": 8980 + }, + { + "epoch": 0.75, + "learning_rate": 7.649472450175849e-05, + "loss": 0.7641, + "step": 9000 + }, + { + "epoch": 0.75, + "eval_loss": 0.7696471214294434, + "eval_runtime": 8.269, + "eval_samples_per_second": 241.867, + "eval_steps_per_second": 1.935, + "step": 9000 + }, + { + "epoch": 0.75, + "learning_rate": 7.599229609780605e-05, + "loss": 0.7622, + "step": 9020 + }, + { + "epoch": 0.75, + "learning_rate": 7.548986769385361e-05, + "loss": 0.7608, + "step": 9040 + }, + { + "epoch": 0.75, + "learning_rate": 7.498743928990119e-05, + "loss": 0.7613, + "step": 9060 + }, + { + "epoch": 0.75, + "learning_rate": 7.448501088594875e-05, + "loss": 0.759, + "step": 9080 + }, + { + "epoch": 0.76, + "learning_rate": 7.398258248199631e-05, + "loss": 0.7585, + "step": 9100 + }, + { + "epoch": 0.76, + "learning_rate": 7.348015407804387e-05, + "loss": 0.7609, + "step": 9120 + }, + { + "epoch": 0.76, + "learning_rate": 7.297772567409144e-05, + "loss": 0.76, + "step": 9140 + }, + { + "epoch": 0.76, + "learning_rate": 7.2475297270139e-05, + "loss": 0.7621, + "step": 9160 + }, + { + "epoch": 0.76, + "learning_rate": 7.197286886618657e-05, + "loss": 0.7587, + "step": 9180 + }, + { + "epoch": 0.76, + "learning_rate": 7.147044046223413e-05, + "loss": 0.7589, + "step": 9200 + }, + { + "epoch": 0.76, + "eval_loss": 0.7685341835021973, + "eval_runtime": 8.3378, + "eval_samples_per_second": 239.872, + "eval_steps_per_second": 1.919, + "step": 9200 + }, + { + "epoch": 0.77, + "learning_rate": 7.096801205828169e-05, + "loss": 0.7708, + "step": 9220 + }, + { + "epoch": 0.77, + "learning_rate": 7.046558365432925e-05, + "loss": 0.7468, + "step": 9240 + }, + { + "epoch": 0.77, + "learning_rate": 6.996315525037681e-05, + "loss": 0.7669, + "step": 9260 + }, + { + "epoch": 0.77, + "learning_rate": 6.946072684642437e-05, + "loss": 0.7639, + "step": 9280 + }, + { + "epoch": 0.77, + "learning_rate": 6.895829844247193e-05, + "loss": 0.764, + "step": 9300 + }, + { + "epoch": 0.77, + "learning_rate": 6.845587003851951e-05, + "loss": 0.7605, + "step": 9320 + }, + { + "epoch": 0.78, + "learning_rate": 6.795344163456707e-05, + "loss": 0.7638, + "step": 9340 + }, + { + "epoch": 0.78, + "learning_rate": 6.745101323061463e-05, + "loss": 0.753, + "step": 9360 + }, + { + "epoch": 0.78, + "learning_rate": 6.694858482666219e-05, + "loss": 0.7567, + "step": 9380 + }, + { + "epoch": 0.78, + "learning_rate": 6.644615642270977e-05, + "loss": 0.7604, + "step": 9400 + }, + { + "epoch": 0.78, + "eval_loss": 0.7682663798332214, + "eval_runtime": 8.2956, + "eval_samples_per_second": 241.092, + "eval_steps_per_second": 1.929, + "step": 9400 + }, + { + "epoch": 0.78, + "learning_rate": 6.594372801875733e-05, + "loss": 0.7603, + "step": 9420 + }, + { + "epoch": 0.78, + "learning_rate": 6.544129961480489e-05, + "loss": 0.7677, + "step": 9440 + }, + { + "epoch": 0.79, + "learning_rate": 6.493887121085245e-05, + "loss": 0.7692, + "step": 9460 + }, + { + "epoch": 0.79, + "learning_rate": 6.443644280690001e-05, + "loss": 0.7637, + "step": 9480 + }, + { + "epoch": 0.79, + "learning_rate": 6.393401440294757e-05, + "loss": 0.756, + "step": 9500 + }, + { + "epoch": 0.79, + "learning_rate": 6.343158599899513e-05, + "loss": 0.7572, + "step": 9520 + }, + { + "epoch": 0.79, + "learning_rate": 6.29291575950427e-05, + "loss": 0.7696, + "step": 9540 + }, + { + "epoch": 0.79, + "learning_rate": 6.242672919109027e-05, + "loss": 0.753, + "step": 9560 + }, + { + "epoch": 0.8, + "learning_rate": 6.192430078713783e-05, + "loss": 0.7619, + "step": 9580 + }, + { + "epoch": 0.8, + "learning_rate": 6.142187238318539e-05, + "loss": 0.7574, + "step": 9600 + }, + { + "epoch": 0.8, + "eval_loss": 0.7677283883094788, + "eval_runtime": 8.2754, + "eval_samples_per_second": 241.68, + "eval_steps_per_second": 1.933, + "step": 9600 + }, + { + "epoch": 0.8, + "learning_rate": 6.091944397923295e-05, + "loss": 0.7624, + "step": 9620 + }, + { + "epoch": 0.8, + "learning_rate": 6.0417015575280514e-05, + "loss": 0.7554, + "step": 9640 + }, + { + "epoch": 0.8, + "learning_rate": 5.9914587171328075e-05, + "loss": 0.7635, + "step": 9660 + }, + { + "epoch": 0.8, + "learning_rate": 5.941215876737565e-05, + "loss": 0.7586, + "step": 9680 + }, + { + "epoch": 0.81, + "learning_rate": 5.890973036342321e-05, + "loss": 0.755, + "step": 9700 + }, + { + "epoch": 0.81, + "learning_rate": 5.840730195947077e-05, + "loss": 0.7584, + "step": 9720 + }, + { + "epoch": 0.81, + "learning_rate": 5.7904873555518333e-05, + "loss": 0.7528, + "step": 9740 + }, + { + "epoch": 0.81, + "learning_rate": 5.74024451515659e-05, + "loss": 0.7597, + "step": 9760 + }, + { + "epoch": 0.81, + "learning_rate": 5.690001674761346e-05, + "loss": 0.7527, + "step": 9780 + }, + { + "epoch": 0.81, + "learning_rate": 5.6397588343661024e-05, + "loss": 0.7617, + "step": 9800 + }, + { + "epoch": 0.81, + "eval_loss": 0.7668038010597229, + "eval_runtime": 8.3597, + "eval_samples_per_second": 239.242, + "eval_steps_per_second": 1.914, + "step": 9800 + }, + { + "epoch": 0.82, + "learning_rate": 5.5895159939708585e-05, + "loss": 0.7584, + "step": 9820 + }, + { + "epoch": 0.82, + "learning_rate": 5.5392731535756146e-05, + "loss": 0.762, + "step": 9840 + }, + { + "epoch": 0.82, + "learning_rate": 5.4890303131803714e-05, + "loss": 0.7586, + "step": 9860 + }, + { + "epoch": 0.82, + "learning_rate": 5.4387874727851275e-05, + "loss": 0.7569, + "step": 9880 + }, + { + "epoch": 0.82, + "learning_rate": 5.3885446323898836e-05, + "loss": 0.7611, + "step": 9900 + }, + { + "epoch": 0.82, + "learning_rate": 5.33830179199464e-05, + "loss": 0.7581, + "step": 9920 + }, + { + "epoch": 0.83, + "learning_rate": 5.2880589515993965e-05, + "loss": 0.7657, + "step": 9940 + }, + { + "epoch": 0.83, + "learning_rate": 5.237816111204153e-05, + "loss": 0.7504, + "step": 9960 + }, + { + "epoch": 0.83, + "learning_rate": 5.1875732708089094e-05, + "loss": 0.7547, + "step": 9980 + }, + { + "epoch": 0.83, + "learning_rate": 5.1373304304136655e-05, + "loss": 0.7588, + "step": 10000 + }, + { + "epoch": 0.83, + "eval_loss": 0.766875147819519, + "eval_runtime": 8.3125, + "eval_samples_per_second": 240.603, + "eval_steps_per_second": 1.925, + "step": 10000 + }, + { + "epoch": 0.83, + "learning_rate": 5.0870875900184223e-05, + "loss": 0.7588, + "step": 10020 + }, + { + "epoch": 0.83, + "learning_rate": 5.0368447496231785e-05, + "loss": 0.7544, + "step": 10040 + }, + { + "epoch": 0.84, + "learning_rate": 4.9866019092279346e-05, + "loss": 0.7588, + "step": 10060 + }, + { + "epoch": 0.84, + "learning_rate": 4.936359068832691e-05, + "loss": 0.7547, + "step": 10080 + }, + { + "epoch": 0.84, + "learning_rate": 4.8861162284374475e-05, + "loss": 0.7608, + "step": 10100 + }, + { + "epoch": 0.84, + "learning_rate": 4.8358733880422036e-05, + "loss": 0.7539, + "step": 10120 + }, + { + "epoch": 0.84, + "learning_rate": 4.78563054764696e-05, + "loss": 0.7639, + "step": 10140 + }, + { + "epoch": 0.84, + "learning_rate": 4.735387707251716e-05, + "loss": 0.7622, + "step": 10160 + }, + { + "epoch": 0.85, + "learning_rate": 4.6851448668564726e-05, + "loss": 0.7528, + "step": 10180 + }, + { + "epoch": 0.85, + "learning_rate": 4.634902026461229e-05, + "loss": 0.7566, + "step": 10200 + }, + { + "epoch": 0.85, + "eval_loss": 0.7660693526268005, + "eval_runtime": 8.2951, + "eval_samples_per_second": 241.107, + "eval_steps_per_second": 1.929, + "step": 10200 + }, + { + "epoch": 0.85, + "learning_rate": 4.584659186065985e-05, + "loss": 0.7545, + "step": 10220 + }, + { + "epoch": 0.85, + "learning_rate": 4.5344163456707416e-05, + "loss": 0.7505, + "step": 10240 + }, + { + "epoch": 0.85, + "learning_rate": 4.4841735052754984e-05, + "loss": 0.7645, + "step": 10260 + }, + { + "epoch": 0.85, + "learning_rate": 4.4339306648802545e-05, + "loss": 0.7566, + "step": 10280 + }, + { + "epoch": 0.86, + "learning_rate": 4.383687824485011e-05, + "loss": 0.7509, + "step": 10300 + }, + { + "epoch": 0.86, + "learning_rate": 4.333444984089767e-05, + "loss": 0.7546, + "step": 10320 + }, + { + "epoch": 0.86, + "learning_rate": 4.285714285714285e-05, + "loss": 0.7533, + "step": 10340 + }, + { + "epoch": 0.86, + "learning_rate": 4.235471445319041e-05, + "loss": 0.7509, + "step": 10360 + }, + { + "epoch": 0.86, + "learning_rate": 4.185228604923798e-05, + "loss": 0.7558, + "step": 10380 + }, + { + "epoch": 0.86, + "learning_rate": 4.1349857645285547e-05, + "loss": 0.7624, + "step": 10400 + }, + { + "epoch": 0.86, + "eval_loss": 0.7657083868980408, + "eval_runtime": 8.2828, + "eval_samples_per_second": 241.464, + "eval_steps_per_second": 1.932, + "step": 10400 + }, + { + "epoch": 0.87, + "learning_rate": 4.084742924133311e-05, + "loss": 0.7509, + "step": 10420 + }, + { + "epoch": 0.87, + "learning_rate": 4.0345000837380676e-05, + "loss": 0.7509, + "step": 10440 + }, + { + "epoch": 0.87, + "learning_rate": 3.984257243342824e-05, + "loss": 0.761, + "step": 10460 + }, + { + "epoch": 0.87, + "learning_rate": 3.93401440294758e-05, + "loss": 0.7546, + "step": 10480 + }, + { + "epoch": 0.87, + "learning_rate": 3.883771562552336e-05, + "loss": 0.764, + "step": 10500 + }, + { + "epoch": 0.87, + "learning_rate": 3.833528722157092e-05, + "loss": 0.765, + "step": 10520 + }, + { + "epoch": 0.88, + "learning_rate": 3.783285881761849e-05, + "loss": 0.7551, + "step": 10540 + }, + { + "epoch": 0.88, + "learning_rate": 3.733043041366605e-05, + "loss": 0.7554, + "step": 10560 + }, + { + "epoch": 0.88, + "learning_rate": 3.682800200971361e-05, + "loss": 0.7574, + "step": 10580 + }, + { + "epoch": 0.88, + "learning_rate": 3.632557360576117e-05, + "loss": 0.7647, + "step": 10600 + }, + { + "epoch": 0.88, + "eval_loss": 0.7651572227478027, + "eval_runtime": 8.2868, + "eval_samples_per_second": 241.347, + "eval_steps_per_second": 1.931, + "step": 10600 + }, + { + "epoch": 0.88, + "learning_rate": 3.584826662200636e-05, + "loss": 0.7508, + "step": 10620 + }, + { + "epoch": 0.88, + "learning_rate": 3.534583821805393e-05, + "loss": 0.7636, + "step": 10640 + }, + { + "epoch": 0.89, + "learning_rate": 3.484340981410149e-05, + "loss": 0.7584, + "step": 10660 + }, + { + "epoch": 0.89, + "learning_rate": 3.434098141014905e-05, + "loss": 0.7677, + "step": 10680 + }, + { + "epoch": 0.89, + "learning_rate": 3.383855300619661e-05, + "loss": 0.7493, + "step": 10700 + }, + { + "epoch": 0.89, + "learning_rate": 3.333612460224418e-05, + "loss": 0.7557, + "step": 10720 + }, + { + "epoch": 0.89, + "learning_rate": 3.283369619829174e-05, + "loss": 0.7528, + "step": 10740 + }, + { + "epoch": 0.89, + "learning_rate": 3.23312677943393e-05, + "loss": 0.7573, + "step": 10760 + }, + { + "epoch": 0.9, + "learning_rate": 3.182883939038687e-05, + "loss": 0.7471, + "step": 10780 + }, + { + "epoch": 0.9, + "learning_rate": 3.132641098643443e-05, + "loss": 0.7537, + "step": 10800 + }, + { + "epoch": 0.9, + "eval_loss": 0.7651455402374268, + "eval_runtime": 8.2847, + "eval_samples_per_second": 241.409, + "eval_steps_per_second": 1.931, + "step": 10800 + }, + { + "epoch": 0.9, + "learning_rate": 3.082398258248199e-05, + "loss": 0.7538, + "step": 10820 + }, + { + "epoch": 0.9, + "learning_rate": 3.0321554178529556e-05, + "loss": 0.7585, + "step": 10840 + }, + { + "epoch": 0.9, + "learning_rate": 2.981912577457712e-05, + "loss": 0.7533, + "step": 10860 + }, + { + "epoch": 0.9, + "learning_rate": 2.9316697370624682e-05, + "loss": 0.7607, + "step": 10880 + }, + { + "epoch": 0.91, + "learning_rate": 2.8814268966672247e-05, + "loss": 0.7522, + "step": 10900 + }, + { + "epoch": 0.91, + "learning_rate": 2.831184056271981e-05, + "loss": 0.7581, + "step": 10920 + }, + { + "epoch": 0.91, + "learning_rate": 2.7809412158767376e-05, + "loss": 0.7597, + "step": 10940 + }, + { + "epoch": 0.91, + "learning_rate": 2.7306983754814937e-05, + "loss": 0.7649, + "step": 10960 + }, + { + "epoch": 0.91, + "learning_rate": 2.68045553508625e-05, + "loss": 0.7645, + "step": 10980 + }, + { + "epoch": 0.91, + "learning_rate": 2.6302126946910063e-05, + "loss": 0.743, + "step": 11000 + }, + { + "epoch": 0.91, + "eval_loss": 0.7645469903945923, + "eval_runtime": 8.314, + "eval_samples_per_second": 240.558, + "eval_steps_per_second": 1.924, + "step": 11000 + }, + { + "epoch": 0.92, + "learning_rate": 2.5799698542957624e-05, + "loss": 0.7488, + "step": 11020 + }, + { + "epoch": 0.92, + "learning_rate": 2.529727013900519e-05, + "loss": 0.7515, + "step": 11040 + }, + { + "epoch": 0.92, + "learning_rate": 2.4794841735052756e-05, + "loss": 0.7582, + "step": 11060 + }, + { + "epoch": 0.92, + "learning_rate": 2.4292413331100317e-05, + "loss": 0.7564, + "step": 11080 + }, + { + "epoch": 0.92, + "learning_rate": 2.378998492714788e-05, + "loss": 0.7486, + "step": 11100 + }, + { + "epoch": 0.92, + "learning_rate": 2.3287556523195443e-05, + "loss": 0.7537, + "step": 11120 + }, + { + "epoch": 0.93, + "learning_rate": 2.2785128119243004e-05, + "loss": 0.7593, + "step": 11140 + }, + { + "epoch": 0.93, + "learning_rate": 2.228269971529057e-05, + "loss": 0.7435, + "step": 11160 + }, + { + "epoch": 0.93, + "learning_rate": 2.178027131133813e-05, + "loss": 0.7646, + "step": 11180 + }, + { + "epoch": 0.93, + "learning_rate": 2.1277842907385698e-05, + "loss": 0.7445, + "step": 11200 + }, + { + "epoch": 0.93, + "eval_loss": 0.7643282413482666, + "eval_runtime": 8.2655, + "eval_samples_per_second": 241.97, + "eval_steps_per_second": 1.936, + "step": 11200 + }, + { + "epoch": 0.93, + "learning_rate": 2.077541450343326e-05, + "loss": 0.7576, + "step": 11220 + }, + { + "epoch": 0.93, + "learning_rate": 2.0272986099480824e-05, + "loss": 0.7553, + "step": 11240 + }, + { + "epoch": 0.94, + "learning_rate": 1.9770557695528385e-05, + "loss": 0.7567, + "step": 11260 + }, + { + "epoch": 0.94, + "learning_rate": 1.926812929157595e-05, + "loss": 0.7501, + "step": 11280 + }, + { + "epoch": 0.94, + "learning_rate": 1.876570088762351e-05, + "loss": 0.7502, + "step": 11300 + }, + { + "epoch": 0.94, + "learning_rate": 1.8263272483671075e-05, + "loss": 0.756, + "step": 11320 + }, + { + "epoch": 0.94, + "learning_rate": 1.776084407971864e-05, + "loss": 0.7528, + "step": 11340 + }, + { + "epoch": 0.94, + "learning_rate": 1.72584156757662e-05, + "loss": 0.7578, + "step": 11360 + }, + { + "epoch": 0.95, + "learning_rate": 1.6755987271813765e-05, + "loss": 0.7588, + "step": 11380 + }, + { + "epoch": 0.95, + "learning_rate": 1.625355886786133e-05, + "loss": 0.7486, + "step": 11400 + }, + { + "epoch": 0.95, + "eval_loss": 0.7640262842178345, + "eval_runtime": 8.2822, + "eval_samples_per_second": 241.482, + "eval_steps_per_second": 1.932, + "step": 11400 + }, + { + "epoch": 0.95, + "learning_rate": 1.575113046390889e-05, + "loss": 0.7527, + "step": 11420 + }, + { + "epoch": 0.95, + "learning_rate": 1.5248702059956454e-05, + "loss": 0.7472, + "step": 11440 + }, + { + "epoch": 0.95, + "learning_rate": 1.4746273656004018e-05, + "loss": 0.7476, + "step": 11460 + }, + { + "epoch": 0.95, + "learning_rate": 1.4243845252051581e-05, + "loss": 0.7551, + "step": 11480 + }, + { + "epoch": 0.95, + "learning_rate": 1.3741416848099144e-05, + "loss": 0.7609, + "step": 11500 + }, + { + "epoch": 0.96, + "learning_rate": 1.3238988444146708e-05, + "loss": 0.7496, + "step": 11520 + }, + { + "epoch": 0.96, + "learning_rate": 1.2736560040194271e-05, + "loss": 0.7528, + "step": 11540 + }, + { + "epoch": 0.96, + "learning_rate": 1.2234131636241834e-05, + "loss": 0.7541, + "step": 11560 + }, + { + "epoch": 0.96, + "learning_rate": 1.1731703232289399e-05, + "loss": 0.7492, + "step": 11580 + }, + { + "epoch": 0.96, + "learning_rate": 1.1229274828336962e-05, + "loss": 0.7464, + "step": 11600 + }, + { + "epoch": 0.96, + "eval_loss": 0.7637657523155212, + "eval_runtime": 8.2732, + "eval_samples_per_second": 241.743, + "eval_steps_per_second": 1.934, + "step": 11600 + } + ], + "max_steps": 12042, + "num_train_epochs": 1, + "total_flos": 3.0157246184648868e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-belle1.5m7b/checkpoint-11600/training_args.bin b/adapters/saved-belle1.5m7b/checkpoint-11600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dfe19878cfd0a3df620d94aa6be1508290c31ed9 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfb1ef6ab82f6cd0561137910099627267bd4099d7d83919869b14210b7e77c3 +size 3643 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/optimizer.pt b/adapters/saved-belle1.5m7b/checkpoint-11800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..773c95e54585aee232a2ea13ef842cf21eade35a --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db84d358db351e54f0a1cd9596fe10463394b2bd17bf83de72d948a0b9f223fe +size 33629893 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/pytorch_model.bin b/adapters/saved-belle1.5m7b/checkpoint-11800/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..e23292c801418b4812fd4fa5991af8dd49882d2a --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9ae9596c4eff30dfe18306c6a799e2d56e57879f14da062637b9a64724613ad +size 16822989 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_0.pth b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..85c0705f39127beb67dc3a78d3309a35cdb2bb54 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc91551d8a126892aabb23dd177b8365cb0eb80fb974fea3f704726cf0c6dcc1 +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_1.pth b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e5da190663eb93b4d593ffeeefe4fd4490146d6 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:642d0de8a51e0eae5bb35396a305979383c835b149554ea4b27bfc895126d485 +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_10.pth b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_10.pth new file mode 100644 index 0000000000000000000000000000000000000000..c58964b35d11e2dd580bd5828c70b2a5a6590319 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_10.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:163a6d6e330fab2b282e4bb570164137ef3418092fd8b2f90d5787725895e4e6 +size 14587 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_11.pth b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_11.pth new file mode 100644 index 0000000000000000000000000000000000000000..c01088fe98776be03840a11f1faedc63dd73c4db --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_11.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c085bde867d4a76ee9d6e43eebda896c22da15ab8abd240d9c266e8ea542a7a +size 14587 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_12.pth b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_12.pth new file mode 100644 index 0000000000000000000000000000000000000000..8a427d02c6ed622ee25e99ba04715e10839b677d --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_12.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71d672bc60349fa5be98e5d84ae8bac76cf9bdefdbe167083d297eba7a8874a3 +size 14587 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_13.pth b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_13.pth new file mode 100644 index 0000000000000000000000000000000000000000..436a61d6e476e9c06623c1dd51d5a434dcc8bdd5 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_13.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a08e66d072a85a4ac686d6cb0917cb17d2486d9e500b4fe8b09642fdbc1716ae +size 14587 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_14.pth b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_14.pth new file mode 100644 index 0000000000000000000000000000000000000000..04e9ee43fdcff6f208b2a4bf89feb6aabd5ef052 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_14.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ef1fc0e184e2ee16dbf0ba826acdb4877d8961c6a11eb479f7a9309889a93b7 +size 14587 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_15.pth b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_15.pth new file mode 100644 index 0000000000000000000000000000000000000000..69270216c2afcb30635fec4e95d5f3f431e0bf45 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_15.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d15a1a3fcfe0a62355188cf78ce57917d18144ee19368894903288b000baa5e +size 14587 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_2.pth b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..c7bf77d3ab9622791c05421aa6db2879ad7f5570 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4cdb73baabdb05e5504b6f67af78c11c53cab0a43b0bf77bfc77e9a9f812db1 +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_3.pth b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..a10b19344e0c5f2609287d6ed72c6eea254240ef --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b8a7faeb5920074a6724cbb55eb6939b7b0250ede4da03ce87130a880888fa7 +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_4.pth b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..7df07ed4e6fb2ab431ea634fd6a826c6c71512fa --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5c501c18dc63b535a4d8ac8881c8f5768e78a36cd745da1cd5908beacd05d3e +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_5.pth b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..781d1e898b213863734321556214aec6f78a681a --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5afe12b3c03fde7fd32faa296eb14f7578f2d3e59b7f4bbb9de1f54d37098de4 +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_6.pth b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..6dc17a35e0397b83c3ac82ff39485f12aa14767d --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f5cd3b6b6d417e32c8a9f4645a949cc74dbbd80e45bb0c9ca49c17cb4867e9d +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_7.pth b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..9db44ada5d38c5802ae8a0ed55eba3275ca0311c --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a41a4062aa9c96472816a352f6e21928ea83b0d6cab2e9635ca5e711da07490d +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_8.pth b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..17f43e37458aa5c248d5ffd512053ae79816ae43 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:909adce2a207fcf45e402759c5f8df87b760824185fa35ae9983c10a26d23424 +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_9.pth b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_9.pth new file mode 100644 index 0000000000000000000000000000000000000000..e7169ba46631f552201cb80823d7d9bbaff4b2d1 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/rng_state_9.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d5421e8bcd5a5f00fd8dd38d62f00968f7793e9aa5e91973c541aab69d8e9f8 +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/scaler.pt b/adapters/saved-belle1.5m7b/checkpoint-11800/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..27d8a26e294cf34b385e48612b9b8dba3e50b0d9 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d96296368084d90f719f85b8992eeef8e77d32acf215e70ba9c8ca86f91d5cf7 +size 557 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/scheduler.pt b/adapters/saved-belle1.5m7b/checkpoint-11800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..13c8f5594a7d06e569ca18ea9a99ebe6264a31b4 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34d6e7b232d73363caa9223ab6a428651696466348cb45d1e5adbb6c99bcca87 +size 627 diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/trainer_state.json b/adapters/saved-belle1.5m7b/checkpoint-11800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f0d8b051041e0fcd44afe0847db03a781ab43ad0 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/trainer_state.json @@ -0,0 +1,4028 @@ +{ + "best_metric": 0.7635765671730042, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-belle1.5m7b/checkpoint-11800", + "epoch": 0.9799036704866301, + "global_step": 11800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7631, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011999999999999999, + "loss": 1.4784, + "step": 40 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017999999999999998, + "loss": 1.1332, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 0.00023999999999999998, + "loss": 1.0665, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003, + "loss": 1.0418, + "step": 100 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029949757159604753, + "loss": 1.0246, + "step": 120 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002989951431920951, + "loss": 1.0054, + "step": 140 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029849271478814266, + "loss": 1.002, + "step": 160 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002979902863841902, + "loss": 0.9908, + "step": 180 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002974878579802378, + "loss": 0.9784, + "step": 200 + }, + { + "epoch": 0.02, + "eval_loss": 0.9920349717140198, + "eval_runtime": 8.2147, + "eval_samples_per_second": 243.465, + "eval_steps_per_second": 1.948, + "step": 200 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029698542957628534, + "loss": 0.9817, + "step": 220 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002964830011723329, + "loss": 0.983, + "step": 240 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029598057276838046, + "loss": 0.9637, + "step": 260 + }, + { + "epoch": 0.02, + "learning_rate": 0.000295478144364428, + "loss": 0.9534, + "step": 280 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002949757159604756, + "loss": 0.9475, + "step": 300 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029447328755652315, + "loss": 0.9502, + "step": 320 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002939708591525707, + "loss": 0.946, + "step": 340 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029346843074861827, + "loss": 0.9545, + "step": 360 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029296600234466583, + "loss": 0.946, + "step": 380 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002924635739407134, + "loss": 0.9295, + "step": 400 + }, + { + "epoch": 0.03, + "eval_loss": 0.9477736949920654, + "eval_runtime": 8.229, + "eval_samples_per_second": 243.042, + "eval_steps_per_second": 1.944, + "step": 400 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029196114553676095, + "loss": 0.9232, + "step": 420 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002914587171328085, + "loss": 0.9201, + "step": 440 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029095628872885613, + "loss": 0.9212, + "step": 460 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002904538603249037, + "loss": 0.9131, + "step": 480 + }, + { + "epoch": 0.04, + "learning_rate": 0.00028995143192095125, + "loss": 0.922, + "step": 500 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002894490035169988, + "loss": 0.9176, + "step": 520 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002889465751130464, + "loss": 0.9177, + "step": 540 + }, + { + "epoch": 0.05, + "learning_rate": 0.00028844414670909393, + "loss": 0.9132, + "step": 560 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002879417183051415, + "loss": 0.921, + "step": 580 + }, + { + "epoch": 0.05, + "learning_rate": 0.00028743928990118906, + "loss": 0.9013, + "step": 600 + }, + { + "epoch": 0.05, + "eval_loss": 0.9194319248199463, + "eval_runtime": 8.23, + "eval_samples_per_second": 243.013, + "eval_steps_per_second": 1.944, + "step": 600 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002869368614972366, + "loss": 0.904, + "step": 620 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002864344330932842, + "loss": 0.9016, + "step": 640 + }, + { + "epoch": 0.05, + "learning_rate": 0.00028593200468933174, + "loss": 0.8931, + "step": 660 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002854295762853793, + "loss": 0.9, + "step": 680 + }, + { + "epoch": 0.06, + "learning_rate": 0.00028492714788142686, + "loss": 0.896, + "step": 700 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002844247194774744, + "loss": 0.8982, + "step": 720 + }, + { + "epoch": 0.06, + "learning_rate": 0.00028392229107352204, + "loss": 0.9, + "step": 740 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002834198626695696, + "loss": 0.8861, + "step": 760 + }, + { + "epoch": 0.06, + "learning_rate": 0.00028291743426561716, + "loss": 0.8926, + "step": 780 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002824150058616647, + "loss": 0.8853, + "step": 800 + }, + { + "epoch": 0.07, + "eval_loss": 0.8987648487091064, + "eval_runtime": 8.2248, + "eval_samples_per_second": 243.167, + "eval_steps_per_second": 1.945, + "step": 800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002819125774577123, + "loss": 0.8922, + "step": 820 + }, + { + "epoch": 0.07, + "learning_rate": 0.00028141014905375984, + "loss": 0.8813, + "step": 840 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002809077206498074, + "loss": 0.8825, + "step": 860 + }, + { + "epoch": 0.07, + "learning_rate": 0.00028040529224585497, + "loss": 0.8821, + "step": 880 + }, + { + "epoch": 0.07, + "learning_rate": 0.00027990286384190253, + "loss": 0.8853, + "step": 900 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002794004354379501, + "loss": 0.8806, + "step": 920 + }, + { + "epoch": 0.08, + "learning_rate": 0.00027889800703399765, + "loss": 0.8774, + "step": 940 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002783955786300452, + "loss": 0.8783, + "step": 960 + }, + { + "epoch": 0.08, + "learning_rate": 0.00027789315022609277, + "loss": 0.876, + "step": 980 + }, + { + "epoch": 0.08, + "learning_rate": 0.00027739072182214033, + "loss": 0.8656, + "step": 1000 + }, + { + "epoch": 0.08, + "eval_loss": 0.8842049241065979, + "eval_runtime": 8.2372, + "eval_samples_per_second": 242.8, + "eval_steps_per_second": 1.942, + "step": 1000 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002768882934181879, + "loss": 0.8776, + "step": 1020 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027638586501423546, + "loss": 0.8751, + "step": 1040 + }, + { + "epoch": 0.09, + "learning_rate": 0.000275883436610283, + "loss": 0.8701, + "step": 1060 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002753810082063306, + "loss": 0.8618, + "step": 1080 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027487857980237814, + "loss": 0.8604, + "step": 1100 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002743761513984257, + "loss": 0.8516, + "step": 1120 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027387372299447326, + "loss": 0.8553, + "step": 1140 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002733712945905208, + "loss": 0.8638, + "step": 1160 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002728688661865684, + "loss": 0.8528, + "step": 1180 + }, + { + "epoch": 0.1, + "learning_rate": 0.00027236643778261595, + "loss": 0.8641, + "step": 1200 + }, + { + "epoch": 0.1, + "eval_loss": 0.871113121509552, + "eval_runtime": 8.2308, + "eval_samples_per_second": 242.989, + "eval_steps_per_second": 1.944, + "step": 1200 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002718640093786635, + "loss": 0.8656, + "step": 1220 + }, + { + "epoch": 0.1, + "learning_rate": 0.00027136158097471107, + "loss": 0.8534, + "step": 1240 + }, + { + "epoch": 0.1, + "learning_rate": 0.00027085915257075863, + "loss": 0.8512, + "step": 1260 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002703567241668062, + "loss": 0.8472, + "step": 1280 + }, + { + "epoch": 0.11, + "learning_rate": 0.00026985429576285375, + "loss": 0.8584, + "step": 1300 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002693518673589013, + "loss": 0.8557, + "step": 1320 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002688494389549489, + "loss": 0.8547, + "step": 1340 + }, + { + "epoch": 0.11, + "learning_rate": 0.00026834701055099643, + "loss": 0.8576, + "step": 1360 + }, + { + "epoch": 0.11, + "learning_rate": 0.000267844582147044, + "loss": 0.8499, + "step": 1380 + }, + { + "epoch": 0.12, + "learning_rate": 0.00026734215374309156, + "loss": 0.839, + "step": 1400 + }, + { + "epoch": 0.12, + "eval_loss": 0.8613501191139221, + "eval_runtime": 8.2712, + "eval_samples_per_second": 241.802, + "eval_steps_per_second": 1.934, + "step": 1400 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002668397253391391, + "loss": 0.8538, + "step": 1420 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002663372969351867, + "loss": 0.8541, + "step": 1440 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002658348685312343, + "loss": 0.8381, + "step": 1460 + }, + { + "epoch": 0.12, + "learning_rate": 0.00026533244012728186, + "loss": 0.8527, + "step": 1480 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002648300117233294, + "loss": 0.8493, + "step": 1500 + }, + { + "epoch": 0.13, + "learning_rate": 0.000264327583319377, + "loss": 0.846, + "step": 1520 + }, + { + "epoch": 0.13, + "learning_rate": 0.00026382515491542454, + "loss": 0.8435, + "step": 1540 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002633227265114721, + "loss": 0.8517, + "step": 1560 + }, + { + "epoch": 0.13, + "learning_rate": 0.00026282029810751966, + "loss": 0.8403, + "step": 1580 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002623178697035672, + "loss": 0.8419, + "step": 1600 + }, + { + "epoch": 0.13, + "eval_loss": 0.8518173098564148, + "eval_runtime": 8.2505, + "eval_samples_per_second": 242.411, + "eval_steps_per_second": 1.939, + "step": 1600 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002618154412996148, + "loss": 0.8393, + "step": 1620 + }, + { + "epoch": 0.14, + "learning_rate": 0.00026131301289566234, + "loss": 0.8471, + "step": 1640 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002608105844917099, + "loss": 0.85, + "step": 1660 + }, + { + "epoch": 0.14, + "learning_rate": 0.00026030815608775747, + "loss": 0.8388, + "step": 1680 + }, + { + "epoch": 0.14, + "learning_rate": 0.00025980572768380503, + "loss": 0.8394, + "step": 1700 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002593032992798526, + "loss": 0.8333, + "step": 1720 + }, + { + "epoch": 0.14, + "learning_rate": 0.00025880087087590015, + "loss": 0.8343, + "step": 1740 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002582984424719477, + "loss": 0.8303, + "step": 1760 + }, + { + "epoch": 0.15, + "learning_rate": 0.00025779601406799533, + "loss": 0.8247, + "step": 1780 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002572935856640429, + "loss": 0.8282, + "step": 1800 + }, + { + "epoch": 0.15, + "eval_loss": 0.8434953689575195, + "eval_runtime": 8.2633, + "eval_samples_per_second": 242.034, + "eval_steps_per_second": 1.936, + "step": 1800 + }, + { + "epoch": 0.15, + "learning_rate": 0.00025679115726009045, + "loss": 0.8348, + "step": 1820 + }, + { + "epoch": 0.15, + "learning_rate": 0.000256288728856138, + "loss": 0.8393, + "step": 1840 + }, + { + "epoch": 0.15, + "learning_rate": 0.00025578630045218557, + "loss": 0.8304, + "step": 1860 + }, + { + "epoch": 0.16, + "learning_rate": 0.00025528387204823313, + "loss": 0.8309, + "step": 1880 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002547814436442807, + "loss": 0.8382, + "step": 1900 + }, + { + "epoch": 0.16, + "learning_rate": 0.00025427901524032826, + "loss": 0.841, + "step": 1920 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002537765868363758, + "loss": 0.8354, + "step": 1940 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002532741584324234, + "loss": 0.8334, + "step": 1960 + }, + { + "epoch": 0.16, + "learning_rate": 0.00025277173002847094, + "loss": 0.8243, + "step": 1980 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002522693016245185, + "loss": 0.8337, + "step": 2000 + }, + { + "epoch": 0.17, + "eval_loss": 0.8386329412460327, + "eval_runtime": 8.2418, + "eval_samples_per_second": 242.666, + "eval_steps_per_second": 1.941, + "step": 2000 + }, + { + "epoch": 0.17, + "learning_rate": 0.00025176687322056606, + "loss": 0.8237, + "step": 2020 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002512644448166136, + "loss": 0.8206, + "step": 2040 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002507620164126612, + "loss": 0.8277, + "step": 2060 + }, + { + "epoch": 0.17, + "learning_rate": 0.00025025958800870874, + "loss": 0.8271, + "step": 2080 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002497571596047563, + "loss": 0.8353, + "step": 2100 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024925473120080387, + "loss": 0.8253, + "step": 2120 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024875230279685143, + "loss": 0.8212, + "step": 2140 + }, + { + "epoch": 0.18, + "learning_rate": 0.000248249874392899, + "loss": 0.8192, + "step": 2160 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024774744598894655, + "loss": 0.8292, + "step": 2180 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002472450175849941, + "loss": 0.825, + "step": 2200 + }, + { + "epoch": 0.18, + "eval_loss": 0.8326684236526489, + "eval_runtime": 8.2602, + "eval_samples_per_second": 242.126, + "eval_steps_per_second": 1.937, + "step": 2200 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024674258918104167, + "loss": 0.8308, + "step": 2220 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024624016077708923, + "loss": 0.8324, + "step": 2240 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002457377323731368, + "loss": 0.8257, + "step": 2260 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024523530396918436, + "loss": 0.8181, + "step": 2280 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002447328755652319, + "loss": 0.8126, + "step": 2300 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002442304471612795, + "loss": 0.819, + "step": 2320 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024372801875732707, + "loss": 0.8198, + "step": 2340 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024322559035337463, + "loss": 0.8188, + "step": 2360 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002427231619494222, + "loss": 0.8145, + "step": 2380 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024222073354546975, + "loss": 0.81, + "step": 2400 + }, + { + "epoch": 0.2, + "eval_loss": 0.8278167843818665, + "eval_runtime": 8.2577, + "eval_samples_per_second": 242.198, + "eval_steps_per_second": 1.938, + "step": 2400 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002417183051415173, + "loss": 0.8142, + "step": 2420 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024121587673756487, + "loss": 0.8059, + "step": 2440 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024071344833361243, + "loss": 0.826, + "step": 2460 + }, + { + "epoch": 0.21, + "learning_rate": 0.00024021101992966, + "loss": 0.8173, + "step": 2480 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023970859152570756, + "loss": 0.8063, + "step": 2500 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023920616312175512, + "loss": 0.812, + "step": 2520 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023870373471780268, + "loss": 0.82, + "step": 2540 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023820130631385024, + "loss": 0.8207, + "step": 2560 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002376988779098978, + "loss": 0.8113, + "step": 2580 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023719644950594536, + "loss": 0.8175, + "step": 2600 + }, + { + "epoch": 0.22, + "eval_loss": 0.8231886029243469, + "eval_runtime": 8.2627, + "eval_samples_per_second": 242.052, + "eval_steps_per_second": 1.936, + "step": 2600 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023669402110199295, + "loss": 0.8105, + "step": 2620 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002361915926980405, + "loss": 0.8106, + "step": 2640 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023568916429408807, + "loss": 0.8114, + "step": 2660 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023518673589013563, + "loss": 0.8152, + "step": 2680 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002346843074861832, + "loss": 0.8059, + "step": 2700 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023418187908223076, + "loss": 0.8065, + "step": 2720 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023367945067827832, + "loss": 0.8147, + "step": 2740 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023317702227432588, + "loss": 0.8006, + "step": 2760 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023267459387037344, + "loss": 0.805, + "step": 2780 + }, + { + "epoch": 0.23, + "learning_rate": 0.000232172165466421, + "loss": 0.8142, + "step": 2800 + }, + { + "epoch": 0.23, + "eval_loss": 0.8191845417022705, + "eval_runtime": 8.28, + "eval_samples_per_second": 241.547, + "eval_steps_per_second": 1.932, + "step": 2800 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023166973706246856, + "loss": 0.8134, + "step": 2820 + }, + { + "epoch": 0.24, + "learning_rate": 0.00023116730865851612, + "loss": 0.8066, + "step": 2840 + }, + { + "epoch": 0.24, + "learning_rate": 0.00023066488025456368, + "loss": 0.8049, + "step": 2860 + }, + { + "epoch": 0.24, + "learning_rate": 0.00023016245185061124, + "loss": 0.7987, + "step": 2880 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022966002344665886, + "loss": 0.8113, + "step": 2900 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022915759504270642, + "loss": 0.8086, + "step": 2920 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022865516663875398, + "loss": 0.8019, + "step": 2940 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022815273823480154, + "loss": 0.8051, + "step": 2960 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002276503098308491, + "loss": 0.8032, + "step": 2980 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022714788142689667, + "loss": 0.8058, + "step": 3000 + }, + { + "epoch": 0.25, + "eval_loss": 0.8153129816055298, + "eval_runtime": 8.2727, + "eval_samples_per_second": 241.759, + "eval_steps_per_second": 1.934, + "step": 3000 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022664545302294423, + "loss": 0.801, + "step": 3020 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002261430246189918, + "loss": 0.801, + "step": 3040 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022564059621503935, + "loss": 0.8037, + "step": 3060 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002251381678110869, + "loss": 0.7992, + "step": 3080 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022463573940713447, + "loss": 0.8036, + "step": 3100 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022413331100318203, + "loss": 0.8064, + "step": 3120 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002236308825992296, + "loss": 0.7966, + "step": 3140 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022312845419527716, + "loss": 0.8047, + "step": 3160 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022262602579132472, + "loss": 0.7986, + "step": 3180 + }, + { + "epoch": 0.27, + "learning_rate": 0.00022212359738737228, + "loss": 0.8026, + "step": 3200 + }, + { + "epoch": 0.27, + "eval_loss": 0.8119255304336548, + "eval_runtime": 8.288, + "eval_samples_per_second": 241.313, + "eval_steps_per_second": 1.931, + "step": 3200 + }, + { + "epoch": 0.27, + "learning_rate": 0.00022162116898341984, + "loss": 0.8028, + "step": 3220 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002211187405794674, + "loss": 0.7987, + "step": 3240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000220616312175515, + "loss": 0.8148, + "step": 3260 + }, + { + "epoch": 0.27, + "learning_rate": 0.00022011388377156255, + "loss": 0.7932, + "step": 3280 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002196114553676101, + "loss": 0.8027, + "step": 3300 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021910902696365767, + "loss": 0.7989, + "step": 3320 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021860659855970523, + "loss": 0.7983, + "step": 3340 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002181041701557528, + "loss": 0.7977, + "step": 3360 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021760174175180035, + "loss": 0.7972, + "step": 3380 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021709931334784792, + "loss": 0.8025, + "step": 3400 + }, + { + "epoch": 0.28, + "eval_loss": 0.8084473609924316, + "eval_runtime": 8.2772, + "eval_samples_per_second": 241.627, + "eval_steps_per_second": 1.933, + "step": 3400 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021659688494389548, + "loss": 0.7993, + "step": 3420 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021609445653994304, + "loss": 0.7919, + "step": 3440 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002155920281359906, + "loss": 0.8062, + "step": 3460 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021508959973203816, + "loss": 0.7937, + "step": 3480 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021458717132808572, + "loss": 0.7947, + "step": 3500 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021408474292413328, + "loss": 0.7973, + "step": 3520 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021358231452018084, + "loss": 0.7964, + "step": 3540 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002130798861162284, + "loss": 0.7994, + "step": 3560 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021257745771227597, + "loss": 0.7911, + "step": 3580 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021207502930832353, + "loss": 0.805, + "step": 3600 + }, + { + "epoch": 0.3, + "eval_loss": 0.8064665794372559, + "eval_runtime": 8.2739, + "eval_samples_per_second": 241.725, + "eval_steps_per_second": 1.934, + "step": 3600 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021157260090437112, + "loss": 0.8014, + "step": 3620 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021107017250041868, + "loss": 0.7941, + "step": 3640 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021056774409646624, + "loss": 0.793, + "step": 3660 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002100653156925138, + "loss": 0.7918, + "step": 3680 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020956288728856136, + "loss": 0.7926, + "step": 3700 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020906045888460892, + "loss": 0.797, + "step": 3720 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020855803048065648, + "loss": 0.7965, + "step": 3740 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020805560207670404, + "loss": 0.7963, + "step": 3760 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002075531736727516, + "loss": 0.7924, + "step": 3780 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020705074526879917, + "loss": 0.788, + "step": 3800 + }, + { + "epoch": 0.32, + "eval_loss": 0.8037804961204529, + "eval_runtime": 8.3133, + "eval_samples_per_second": 240.579, + "eval_steps_per_second": 1.925, + "step": 3800 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020654831686484673, + "loss": 0.794, + "step": 3820 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002060458884608943, + "loss": 0.7946, + "step": 3840 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020554346005694185, + "loss": 0.7934, + "step": 3860 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002050410316529894, + "loss": 0.7935, + "step": 3880 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020453860324903697, + "loss": 0.7864, + "step": 3900 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020403617484508453, + "loss": 0.8026, + "step": 3920 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002035337464411321, + "loss": 0.7902, + "step": 3940 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020303131803717966, + "loss": 0.7915, + "step": 3960 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020252888963322724, + "loss": 0.798, + "step": 3980 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002020264612292748, + "loss": 0.7989, + "step": 4000 + }, + { + "epoch": 0.33, + "eval_loss": 0.8007607460021973, + "eval_runtime": 8.2882, + "eval_samples_per_second": 241.308, + "eval_steps_per_second": 1.93, + "step": 4000 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002015240328253224, + "loss": 0.7831, + "step": 4020 + }, + { + "epoch": 0.34, + "learning_rate": 0.00020102160442136995, + "loss": 0.7877, + "step": 4040 + }, + { + "epoch": 0.34, + "learning_rate": 0.00020051917601741752, + "loss": 0.7855, + "step": 4060 + }, + { + "epoch": 0.34, + "learning_rate": 0.00020001674761346508, + "loss": 0.7889, + "step": 4080 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019951431920951264, + "loss": 0.7958, + "step": 4100 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001990118908055602, + "loss": 0.7934, + "step": 4120 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019850946240160776, + "loss": 0.785, + "step": 4140 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019800703399765532, + "loss": 0.7987, + "step": 4160 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019750460559370288, + "loss": 0.7886, + "step": 4180 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019702729860994808, + "loss": 0.79, + "step": 4200 + }, + { + "epoch": 0.35, + "eval_loss": 0.7986196279525757, + "eval_runtime": 8.2739, + "eval_samples_per_second": 241.724, + "eval_steps_per_second": 1.934, + "step": 4200 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019652487020599565, + "loss": 0.7889, + "step": 4220 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001960224418020432, + "loss": 0.7883, + "step": 4240 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019552001339809077, + "loss": 0.7895, + "step": 4260 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019501758499413833, + "loss": 0.7838, + "step": 4280 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001945151565901859, + "loss": 0.7957, + "step": 4300 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019401272818623345, + "loss": 0.7915, + "step": 4320 + }, + { + "epoch": 0.36, + "learning_rate": 0.000193510299782281, + "loss": 0.7987, + "step": 4340 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019300787137832857, + "loss": 0.7811, + "step": 4360 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019250544297437613, + "loss": 0.7921, + "step": 4380 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001920030145704237, + "loss": 0.7928, + "step": 4400 + }, + { + "epoch": 0.37, + "eval_loss": 0.7964197993278503, + "eval_runtime": 8.2761, + "eval_samples_per_second": 241.66, + "eval_steps_per_second": 1.933, + "step": 4400 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019150058616647126, + "loss": 0.7924, + "step": 4420 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019099815776251885, + "loss": 0.7898, + "step": 4440 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001904957293585664, + "loss": 0.7893, + "step": 4460 + }, + { + "epoch": 0.37, + "learning_rate": 0.00018999330095461397, + "loss": 0.795, + "step": 4480 + }, + { + "epoch": 0.37, + "learning_rate": 0.00018949087255066153, + "loss": 0.7832, + "step": 4500 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001889884441467091, + "loss": 0.7835, + "step": 4520 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018848601574275665, + "loss": 0.7809, + "step": 4540 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001879835873388042, + "loss": 0.7792, + "step": 4560 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018748115893485177, + "loss": 0.7884, + "step": 4580 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018697873053089933, + "loss": 0.7859, + "step": 4600 + }, + { + "epoch": 0.38, + "eval_loss": 0.794753909111023, + "eval_runtime": 8.2787, + "eval_samples_per_second": 241.584, + "eval_steps_per_second": 1.933, + "step": 4600 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001864763021269469, + "loss": 0.7709, + "step": 4620 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018597387372299446, + "loss": 0.7786, + "step": 4640 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018547144531904202, + "loss": 0.7786, + "step": 4660 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018496901691508958, + "loss": 0.7824, + "step": 4680 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018446658851113714, + "loss": 0.7741, + "step": 4700 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001839641601071847, + "loss": 0.78, + "step": 4720 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018346173170323226, + "loss": 0.782, + "step": 4740 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018295930329927982, + "loss": 0.7808, + "step": 4760 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001824568748953274, + "loss": 0.7776, + "step": 4780 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018195444649137497, + "loss": 0.7824, + "step": 4800 + }, + { + "epoch": 0.4, + "eval_loss": 0.7927345633506775, + "eval_runtime": 8.2706, + "eval_samples_per_second": 241.82, + "eval_steps_per_second": 1.935, + "step": 4800 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018145201808742253, + "loss": 0.7843, + "step": 4820 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001809495896834701, + "loss": 0.7908, + "step": 4840 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018044716127951766, + "loss": 0.7872, + "step": 4860 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017994473287556522, + "loss": 0.7763, + "step": 4880 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017944230447161278, + "loss": 0.7846, + "step": 4900 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017893987606766034, + "loss": 0.7775, + "step": 4920 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001784374476637079, + "loss": 0.793, + "step": 4940 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017793501925975546, + "loss": 0.7814, + "step": 4960 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017743259085580302, + "loss": 0.7913, + "step": 4980 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017693016245185058, + "loss": 0.7755, + "step": 5000 + }, + { + "epoch": 0.42, + "eval_loss": 0.7912722826004028, + "eval_runtime": 8.285, + "eval_samples_per_second": 241.401, + "eval_steps_per_second": 1.931, + "step": 5000 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017642773404789815, + "loss": 0.7848, + "step": 5020 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001759253056439457, + "loss": 0.7826, + "step": 5040 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017542287723999327, + "loss": 0.7753, + "step": 5060 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017492044883604083, + "loss": 0.7786, + "step": 5080 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001744180204320884, + "loss": 0.7929, + "step": 5100 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017391559202813595, + "loss": 0.7836, + "step": 5120 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017341316362418354, + "loss": 0.7802, + "step": 5140 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001729107352202311, + "loss": 0.7808, + "step": 5160 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017240830681627866, + "loss": 0.7784, + "step": 5180 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017190587841232622, + "loss": 0.7803, + "step": 5200 + }, + { + "epoch": 0.43, + "eval_loss": 0.7892646193504333, + "eval_runtime": 8.297, + "eval_samples_per_second": 241.05, + "eval_steps_per_second": 1.928, + "step": 5200 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017140345000837378, + "loss": 0.7872, + "step": 5220 + }, + { + "epoch": 0.44, + "learning_rate": 0.00017090102160442135, + "loss": 0.7795, + "step": 5240 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001703985932004689, + "loss": 0.7777, + "step": 5260 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016989616479651647, + "loss": 0.7775, + "step": 5280 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016939373639256406, + "loss": 0.7789, + "step": 5300 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016889130798861162, + "loss": 0.7863, + "step": 5320 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016838887958465918, + "loss": 0.7774, + "step": 5340 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016788645118070674, + "loss": 0.7856, + "step": 5360 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001673840227767543, + "loss": 0.78, + "step": 5380 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016688159437280186, + "loss": 0.7929, + "step": 5400 + }, + { + "epoch": 0.45, + "eval_loss": 0.7884517908096313, + "eval_runtime": 8.3482, + "eval_samples_per_second": 239.571, + "eval_steps_per_second": 1.917, + "step": 5400 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016637916596884945, + "loss": 0.7728, + "step": 5420 + }, + { + "epoch": 0.45, + "learning_rate": 0.000165876737564897, + "loss": 0.7827, + "step": 5440 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016537430916094457, + "loss": 0.767, + "step": 5460 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016487188075699213, + "loss": 0.7768, + "step": 5480 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001643694523530397, + "loss": 0.776, + "step": 5500 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016386702394908726, + "loss": 0.77, + "step": 5520 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016336459554513482, + "loss": 0.7825, + "step": 5540 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016286216714118238, + "loss": 0.7829, + "step": 5560 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016235973873722994, + "loss": 0.791, + "step": 5580 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001618573103332775, + "loss": 0.7807, + "step": 5600 + }, + { + "epoch": 0.47, + "eval_loss": 0.7869579792022705, + "eval_runtime": 8.297, + "eval_samples_per_second": 241.051, + "eval_steps_per_second": 1.928, + "step": 5600 + }, + { + "epoch": 0.47, + "learning_rate": 0.00016135488192932506, + "loss": 0.7762, + "step": 5620 + }, + { + "epoch": 0.47, + "learning_rate": 0.00016085245352537262, + "loss": 0.7805, + "step": 5640 + }, + { + "epoch": 0.47, + "learning_rate": 0.00016035002512142018, + "loss": 0.7797, + "step": 5660 + }, + { + "epoch": 0.47, + "learning_rate": 0.00015984759671746774, + "loss": 0.7832, + "step": 5680 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001593451683135153, + "loss": 0.7739, + "step": 5700 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015884273990956287, + "loss": 0.7703, + "step": 5720 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015834031150561043, + "loss": 0.7707, + "step": 5740 + }, + { + "epoch": 0.48, + "learning_rate": 0.000157837883101658, + "loss": 0.7649, + "step": 5760 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015733545469770558, + "loss": 0.7713, + "step": 5780 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015683302629375314, + "loss": 0.7753, + "step": 5800 + }, + { + "epoch": 0.48, + "eval_loss": 0.7855839729309082, + "eval_runtime": 8.3158, + "eval_samples_per_second": 240.507, + "eval_steps_per_second": 1.924, + "step": 5800 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001563305978898007, + "loss": 0.7795, + "step": 5820 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015582816948584826, + "loss": 0.7714, + "step": 5840 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015532574108189582, + "loss": 0.7772, + "step": 5860 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015482331267794338, + "loss": 0.7784, + "step": 5880 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015432088427399094, + "loss": 0.7628, + "step": 5900 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001538184558700385, + "loss": 0.7848, + "step": 5920 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015331602746608607, + "loss": 0.78, + "step": 5940 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015281359906213363, + "loss": 0.7856, + "step": 5960 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001523111706581812, + "loss": 0.776, + "step": 5980 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015180874225422875, + "loss": 0.7752, + "step": 6000 + }, + { + "epoch": 0.5, + "eval_loss": 0.7837858200073242, + "eval_runtime": 8.2979, + "eval_samples_per_second": 241.025, + "eval_steps_per_second": 1.928, + "step": 6000 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001513063138502763, + "loss": 0.7719, + "step": 6020 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015080388544632387, + "loss": 0.7841, + "step": 6040 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015030145704237143, + "loss": 0.7779, + "step": 6060 + }, + { + "epoch": 0.5, + "learning_rate": 0.000149799028638419, + "loss": 0.7706, + "step": 6080 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014929660023446656, + "loss": 0.762, + "step": 6100 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014879417183051412, + "loss": 0.7854, + "step": 6120 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001482917434265617, + "loss": 0.7803, + "step": 6140 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014778931502260927, + "loss": 0.7769, + "step": 6160 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014728688661865683, + "loss": 0.7773, + "step": 6180 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001467844582147044, + "loss": 0.7725, + "step": 6200 + }, + { + "epoch": 0.51, + "eval_loss": 0.7822731137275696, + "eval_runtime": 8.3078, + "eval_samples_per_second": 240.738, + "eval_steps_per_second": 1.926, + "step": 6200 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014630715123094956, + "loss": 0.7797, + "step": 6220 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014580472282699713, + "loss": 0.77, + "step": 6240 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014530229442304469, + "loss": 0.7699, + "step": 6260 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014479986601909225, + "loss": 0.7769, + "step": 6280 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014429743761513984, + "loss": 0.7755, + "step": 6300 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001437950092111874, + "loss": 0.7752, + "step": 6320 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014329258080723496, + "loss": 0.7791, + "step": 6340 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014279015240328252, + "loss": 0.7606, + "step": 6360 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014228772399933008, + "loss": 0.7733, + "step": 6380 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014178529559537764, + "loss": 0.7855, + "step": 6400 + }, + { + "epoch": 0.53, + "eval_loss": 0.7812179923057556, + "eval_runtime": 8.2896, + "eval_samples_per_second": 241.265, + "eval_steps_per_second": 1.93, + "step": 6400 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001412828671914252, + "loss": 0.7718, + "step": 6420 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014078043878747276, + "loss": 0.7676, + "step": 6440 + }, + { + "epoch": 0.54, + "learning_rate": 0.00014027801038352035, + "loss": 0.7732, + "step": 6460 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001397755819795679, + "loss": 0.7745, + "step": 6480 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013927315357561547, + "loss": 0.7747, + "step": 6500 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013877072517166304, + "loss": 0.7648, + "step": 6520 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001382682967677106, + "loss": 0.7767, + "step": 6540 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013776586836375816, + "loss": 0.7735, + "step": 6560 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013726343995980572, + "loss": 0.7684, + "step": 6580 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013676101155585328, + "loss": 0.7783, + "step": 6600 + }, + { + "epoch": 0.55, + "eval_loss": 0.7797773480415344, + "eval_runtime": 8.2853, + "eval_samples_per_second": 241.391, + "eval_steps_per_second": 1.931, + "step": 6600 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013625858315190084, + "loss": 0.7755, + "step": 6620 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001357561547479484, + "loss": 0.7813, + "step": 6640 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013525372634399596, + "loss": 0.7721, + "step": 6660 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013475129794004352, + "loss": 0.7621, + "step": 6680 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013424886953609109, + "loss": 0.7703, + "step": 6700 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013374644113213865, + "loss": 0.7795, + "step": 6720 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001332440127281862, + "loss": 0.779, + "step": 6740 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013274158432423377, + "loss": 0.7762, + "step": 6760 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013223915592028133, + "loss": 0.7769, + "step": 6780 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001317367275163289, + "loss": 0.7762, + "step": 6800 + }, + { + "epoch": 0.56, + "eval_loss": 0.7790360450744629, + "eval_runtime": 8.301, + "eval_samples_per_second": 240.935, + "eval_steps_per_second": 1.927, + "step": 6800 + }, + { + "epoch": 0.57, + "learning_rate": 0.00013123429911237648, + "loss": 0.7587, + "step": 6820 + }, + { + "epoch": 0.57, + "learning_rate": 0.00013073187070842404, + "loss": 0.7667, + "step": 6840 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001302294423044716, + "loss": 0.7575, + "step": 6860 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012972701390051916, + "loss": 0.7755, + "step": 6880 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012922458549656672, + "loss": 0.7718, + "step": 6900 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012872215709261429, + "loss": 0.7715, + "step": 6920 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012821972868866185, + "loss": 0.7679, + "step": 6940 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012771730028470944, + "loss": 0.7691, + "step": 6960 + }, + { + "epoch": 0.58, + "learning_rate": 0.000127214871880757, + "loss": 0.7766, + "step": 6980 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012671244347680456, + "loss": 0.7708, + "step": 7000 + }, + { + "epoch": 0.58, + "eval_loss": 0.7774990200996399, + "eval_runtime": 8.2721, + "eval_samples_per_second": 241.778, + "eval_steps_per_second": 1.934, + "step": 7000 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012621001507285212, + "loss": 0.7687, + "step": 7020 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012570758666889968, + "loss": 0.7675, + "step": 7040 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012520515826494724, + "loss": 0.7746, + "step": 7060 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001247027298609948, + "loss": 0.7626, + "step": 7080 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012420030145704236, + "loss": 0.764, + "step": 7100 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012369787305308992, + "loss": 0.7734, + "step": 7120 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012319544464913749, + "loss": 0.7642, + "step": 7140 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012269301624518505, + "loss": 0.7715, + "step": 7160 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001221905878412326, + "loss": 0.7619, + "step": 7180 + }, + { + "epoch": 0.6, + "learning_rate": 0.00012168815943728017, + "loss": 0.7674, + "step": 7200 + }, + { + "epoch": 0.6, + "eval_loss": 0.777417004108429, + "eval_runtime": 8.2947, + "eval_samples_per_second": 241.117, + "eval_steps_per_second": 1.929, + "step": 7200 + }, + { + "epoch": 0.6, + "learning_rate": 0.00012118573103332774, + "loss": 0.7709, + "step": 7220 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001206833026293753, + "loss": 0.7682, + "step": 7240 + }, + { + "epoch": 0.6, + "learning_rate": 0.00012018087422542287, + "loss": 0.7604, + "step": 7260 + }, + { + "epoch": 0.6, + "learning_rate": 0.00011967844582147043, + "loss": 0.767, + "step": 7280 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011917601741751799, + "loss": 0.7707, + "step": 7300 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011867358901356555, + "loss": 0.7602, + "step": 7320 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011817116060961311, + "loss": 0.7584, + "step": 7340 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011766873220566067, + "loss": 0.7696, + "step": 7360 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011716630380170823, + "loss": 0.7675, + "step": 7380 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011666387539775581, + "loss": 0.7716, + "step": 7400 + }, + { + "epoch": 0.61, + "eval_loss": 0.7760618329048157, + "eval_runtime": 8.2995, + "eval_samples_per_second": 240.98, + "eval_steps_per_second": 1.928, + "step": 7400 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011616144699380338, + "loss": 0.7648, + "step": 7420 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011565901858985094, + "loss": 0.7643, + "step": 7440 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001151565901858985, + "loss": 0.759, + "step": 7460 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011465416178194607, + "loss": 0.7694, + "step": 7480 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011415173337799363, + "loss": 0.7633, + "step": 7500 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011364930497404119, + "loss": 0.773, + "step": 7520 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011314687657008876, + "loss": 0.76, + "step": 7540 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011264444816613632, + "loss": 0.7708, + "step": 7560 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011214201976218388, + "loss": 0.7757, + "step": 7580 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011163959135823145, + "loss": 0.7655, + "step": 7600 + }, + { + "epoch": 0.63, + "eval_loss": 0.7751550078392029, + "eval_runtime": 8.2715, + "eval_samples_per_second": 241.793, + "eval_steps_per_second": 1.934, + "step": 7600 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011113716295427901, + "loss": 0.7607, + "step": 7620 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011063473455032657, + "loss": 0.7703, + "step": 7640 + }, + { + "epoch": 0.64, + "learning_rate": 0.00011013230614637413, + "loss": 0.7653, + "step": 7660 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010962987774242169, + "loss": 0.7753, + "step": 7680 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010912744933846925, + "loss": 0.7639, + "step": 7700 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010862502093451683, + "loss": 0.7701, + "step": 7720 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010812259253056439, + "loss": 0.7614, + "step": 7740 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010762016412661195, + "loss": 0.7612, + "step": 7760 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010711773572265951, + "loss": 0.7597, + "step": 7780 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010661530731870707, + "loss": 0.7621, + "step": 7800 + }, + { + "epoch": 0.65, + "eval_loss": 0.7740359902381897, + "eval_runtime": 8.2962, + "eval_samples_per_second": 241.075, + "eval_steps_per_second": 1.929, + "step": 7800 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010611287891475463, + "loss": 0.7592, + "step": 7820 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001056104505108022, + "loss": 0.7665, + "step": 7840 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010510802210684975, + "loss": 0.7646, + "step": 7860 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010460559370289732, + "loss": 0.7668, + "step": 7880 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010410316529894489, + "loss": 0.7756, + "step": 7900 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010360073689499245, + "loss": 0.7684, + "step": 7920 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010309830849104001, + "loss": 0.7566, + "step": 7940 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010259588008708757, + "loss": 0.7581, + "step": 7960 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010209345168313515, + "loss": 0.7624, + "step": 7980 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010159102327918271, + "loss": 0.7594, + "step": 8000 + }, + { + "epoch": 0.66, + "eval_loss": 0.7729437351226807, + "eval_runtime": 8.2949, + "eval_samples_per_second": 241.112, + "eval_steps_per_second": 1.929, + "step": 8000 + }, + { + "epoch": 0.67, + "learning_rate": 0.00010108859487523028, + "loss": 0.7657, + "step": 8020 + }, + { + "epoch": 0.67, + "learning_rate": 0.00010058616647127785, + "loss": 0.7604, + "step": 8040 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001000837380673254, + "loss": 0.7659, + "step": 8060 + }, + { + "epoch": 0.67, + "learning_rate": 9.958130966337297e-05, + "loss": 0.7688, + "step": 8080 + }, + { + "epoch": 0.67, + "learning_rate": 9.907888125942053e-05, + "loss": 0.7702, + "step": 8100 + }, + { + "epoch": 0.67, + "learning_rate": 9.857645285546809e-05, + "loss": 0.7572, + "step": 8120 + }, + { + "epoch": 0.68, + "learning_rate": 9.807402445151565e-05, + "loss": 0.7603, + "step": 8140 + }, + { + "epoch": 0.68, + "learning_rate": 9.757159604756321e-05, + "loss": 0.761, + "step": 8160 + }, + { + "epoch": 0.68, + "learning_rate": 9.706916764361077e-05, + "loss": 0.7649, + "step": 8180 + }, + { + "epoch": 0.68, + "learning_rate": 9.656673923965835e-05, + "loss": 0.7554, + "step": 8200 + }, + { + "epoch": 0.68, + "eval_loss": 0.7727349996566772, + "eval_runtime": 8.2811, + "eval_samples_per_second": 241.515, + "eval_steps_per_second": 1.932, + "step": 8200 + }, + { + "epoch": 0.68, + "learning_rate": 9.606431083570591e-05, + "loss": 0.7634, + "step": 8220 + }, + { + "epoch": 0.68, + "learning_rate": 9.556188243175347e-05, + "loss": 0.7611, + "step": 8240 + }, + { + "epoch": 0.69, + "learning_rate": 9.505945402780103e-05, + "loss": 0.7606, + "step": 8260 + }, + { + "epoch": 0.69, + "learning_rate": 9.455702562384859e-05, + "loss": 0.7703, + "step": 8280 + }, + { + "epoch": 0.69, + "learning_rate": 9.405459721989615e-05, + "loss": 0.7616, + "step": 8300 + }, + { + "epoch": 0.69, + "learning_rate": 9.355216881594372e-05, + "loss": 0.7638, + "step": 8320 + }, + { + "epoch": 0.69, + "learning_rate": 9.30748618321889e-05, + "loss": 0.7461, + "step": 8340 + }, + { + "epoch": 0.69, + "learning_rate": 9.257243342823646e-05, + "loss": 0.7577, + "step": 8360 + }, + { + "epoch": 0.7, + "learning_rate": 9.207000502428403e-05, + "loss": 0.7553, + "step": 8380 + }, + { + "epoch": 0.7, + "learning_rate": 9.15675766203316e-05, + "loss": 0.7564, + "step": 8400 + }, + { + "epoch": 0.7, + "eval_loss": 0.771743893623352, + "eval_runtime": 8.2767, + "eval_samples_per_second": 241.642, + "eval_steps_per_second": 1.933, + "step": 8400 + }, + { + "epoch": 0.7, + "learning_rate": 9.106514821637916e-05, + "loss": 0.7605, + "step": 8420 + }, + { + "epoch": 0.7, + "learning_rate": 9.056271981242672e-05, + "loss": 0.773, + "step": 8440 + }, + { + "epoch": 0.7, + "learning_rate": 9.006029140847428e-05, + "loss": 0.7657, + "step": 8460 + }, + { + "epoch": 0.7, + "learning_rate": 8.955786300452185e-05, + "loss": 0.7597, + "step": 8480 + }, + { + "epoch": 0.71, + "learning_rate": 8.90554346005694e-05, + "loss": 0.7687, + "step": 8500 + }, + { + "epoch": 0.71, + "learning_rate": 8.855300619661697e-05, + "loss": 0.7545, + "step": 8520 + }, + { + "epoch": 0.71, + "learning_rate": 8.805057779266453e-05, + "loss": 0.7575, + "step": 8540 + }, + { + "epoch": 0.71, + "learning_rate": 8.754814938871209e-05, + "loss": 0.771, + "step": 8560 + }, + { + "epoch": 0.71, + "learning_rate": 8.704572098475966e-05, + "loss": 0.7622, + "step": 8580 + }, + { + "epoch": 0.71, + "learning_rate": 8.654329258080723e-05, + "loss": 0.7621, + "step": 8600 + }, + { + "epoch": 0.71, + "eval_loss": 0.7706981897354126, + "eval_runtime": 8.2852, + "eval_samples_per_second": 241.394, + "eval_steps_per_second": 1.931, + "step": 8600 + }, + { + "epoch": 0.72, + "learning_rate": 8.604086417685479e-05, + "loss": 0.7516, + "step": 8620 + }, + { + "epoch": 0.72, + "learning_rate": 8.553843577290235e-05, + "loss": 0.7526, + "step": 8640 + }, + { + "epoch": 0.72, + "learning_rate": 8.503600736894991e-05, + "loss": 0.7639, + "step": 8660 + }, + { + "epoch": 0.72, + "learning_rate": 8.453357896499747e-05, + "loss": 0.7561, + "step": 8680 + }, + { + "epoch": 0.72, + "learning_rate": 8.403115056104506e-05, + "loss": 0.7643, + "step": 8700 + }, + { + "epoch": 0.72, + "learning_rate": 8.352872215709262e-05, + "loss": 0.7665, + "step": 8720 + }, + { + "epoch": 0.73, + "learning_rate": 8.302629375314018e-05, + "loss": 0.7681, + "step": 8740 + }, + { + "epoch": 0.73, + "learning_rate": 8.252386534918774e-05, + "loss": 0.7655, + "step": 8760 + }, + { + "epoch": 0.73, + "learning_rate": 8.20214369452353e-05, + "loss": 0.7603, + "step": 8780 + }, + { + "epoch": 0.73, + "learning_rate": 8.151900854128286e-05, + "loss": 0.7624, + "step": 8800 + }, + { + "epoch": 0.73, + "eval_loss": 0.7699927687644958, + "eval_runtime": 8.2872, + "eval_samples_per_second": 241.335, + "eval_steps_per_second": 1.931, + "step": 8800 + }, + { + "epoch": 0.73, + "learning_rate": 8.101658013733043e-05, + "loss": 0.7666, + "step": 8820 + }, + { + "epoch": 0.73, + "learning_rate": 8.051415173337799e-05, + "loss": 0.752, + "step": 8840 + }, + { + "epoch": 0.74, + "learning_rate": 8.001172332942555e-05, + "loss": 0.7654, + "step": 8860 + }, + { + "epoch": 0.74, + "learning_rate": 7.950929492547312e-05, + "loss": 0.7546, + "step": 8880 + }, + { + "epoch": 0.74, + "learning_rate": 7.900686652152068e-05, + "loss": 0.755, + "step": 8900 + }, + { + "epoch": 0.74, + "learning_rate": 7.850443811756824e-05, + "loss": 0.7578, + "step": 8920 + }, + { + "epoch": 0.74, + "learning_rate": 7.80020097136158e-05, + "loss": 0.7527, + "step": 8940 + }, + { + "epoch": 0.74, + "learning_rate": 7.749958130966337e-05, + "loss": 0.7622, + "step": 8960 + }, + { + "epoch": 0.75, + "learning_rate": 7.699715290571093e-05, + "loss": 0.7681, + "step": 8980 + }, + { + "epoch": 0.75, + "learning_rate": 7.649472450175849e-05, + "loss": 0.7641, + "step": 9000 + }, + { + "epoch": 0.75, + "eval_loss": 0.7696471214294434, + "eval_runtime": 8.269, + "eval_samples_per_second": 241.867, + "eval_steps_per_second": 1.935, + "step": 9000 + }, + { + "epoch": 0.75, + "learning_rate": 7.599229609780605e-05, + "loss": 0.7622, + "step": 9020 + }, + { + "epoch": 0.75, + "learning_rate": 7.548986769385361e-05, + "loss": 0.7608, + "step": 9040 + }, + { + "epoch": 0.75, + "learning_rate": 7.498743928990119e-05, + "loss": 0.7613, + "step": 9060 + }, + { + "epoch": 0.75, + "learning_rate": 7.448501088594875e-05, + "loss": 0.759, + "step": 9080 + }, + { + "epoch": 0.76, + "learning_rate": 7.398258248199631e-05, + "loss": 0.7585, + "step": 9100 + }, + { + "epoch": 0.76, + "learning_rate": 7.348015407804387e-05, + "loss": 0.7609, + "step": 9120 + }, + { + "epoch": 0.76, + "learning_rate": 7.297772567409144e-05, + "loss": 0.76, + "step": 9140 + }, + { + "epoch": 0.76, + "learning_rate": 7.2475297270139e-05, + "loss": 0.7621, + "step": 9160 + }, + { + "epoch": 0.76, + "learning_rate": 7.197286886618657e-05, + "loss": 0.7587, + "step": 9180 + }, + { + "epoch": 0.76, + "learning_rate": 7.147044046223413e-05, + "loss": 0.7589, + "step": 9200 + }, + { + "epoch": 0.76, + "eval_loss": 0.7685341835021973, + "eval_runtime": 8.3378, + "eval_samples_per_second": 239.872, + "eval_steps_per_second": 1.919, + "step": 9200 + }, + { + "epoch": 0.77, + "learning_rate": 7.096801205828169e-05, + "loss": 0.7708, + "step": 9220 + }, + { + "epoch": 0.77, + "learning_rate": 7.046558365432925e-05, + "loss": 0.7468, + "step": 9240 + }, + { + "epoch": 0.77, + "learning_rate": 6.996315525037681e-05, + "loss": 0.7669, + "step": 9260 + }, + { + "epoch": 0.77, + "learning_rate": 6.946072684642437e-05, + "loss": 0.7639, + "step": 9280 + }, + { + "epoch": 0.77, + "learning_rate": 6.895829844247193e-05, + "loss": 0.764, + "step": 9300 + }, + { + "epoch": 0.77, + "learning_rate": 6.845587003851951e-05, + "loss": 0.7605, + "step": 9320 + }, + { + "epoch": 0.78, + "learning_rate": 6.795344163456707e-05, + "loss": 0.7638, + "step": 9340 + }, + { + "epoch": 0.78, + "learning_rate": 6.745101323061463e-05, + "loss": 0.753, + "step": 9360 + }, + { + "epoch": 0.78, + "learning_rate": 6.694858482666219e-05, + "loss": 0.7567, + "step": 9380 + }, + { + "epoch": 0.78, + "learning_rate": 6.644615642270977e-05, + "loss": 0.7604, + "step": 9400 + }, + { + "epoch": 0.78, + "eval_loss": 0.7682663798332214, + "eval_runtime": 8.2956, + "eval_samples_per_second": 241.092, + "eval_steps_per_second": 1.929, + "step": 9400 + }, + { + "epoch": 0.78, + "learning_rate": 6.594372801875733e-05, + "loss": 0.7603, + "step": 9420 + }, + { + "epoch": 0.78, + "learning_rate": 6.544129961480489e-05, + "loss": 0.7677, + "step": 9440 + }, + { + "epoch": 0.79, + "learning_rate": 6.493887121085245e-05, + "loss": 0.7692, + "step": 9460 + }, + { + "epoch": 0.79, + "learning_rate": 6.443644280690001e-05, + "loss": 0.7637, + "step": 9480 + }, + { + "epoch": 0.79, + "learning_rate": 6.393401440294757e-05, + "loss": 0.756, + "step": 9500 + }, + { + "epoch": 0.79, + "learning_rate": 6.343158599899513e-05, + "loss": 0.7572, + "step": 9520 + }, + { + "epoch": 0.79, + "learning_rate": 6.29291575950427e-05, + "loss": 0.7696, + "step": 9540 + }, + { + "epoch": 0.79, + "learning_rate": 6.242672919109027e-05, + "loss": 0.753, + "step": 9560 + }, + { + "epoch": 0.8, + "learning_rate": 6.192430078713783e-05, + "loss": 0.7619, + "step": 9580 + }, + { + "epoch": 0.8, + "learning_rate": 6.142187238318539e-05, + "loss": 0.7574, + "step": 9600 + }, + { + "epoch": 0.8, + "eval_loss": 0.7677283883094788, + "eval_runtime": 8.2754, + "eval_samples_per_second": 241.68, + "eval_steps_per_second": 1.933, + "step": 9600 + }, + { + "epoch": 0.8, + "learning_rate": 6.091944397923295e-05, + "loss": 0.7624, + "step": 9620 + }, + { + "epoch": 0.8, + "learning_rate": 6.0417015575280514e-05, + "loss": 0.7554, + "step": 9640 + }, + { + "epoch": 0.8, + "learning_rate": 5.9914587171328075e-05, + "loss": 0.7635, + "step": 9660 + }, + { + "epoch": 0.8, + "learning_rate": 5.941215876737565e-05, + "loss": 0.7586, + "step": 9680 + }, + { + "epoch": 0.81, + "learning_rate": 5.890973036342321e-05, + "loss": 0.755, + "step": 9700 + }, + { + "epoch": 0.81, + "learning_rate": 5.840730195947077e-05, + "loss": 0.7584, + "step": 9720 + }, + { + "epoch": 0.81, + "learning_rate": 5.7904873555518333e-05, + "loss": 0.7528, + "step": 9740 + }, + { + "epoch": 0.81, + "learning_rate": 5.74024451515659e-05, + "loss": 0.7597, + "step": 9760 + }, + { + "epoch": 0.81, + "learning_rate": 5.690001674761346e-05, + "loss": 0.7527, + "step": 9780 + }, + { + "epoch": 0.81, + "learning_rate": 5.6397588343661024e-05, + "loss": 0.7617, + "step": 9800 + }, + { + "epoch": 0.81, + "eval_loss": 0.7668038010597229, + "eval_runtime": 8.3597, + "eval_samples_per_second": 239.242, + "eval_steps_per_second": 1.914, + "step": 9800 + }, + { + "epoch": 0.82, + "learning_rate": 5.5895159939708585e-05, + "loss": 0.7584, + "step": 9820 + }, + { + "epoch": 0.82, + "learning_rate": 5.5392731535756146e-05, + "loss": 0.762, + "step": 9840 + }, + { + "epoch": 0.82, + "learning_rate": 5.4890303131803714e-05, + "loss": 0.7586, + "step": 9860 + }, + { + "epoch": 0.82, + "learning_rate": 5.4387874727851275e-05, + "loss": 0.7569, + "step": 9880 + }, + { + "epoch": 0.82, + "learning_rate": 5.3885446323898836e-05, + "loss": 0.7611, + "step": 9900 + }, + { + "epoch": 0.82, + "learning_rate": 5.33830179199464e-05, + "loss": 0.7581, + "step": 9920 + }, + { + "epoch": 0.83, + "learning_rate": 5.2880589515993965e-05, + "loss": 0.7657, + "step": 9940 + }, + { + "epoch": 0.83, + "learning_rate": 5.237816111204153e-05, + "loss": 0.7504, + "step": 9960 + }, + { + "epoch": 0.83, + "learning_rate": 5.1875732708089094e-05, + "loss": 0.7547, + "step": 9980 + }, + { + "epoch": 0.83, + "learning_rate": 5.1373304304136655e-05, + "loss": 0.7588, + "step": 10000 + }, + { + "epoch": 0.83, + "eval_loss": 0.766875147819519, + "eval_runtime": 8.3125, + "eval_samples_per_second": 240.603, + "eval_steps_per_second": 1.925, + "step": 10000 + }, + { + "epoch": 0.83, + "learning_rate": 5.0870875900184223e-05, + "loss": 0.7588, + "step": 10020 + }, + { + "epoch": 0.83, + "learning_rate": 5.0368447496231785e-05, + "loss": 0.7544, + "step": 10040 + }, + { + "epoch": 0.84, + "learning_rate": 4.9866019092279346e-05, + "loss": 0.7588, + "step": 10060 + }, + { + "epoch": 0.84, + "learning_rate": 4.936359068832691e-05, + "loss": 0.7547, + "step": 10080 + }, + { + "epoch": 0.84, + "learning_rate": 4.8861162284374475e-05, + "loss": 0.7608, + "step": 10100 + }, + { + "epoch": 0.84, + "learning_rate": 4.8358733880422036e-05, + "loss": 0.7539, + "step": 10120 + }, + { + "epoch": 0.84, + "learning_rate": 4.78563054764696e-05, + "loss": 0.7639, + "step": 10140 + }, + { + "epoch": 0.84, + "learning_rate": 4.735387707251716e-05, + "loss": 0.7622, + "step": 10160 + }, + { + "epoch": 0.85, + "learning_rate": 4.6851448668564726e-05, + "loss": 0.7528, + "step": 10180 + }, + { + "epoch": 0.85, + "learning_rate": 4.634902026461229e-05, + "loss": 0.7566, + "step": 10200 + }, + { + "epoch": 0.85, + "eval_loss": 0.7660693526268005, + "eval_runtime": 8.2951, + "eval_samples_per_second": 241.107, + "eval_steps_per_second": 1.929, + "step": 10200 + }, + { + "epoch": 0.85, + "learning_rate": 4.584659186065985e-05, + "loss": 0.7545, + "step": 10220 + }, + { + "epoch": 0.85, + "learning_rate": 4.5344163456707416e-05, + "loss": 0.7505, + "step": 10240 + }, + { + "epoch": 0.85, + "learning_rate": 4.4841735052754984e-05, + "loss": 0.7645, + "step": 10260 + }, + { + "epoch": 0.85, + "learning_rate": 4.4339306648802545e-05, + "loss": 0.7566, + "step": 10280 + }, + { + "epoch": 0.86, + "learning_rate": 4.383687824485011e-05, + "loss": 0.7509, + "step": 10300 + }, + { + "epoch": 0.86, + "learning_rate": 4.333444984089767e-05, + "loss": 0.7546, + "step": 10320 + }, + { + "epoch": 0.86, + "learning_rate": 4.285714285714285e-05, + "loss": 0.7533, + "step": 10340 + }, + { + "epoch": 0.86, + "learning_rate": 4.235471445319041e-05, + "loss": 0.7509, + "step": 10360 + }, + { + "epoch": 0.86, + "learning_rate": 4.185228604923798e-05, + "loss": 0.7558, + "step": 10380 + }, + { + "epoch": 0.86, + "learning_rate": 4.1349857645285547e-05, + "loss": 0.7624, + "step": 10400 + }, + { + "epoch": 0.86, + "eval_loss": 0.7657083868980408, + "eval_runtime": 8.2828, + "eval_samples_per_second": 241.464, + "eval_steps_per_second": 1.932, + "step": 10400 + }, + { + "epoch": 0.87, + "learning_rate": 4.084742924133311e-05, + "loss": 0.7509, + "step": 10420 + }, + { + "epoch": 0.87, + "learning_rate": 4.0345000837380676e-05, + "loss": 0.7509, + "step": 10440 + }, + { + "epoch": 0.87, + "learning_rate": 3.984257243342824e-05, + "loss": 0.761, + "step": 10460 + }, + { + "epoch": 0.87, + "learning_rate": 3.93401440294758e-05, + "loss": 0.7546, + "step": 10480 + }, + { + "epoch": 0.87, + "learning_rate": 3.883771562552336e-05, + "loss": 0.764, + "step": 10500 + }, + { + "epoch": 0.87, + "learning_rate": 3.833528722157092e-05, + "loss": 0.765, + "step": 10520 + }, + { + "epoch": 0.88, + "learning_rate": 3.783285881761849e-05, + "loss": 0.7551, + "step": 10540 + }, + { + "epoch": 0.88, + "learning_rate": 3.733043041366605e-05, + "loss": 0.7554, + "step": 10560 + }, + { + "epoch": 0.88, + "learning_rate": 3.682800200971361e-05, + "loss": 0.7574, + "step": 10580 + }, + { + "epoch": 0.88, + "learning_rate": 3.632557360576117e-05, + "loss": 0.7647, + "step": 10600 + }, + { + "epoch": 0.88, + "eval_loss": 0.7651572227478027, + "eval_runtime": 8.2868, + "eval_samples_per_second": 241.347, + "eval_steps_per_second": 1.931, + "step": 10600 + }, + { + "epoch": 0.88, + "learning_rate": 3.584826662200636e-05, + "loss": 0.7508, + "step": 10620 + }, + { + "epoch": 0.88, + "learning_rate": 3.534583821805393e-05, + "loss": 0.7636, + "step": 10640 + }, + { + "epoch": 0.89, + "learning_rate": 3.484340981410149e-05, + "loss": 0.7584, + "step": 10660 + }, + { + "epoch": 0.89, + "learning_rate": 3.434098141014905e-05, + "loss": 0.7677, + "step": 10680 + }, + { + "epoch": 0.89, + "learning_rate": 3.383855300619661e-05, + "loss": 0.7493, + "step": 10700 + }, + { + "epoch": 0.89, + "learning_rate": 3.333612460224418e-05, + "loss": 0.7557, + "step": 10720 + }, + { + "epoch": 0.89, + "learning_rate": 3.283369619829174e-05, + "loss": 0.7528, + "step": 10740 + }, + { + "epoch": 0.89, + "learning_rate": 3.23312677943393e-05, + "loss": 0.7573, + "step": 10760 + }, + { + "epoch": 0.9, + "learning_rate": 3.182883939038687e-05, + "loss": 0.7471, + "step": 10780 + }, + { + "epoch": 0.9, + "learning_rate": 3.132641098643443e-05, + "loss": 0.7537, + "step": 10800 + }, + { + "epoch": 0.9, + "eval_loss": 0.7651455402374268, + "eval_runtime": 8.2847, + "eval_samples_per_second": 241.409, + "eval_steps_per_second": 1.931, + "step": 10800 + }, + { + "epoch": 0.9, + "learning_rate": 3.082398258248199e-05, + "loss": 0.7538, + "step": 10820 + }, + { + "epoch": 0.9, + "learning_rate": 3.0321554178529556e-05, + "loss": 0.7585, + "step": 10840 + }, + { + "epoch": 0.9, + "learning_rate": 2.981912577457712e-05, + "loss": 0.7533, + "step": 10860 + }, + { + "epoch": 0.9, + "learning_rate": 2.9316697370624682e-05, + "loss": 0.7607, + "step": 10880 + }, + { + "epoch": 0.91, + "learning_rate": 2.8814268966672247e-05, + "loss": 0.7522, + "step": 10900 + }, + { + "epoch": 0.91, + "learning_rate": 2.831184056271981e-05, + "loss": 0.7581, + "step": 10920 + }, + { + "epoch": 0.91, + "learning_rate": 2.7809412158767376e-05, + "loss": 0.7597, + "step": 10940 + }, + { + "epoch": 0.91, + "learning_rate": 2.7306983754814937e-05, + "loss": 0.7649, + "step": 10960 + }, + { + "epoch": 0.91, + "learning_rate": 2.68045553508625e-05, + "loss": 0.7645, + "step": 10980 + }, + { + "epoch": 0.91, + "learning_rate": 2.6302126946910063e-05, + "loss": 0.743, + "step": 11000 + }, + { + "epoch": 0.91, + "eval_loss": 0.7645469903945923, + "eval_runtime": 8.314, + "eval_samples_per_second": 240.558, + "eval_steps_per_second": 1.924, + "step": 11000 + }, + { + "epoch": 0.92, + "learning_rate": 2.5799698542957624e-05, + "loss": 0.7488, + "step": 11020 + }, + { + "epoch": 0.92, + "learning_rate": 2.529727013900519e-05, + "loss": 0.7515, + "step": 11040 + }, + { + "epoch": 0.92, + "learning_rate": 2.4794841735052756e-05, + "loss": 0.7582, + "step": 11060 + }, + { + "epoch": 0.92, + "learning_rate": 2.4292413331100317e-05, + "loss": 0.7564, + "step": 11080 + }, + { + "epoch": 0.92, + "learning_rate": 2.378998492714788e-05, + "loss": 0.7486, + "step": 11100 + }, + { + "epoch": 0.92, + "learning_rate": 2.3287556523195443e-05, + "loss": 0.7537, + "step": 11120 + }, + { + "epoch": 0.93, + "learning_rate": 2.2785128119243004e-05, + "loss": 0.7593, + "step": 11140 + }, + { + "epoch": 0.93, + "learning_rate": 2.228269971529057e-05, + "loss": 0.7435, + "step": 11160 + }, + { + "epoch": 0.93, + "learning_rate": 2.178027131133813e-05, + "loss": 0.7646, + "step": 11180 + }, + { + "epoch": 0.93, + "learning_rate": 2.1277842907385698e-05, + "loss": 0.7445, + "step": 11200 + }, + { + "epoch": 0.93, + "eval_loss": 0.7643282413482666, + "eval_runtime": 8.2655, + "eval_samples_per_second": 241.97, + "eval_steps_per_second": 1.936, + "step": 11200 + }, + { + "epoch": 0.93, + "learning_rate": 2.077541450343326e-05, + "loss": 0.7576, + "step": 11220 + }, + { + "epoch": 0.93, + "learning_rate": 2.0272986099480824e-05, + "loss": 0.7553, + "step": 11240 + }, + { + "epoch": 0.94, + "learning_rate": 1.9770557695528385e-05, + "loss": 0.7567, + "step": 11260 + }, + { + "epoch": 0.94, + "learning_rate": 1.926812929157595e-05, + "loss": 0.7501, + "step": 11280 + }, + { + "epoch": 0.94, + "learning_rate": 1.876570088762351e-05, + "loss": 0.7502, + "step": 11300 + }, + { + "epoch": 0.94, + "learning_rate": 1.8263272483671075e-05, + "loss": 0.756, + "step": 11320 + }, + { + "epoch": 0.94, + "learning_rate": 1.776084407971864e-05, + "loss": 0.7528, + "step": 11340 + }, + { + "epoch": 0.94, + "learning_rate": 1.72584156757662e-05, + "loss": 0.7578, + "step": 11360 + }, + { + "epoch": 0.95, + "learning_rate": 1.6755987271813765e-05, + "loss": 0.7588, + "step": 11380 + }, + { + "epoch": 0.95, + "learning_rate": 1.625355886786133e-05, + "loss": 0.7486, + "step": 11400 + }, + { + "epoch": 0.95, + "eval_loss": 0.7640262842178345, + "eval_runtime": 8.2822, + "eval_samples_per_second": 241.482, + "eval_steps_per_second": 1.932, + "step": 11400 + }, + { + "epoch": 0.95, + "learning_rate": 1.575113046390889e-05, + "loss": 0.7527, + "step": 11420 + }, + { + "epoch": 0.95, + "learning_rate": 1.5248702059956454e-05, + "loss": 0.7472, + "step": 11440 + }, + { + "epoch": 0.95, + "learning_rate": 1.4746273656004018e-05, + "loss": 0.7476, + "step": 11460 + }, + { + "epoch": 0.95, + "learning_rate": 1.4243845252051581e-05, + "loss": 0.7551, + "step": 11480 + }, + { + "epoch": 0.95, + "learning_rate": 1.3741416848099144e-05, + "loss": 0.7609, + "step": 11500 + }, + { + "epoch": 0.96, + "learning_rate": 1.3238988444146708e-05, + "loss": 0.7496, + "step": 11520 + }, + { + "epoch": 0.96, + "learning_rate": 1.2736560040194271e-05, + "loss": 0.7528, + "step": 11540 + }, + { + "epoch": 0.96, + "learning_rate": 1.2234131636241834e-05, + "loss": 0.7541, + "step": 11560 + }, + { + "epoch": 0.96, + "learning_rate": 1.1731703232289399e-05, + "loss": 0.7492, + "step": 11580 + }, + { + "epoch": 0.96, + "learning_rate": 1.1229274828336962e-05, + "loss": 0.7464, + "step": 11600 + }, + { + "epoch": 0.96, + "eval_loss": 0.7637657523155212, + "eval_runtime": 8.2732, + "eval_samples_per_second": 241.743, + "eval_steps_per_second": 1.934, + "step": 11600 + }, + { + "epoch": 0.96, + "learning_rate": 1.0726846424384524e-05, + "loss": 0.7549, + "step": 11620 + }, + { + "epoch": 0.97, + "learning_rate": 1.0224418020432087e-05, + "loss": 0.7532, + "step": 11640 + }, + { + "epoch": 0.97, + "learning_rate": 9.721989616479652e-06, + "loss": 0.7513, + "step": 11660 + }, + { + "epoch": 0.97, + "learning_rate": 9.219561212527215e-06, + "loss": 0.7657, + "step": 11680 + }, + { + "epoch": 0.97, + "learning_rate": 8.717132808574777e-06, + "loss": 0.7445, + "step": 11700 + }, + { + "epoch": 0.97, + "learning_rate": 8.21470440462234e-06, + "loss": 0.7574, + "step": 11720 + }, + { + "epoch": 0.97, + "learning_rate": 7.712276000669905e-06, + "loss": 0.7576, + "step": 11740 + }, + { + "epoch": 0.98, + "learning_rate": 7.209847596717467e-06, + "loss": 0.7599, + "step": 11760 + }, + { + "epoch": 0.98, + "learning_rate": 6.7074191927650305e-06, + "loss": 0.7558, + "step": 11780 + }, + { + "epoch": 0.98, + "learning_rate": 6.204990788812593e-06, + "loss": 0.7541, + "step": 11800 + }, + { + "epoch": 0.98, + "eval_loss": 0.7635765671730042, + "eval_runtime": 8.2847, + "eval_samples_per_second": 241.409, + "eval_steps_per_second": 1.931, + "step": 11800 + } + ], + "max_steps": 12042, + "num_train_epochs": 1, + "total_flos": 3.067719870507385e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-belle1.5m7b/checkpoint-11800/training_args.bin b/adapters/saved-belle1.5m7b/checkpoint-11800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dfe19878cfd0a3df620d94aa6be1508290c31ed9 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-11800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfb1ef6ab82f6cd0561137910099627267bd4099d7d83919869b14210b7e77c3 +size 3643 diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/optimizer.pt b/adapters/saved-belle1.5m7b/checkpoint-12000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..df36ef5d056c2dd3ed87d336e2baaf9e692e6ca4 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9230a8759b1c621965033bcdb2d9131e4c294fe4f3109414d1c55f629cb8eb64 +size 33629893 diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/pytorch_model.bin b/adapters/saved-belle1.5m7b/checkpoint-12000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..188709b1ca420e466bbaa8f68437364a1579b6b0 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a671ea20ca7ecc8663f84ad19a09374a1fdfa7caa477f828d5b3b63c1f341e46 +size 16822989 diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_0.pth b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..448074168833d48180154fcfa4c048dfedab9768 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13c585c16977223d63038d2bbf7c401b9797101de2cfae390ba9f3b39ffcd77a +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_1.pth b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..9ebff0e509f466fcee171f201a1e5fd23ffd71f7 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aaf79a0a12296be170d2c43eecec9db71872683ec2e7676bf71abe6daf95f9ee +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_10.pth b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_10.pth new file mode 100644 index 0000000000000000000000000000000000000000..cfe1c8791f06a6cabc723cb9e8c04268b50587e4 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_10.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2b4a688ee0ec9e81769e26cef1892360eacaf64b66380d4c29948e27fd568d2 +size 14587 diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_11.pth b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_11.pth new file mode 100644 index 0000000000000000000000000000000000000000..2484d82b622598f54e793194f05755958655cb59 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_11.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c1d708929756eb1a86bb94658ee72d15363f62c4d7e9c85b631bafd28933209 +size 14587 diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_12.pth b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_12.pth new file mode 100644 index 0000000000000000000000000000000000000000..ac55369a3c5a40902fbbbbf9dbc505e6fd23be01 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_12.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07b45b24459bee2137710811a8def3be20db102014b352816cfe8945070ae4f1 +size 14587 diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_13.pth b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_13.pth new file mode 100644 index 0000000000000000000000000000000000000000..a3939cdbe850ace8f418bb1e81885c9e27c3ae1d --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_13.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:004543c092359c480cca9d5b8a6fc874fa8c2a0f6fdaee185ae6b9a571312d9e +size 14587 diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_14.pth b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_14.pth new file mode 100644 index 0000000000000000000000000000000000000000..e78ffdfd47ed8cea2931cd14a82cc07dfe1c063e --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_14.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90a2f3aa54e7201f943e4c42bdcf29165c92b99bbf4fe70f891b4d85fd7f3108 +size 14587 diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_15.pth b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_15.pth new file mode 100644 index 0000000000000000000000000000000000000000..35efe3113e343ebab043794b32f0ec29424e0575 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_15.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03b68ef7ef18123d79323caecfd3810bfcc0c8fe08179845eb5c506434546971 +size 14587 diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_2.pth b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..0ef2952845fcc554e3b49bc9b6357de30e0aa675 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc92d7bc0d92d7384ae9f3555ef2366316abe063c34ce6d0fa2cd7a792a7dcb7 +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_3.pth b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..9509b14700d6d4ac804209cc99082fa13b845b61 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:055cab7447e26963e1fda5570f20745cd0602742ecda2024524d6436dd2846e9 +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_4.pth b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..27ed3556dbf65dc902c82628bf98e53c384ede71 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8993e2540c52a44df18ef0917b7315b7e3c8abe71278f388787b81b093f0c9a1 +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_5.pth b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1146d362f08efa095f32c9a4a356e8ea5a91451e --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc999bad90919ef86264316536863f2b508c01039822e78615470c6f5245ec10 +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_6.pth b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..ad78b9ea6e8afbda292fa7c370ab2fda489f8230 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:358d1b9f69ef36850c3b9ca76bff0cab4d329836222bba6f6a04be26753914e2 +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_7.pth b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..2b9506937750d6bba24009b7edb81a372044d6ca --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21ea13098c11f61a949fea89d55dded1469156f3a05face154661b4662bca666 +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_8.pth b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..db5b9f2b4722939e3dc00c7720e11d9cf4a90ad5 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a95b3aaf0c836a91d68d376c873d3f0e606ee32277b95849addb7e425a7515aa +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_9.pth b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_9.pth new file mode 100644 index 0000000000000000000000000000000000000000..67390971068c4de2e2f885956d12b7f440358627 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/rng_state_9.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbdb20c6ccba519e01affd1fef6e792da4963ddeda54288d4fdb7386e7bdb23a +size 14583 diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/scaler.pt b/adapters/saved-belle1.5m7b/checkpoint-12000/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a21fd198a6e0482bea54c22ad129bb8d48ae758 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:faee8c00f1996b039529187e23aee629736db36f2a9250882d8e84865073ef00 +size 557 diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/scheduler.pt b/adapters/saved-belle1.5m7b/checkpoint-12000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..dca9d54c345317ef5d98e384b6f8915b9020420c --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cbd23beb08227ca4063af6115898de745f0aefcadce8792dfec19af2c67f702 +size 627 diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/trainer_state.json b/adapters/saved-belle1.5m7b/checkpoint-12000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0b9126cb878baccbc47bb9c5c4972c749598095b --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/trainer_state.json @@ -0,0 +1,4096 @@ +{ + "best_metric": 0.763276994228363, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-belle1.5m7b/checkpoint-12000", + "epoch": 0.9965122072745392, + "global_step": 12000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7631, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011999999999999999, + "loss": 1.4784, + "step": 40 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017999999999999998, + "loss": 1.1332, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 0.00023999999999999998, + "loss": 1.0665, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003, + "loss": 1.0418, + "step": 100 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029949757159604753, + "loss": 1.0246, + "step": 120 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002989951431920951, + "loss": 1.0054, + "step": 140 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029849271478814266, + "loss": 1.002, + "step": 160 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002979902863841902, + "loss": 0.9908, + "step": 180 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002974878579802378, + "loss": 0.9784, + "step": 200 + }, + { + "epoch": 0.02, + "eval_loss": 0.9920349717140198, + "eval_runtime": 8.2147, + "eval_samples_per_second": 243.465, + "eval_steps_per_second": 1.948, + "step": 200 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029698542957628534, + "loss": 0.9817, + "step": 220 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002964830011723329, + "loss": 0.983, + "step": 240 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029598057276838046, + "loss": 0.9637, + "step": 260 + }, + { + "epoch": 0.02, + "learning_rate": 0.000295478144364428, + "loss": 0.9534, + "step": 280 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002949757159604756, + "loss": 0.9475, + "step": 300 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029447328755652315, + "loss": 0.9502, + "step": 320 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002939708591525707, + "loss": 0.946, + "step": 340 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029346843074861827, + "loss": 0.9545, + "step": 360 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029296600234466583, + "loss": 0.946, + "step": 380 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002924635739407134, + "loss": 0.9295, + "step": 400 + }, + { + "epoch": 0.03, + "eval_loss": 0.9477736949920654, + "eval_runtime": 8.229, + "eval_samples_per_second": 243.042, + "eval_steps_per_second": 1.944, + "step": 400 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029196114553676095, + "loss": 0.9232, + "step": 420 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002914587171328085, + "loss": 0.9201, + "step": 440 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029095628872885613, + "loss": 0.9212, + "step": 460 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002904538603249037, + "loss": 0.9131, + "step": 480 + }, + { + "epoch": 0.04, + "learning_rate": 0.00028995143192095125, + "loss": 0.922, + "step": 500 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002894490035169988, + "loss": 0.9176, + "step": 520 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002889465751130464, + "loss": 0.9177, + "step": 540 + }, + { + "epoch": 0.05, + "learning_rate": 0.00028844414670909393, + "loss": 0.9132, + "step": 560 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002879417183051415, + "loss": 0.921, + "step": 580 + }, + { + "epoch": 0.05, + "learning_rate": 0.00028743928990118906, + "loss": 0.9013, + "step": 600 + }, + { + "epoch": 0.05, + "eval_loss": 0.9194319248199463, + "eval_runtime": 8.23, + "eval_samples_per_second": 243.013, + "eval_steps_per_second": 1.944, + "step": 600 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002869368614972366, + "loss": 0.904, + "step": 620 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002864344330932842, + "loss": 0.9016, + "step": 640 + }, + { + "epoch": 0.05, + "learning_rate": 0.00028593200468933174, + "loss": 0.8931, + "step": 660 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002854295762853793, + "loss": 0.9, + "step": 680 + }, + { + "epoch": 0.06, + "learning_rate": 0.00028492714788142686, + "loss": 0.896, + "step": 700 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002844247194774744, + "loss": 0.8982, + "step": 720 + }, + { + "epoch": 0.06, + "learning_rate": 0.00028392229107352204, + "loss": 0.9, + "step": 740 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002834198626695696, + "loss": 0.8861, + "step": 760 + }, + { + "epoch": 0.06, + "learning_rate": 0.00028291743426561716, + "loss": 0.8926, + "step": 780 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002824150058616647, + "loss": 0.8853, + "step": 800 + }, + { + "epoch": 0.07, + "eval_loss": 0.8987648487091064, + "eval_runtime": 8.2248, + "eval_samples_per_second": 243.167, + "eval_steps_per_second": 1.945, + "step": 800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002819125774577123, + "loss": 0.8922, + "step": 820 + }, + { + "epoch": 0.07, + "learning_rate": 0.00028141014905375984, + "loss": 0.8813, + "step": 840 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002809077206498074, + "loss": 0.8825, + "step": 860 + }, + { + "epoch": 0.07, + "learning_rate": 0.00028040529224585497, + "loss": 0.8821, + "step": 880 + }, + { + "epoch": 0.07, + "learning_rate": 0.00027990286384190253, + "loss": 0.8853, + "step": 900 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002794004354379501, + "loss": 0.8806, + "step": 920 + }, + { + "epoch": 0.08, + "learning_rate": 0.00027889800703399765, + "loss": 0.8774, + "step": 940 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002783955786300452, + "loss": 0.8783, + "step": 960 + }, + { + "epoch": 0.08, + "learning_rate": 0.00027789315022609277, + "loss": 0.876, + "step": 980 + }, + { + "epoch": 0.08, + "learning_rate": 0.00027739072182214033, + "loss": 0.8656, + "step": 1000 + }, + { + "epoch": 0.08, + "eval_loss": 0.8842049241065979, + "eval_runtime": 8.2372, + "eval_samples_per_second": 242.8, + "eval_steps_per_second": 1.942, + "step": 1000 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002768882934181879, + "loss": 0.8776, + "step": 1020 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027638586501423546, + "loss": 0.8751, + "step": 1040 + }, + { + "epoch": 0.09, + "learning_rate": 0.000275883436610283, + "loss": 0.8701, + "step": 1060 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002753810082063306, + "loss": 0.8618, + "step": 1080 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027487857980237814, + "loss": 0.8604, + "step": 1100 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002743761513984257, + "loss": 0.8516, + "step": 1120 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027387372299447326, + "loss": 0.8553, + "step": 1140 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002733712945905208, + "loss": 0.8638, + "step": 1160 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002728688661865684, + "loss": 0.8528, + "step": 1180 + }, + { + "epoch": 0.1, + "learning_rate": 0.00027236643778261595, + "loss": 0.8641, + "step": 1200 + }, + { + "epoch": 0.1, + "eval_loss": 0.871113121509552, + "eval_runtime": 8.2308, + "eval_samples_per_second": 242.989, + "eval_steps_per_second": 1.944, + "step": 1200 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002718640093786635, + "loss": 0.8656, + "step": 1220 + }, + { + "epoch": 0.1, + "learning_rate": 0.00027136158097471107, + "loss": 0.8534, + "step": 1240 + }, + { + "epoch": 0.1, + "learning_rate": 0.00027085915257075863, + "loss": 0.8512, + "step": 1260 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002703567241668062, + "loss": 0.8472, + "step": 1280 + }, + { + "epoch": 0.11, + "learning_rate": 0.00026985429576285375, + "loss": 0.8584, + "step": 1300 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002693518673589013, + "loss": 0.8557, + "step": 1320 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002688494389549489, + "loss": 0.8547, + "step": 1340 + }, + { + "epoch": 0.11, + "learning_rate": 0.00026834701055099643, + "loss": 0.8576, + "step": 1360 + }, + { + "epoch": 0.11, + "learning_rate": 0.000267844582147044, + "loss": 0.8499, + "step": 1380 + }, + { + "epoch": 0.12, + "learning_rate": 0.00026734215374309156, + "loss": 0.839, + "step": 1400 + }, + { + "epoch": 0.12, + "eval_loss": 0.8613501191139221, + "eval_runtime": 8.2712, + "eval_samples_per_second": 241.802, + "eval_steps_per_second": 1.934, + "step": 1400 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002668397253391391, + "loss": 0.8538, + "step": 1420 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002663372969351867, + "loss": 0.8541, + "step": 1440 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002658348685312343, + "loss": 0.8381, + "step": 1460 + }, + { + "epoch": 0.12, + "learning_rate": 0.00026533244012728186, + "loss": 0.8527, + "step": 1480 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002648300117233294, + "loss": 0.8493, + "step": 1500 + }, + { + "epoch": 0.13, + "learning_rate": 0.000264327583319377, + "loss": 0.846, + "step": 1520 + }, + { + "epoch": 0.13, + "learning_rate": 0.00026382515491542454, + "loss": 0.8435, + "step": 1540 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002633227265114721, + "loss": 0.8517, + "step": 1560 + }, + { + "epoch": 0.13, + "learning_rate": 0.00026282029810751966, + "loss": 0.8403, + "step": 1580 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002623178697035672, + "loss": 0.8419, + "step": 1600 + }, + { + "epoch": 0.13, + "eval_loss": 0.8518173098564148, + "eval_runtime": 8.2505, + "eval_samples_per_second": 242.411, + "eval_steps_per_second": 1.939, + "step": 1600 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002618154412996148, + "loss": 0.8393, + "step": 1620 + }, + { + "epoch": 0.14, + "learning_rate": 0.00026131301289566234, + "loss": 0.8471, + "step": 1640 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002608105844917099, + "loss": 0.85, + "step": 1660 + }, + { + "epoch": 0.14, + "learning_rate": 0.00026030815608775747, + "loss": 0.8388, + "step": 1680 + }, + { + "epoch": 0.14, + "learning_rate": 0.00025980572768380503, + "loss": 0.8394, + "step": 1700 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002593032992798526, + "loss": 0.8333, + "step": 1720 + }, + { + "epoch": 0.14, + "learning_rate": 0.00025880087087590015, + "loss": 0.8343, + "step": 1740 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002582984424719477, + "loss": 0.8303, + "step": 1760 + }, + { + "epoch": 0.15, + "learning_rate": 0.00025779601406799533, + "loss": 0.8247, + "step": 1780 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002572935856640429, + "loss": 0.8282, + "step": 1800 + }, + { + "epoch": 0.15, + "eval_loss": 0.8434953689575195, + "eval_runtime": 8.2633, + "eval_samples_per_second": 242.034, + "eval_steps_per_second": 1.936, + "step": 1800 + }, + { + "epoch": 0.15, + "learning_rate": 0.00025679115726009045, + "loss": 0.8348, + "step": 1820 + }, + { + "epoch": 0.15, + "learning_rate": 0.000256288728856138, + "loss": 0.8393, + "step": 1840 + }, + { + "epoch": 0.15, + "learning_rate": 0.00025578630045218557, + "loss": 0.8304, + "step": 1860 + }, + { + "epoch": 0.16, + "learning_rate": 0.00025528387204823313, + "loss": 0.8309, + "step": 1880 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002547814436442807, + "loss": 0.8382, + "step": 1900 + }, + { + "epoch": 0.16, + "learning_rate": 0.00025427901524032826, + "loss": 0.841, + "step": 1920 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002537765868363758, + "loss": 0.8354, + "step": 1940 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002532741584324234, + "loss": 0.8334, + "step": 1960 + }, + { + "epoch": 0.16, + "learning_rate": 0.00025277173002847094, + "loss": 0.8243, + "step": 1980 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002522693016245185, + "loss": 0.8337, + "step": 2000 + }, + { + "epoch": 0.17, + "eval_loss": 0.8386329412460327, + "eval_runtime": 8.2418, + "eval_samples_per_second": 242.666, + "eval_steps_per_second": 1.941, + "step": 2000 + }, + { + "epoch": 0.17, + "learning_rate": 0.00025176687322056606, + "loss": 0.8237, + "step": 2020 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002512644448166136, + "loss": 0.8206, + "step": 2040 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002507620164126612, + "loss": 0.8277, + "step": 2060 + }, + { + "epoch": 0.17, + "learning_rate": 0.00025025958800870874, + "loss": 0.8271, + "step": 2080 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002497571596047563, + "loss": 0.8353, + "step": 2100 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024925473120080387, + "loss": 0.8253, + "step": 2120 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024875230279685143, + "loss": 0.8212, + "step": 2140 + }, + { + "epoch": 0.18, + "learning_rate": 0.000248249874392899, + "loss": 0.8192, + "step": 2160 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024774744598894655, + "loss": 0.8292, + "step": 2180 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002472450175849941, + "loss": 0.825, + "step": 2200 + }, + { + "epoch": 0.18, + "eval_loss": 0.8326684236526489, + "eval_runtime": 8.2602, + "eval_samples_per_second": 242.126, + "eval_steps_per_second": 1.937, + "step": 2200 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024674258918104167, + "loss": 0.8308, + "step": 2220 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024624016077708923, + "loss": 0.8324, + "step": 2240 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002457377323731368, + "loss": 0.8257, + "step": 2260 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024523530396918436, + "loss": 0.8181, + "step": 2280 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002447328755652319, + "loss": 0.8126, + "step": 2300 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002442304471612795, + "loss": 0.819, + "step": 2320 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024372801875732707, + "loss": 0.8198, + "step": 2340 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024322559035337463, + "loss": 0.8188, + "step": 2360 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002427231619494222, + "loss": 0.8145, + "step": 2380 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024222073354546975, + "loss": 0.81, + "step": 2400 + }, + { + "epoch": 0.2, + "eval_loss": 0.8278167843818665, + "eval_runtime": 8.2577, + "eval_samples_per_second": 242.198, + "eval_steps_per_second": 1.938, + "step": 2400 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002417183051415173, + "loss": 0.8142, + "step": 2420 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024121587673756487, + "loss": 0.8059, + "step": 2440 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024071344833361243, + "loss": 0.826, + "step": 2460 + }, + { + "epoch": 0.21, + "learning_rate": 0.00024021101992966, + "loss": 0.8173, + "step": 2480 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023970859152570756, + "loss": 0.8063, + "step": 2500 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023920616312175512, + "loss": 0.812, + "step": 2520 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023870373471780268, + "loss": 0.82, + "step": 2540 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023820130631385024, + "loss": 0.8207, + "step": 2560 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002376988779098978, + "loss": 0.8113, + "step": 2580 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023719644950594536, + "loss": 0.8175, + "step": 2600 + }, + { + "epoch": 0.22, + "eval_loss": 0.8231886029243469, + "eval_runtime": 8.2627, + "eval_samples_per_second": 242.052, + "eval_steps_per_second": 1.936, + "step": 2600 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023669402110199295, + "loss": 0.8105, + "step": 2620 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002361915926980405, + "loss": 0.8106, + "step": 2640 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023568916429408807, + "loss": 0.8114, + "step": 2660 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023518673589013563, + "loss": 0.8152, + "step": 2680 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002346843074861832, + "loss": 0.8059, + "step": 2700 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023418187908223076, + "loss": 0.8065, + "step": 2720 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023367945067827832, + "loss": 0.8147, + "step": 2740 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023317702227432588, + "loss": 0.8006, + "step": 2760 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023267459387037344, + "loss": 0.805, + "step": 2780 + }, + { + "epoch": 0.23, + "learning_rate": 0.000232172165466421, + "loss": 0.8142, + "step": 2800 + }, + { + "epoch": 0.23, + "eval_loss": 0.8191845417022705, + "eval_runtime": 8.28, + "eval_samples_per_second": 241.547, + "eval_steps_per_second": 1.932, + "step": 2800 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023166973706246856, + "loss": 0.8134, + "step": 2820 + }, + { + "epoch": 0.24, + "learning_rate": 0.00023116730865851612, + "loss": 0.8066, + "step": 2840 + }, + { + "epoch": 0.24, + "learning_rate": 0.00023066488025456368, + "loss": 0.8049, + "step": 2860 + }, + { + "epoch": 0.24, + "learning_rate": 0.00023016245185061124, + "loss": 0.7987, + "step": 2880 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022966002344665886, + "loss": 0.8113, + "step": 2900 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022915759504270642, + "loss": 0.8086, + "step": 2920 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022865516663875398, + "loss": 0.8019, + "step": 2940 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022815273823480154, + "loss": 0.8051, + "step": 2960 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002276503098308491, + "loss": 0.8032, + "step": 2980 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022714788142689667, + "loss": 0.8058, + "step": 3000 + }, + { + "epoch": 0.25, + "eval_loss": 0.8153129816055298, + "eval_runtime": 8.2727, + "eval_samples_per_second": 241.759, + "eval_steps_per_second": 1.934, + "step": 3000 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022664545302294423, + "loss": 0.801, + "step": 3020 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002261430246189918, + "loss": 0.801, + "step": 3040 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022564059621503935, + "loss": 0.8037, + "step": 3060 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002251381678110869, + "loss": 0.7992, + "step": 3080 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022463573940713447, + "loss": 0.8036, + "step": 3100 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022413331100318203, + "loss": 0.8064, + "step": 3120 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002236308825992296, + "loss": 0.7966, + "step": 3140 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022312845419527716, + "loss": 0.8047, + "step": 3160 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022262602579132472, + "loss": 0.7986, + "step": 3180 + }, + { + "epoch": 0.27, + "learning_rate": 0.00022212359738737228, + "loss": 0.8026, + "step": 3200 + }, + { + "epoch": 0.27, + "eval_loss": 0.8119255304336548, + "eval_runtime": 8.288, + "eval_samples_per_second": 241.313, + "eval_steps_per_second": 1.931, + "step": 3200 + }, + { + "epoch": 0.27, + "learning_rate": 0.00022162116898341984, + "loss": 0.8028, + "step": 3220 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002211187405794674, + "loss": 0.7987, + "step": 3240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000220616312175515, + "loss": 0.8148, + "step": 3260 + }, + { + "epoch": 0.27, + "learning_rate": 0.00022011388377156255, + "loss": 0.7932, + "step": 3280 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002196114553676101, + "loss": 0.8027, + "step": 3300 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021910902696365767, + "loss": 0.7989, + "step": 3320 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021860659855970523, + "loss": 0.7983, + "step": 3340 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002181041701557528, + "loss": 0.7977, + "step": 3360 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021760174175180035, + "loss": 0.7972, + "step": 3380 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021709931334784792, + "loss": 0.8025, + "step": 3400 + }, + { + "epoch": 0.28, + "eval_loss": 0.8084473609924316, + "eval_runtime": 8.2772, + "eval_samples_per_second": 241.627, + "eval_steps_per_second": 1.933, + "step": 3400 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021659688494389548, + "loss": 0.7993, + "step": 3420 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021609445653994304, + "loss": 0.7919, + "step": 3440 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002155920281359906, + "loss": 0.8062, + "step": 3460 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021508959973203816, + "loss": 0.7937, + "step": 3480 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021458717132808572, + "loss": 0.7947, + "step": 3500 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021408474292413328, + "loss": 0.7973, + "step": 3520 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021358231452018084, + "loss": 0.7964, + "step": 3540 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002130798861162284, + "loss": 0.7994, + "step": 3560 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021257745771227597, + "loss": 0.7911, + "step": 3580 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021207502930832353, + "loss": 0.805, + "step": 3600 + }, + { + "epoch": 0.3, + "eval_loss": 0.8064665794372559, + "eval_runtime": 8.2739, + "eval_samples_per_second": 241.725, + "eval_steps_per_second": 1.934, + "step": 3600 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021157260090437112, + "loss": 0.8014, + "step": 3620 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021107017250041868, + "loss": 0.7941, + "step": 3640 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021056774409646624, + "loss": 0.793, + "step": 3660 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002100653156925138, + "loss": 0.7918, + "step": 3680 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020956288728856136, + "loss": 0.7926, + "step": 3700 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020906045888460892, + "loss": 0.797, + "step": 3720 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020855803048065648, + "loss": 0.7965, + "step": 3740 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020805560207670404, + "loss": 0.7963, + "step": 3760 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002075531736727516, + "loss": 0.7924, + "step": 3780 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020705074526879917, + "loss": 0.788, + "step": 3800 + }, + { + "epoch": 0.32, + "eval_loss": 0.8037804961204529, + "eval_runtime": 8.3133, + "eval_samples_per_second": 240.579, + "eval_steps_per_second": 1.925, + "step": 3800 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020654831686484673, + "loss": 0.794, + "step": 3820 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002060458884608943, + "loss": 0.7946, + "step": 3840 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020554346005694185, + "loss": 0.7934, + "step": 3860 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002050410316529894, + "loss": 0.7935, + "step": 3880 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020453860324903697, + "loss": 0.7864, + "step": 3900 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020403617484508453, + "loss": 0.8026, + "step": 3920 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002035337464411321, + "loss": 0.7902, + "step": 3940 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020303131803717966, + "loss": 0.7915, + "step": 3960 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020252888963322724, + "loss": 0.798, + "step": 3980 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002020264612292748, + "loss": 0.7989, + "step": 4000 + }, + { + "epoch": 0.33, + "eval_loss": 0.8007607460021973, + "eval_runtime": 8.2882, + "eval_samples_per_second": 241.308, + "eval_steps_per_second": 1.93, + "step": 4000 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002015240328253224, + "loss": 0.7831, + "step": 4020 + }, + { + "epoch": 0.34, + "learning_rate": 0.00020102160442136995, + "loss": 0.7877, + "step": 4040 + }, + { + "epoch": 0.34, + "learning_rate": 0.00020051917601741752, + "loss": 0.7855, + "step": 4060 + }, + { + "epoch": 0.34, + "learning_rate": 0.00020001674761346508, + "loss": 0.7889, + "step": 4080 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019951431920951264, + "loss": 0.7958, + "step": 4100 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001990118908055602, + "loss": 0.7934, + "step": 4120 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019850946240160776, + "loss": 0.785, + "step": 4140 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019800703399765532, + "loss": 0.7987, + "step": 4160 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019750460559370288, + "loss": 0.7886, + "step": 4180 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019702729860994808, + "loss": 0.79, + "step": 4200 + }, + { + "epoch": 0.35, + "eval_loss": 0.7986196279525757, + "eval_runtime": 8.2739, + "eval_samples_per_second": 241.724, + "eval_steps_per_second": 1.934, + "step": 4200 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019652487020599565, + "loss": 0.7889, + "step": 4220 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001960224418020432, + "loss": 0.7883, + "step": 4240 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019552001339809077, + "loss": 0.7895, + "step": 4260 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019501758499413833, + "loss": 0.7838, + "step": 4280 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001945151565901859, + "loss": 0.7957, + "step": 4300 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019401272818623345, + "loss": 0.7915, + "step": 4320 + }, + { + "epoch": 0.36, + "learning_rate": 0.000193510299782281, + "loss": 0.7987, + "step": 4340 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019300787137832857, + "loss": 0.7811, + "step": 4360 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019250544297437613, + "loss": 0.7921, + "step": 4380 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001920030145704237, + "loss": 0.7928, + "step": 4400 + }, + { + "epoch": 0.37, + "eval_loss": 0.7964197993278503, + "eval_runtime": 8.2761, + "eval_samples_per_second": 241.66, + "eval_steps_per_second": 1.933, + "step": 4400 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019150058616647126, + "loss": 0.7924, + "step": 4420 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019099815776251885, + "loss": 0.7898, + "step": 4440 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001904957293585664, + "loss": 0.7893, + "step": 4460 + }, + { + "epoch": 0.37, + "learning_rate": 0.00018999330095461397, + "loss": 0.795, + "step": 4480 + }, + { + "epoch": 0.37, + "learning_rate": 0.00018949087255066153, + "loss": 0.7832, + "step": 4500 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001889884441467091, + "loss": 0.7835, + "step": 4520 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018848601574275665, + "loss": 0.7809, + "step": 4540 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001879835873388042, + "loss": 0.7792, + "step": 4560 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018748115893485177, + "loss": 0.7884, + "step": 4580 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018697873053089933, + "loss": 0.7859, + "step": 4600 + }, + { + "epoch": 0.38, + "eval_loss": 0.794753909111023, + "eval_runtime": 8.2787, + "eval_samples_per_second": 241.584, + "eval_steps_per_second": 1.933, + "step": 4600 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001864763021269469, + "loss": 0.7709, + "step": 4620 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018597387372299446, + "loss": 0.7786, + "step": 4640 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018547144531904202, + "loss": 0.7786, + "step": 4660 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018496901691508958, + "loss": 0.7824, + "step": 4680 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018446658851113714, + "loss": 0.7741, + "step": 4700 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001839641601071847, + "loss": 0.78, + "step": 4720 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018346173170323226, + "loss": 0.782, + "step": 4740 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018295930329927982, + "loss": 0.7808, + "step": 4760 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001824568748953274, + "loss": 0.7776, + "step": 4780 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018195444649137497, + "loss": 0.7824, + "step": 4800 + }, + { + "epoch": 0.4, + "eval_loss": 0.7927345633506775, + "eval_runtime": 8.2706, + "eval_samples_per_second": 241.82, + "eval_steps_per_second": 1.935, + "step": 4800 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018145201808742253, + "loss": 0.7843, + "step": 4820 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001809495896834701, + "loss": 0.7908, + "step": 4840 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018044716127951766, + "loss": 0.7872, + "step": 4860 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017994473287556522, + "loss": 0.7763, + "step": 4880 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017944230447161278, + "loss": 0.7846, + "step": 4900 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017893987606766034, + "loss": 0.7775, + "step": 4920 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001784374476637079, + "loss": 0.793, + "step": 4940 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017793501925975546, + "loss": 0.7814, + "step": 4960 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017743259085580302, + "loss": 0.7913, + "step": 4980 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017693016245185058, + "loss": 0.7755, + "step": 5000 + }, + { + "epoch": 0.42, + "eval_loss": 0.7912722826004028, + "eval_runtime": 8.285, + "eval_samples_per_second": 241.401, + "eval_steps_per_second": 1.931, + "step": 5000 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017642773404789815, + "loss": 0.7848, + "step": 5020 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001759253056439457, + "loss": 0.7826, + "step": 5040 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017542287723999327, + "loss": 0.7753, + "step": 5060 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017492044883604083, + "loss": 0.7786, + "step": 5080 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001744180204320884, + "loss": 0.7929, + "step": 5100 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017391559202813595, + "loss": 0.7836, + "step": 5120 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017341316362418354, + "loss": 0.7802, + "step": 5140 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001729107352202311, + "loss": 0.7808, + "step": 5160 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017240830681627866, + "loss": 0.7784, + "step": 5180 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017190587841232622, + "loss": 0.7803, + "step": 5200 + }, + { + "epoch": 0.43, + "eval_loss": 0.7892646193504333, + "eval_runtime": 8.297, + "eval_samples_per_second": 241.05, + "eval_steps_per_second": 1.928, + "step": 5200 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017140345000837378, + "loss": 0.7872, + "step": 5220 + }, + { + "epoch": 0.44, + "learning_rate": 0.00017090102160442135, + "loss": 0.7795, + "step": 5240 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001703985932004689, + "loss": 0.7777, + "step": 5260 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016989616479651647, + "loss": 0.7775, + "step": 5280 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016939373639256406, + "loss": 0.7789, + "step": 5300 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016889130798861162, + "loss": 0.7863, + "step": 5320 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016838887958465918, + "loss": 0.7774, + "step": 5340 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016788645118070674, + "loss": 0.7856, + "step": 5360 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001673840227767543, + "loss": 0.78, + "step": 5380 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016688159437280186, + "loss": 0.7929, + "step": 5400 + }, + { + "epoch": 0.45, + "eval_loss": 0.7884517908096313, + "eval_runtime": 8.3482, + "eval_samples_per_second": 239.571, + "eval_steps_per_second": 1.917, + "step": 5400 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016637916596884945, + "loss": 0.7728, + "step": 5420 + }, + { + "epoch": 0.45, + "learning_rate": 0.000165876737564897, + "loss": 0.7827, + "step": 5440 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016537430916094457, + "loss": 0.767, + "step": 5460 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016487188075699213, + "loss": 0.7768, + "step": 5480 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001643694523530397, + "loss": 0.776, + "step": 5500 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016386702394908726, + "loss": 0.77, + "step": 5520 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016336459554513482, + "loss": 0.7825, + "step": 5540 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016286216714118238, + "loss": 0.7829, + "step": 5560 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016235973873722994, + "loss": 0.791, + "step": 5580 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001618573103332775, + "loss": 0.7807, + "step": 5600 + }, + { + "epoch": 0.47, + "eval_loss": 0.7869579792022705, + "eval_runtime": 8.297, + "eval_samples_per_second": 241.051, + "eval_steps_per_second": 1.928, + "step": 5600 + }, + { + "epoch": 0.47, + "learning_rate": 0.00016135488192932506, + "loss": 0.7762, + "step": 5620 + }, + { + "epoch": 0.47, + "learning_rate": 0.00016085245352537262, + "loss": 0.7805, + "step": 5640 + }, + { + "epoch": 0.47, + "learning_rate": 0.00016035002512142018, + "loss": 0.7797, + "step": 5660 + }, + { + "epoch": 0.47, + "learning_rate": 0.00015984759671746774, + "loss": 0.7832, + "step": 5680 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001593451683135153, + "loss": 0.7739, + "step": 5700 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015884273990956287, + "loss": 0.7703, + "step": 5720 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015834031150561043, + "loss": 0.7707, + "step": 5740 + }, + { + "epoch": 0.48, + "learning_rate": 0.000157837883101658, + "loss": 0.7649, + "step": 5760 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015733545469770558, + "loss": 0.7713, + "step": 5780 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015683302629375314, + "loss": 0.7753, + "step": 5800 + }, + { + "epoch": 0.48, + "eval_loss": 0.7855839729309082, + "eval_runtime": 8.3158, + "eval_samples_per_second": 240.507, + "eval_steps_per_second": 1.924, + "step": 5800 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001563305978898007, + "loss": 0.7795, + "step": 5820 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015582816948584826, + "loss": 0.7714, + "step": 5840 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015532574108189582, + "loss": 0.7772, + "step": 5860 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015482331267794338, + "loss": 0.7784, + "step": 5880 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015432088427399094, + "loss": 0.7628, + "step": 5900 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001538184558700385, + "loss": 0.7848, + "step": 5920 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015331602746608607, + "loss": 0.78, + "step": 5940 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015281359906213363, + "loss": 0.7856, + "step": 5960 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001523111706581812, + "loss": 0.776, + "step": 5980 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015180874225422875, + "loss": 0.7752, + "step": 6000 + }, + { + "epoch": 0.5, + "eval_loss": 0.7837858200073242, + "eval_runtime": 8.2979, + "eval_samples_per_second": 241.025, + "eval_steps_per_second": 1.928, + "step": 6000 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001513063138502763, + "loss": 0.7719, + "step": 6020 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015080388544632387, + "loss": 0.7841, + "step": 6040 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015030145704237143, + "loss": 0.7779, + "step": 6060 + }, + { + "epoch": 0.5, + "learning_rate": 0.000149799028638419, + "loss": 0.7706, + "step": 6080 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014929660023446656, + "loss": 0.762, + "step": 6100 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014879417183051412, + "loss": 0.7854, + "step": 6120 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001482917434265617, + "loss": 0.7803, + "step": 6140 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014778931502260927, + "loss": 0.7769, + "step": 6160 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014728688661865683, + "loss": 0.7773, + "step": 6180 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001467844582147044, + "loss": 0.7725, + "step": 6200 + }, + { + "epoch": 0.51, + "eval_loss": 0.7822731137275696, + "eval_runtime": 8.3078, + "eval_samples_per_second": 240.738, + "eval_steps_per_second": 1.926, + "step": 6200 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014630715123094956, + "loss": 0.7797, + "step": 6220 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014580472282699713, + "loss": 0.77, + "step": 6240 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014530229442304469, + "loss": 0.7699, + "step": 6260 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014479986601909225, + "loss": 0.7769, + "step": 6280 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014429743761513984, + "loss": 0.7755, + "step": 6300 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001437950092111874, + "loss": 0.7752, + "step": 6320 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014329258080723496, + "loss": 0.7791, + "step": 6340 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014279015240328252, + "loss": 0.7606, + "step": 6360 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014228772399933008, + "loss": 0.7733, + "step": 6380 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014178529559537764, + "loss": 0.7855, + "step": 6400 + }, + { + "epoch": 0.53, + "eval_loss": 0.7812179923057556, + "eval_runtime": 8.2896, + "eval_samples_per_second": 241.265, + "eval_steps_per_second": 1.93, + "step": 6400 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001412828671914252, + "loss": 0.7718, + "step": 6420 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014078043878747276, + "loss": 0.7676, + "step": 6440 + }, + { + "epoch": 0.54, + "learning_rate": 0.00014027801038352035, + "loss": 0.7732, + "step": 6460 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001397755819795679, + "loss": 0.7745, + "step": 6480 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013927315357561547, + "loss": 0.7747, + "step": 6500 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013877072517166304, + "loss": 0.7648, + "step": 6520 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001382682967677106, + "loss": 0.7767, + "step": 6540 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013776586836375816, + "loss": 0.7735, + "step": 6560 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013726343995980572, + "loss": 0.7684, + "step": 6580 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013676101155585328, + "loss": 0.7783, + "step": 6600 + }, + { + "epoch": 0.55, + "eval_loss": 0.7797773480415344, + "eval_runtime": 8.2853, + "eval_samples_per_second": 241.391, + "eval_steps_per_second": 1.931, + "step": 6600 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013625858315190084, + "loss": 0.7755, + "step": 6620 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001357561547479484, + "loss": 0.7813, + "step": 6640 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013525372634399596, + "loss": 0.7721, + "step": 6660 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013475129794004352, + "loss": 0.7621, + "step": 6680 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013424886953609109, + "loss": 0.7703, + "step": 6700 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013374644113213865, + "loss": 0.7795, + "step": 6720 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001332440127281862, + "loss": 0.779, + "step": 6740 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013274158432423377, + "loss": 0.7762, + "step": 6760 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013223915592028133, + "loss": 0.7769, + "step": 6780 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001317367275163289, + "loss": 0.7762, + "step": 6800 + }, + { + "epoch": 0.56, + "eval_loss": 0.7790360450744629, + "eval_runtime": 8.301, + "eval_samples_per_second": 240.935, + "eval_steps_per_second": 1.927, + "step": 6800 + }, + { + "epoch": 0.57, + "learning_rate": 0.00013123429911237648, + "loss": 0.7587, + "step": 6820 + }, + { + "epoch": 0.57, + "learning_rate": 0.00013073187070842404, + "loss": 0.7667, + "step": 6840 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001302294423044716, + "loss": 0.7575, + "step": 6860 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012972701390051916, + "loss": 0.7755, + "step": 6880 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012922458549656672, + "loss": 0.7718, + "step": 6900 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012872215709261429, + "loss": 0.7715, + "step": 6920 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012821972868866185, + "loss": 0.7679, + "step": 6940 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012771730028470944, + "loss": 0.7691, + "step": 6960 + }, + { + "epoch": 0.58, + "learning_rate": 0.000127214871880757, + "loss": 0.7766, + "step": 6980 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012671244347680456, + "loss": 0.7708, + "step": 7000 + }, + { + "epoch": 0.58, + "eval_loss": 0.7774990200996399, + "eval_runtime": 8.2721, + "eval_samples_per_second": 241.778, + "eval_steps_per_second": 1.934, + "step": 7000 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012621001507285212, + "loss": 0.7687, + "step": 7020 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012570758666889968, + "loss": 0.7675, + "step": 7040 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012520515826494724, + "loss": 0.7746, + "step": 7060 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001247027298609948, + "loss": 0.7626, + "step": 7080 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012420030145704236, + "loss": 0.764, + "step": 7100 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012369787305308992, + "loss": 0.7734, + "step": 7120 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012319544464913749, + "loss": 0.7642, + "step": 7140 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012269301624518505, + "loss": 0.7715, + "step": 7160 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001221905878412326, + "loss": 0.7619, + "step": 7180 + }, + { + "epoch": 0.6, + "learning_rate": 0.00012168815943728017, + "loss": 0.7674, + "step": 7200 + }, + { + "epoch": 0.6, + "eval_loss": 0.777417004108429, + "eval_runtime": 8.2947, + "eval_samples_per_second": 241.117, + "eval_steps_per_second": 1.929, + "step": 7200 + }, + { + "epoch": 0.6, + "learning_rate": 0.00012118573103332774, + "loss": 0.7709, + "step": 7220 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001206833026293753, + "loss": 0.7682, + "step": 7240 + }, + { + "epoch": 0.6, + "learning_rate": 0.00012018087422542287, + "loss": 0.7604, + "step": 7260 + }, + { + "epoch": 0.6, + "learning_rate": 0.00011967844582147043, + "loss": 0.767, + "step": 7280 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011917601741751799, + "loss": 0.7707, + "step": 7300 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011867358901356555, + "loss": 0.7602, + "step": 7320 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011817116060961311, + "loss": 0.7584, + "step": 7340 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011766873220566067, + "loss": 0.7696, + "step": 7360 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011716630380170823, + "loss": 0.7675, + "step": 7380 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011666387539775581, + "loss": 0.7716, + "step": 7400 + }, + { + "epoch": 0.61, + "eval_loss": 0.7760618329048157, + "eval_runtime": 8.2995, + "eval_samples_per_second": 240.98, + "eval_steps_per_second": 1.928, + "step": 7400 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011616144699380338, + "loss": 0.7648, + "step": 7420 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011565901858985094, + "loss": 0.7643, + "step": 7440 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001151565901858985, + "loss": 0.759, + "step": 7460 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011465416178194607, + "loss": 0.7694, + "step": 7480 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011415173337799363, + "loss": 0.7633, + "step": 7500 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011364930497404119, + "loss": 0.773, + "step": 7520 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011314687657008876, + "loss": 0.76, + "step": 7540 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011264444816613632, + "loss": 0.7708, + "step": 7560 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011214201976218388, + "loss": 0.7757, + "step": 7580 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011163959135823145, + "loss": 0.7655, + "step": 7600 + }, + { + "epoch": 0.63, + "eval_loss": 0.7751550078392029, + "eval_runtime": 8.2715, + "eval_samples_per_second": 241.793, + "eval_steps_per_second": 1.934, + "step": 7600 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011113716295427901, + "loss": 0.7607, + "step": 7620 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011063473455032657, + "loss": 0.7703, + "step": 7640 + }, + { + "epoch": 0.64, + "learning_rate": 0.00011013230614637413, + "loss": 0.7653, + "step": 7660 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010962987774242169, + "loss": 0.7753, + "step": 7680 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010912744933846925, + "loss": 0.7639, + "step": 7700 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010862502093451683, + "loss": 0.7701, + "step": 7720 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010812259253056439, + "loss": 0.7614, + "step": 7740 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010762016412661195, + "loss": 0.7612, + "step": 7760 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010711773572265951, + "loss": 0.7597, + "step": 7780 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010661530731870707, + "loss": 0.7621, + "step": 7800 + }, + { + "epoch": 0.65, + "eval_loss": 0.7740359902381897, + "eval_runtime": 8.2962, + "eval_samples_per_second": 241.075, + "eval_steps_per_second": 1.929, + "step": 7800 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010611287891475463, + "loss": 0.7592, + "step": 7820 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001056104505108022, + "loss": 0.7665, + "step": 7840 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010510802210684975, + "loss": 0.7646, + "step": 7860 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010460559370289732, + "loss": 0.7668, + "step": 7880 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010410316529894489, + "loss": 0.7756, + "step": 7900 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010360073689499245, + "loss": 0.7684, + "step": 7920 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010309830849104001, + "loss": 0.7566, + "step": 7940 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010259588008708757, + "loss": 0.7581, + "step": 7960 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010209345168313515, + "loss": 0.7624, + "step": 7980 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010159102327918271, + "loss": 0.7594, + "step": 8000 + }, + { + "epoch": 0.66, + "eval_loss": 0.7729437351226807, + "eval_runtime": 8.2949, + "eval_samples_per_second": 241.112, + "eval_steps_per_second": 1.929, + "step": 8000 + }, + { + "epoch": 0.67, + "learning_rate": 0.00010108859487523028, + "loss": 0.7657, + "step": 8020 + }, + { + "epoch": 0.67, + "learning_rate": 0.00010058616647127785, + "loss": 0.7604, + "step": 8040 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001000837380673254, + "loss": 0.7659, + "step": 8060 + }, + { + "epoch": 0.67, + "learning_rate": 9.958130966337297e-05, + "loss": 0.7688, + "step": 8080 + }, + { + "epoch": 0.67, + "learning_rate": 9.907888125942053e-05, + "loss": 0.7702, + "step": 8100 + }, + { + "epoch": 0.67, + "learning_rate": 9.857645285546809e-05, + "loss": 0.7572, + "step": 8120 + }, + { + "epoch": 0.68, + "learning_rate": 9.807402445151565e-05, + "loss": 0.7603, + "step": 8140 + }, + { + "epoch": 0.68, + "learning_rate": 9.757159604756321e-05, + "loss": 0.761, + "step": 8160 + }, + { + "epoch": 0.68, + "learning_rate": 9.706916764361077e-05, + "loss": 0.7649, + "step": 8180 + }, + { + "epoch": 0.68, + "learning_rate": 9.656673923965835e-05, + "loss": 0.7554, + "step": 8200 + }, + { + "epoch": 0.68, + "eval_loss": 0.7727349996566772, + "eval_runtime": 8.2811, + "eval_samples_per_second": 241.515, + "eval_steps_per_second": 1.932, + "step": 8200 + }, + { + "epoch": 0.68, + "learning_rate": 9.606431083570591e-05, + "loss": 0.7634, + "step": 8220 + }, + { + "epoch": 0.68, + "learning_rate": 9.556188243175347e-05, + "loss": 0.7611, + "step": 8240 + }, + { + "epoch": 0.69, + "learning_rate": 9.505945402780103e-05, + "loss": 0.7606, + "step": 8260 + }, + { + "epoch": 0.69, + "learning_rate": 9.455702562384859e-05, + "loss": 0.7703, + "step": 8280 + }, + { + "epoch": 0.69, + "learning_rate": 9.405459721989615e-05, + "loss": 0.7616, + "step": 8300 + }, + { + "epoch": 0.69, + "learning_rate": 9.355216881594372e-05, + "loss": 0.7638, + "step": 8320 + }, + { + "epoch": 0.69, + "learning_rate": 9.30748618321889e-05, + "loss": 0.7461, + "step": 8340 + }, + { + "epoch": 0.69, + "learning_rate": 9.257243342823646e-05, + "loss": 0.7577, + "step": 8360 + }, + { + "epoch": 0.7, + "learning_rate": 9.207000502428403e-05, + "loss": 0.7553, + "step": 8380 + }, + { + "epoch": 0.7, + "learning_rate": 9.15675766203316e-05, + "loss": 0.7564, + "step": 8400 + }, + { + "epoch": 0.7, + "eval_loss": 0.771743893623352, + "eval_runtime": 8.2767, + "eval_samples_per_second": 241.642, + "eval_steps_per_second": 1.933, + "step": 8400 + }, + { + "epoch": 0.7, + "learning_rate": 9.106514821637916e-05, + "loss": 0.7605, + "step": 8420 + }, + { + "epoch": 0.7, + "learning_rate": 9.056271981242672e-05, + "loss": 0.773, + "step": 8440 + }, + { + "epoch": 0.7, + "learning_rate": 9.006029140847428e-05, + "loss": 0.7657, + "step": 8460 + }, + { + "epoch": 0.7, + "learning_rate": 8.955786300452185e-05, + "loss": 0.7597, + "step": 8480 + }, + { + "epoch": 0.71, + "learning_rate": 8.90554346005694e-05, + "loss": 0.7687, + "step": 8500 + }, + { + "epoch": 0.71, + "learning_rate": 8.855300619661697e-05, + "loss": 0.7545, + "step": 8520 + }, + { + "epoch": 0.71, + "learning_rate": 8.805057779266453e-05, + "loss": 0.7575, + "step": 8540 + }, + { + "epoch": 0.71, + "learning_rate": 8.754814938871209e-05, + "loss": 0.771, + "step": 8560 + }, + { + "epoch": 0.71, + "learning_rate": 8.704572098475966e-05, + "loss": 0.7622, + "step": 8580 + }, + { + "epoch": 0.71, + "learning_rate": 8.654329258080723e-05, + "loss": 0.7621, + "step": 8600 + }, + { + "epoch": 0.71, + "eval_loss": 0.7706981897354126, + "eval_runtime": 8.2852, + "eval_samples_per_second": 241.394, + "eval_steps_per_second": 1.931, + "step": 8600 + }, + { + "epoch": 0.72, + "learning_rate": 8.604086417685479e-05, + "loss": 0.7516, + "step": 8620 + }, + { + "epoch": 0.72, + "learning_rate": 8.553843577290235e-05, + "loss": 0.7526, + "step": 8640 + }, + { + "epoch": 0.72, + "learning_rate": 8.503600736894991e-05, + "loss": 0.7639, + "step": 8660 + }, + { + "epoch": 0.72, + "learning_rate": 8.453357896499747e-05, + "loss": 0.7561, + "step": 8680 + }, + { + "epoch": 0.72, + "learning_rate": 8.403115056104506e-05, + "loss": 0.7643, + "step": 8700 + }, + { + "epoch": 0.72, + "learning_rate": 8.352872215709262e-05, + "loss": 0.7665, + "step": 8720 + }, + { + "epoch": 0.73, + "learning_rate": 8.302629375314018e-05, + "loss": 0.7681, + "step": 8740 + }, + { + "epoch": 0.73, + "learning_rate": 8.252386534918774e-05, + "loss": 0.7655, + "step": 8760 + }, + { + "epoch": 0.73, + "learning_rate": 8.20214369452353e-05, + "loss": 0.7603, + "step": 8780 + }, + { + "epoch": 0.73, + "learning_rate": 8.151900854128286e-05, + "loss": 0.7624, + "step": 8800 + }, + { + "epoch": 0.73, + "eval_loss": 0.7699927687644958, + "eval_runtime": 8.2872, + "eval_samples_per_second": 241.335, + "eval_steps_per_second": 1.931, + "step": 8800 + }, + { + "epoch": 0.73, + "learning_rate": 8.101658013733043e-05, + "loss": 0.7666, + "step": 8820 + }, + { + "epoch": 0.73, + "learning_rate": 8.051415173337799e-05, + "loss": 0.752, + "step": 8840 + }, + { + "epoch": 0.74, + "learning_rate": 8.001172332942555e-05, + "loss": 0.7654, + "step": 8860 + }, + { + "epoch": 0.74, + "learning_rate": 7.950929492547312e-05, + "loss": 0.7546, + "step": 8880 + }, + { + "epoch": 0.74, + "learning_rate": 7.900686652152068e-05, + "loss": 0.755, + "step": 8900 + }, + { + "epoch": 0.74, + "learning_rate": 7.850443811756824e-05, + "loss": 0.7578, + "step": 8920 + }, + { + "epoch": 0.74, + "learning_rate": 7.80020097136158e-05, + "loss": 0.7527, + "step": 8940 + }, + { + "epoch": 0.74, + "learning_rate": 7.749958130966337e-05, + "loss": 0.7622, + "step": 8960 + }, + { + "epoch": 0.75, + "learning_rate": 7.699715290571093e-05, + "loss": 0.7681, + "step": 8980 + }, + { + "epoch": 0.75, + "learning_rate": 7.649472450175849e-05, + "loss": 0.7641, + "step": 9000 + }, + { + "epoch": 0.75, + "eval_loss": 0.7696471214294434, + "eval_runtime": 8.269, + "eval_samples_per_second": 241.867, + "eval_steps_per_second": 1.935, + "step": 9000 + }, + { + "epoch": 0.75, + "learning_rate": 7.599229609780605e-05, + "loss": 0.7622, + "step": 9020 + }, + { + "epoch": 0.75, + "learning_rate": 7.548986769385361e-05, + "loss": 0.7608, + "step": 9040 + }, + { + "epoch": 0.75, + "learning_rate": 7.498743928990119e-05, + "loss": 0.7613, + "step": 9060 + }, + { + "epoch": 0.75, + "learning_rate": 7.448501088594875e-05, + "loss": 0.759, + "step": 9080 + }, + { + "epoch": 0.76, + "learning_rate": 7.398258248199631e-05, + "loss": 0.7585, + "step": 9100 + }, + { + "epoch": 0.76, + "learning_rate": 7.348015407804387e-05, + "loss": 0.7609, + "step": 9120 + }, + { + "epoch": 0.76, + "learning_rate": 7.297772567409144e-05, + "loss": 0.76, + "step": 9140 + }, + { + "epoch": 0.76, + "learning_rate": 7.2475297270139e-05, + "loss": 0.7621, + "step": 9160 + }, + { + "epoch": 0.76, + "learning_rate": 7.197286886618657e-05, + "loss": 0.7587, + "step": 9180 + }, + { + "epoch": 0.76, + "learning_rate": 7.147044046223413e-05, + "loss": 0.7589, + "step": 9200 + }, + { + "epoch": 0.76, + "eval_loss": 0.7685341835021973, + "eval_runtime": 8.3378, + "eval_samples_per_second": 239.872, + "eval_steps_per_second": 1.919, + "step": 9200 + }, + { + "epoch": 0.77, + "learning_rate": 7.096801205828169e-05, + "loss": 0.7708, + "step": 9220 + }, + { + "epoch": 0.77, + "learning_rate": 7.046558365432925e-05, + "loss": 0.7468, + "step": 9240 + }, + { + "epoch": 0.77, + "learning_rate": 6.996315525037681e-05, + "loss": 0.7669, + "step": 9260 + }, + { + "epoch": 0.77, + "learning_rate": 6.946072684642437e-05, + "loss": 0.7639, + "step": 9280 + }, + { + "epoch": 0.77, + "learning_rate": 6.895829844247193e-05, + "loss": 0.764, + "step": 9300 + }, + { + "epoch": 0.77, + "learning_rate": 6.845587003851951e-05, + "loss": 0.7605, + "step": 9320 + }, + { + "epoch": 0.78, + "learning_rate": 6.795344163456707e-05, + "loss": 0.7638, + "step": 9340 + }, + { + "epoch": 0.78, + "learning_rate": 6.745101323061463e-05, + "loss": 0.753, + "step": 9360 + }, + { + "epoch": 0.78, + "learning_rate": 6.694858482666219e-05, + "loss": 0.7567, + "step": 9380 + }, + { + "epoch": 0.78, + "learning_rate": 6.644615642270977e-05, + "loss": 0.7604, + "step": 9400 + }, + { + "epoch": 0.78, + "eval_loss": 0.7682663798332214, + "eval_runtime": 8.2956, + "eval_samples_per_second": 241.092, + "eval_steps_per_second": 1.929, + "step": 9400 + }, + { + "epoch": 0.78, + "learning_rate": 6.594372801875733e-05, + "loss": 0.7603, + "step": 9420 + }, + { + "epoch": 0.78, + "learning_rate": 6.544129961480489e-05, + "loss": 0.7677, + "step": 9440 + }, + { + "epoch": 0.79, + "learning_rate": 6.493887121085245e-05, + "loss": 0.7692, + "step": 9460 + }, + { + "epoch": 0.79, + "learning_rate": 6.443644280690001e-05, + "loss": 0.7637, + "step": 9480 + }, + { + "epoch": 0.79, + "learning_rate": 6.393401440294757e-05, + "loss": 0.756, + "step": 9500 + }, + { + "epoch": 0.79, + "learning_rate": 6.343158599899513e-05, + "loss": 0.7572, + "step": 9520 + }, + { + "epoch": 0.79, + "learning_rate": 6.29291575950427e-05, + "loss": 0.7696, + "step": 9540 + }, + { + "epoch": 0.79, + "learning_rate": 6.242672919109027e-05, + "loss": 0.753, + "step": 9560 + }, + { + "epoch": 0.8, + "learning_rate": 6.192430078713783e-05, + "loss": 0.7619, + "step": 9580 + }, + { + "epoch": 0.8, + "learning_rate": 6.142187238318539e-05, + "loss": 0.7574, + "step": 9600 + }, + { + "epoch": 0.8, + "eval_loss": 0.7677283883094788, + "eval_runtime": 8.2754, + "eval_samples_per_second": 241.68, + "eval_steps_per_second": 1.933, + "step": 9600 + }, + { + "epoch": 0.8, + "learning_rate": 6.091944397923295e-05, + "loss": 0.7624, + "step": 9620 + }, + { + "epoch": 0.8, + "learning_rate": 6.0417015575280514e-05, + "loss": 0.7554, + "step": 9640 + }, + { + "epoch": 0.8, + "learning_rate": 5.9914587171328075e-05, + "loss": 0.7635, + "step": 9660 + }, + { + "epoch": 0.8, + "learning_rate": 5.941215876737565e-05, + "loss": 0.7586, + "step": 9680 + }, + { + "epoch": 0.81, + "learning_rate": 5.890973036342321e-05, + "loss": 0.755, + "step": 9700 + }, + { + "epoch": 0.81, + "learning_rate": 5.840730195947077e-05, + "loss": 0.7584, + "step": 9720 + }, + { + "epoch": 0.81, + "learning_rate": 5.7904873555518333e-05, + "loss": 0.7528, + "step": 9740 + }, + { + "epoch": 0.81, + "learning_rate": 5.74024451515659e-05, + "loss": 0.7597, + "step": 9760 + }, + { + "epoch": 0.81, + "learning_rate": 5.690001674761346e-05, + "loss": 0.7527, + "step": 9780 + }, + { + "epoch": 0.81, + "learning_rate": 5.6397588343661024e-05, + "loss": 0.7617, + "step": 9800 + }, + { + "epoch": 0.81, + "eval_loss": 0.7668038010597229, + "eval_runtime": 8.3597, + "eval_samples_per_second": 239.242, + "eval_steps_per_second": 1.914, + "step": 9800 + }, + { + "epoch": 0.82, + "learning_rate": 5.5895159939708585e-05, + "loss": 0.7584, + "step": 9820 + }, + { + "epoch": 0.82, + "learning_rate": 5.5392731535756146e-05, + "loss": 0.762, + "step": 9840 + }, + { + "epoch": 0.82, + "learning_rate": 5.4890303131803714e-05, + "loss": 0.7586, + "step": 9860 + }, + { + "epoch": 0.82, + "learning_rate": 5.4387874727851275e-05, + "loss": 0.7569, + "step": 9880 + }, + { + "epoch": 0.82, + "learning_rate": 5.3885446323898836e-05, + "loss": 0.7611, + "step": 9900 + }, + { + "epoch": 0.82, + "learning_rate": 5.33830179199464e-05, + "loss": 0.7581, + "step": 9920 + }, + { + "epoch": 0.83, + "learning_rate": 5.2880589515993965e-05, + "loss": 0.7657, + "step": 9940 + }, + { + "epoch": 0.83, + "learning_rate": 5.237816111204153e-05, + "loss": 0.7504, + "step": 9960 + }, + { + "epoch": 0.83, + "learning_rate": 5.1875732708089094e-05, + "loss": 0.7547, + "step": 9980 + }, + { + "epoch": 0.83, + "learning_rate": 5.1373304304136655e-05, + "loss": 0.7588, + "step": 10000 + }, + { + "epoch": 0.83, + "eval_loss": 0.766875147819519, + "eval_runtime": 8.3125, + "eval_samples_per_second": 240.603, + "eval_steps_per_second": 1.925, + "step": 10000 + }, + { + "epoch": 0.83, + "learning_rate": 5.0870875900184223e-05, + "loss": 0.7588, + "step": 10020 + }, + { + "epoch": 0.83, + "learning_rate": 5.0368447496231785e-05, + "loss": 0.7544, + "step": 10040 + }, + { + "epoch": 0.84, + "learning_rate": 4.9866019092279346e-05, + "loss": 0.7588, + "step": 10060 + }, + { + "epoch": 0.84, + "learning_rate": 4.936359068832691e-05, + "loss": 0.7547, + "step": 10080 + }, + { + "epoch": 0.84, + "learning_rate": 4.8861162284374475e-05, + "loss": 0.7608, + "step": 10100 + }, + { + "epoch": 0.84, + "learning_rate": 4.8358733880422036e-05, + "loss": 0.7539, + "step": 10120 + }, + { + "epoch": 0.84, + "learning_rate": 4.78563054764696e-05, + "loss": 0.7639, + "step": 10140 + }, + { + "epoch": 0.84, + "learning_rate": 4.735387707251716e-05, + "loss": 0.7622, + "step": 10160 + }, + { + "epoch": 0.85, + "learning_rate": 4.6851448668564726e-05, + "loss": 0.7528, + "step": 10180 + }, + { + "epoch": 0.85, + "learning_rate": 4.634902026461229e-05, + "loss": 0.7566, + "step": 10200 + }, + { + "epoch": 0.85, + "eval_loss": 0.7660693526268005, + "eval_runtime": 8.2951, + "eval_samples_per_second": 241.107, + "eval_steps_per_second": 1.929, + "step": 10200 + }, + { + "epoch": 0.85, + "learning_rate": 4.584659186065985e-05, + "loss": 0.7545, + "step": 10220 + }, + { + "epoch": 0.85, + "learning_rate": 4.5344163456707416e-05, + "loss": 0.7505, + "step": 10240 + }, + { + "epoch": 0.85, + "learning_rate": 4.4841735052754984e-05, + "loss": 0.7645, + "step": 10260 + }, + { + "epoch": 0.85, + "learning_rate": 4.4339306648802545e-05, + "loss": 0.7566, + "step": 10280 + }, + { + "epoch": 0.86, + "learning_rate": 4.383687824485011e-05, + "loss": 0.7509, + "step": 10300 + }, + { + "epoch": 0.86, + "learning_rate": 4.333444984089767e-05, + "loss": 0.7546, + "step": 10320 + }, + { + "epoch": 0.86, + "learning_rate": 4.285714285714285e-05, + "loss": 0.7533, + "step": 10340 + }, + { + "epoch": 0.86, + "learning_rate": 4.235471445319041e-05, + "loss": 0.7509, + "step": 10360 + }, + { + "epoch": 0.86, + "learning_rate": 4.185228604923798e-05, + "loss": 0.7558, + "step": 10380 + }, + { + "epoch": 0.86, + "learning_rate": 4.1349857645285547e-05, + "loss": 0.7624, + "step": 10400 + }, + { + "epoch": 0.86, + "eval_loss": 0.7657083868980408, + "eval_runtime": 8.2828, + "eval_samples_per_second": 241.464, + "eval_steps_per_second": 1.932, + "step": 10400 + }, + { + "epoch": 0.87, + "learning_rate": 4.084742924133311e-05, + "loss": 0.7509, + "step": 10420 + }, + { + "epoch": 0.87, + "learning_rate": 4.0345000837380676e-05, + "loss": 0.7509, + "step": 10440 + }, + { + "epoch": 0.87, + "learning_rate": 3.984257243342824e-05, + "loss": 0.761, + "step": 10460 + }, + { + "epoch": 0.87, + "learning_rate": 3.93401440294758e-05, + "loss": 0.7546, + "step": 10480 + }, + { + "epoch": 0.87, + "learning_rate": 3.883771562552336e-05, + "loss": 0.764, + "step": 10500 + }, + { + "epoch": 0.87, + "learning_rate": 3.833528722157092e-05, + "loss": 0.765, + "step": 10520 + }, + { + "epoch": 0.88, + "learning_rate": 3.783285881761849e-05, + "loss": 0.7551, + "step": 10540 + }, + { + "epoch": 0.88, + "learning_rate": 3.733043041366605e-05, + "loss": 0.7554, + "step": 10560 + }, + { + "epoch": 0.88, + "learning_rate": 3.682800200971361e-05, + "loss": 0.7574, + "step": 10580 + }, + { + "epoch": 0.88, + "learning_rate": 3.632557360576117e-05, + "loss": 0.7647, + "step": 10600 + }, + { + "epoch": 0.88, + "eval_loss": 0.7651572227478027, + "eval_runtime": 8.2868, + "eval_samples_per_second": 241.347, + "eval_steps_per_second": 1.931, + "step": 10600 + }, + { + "epoch": 0.88, + "learning_rate": 3.584826662200636e-05, + "loss": 0.7508, + "step": 10620 + }, + { + "epoch": 0.88, + "learning_rate": 3.534583821805393e-05, + "loss": 0.7636, + "step": 10640 + }, + { + "epoch": 0.89, + "learning_rate": 3.484340981410149e-05, + "loss": 0.7584, + "step": 10660 + }, + { + "epoch": 0.89, + "learning_rate": 3.434098141014905e-05, + "loss": 0.7677, + "step": 10680 + }, + { + "epoch": 0.89, + "learning_rate": 3.383855300619661e-05, + "loss": 0.7493, + "step": 10700 + }, + { + "epoch": 0.89, + "learning_rate": 3.333612460224418e-05, + "loss": 0.7557, + "step": 10720 + }, + { + "epoch": 0.89, + "learning_rate": 3.283369619829174e-05, + "loss": 0.7528, + "step": 10740 + }, + { + "epoch": 0.89, + "learning_rate": 3.23312677943393e-05, + "loss": 0.7573, + "step": 10760 + }, + { + "epoch": 0.9, + "learning_rate": 3.182883939038687e-05, + "loss": 0.7471, + "step": 10780 + }, + { + "epoch": 0.9, + "learning_rate": 3.132641098643443e-05, + "loss": 0.7537, + "step": 10800 + }, + { + "epoch": 0.9, + "eval_loss": 0.7651455402374268, + "eval_runtime": 8.2847, + "eval_samples_per_second": 241.409, + "eval_steps_per_second": 1.931, + "step": 10800 + }, + { + "epoch": 0.9, + "learning_rate": 3.082398258248199e-05, + "loss": 0.7538, + "step": 10820 + }, + { + "epoch": 0.9, + "learning_rate": 3.0321554178529556e-05, + "loss": 0.7585, + "step": 10840 + }, + { + "epoch": 0.9, + "learning_rate": 2.981912577457712e-05, + "loss": 0.7533, + "step": 10860 + }, + { + "epoch": 0.9, + "learning_rate": 2.9316697370624682e-05, + "loss": 0.7607, + "step": 10880 + }, + { + "epoch": 0.91, + "learning_rate": 2.8814268966672247e-05, + "loss": 0.7522, + "step": 10900 + }, + { + "epoch": 0.91, + "learning_rate": 2.831184056271981e-05, + "loss": 0.7581, + "step": 10920 + }, + { + "epoch": 0.91, + "learning_rate": 2.7809412158767376e-05, + "loss": 0.7597, + "step": 10940 + }, + { + "epoch": 0.91, + "learning_rate": 2.7306983754814937e-05, + "loss": 0.7649, + "step": 10960 + }, + { + "epoch": 0.91, + "learning_rate": 2.68045553508625e-05, + "loss": 0.7645, + "step": 10980 + }, + { + "epoch": 0.91, + "learning_rate": 2.6302126946910063e-05, + "loss": 0.743, + "step": 11000 + }, + { + "epoch": 0.91, + "eval_loss": 0.7645469903945923, + "eval_runtime": 8.314, + "eval_samples_per_second": 240.558, + "eval_steps_per_second": 1.924, + "step": 11000 + }, + { + "epoch": 0.92, + "learning_rate": 2.5799698542957624e-05, + "loss": 0.7488, + "step": 11020 + }, + { + "epoch": 0.92, + "learning_rate": 2.529727013900519e-05, + "loss": 0.7515, + "step": 11040 + }, + { + "epoch": 0.92, + "learning_rate": 2.4794841735052756e-05, + "loss": 0.7582, + "step": 11060 + }, + { + "epoch": 0.92, + "learning_rate": 2.4292413331100317e-05, + "loss": 0.7564, + "step": 11080 + }, + { + "epoch": 0.92, + "learning_rate": 2.378998492714788e-05, + "loss": 0.7486, + "step": 11100 + }, + { + "epoch": 0.92, + "learning_rate": 2.3287556523195443e-05, + "loss": 0.7537, + "step": 11120 + }, + { + "epoch": 0.93, + "learning_rate": 2.2785128119243004e-05, + "loss": 0.7593, + "step": 11140 + }, + { + "epoch": 0.93, + "learning_rate": 2.228269971529057e-05, + "loss": 0.7435, + "step": 11160 + }, + { + "epoch": 0.93, + "learning_rate": 2.178027131133813e-05, + "loss": 0.7646, + "step": 11180 + }, + { + "epoch": 0.93, + "learning_rate": 2.1277842907385698e-05, + "loss": 0.7445, + "step": 11200 + }, + { + "epoch": 0.93, + "eval_loss": 0.7643282413482666, + "eval_runtime": 8.2655, + "eval_samples_per_second": 241.97, + "eval_steps_per_second": 1.936, + "step": 11200 + }, + { + "epoch": 0.93, + "learning_rate": 2.077541450343326e-05, + "loss": 0.7576, + "step": 11220 + }, + { + "epoch": 0.93, + "learning_rate": 2.0272986099480824e-05, + "loss": 0.7553, + "step": 11240 + }, + { + "epoch": 0.94, + "learning_rate": 1.9770557695528385e-05, + "loss": 0.7567, + "step": 11260 + }, + { + "epoch": 0.94, + "learning_rate": 1.926812929157595e-05, + "loss": 0.7501, + "step": 11280 + }, + { + "epoch": 0.94, + "learning_rate": 1.876570088762351e-05, + "loss": 0.7502, + "step": 11300 + }, + { + "epoch": 0.94, + "learning_rate": 1.8263272483671075e-05, + "loss": 0.756, + "step": 11320 + }, + { + "epoch": 0.94, + "learning_rate": 1.776084407971864e-05, + "loss": 0.7528, + "step": 11340 + }, + { + "epoch": 0.94, + "learning_rate": 1.72584156757662e-05, + "loss": 0.7578, + "step": 11360 + }, + { + "epoch": 0.95, + "learning_rate": 1.6755987271813765e-05, + "loss": 0.7588, + "step": 11380 + }, + { + "epoch": 0.95, + "learning_rate": 1.625355886786133e-05, + "loss": 0.7486, + "step": 11400 + }, + { + "epoch": 0.95, + "eval_loss": 0.7640262842178345, + "eval_runtime": 8.2822, + "eval_samples_per_second": 241.482, + "eval_steps_per_second": 1.932, + "step": 11400 + }, + { + "epoch": 0.95, + "learning_rate": 1.575113046390889e-05, + "loss": 0.7527, + "step": 11420 + }, + { + "epoch": 0.95, + "learning_rate": 1.5248702059956454e-05, + "loss": 0.7472, + "step": 11440 + }, + { + "epoch": 0.95, + "learning_rate": 1.4746273656004018e-05, + "loss": 0.7476, + "step": 11460 + }, + { + "epoch": 0.95, + "learning_rate": 1.4243845252051581e-05, + "loss": 0.7551, + "step": 11480 + }, + { + "epoch": 0.95, + "learning_rate": 1.3741416848099144e-05, + "loss": 0.7609, + "step": 11500 + }, + { + "epoch": 0.96, + "learning_rate": 1.3238988444146708e-05, + "loss": 0.7496, + "step": 11520 + }, + { + "epoch": 0.96, + "learning_rate": 1.2736560040194271e-05, + "loss": 0.7528, + "step": 11540 + }, + { + "epoch": 0.96, + "learning_rate": 1.2234131636241834e-05, + "loss": 0.7541, + "step": 11560 + }, + { + "epoch": 0.96, + "learning_rate": 1.1731703232289399e-05, + "loss": 0.7492, + "step": 11580 + }, + { + "epoch": 0.96, + "learning_rate": 1.1229274828336962e-05, + "loss": 0.7464, + "step": 11600 + }, + { + "epoch": 0.96, + "eval_loss": 0.7637657523155212, + "eval_runtime": 8.2732, + "eval_samples_per_second": 241.743, + "eval_steps_per_second": 1.934, + "step": 11600 + }, + { + "epoch": 0.96, + "learning_rate": 1.0726846424384524e-05, + "loss": 0.7549, + "step": 11620 + }, + { + "epoch": 0.97, + "learning_rate": 1.0224418020432087e-05, + "loss": 0.7532, + "step": 11640 + }, + { + "epoch": 0.97, + "learning_rate": 9.721989616479652e-06, + "loss": 0.7513, + "step": 11660 + }, + { + "epoch": 0.97, + "learning_rate": 9.219561212527215e-06, + "loss": 0.7657, + "step": 11680 + }, + { + "epoch": 0.97, + "learning_rate": 8.717132808574777e-06, + "loss": 0.7445, + "step": 11700 + }, + { + "epoch": 0.97, + "learning_rate": 8.21470440462234e-06, + "loss": 0.7574, + "step": 11720 + }, + { + "epoch": 0.97, + "learning_rate": 7.712276000669905e-06, + "loss": 0.7576, + "step": 11740 + }, + { + "epoch": 0.98, + "learning_rate": 7.209847596717467e-06, + "loss": 0.7599, + "step": 11760 + }, + { + "epoch": 0.98, + "learning_rate": 6.7074191927650305e-06, + "loss": 0.7558, + "step": 11780 + }, + { + "epoch": 0.98, + "learning_rate": 6.204990788812593e-06, + "loss": 0.7541, + "step": 11800 + }, + { + "epoch": 0.98, + "eval_loss": 0.7635765671730042, + "eval_runtime": 8.2847, + "eval_samples_per_second": 241.409, + "eval_steps_per_second": 1.931, + "step": 11800 + }, + { + "epoch": 0.98, + "learning_rate": 5.702562384860157e-06, + "loss": 0.758, + "step": 11820 + }, + { + "epoch": 0.98, + "learning_rate": 5.20013398090772e-06, + "loss": 0.7381, + "step": 11840 + }, + { + "epoch": 0.98, + "learning_rate": 4.697705576955284e-06, + "loss": 0.7636, + "step": 11860 + }, + { + "epoch": 0.99, + "learning_rate": 4.1952771730028464e-06, + "loss": 0.7548, + "step": 11880 + }, + { + "epoch": 0.99, + "learning_rate": 3.6928487690504097e-06, + "loss": 0.7537, + "step": 11900 + }, + { + "epoch": 0.99, + "learning_rate": 3.1904203650979734e-06, + "loss": 0.7548, + "step": 11920 + }, + { + "epoch": 0.99, + "learning_rate": 2.6879919611455367e-06, + "loss": 0.7515, + "step": 11940 + }, + { + "epoch": 0.99, + "learning_rate": 2.1855635571931e-06, + "loss": 0.7458, + "step": 11960 + }, + { + "epoch": 0.99, + "learning_rate": 1.683135153240663e-06, + "loss": 0.7466, + "step": 11980 + }, + { + "epoch": 1.0, + "learning_rate": 1.1807067492882263e-06, + "loss": 0.7565, + "step": 12000 + }, + { + "epoch": 1.0, + "eval_loss": 0.763276994228363, + "eval_runtime": 8.2818, + "eval_samples_per_second": 241.494, + "eval_steps_per_second": 1.932, + "step": 12000 + } + ], + "max_steps": 12042, + "num_train_epochs": 1, + "total_flos": 3.119715122549883e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-belle1.5m7b/checkpoint-12000/training_args.bin b/adapters/saved-belle1.5m7b/checkpoint-12000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dfe19878cfd0a3df620d94aa6be1508290c31ed9 --- /dev/null +++ b/adapters/saved-belle1.5m7b/checkpoint-12000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfb1ef6ab82f6cd0561137910099627267bd4099d7d83919869b14210b7e77c3 +size 3643 diff --git a/adapters/saved-cot7b/adapter_config.json b/adapters/saved-cot7b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6e56f2ae8f10fadfeec6c730ac6b119025824443 --- /dev/null +++ b/adapters/saved-cot7b/adapter_config.json @@ -0,0 +1,18 @@ +{ + "base_model_name_or_path": "/mnt/bn/qingyi-bn-lq/llama/llama-7b-hf", + "bias": "none", + "enable_lora": null, + "fan_in_fan_out": false, + "inference_mode": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved-cot7b/adapter_model.bin b/adapters/saved-cot7b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..3d5be01b07e4dabe8a97b97ddf49763be31db7be --- /dev/null +++ b/adapters/saved-cot7b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c55254ba55f677b3737f212099124215178491e43b65dddfec5e0b719a686a56 +size 16822989 diff --git a/adapters/saved-cot7b/checkpoint-1200/optimizer.pt b/adapters/saved-cot7b/checkpoint-1200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f523e868a02ef3e51a9f10d1fdd30fadd504040 --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28691c888d0137fab30caf190be987404b2bcabd52e8177052ee78c63c88f73c +size 33629893 diff --git a/adapters/saved-cot7b/checkpoint-1200/pytorch_model.bin b/adapters/saved-cot7b/checkpoint-1200/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..74c09e5e959f5024f524aec9ec5855c11380b8f0 --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1200/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4b1e88d9f3500df034d3cce176bc4b1a277a54868ff7a774483fa6016f0d287 +size 16822989 diff --git a/adapters/saved-cot7b/checkpoint-1200/rng_state_0.pth b/adapters/saved-cot7b/checkpoint-1200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..69779d7a7ba9559210e33eec241afd18bbb88f16 --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcd6179354d0b09f025b90899fd2bd19d2e1e6127baea0402763b19a118b8038 +size 14583 diff --git a/adapters/saved-cot7b/checkpoint-1200/rng_state_1.pth b/adapters/saved-cot7b/checkpoint-1200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..fec91afe26c24f7f939efc533e933cea6210ce12 --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d79e7b97b49326ecf55ab62fc73b99e5581a46a48adb12208e113bca29d6d42 +size 14583 diff --git a/adapters/saved-cot7b/checkpoint-1200/rng_state_2.pth b/adapters/saved-cot7b/checkpoint-1200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..f854bea4dfd20b0eeafcf30bb074d33a3502e7a5 --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8a7b722c18cbf8db95227efe07d8f20001705cbe45641c28f4f4283abb062be +size 14583 diff --git a/adapters/saved-cot7b/checkpoint-1200/rng_state_3.pth b/adapters/saved-cot7b/checkpoint-1200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..a51b70748f370c7f33cd1ad3e709c2fdebb223a5 --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fec7faf7e59c79cbed8df30ee3b65640efef9f17b76691ef9bc9ffb339b28dd4 +size 14583 diff --git a/adapters/saved-cot7b/checkpoint-1200/scaler.pt b/adapters/saved-cot7b/checkpoint-1200/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c73b6e7148d8ae7026711173634e0a11b1b94e2d --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1200/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:741cefeca9ef427f92406d2d10b81996655e2a9d50eb7aaa9614e6fdd1c9f529 +size 557 diff --git a/adapters/saved-cot7b/checkpoint-1200/scheduler.pt b/adapters/saved-cot7b/checkpoint-1200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..519d4e02f3dcd25b65dd54bb4fd0cd2b90fc8682 --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:014cabc9e3f7058868d5918b198fed0d86f4a17a2635b7cad784fd1b2564b2ad +size 627 diff --git a/adapters/saved-cot7b/checkpoint-1200/trainer_state.json b/adapters/saved-cot7b/checkpoint-1200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dce8777aeb29f315e720eecdba8c139cbbf60d3b --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1200/trainer_state.json @@ -0,0 +1,424 @@ +{ + "best_metric": 0.7033773064613342, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-cot7b/checkpoint-1200", + "epoch": 2.110353923939327, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04, + "learning_rate": 5.9999999999999995e-05, + "loss": 2.3604, + "step": 20 + }, + { + "epoch": 0.07, + "learning_rate": 0.00011999999999999999, + "loss": 1.944, + "step": 40 + }, + { + "epoch": 0.11, + "learning_rate": 0.00017999999999999998, + "loss": 1.2746, + "step": 60 + }, + { + "epoch": 0.14, + "learning_rate": 0.00023999999999999998, + "loss": 0.9818, + "step": 80 + }, + { + "epoch": 0.18, + "learning_rate": 0.0003, + "loss": 0.863, + "step": 100 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002962593516209476, + "loss": 0.8083, + "step": 120 + }, + { + "epoch": 0.25, + "learning_rate": 0.00029251870324189524, + "loss": 0.7923, + "step": 140 + }, + { + "epoch": 0.28, + "learning_rate": 0.00028877805486284284, + "loss": 0.7623, + "step": 160 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002850374064837905, + "loss": 0.7665, + "step": 180 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002812967581047381, + "loss": 0.7554, + "step": 200 + }, + { + "epoch": 0.35, + "eval_loss": 0.7609586715698242, + "eval_runtime": 32.1749, + "eval_samples_per_second": 62.16, + "eval_steps_per_second": 1.958, + "step": 200 + }, + { + "epoch": 0.39, + "learning_rate": 0.00027755610972568577, + "loss": 0.7353, + "step": 220 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002738154613466334, + "loss": 0.7354, + "step": 240 + }, + { + "epoch": 0.46, + "learning_rate": 0.00027007481296758103, + "loss": 0.7435, + "step": 260 + }, + { + "epoch": 0.49, + "learning_rate": 0.00026633416458852864, + "loss": 0.7373, + "step": 280 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002625935162094763, + "loss": 0.7332, + "step": 300 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002588528678304239, + "loss": 0.7265, + "step": 320 + }, + { + "epoch": 0.6, + "learning_rate": 0.00025511221945137156, + "loss": 0.7274, + "step": 340 + }, + { + "epoch": 0.63, + "learning_rate": 0.00025137157107231917, + "loss": 0.7279, + "step": 360 + }, + { + "epoch": 0.67, + "learning_rate": 0.00024763092269326683, + "loss": 0.7231, + "step": 380 + }, + { + "epoch": 0.7, + "learning_rate": 0.00024389027431421443, + "loss": 0.7238, + "step": 400 + }, + { + "epoch": 0.7, + "eval_loss": 0.7300755381584167, + "eval_runtime": 32.1181, + "eval_samples_per_second": 62.27, + "eval_steps_per_second": 1.962, + "step": 400 + }, + { + "epoch": 0.74, + "learning_rate": 0.00024014962593516207, + "loss": 0.7161, + "step": 420 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002364089775561097, + "loss": 0.7146, + "step": 440 + }, + { + "epoch": 0.81, + "learning_rate": 0.00023266832917705733, + "loss": 0.7152, + "step": 460 + }, + { + "epoch": 0.84, + "learning_rate": 0.00022892768079800496, + "loss": 0.7202, + "step": 480 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002251870324189526, + "loss": 0.7142, + "step": 500 + }, + { + "epoch": 0.91, + "learning_rate": 0.00022144638403990023, + "loss": 0.7146, + "step": 520 + }, + { + "epoch": 0.95, + "learning_rate": 0.00021770573566084786, + "loss": 0.7104, + "step": 540 + }, + { + "epoch": 0.98, + "learning_rate": 0.0002139650872817955, + "loss": 0.7034, + "step": 560 + }, + { + "epoch": 1.02, + "learning_rate": 0.00021022443890274313, + "loss": 0.7153, + "step": 580 + }, + { + "epoch": 1.06, + "learning_rate": 0.00020648379052369076, + "loss": 0.7052, + "step": 600 + }, + { + "epoch": 1.06, + "eval_loss": 0.7185753583908081, + "eval_runtime": 32.4703, + "eval_samples_per_second": 61.595, + "eval_steps_per_second": 1.94, + "step": 600 + }, + { + "epoch": 1.09, + "learning_rate": 0.0002027431421446384, + "loss": 0.7061, + "step": 620 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019900249376558603, + "loss": 0.7096, + "step": 640 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019526184538653366, + "loss": 0.7065, + "step": 660 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019152119700748126, + "loss": 0.7046, + "step": 680 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001877805486284289, + "loss": 0.701, + "step": 700 + }, + { + "epoch": 1.27, + "learning_rate": 0.00018403990024937653, + "loss": 0.6922, + "step": 720 + }, + { + "epoch": 1.3, + "learning_rate": 0.00018029925187032416, + "loss": 0.6982, + "step": 740 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001765586034912718, + "loss": 0.6993, + "step": 760 + }, + { + "epoch": 1.37, + "learning_rate": 0.00017281795511221943, + "loss": 0.6922, + "step": 780 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016907730673316706, + "loss": 0.6989, + "step": 800 + }, + { + "epoch": 1.41, + "eval_loss": 0.7114558219909668, + "eval_runtime": 32.0158, + "eval_samples_per_second": 62.469, + "eval_steps_per_second": 1.968, + "step": 800 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001653366583541147, + "loss": 0.6964, + "step": 820 + }, + { + "epoch": 1.48, + "learning_rate": 0.00016159600997506232, + "loss": 0.6969, + "step": 840 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015785536159600996, + "loss": 0.6982, + "step": 860 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001541147132169576, + "loss": 0.6977, + "step": 880 + }, + { + "epoch": 1.58, + "learning_rate": 0.00015037406483790522, + "loss": 0.7019, + "step": 900 + }, + { + "epoch": 1.62, + "learning_rate": 0.00014663341645885285, + "loss": 0.6963, + "step": 920 + }, + { + "epoch": 1.65, + "learning_rate": 0.00014289276807980049, + "loss": 0.7006, + "step": 940 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013915211970074812, + "loss": 0.6935, + "step": 960 + }, + { + "epoch": 1.72, + "learning_rate": 0.00013541147132169575, + "loss": 0.6846, + "step": 980 + }, + { + "epoch": 1.76, + "learning_rate": 0.00013167082294264338, + "loss": 0.701, + "step": 1000 + }, + { + "epoch": 1.76, + "eval_loss": 0.7069133520126343, + "eval_runtime": 31.9943, + "eval_samples_per_second": 62.511, + "eval_steps_per_second": 1.969, + "step": 1000 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012793017456359102, + "loss": 0.6896, + "step": 1020 + }, + { + "epoch": 1.83, + "learning_rate": 0.00012418952618453862, + "loss": 0.702, + "step": 1040 + }, + { + "epoch": 1.86, + "learning_rate": 0.00012044887780548627, + "loss": 0.6952, + "step": 1060 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001167082294264339, + "loss": 0.6902, + "step": 1080 + }, + { + "epoch": 1.93, + "learning_rate": 0.00011296758104738153, + "loss": 0.6866, + "step": 1100 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010922693266832918, + "loss": 0.6929, + "step": 1120 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010548628428927681, + "loss": 0.6846, + "step": 1140 + }, + { + "epoch": 2.04, + "learning_rate": 0.00010174563591022444, + "loss": 0.6944, + "step": 1160 + }, + { + "epoch": 2.08, + "learning_rate": 9.800498753117206e-05, + "loss": 0.6868, + "step": 1180 + }, + { + "epoch": 2.11, + "learning_rate": 9.42643391521197e-05, + "loss": 0.6938, + "step": 1200 + }, + { + "epoch": 2.11, + "eval_loss": 0.7033773064613342, + "eval_runtime": 32.1557, + "eval_samples_per_second": 62.197, + "eval_steps_per_second": 1.959, + "step": 1200 + } + ], + "max_steps": 1704, + "num_train_epochs": 3, + "total_flos": 3.119227660941656e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-cot7b/checkpoint-1200/training_args.bin b/adapters/saved-cot7b/checkpoint-1200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..378e9372689af61e38c0f92507c56ae97d993c2c --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86cc090bd15d462669757ec33f893d72d88487a15309964597325653a9952413 +size 3579 diff --git a/adapters/saved-cot7b/checkpoint-1400/optimizer.pt b/adapters/saved-cot7b/checkpoint-1400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..af896523359992f92b28ada8d1e2a78486cb57ec --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fedbb3c802191965937864bcbec90bb9eab9807130cb9a4f1e97423ec10ae26 +size 33629893 diff --git a/adapters/saved-cot7b/checkpoint-1400/pytorch_model.bin b/adapters/saved-cot7b/checkpoint-1400/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..667fe56ca7031c88a548dca0fea02c0bd83c4c41 --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1400/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65d98b35f8a94e27838b618a7b2ad1bfde043111259bb969e657b3dc28cd7d40 +size 16822989 diff --git a/adapters/saved-cot7b/checkpoint-1400/rng_state_0.pth b/adapters/saved-cot7b/checkpoint-1400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..848d4b6ff4a2ea00d1178f4ccba2f38195ac8eb4 --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6af7aa78902126543c8b674e5fa695029920a231237a475af45ed897780a61cb +size 14583 diff --git a/adapters/saved-cot7b/checkpoint-1400/rng_state_1.pth b/adapters/saved-cot7b/checkpoint-1400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6890000baaace28be32e421ba39a1756552a8b46 --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46a9ad141212156403a5796d6ebdee3ec01688323487bc3145d8e88124891a7d +size 14583 diff --git a/adapters/saved-cot7b/checkpoint-1400/rng_state_2.pth b/adapters/saved-cot7b/checkpoint-1400/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..2facaa09e608f946af80d37d5017d876aafc556c --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1400/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1aef822a30c5993670321aa4fd04e45c402442ce8fb53340d1113ee2e5b1e0ff +size 14583 diff --git a/adapters/saved-cot7b/checkpoint-1400/rng_state_3.pth b/adapters/saved-cot7b/checkpoint-1400/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..2edc08212066d2c78b30b987ce1dba8b6c4e7367 --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1400/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d84e3ee36a8a0b3b689504d5a159ecdac4ad49358e3d53a84d1a0cdc703ba639 +size 14583 diff --git a/adapters/saved-cot7b/checkpoint-1400/scaler.pt b/adapters/saved-cot7b/checkpoint-1400/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a8cb12388904652e2007207f80582007b39a2051 --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1400/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16fdfc03b58220402968eacaac23fb5471cdb9061302380bd3c8d4d326c02ade +size 557 diff --git a/adapters/saved-cot7b/checkpoint-1400/scheduler.pt b/adapters/saved-cot7b/checkpoint-1400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b007b536a9b59bc187993003b868075d6d17c1c --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a279ad10b61c25dda4225bc276631f1eb7d13ce5eeb3f91bab74dbfcedbc840 +size 627 diff --git a/adapters/saved-cot7b/checkpoint-1400/trainer_state.json b/adapters/saved-cot7b/checkpoint-1400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dbd640d1a6573ccdd0c850ee6a3ca7046de5a58e --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1400/trainer_state.json @@ -0,0 +1,492 @@ +{ + "best_metric": 0.7009322643280029, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-cot7b/checkpoint-1400", + "epoch": 2.4620795779292153, + "global_step": 1400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04, + "learning_rate": 5.9999999999999995e-05, + "loss": 2.3604, + "step": 20 + }, + { + "epoch": 0.07, + "learning_rate": 0.00011999999999999999, + "loss": 1.944, + "step": 40 + }, + { + "epoch": 0.11, + "learning_rate": 0.00017999999999999998, + "loss": 1.2746, + "step": 60 + }, + { + "epoch": 0.14, + "learning_rate": 0.00023999999999999998, + "loss": 0.9818, + "step": 80 + }, + { + "epoch": 0.18, + "learning_rate": 0.0003, + "loss": 0.863, + "step": 100 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002962593516209476, + "loss": 0.8083, + "step": 120 + }, + { + "epoch": 0.25, + "learning_rate": 0.00029251870324189524, + "loss": 0.7923, + "step": 140 + }, + { + "epoch": 0.28, + "learning_rate": 0.00028877805486284284, + "loss": 0.7623, + "step": 160 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002850374064837905, + "loss": 0.7665, + "step": 180 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002812967581047381, + "loss": 0.7554, + "step": 200 + }, + { + "epoch": 0.35, + "eval_loss": 0.7609586715698242, + "eval_runtime": 32.1749, + "eval_samples_per_second": 62.16, + "eval_steps_per_second": 1.958, + "step": 200 + }, + { + "epoch": 0.39, + "learning_rate": 0.00027755610972568577, + "loss": 0.7353, + "step": 220 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002738154613466334, + "loss": 0.7354, + "step": 240 + }, + { + "epoch": 0.46, + "learning_rate": 0.00027007481296758103, + "loss": 0.7435, + "step": 260 + }, + { + "epoch": 0.49, + "learning_rate": 0.00026633416458852864, + "loss": 0.7373, + "step": 280 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002625935162094763, + "loss": 0.7332, + "step": 300 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002588528678304239, + "loss": 0.7265, + "step": 320 + }, + { + "epoch": 0.6, + "learning_rate": 0.00025511221945137156, + "loss": 0.7274, + "step": 340 + }, + { + "epoch": 0.63, + "learning_rate": 0.00025137157107231917, + "loss": 0.7279, + "step": 360 + }, + { + "epoch": 0.67, + "learning_rate": 0.00024763092269326683, + "loss": 0.7231, + "step": 380 + }, + { + "epoch": 0.7, + "learning_rate": 0.00024389027431421443, + "loss": 0.7238, + "step": 400 + }, + { + "epoch": 0.7, + "eval_loss": 0.7300755381584167, + "eval_runtime": 32.1181, + "eval_samples_per_second": 62.27, + "eval_steps_per_second": 1.962, + "step": 400 + }, + { + "epoch": 0.74, + "learning_rate": 0.00024014962593516207, + "loss": 0.7161, + "step": 420 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002364089775561097, + "loss": 0.7146, + "step": 440 + }, + { + "epoch": 0.81, + "learning_rate": 0.00023266832917705733, + "loss": 0.7152, + "step": 460 + }, + { + "epoch": 0.84, + "learning_rate": 0.00022892768079800496, + "loss": 0.7202, + "step": 480 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002251870324189526, + "loss": 0.7142, + "step": 500 + }, + { + "epoch": 0.91, + "learning_rate": 0.00022144638403990023, + "loss": 0.7146, + "step": 520 + }, + { + "epoch": 0.95, + "learning_rate": 0.00021770573566084786, + "loss": 0.7104, + "step": 540 + }, + { + "epoch": 0.98, + "learning_rate": 0.0002139650872817955, + "loss": 0.7034, + "step": 560 + }, + { + "epoch": 1.02, + "learning_rate": 0.00021022443890274313, + "loss": 0.7153, + "step": 580 + }, + { + "epoch": 1.06, + "learning_rate": 0.00020648379052369076, + "loss": 0.7052, + "step": 600 + }, + { + "epoch": 1.06, + "eval_loss": 0.7185753583908081, + "eval_runtime": 32.4703, + "eval_samples_per_second": 61.595, + "eval_steps_per_second": 1.94, + "step": 600 + }, + { + "epoch": 1.09, + "learning_rate": 0.0002027431421446384, + "loss": 0.7061, + "step": 620 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019900249376558603, + "loss": 0.7096, + "step": 640 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019526184538653366, + "loss": 0.7065, + "step": 660 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019152119700748126, + "loss": 0.7046, + "step": 680 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001877805486284289, + "loss": 0.701, + "step": 700 + }, + { + "epoch": 1.27, + "learning_rate": 0.00018403990024937653, + "loss": 0.6922, + "step": 720 + }, + { + "epoch": 1.3, + "learning_rate": 0.00018029925187032416, + "loss": 0.6982, + "step": 740 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001765586034912718, + "loss": 0.6993, + "step": 760 + }, + { + "epoch": 1.37, + "learning_rate": 0.00017281795511221943, + "loss": 0.6922, + "step": 780 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016907730673316706, + "loss": 0.6989, + "step": 800 + }, + { + "epoch": 1.41, + "eval_loss": 0.7114558219909668, + "eval_runtime": 32.0158, + "eval_samples_per_second": 62.469, + "eval_steps_per_second": 1.968, + "step": 800 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001653366583541147, + "loss": 0.6964, + "step": 820 + }, + { + "epoch": 1.48, + "learning_rate": 0.00016159600997506232, + "loss": 0.6969, + "step": 840 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015785536159600996, + "loss": 0.6982, + "step": 860 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001541147132169576, + "loss": 0.6977, + "step": 880 + }, + { + "epoch": 1.58, + "learning_rate": 0.00015037406483790522, + "loss": 0.7019, + "step": 900 + }, + { + "epoch": 1.62, + "learning_rate": 0.00014663341645885285, + "loss": 0.6963, + "step": 920 + }, + { + "epoch": 1.65, + "learning_rate": 0.00014289276807980049, + "loss": 0.7006, + "step": 940 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013915211970074812, + "loss": 0.6935, + "step": 960 + }, + { + "epoch": 1.72, + "learning_rate": 0.00013541147132169575, + "loss": 0.6846, + "step": 980 + }, + { + "epoch": 1.76, + "learning_rate": 0.00013167082294264338, + "loss": 0.701, + "step": 1000 + }, + { + "epoch": 1.76, + "eval_loss": 0.7069133520126343, + "eval_runtime": 31.9943, + "eval_samples_per_second": 62.511, + "eval_steps_per_second": 1.969, + "step": 1000 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012793017456359102, + "loss": 0.6896, + "step": 1020 + }, + { + "epoch": 1.83, + "learning_rate": 0.00012418952618453862, + "loss": 0.702, + "step": 1040 + }, + { + "epoch": 1.86, + "learning_rate": 0.00012044887780548627, + "loss": 0.6952, + "step": 1060 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001167082294264339, + "loss": 0.6902, + "step": 1080 + }, + { + "epoch": 1.93, + "learning_rate": 0.00011296758104738153, + "loss": 0.6866, + "step": 1100 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010922693266832918, + "loss": 0.6929, + "step": 1120 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010548628428927681, + "loss": 0.6846, + "step": 1140 + }, + { + "epoch": 2.04, + "learning_rate": 0.00010174563591022444, + "loss": 0.6944, + "step": 1160 + }, + { + "epoch": 2.08, + "learning_rate": 9.800498753117206e-05, + "loss": 0.6868, + "step": 1180 + }, + { + "epoch": 2.11, + "learning_rate": 9.42643391521197e-05, + "loss": 0.6938, + "step": 1200 + }, + { + "epoch": 2.11, + "eval_loss": 0.7033773064613342, + "eval_runtime": 32.1557, + "eval_samples_per_second": 62.197, + "eval_steps_per_second": 1.959, + "step": 1200 + }, + { + "epoch": 2.15, + "learning_rate": 9.052369077306733e-05, + "loss": 0.6868, + "step": 1220 + }, + { + "epoch": 2.18, + "learning_rate": 8.678304239401496e-05, + "loss": 0.6795, + "step": 1240 + }, + { + "epoch": 2.22, + "learning_rate": 8.304239401496259e-05, + "loss": 0.6887, + "step": 1260 + }, + { + "epoch": 2.25, + "learning_rate": 7.930174563591023e-05, + "loss": 0.6795, + "step": 1280 + }, + { + "epoch": 2.29, + "learning_rate": 7.556109725685786e-05, + "loss": 0.6934, + "step": 1300 + }, + { + "epoch": 2.32, + "learning_rate": 7.182044887780548e-05, + "loss": 0.6905, + "step": 1320 + }, + { + "epoch": 2.36, + "learning_rate": 6.807980049875311e-05, + "loss": 0.685, + "step": 1340 + }, + { + "epoch": 2.39, + "learning_rate": 6.433915211970074e-05, + "loss": 0.6887, + "step": 1360 + }, + { + "epoch": 2.43, + "learning_rate": 6.0598503740648375e-05, + "loss": 0.6875, + "step": 1380 + }, + { + "epoch": 2.46, + "learning_rate": 5.6857855361596e-05, + "loss": 0.6807, + "step": 1400 + }, + { + "epoch": 2.46, + "eval_loss": 0.7009322643280029, + "eval_runtime": 32.0972, + "eval_samples_per_second": 62.311, + "eval_steps_per_second": 1.963, + "step": 1400 + } + ], + "max_steps": 1704, + "num_train_epochs": 3, + "total_flos": 3.6391801813666365e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-cot7b/checkpoint-1400/training_args.bin b/adapters/saved-cot7b/checkpoint-1400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..378e9372689af61e38c0f92507c56ae97d993c2c --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86cc090bd15d462669757ec33f893d72d88487a15309964597325653a9952413 +size 3579 diff --git a/adapters/saved-cot7b/checkpoint-1600/optimizer.pt b/adapters/saved-cot7b/checkpoint-1600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1a5812dff5cc859dc51ac6aa1db5fc33540fae45 --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab091692d68abec89288f769109cea7c420f93402ed0b8492bae7d160166aed4 +size 33629893 diff --git a/adapters/saved-cot7b/checkpoint-1600/pytorch_model.bin b/adapters/saved-cot7b/checkpoint-1600/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..05e74f8f320d6dc204319ecfe3dba6da875d2953 --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1600/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d12b2e3dfa0c7c6bc11c3f49deec72efb614c3ba13560982968c23846b9d211 +size 16822989 diff --git a/adapters/saved-cot7b/checkpoint-1600/rng_state_0.pth b/adapters/saved-cot7b/checkpoint-1600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..a7fc1e1cca86710bf2aa781cc84c1032324a1c7b --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a639c04ca33263949db8d2fe5b757fff593090aa7946aff72281e3aa30cba99 +size 14583 diff --git a/adapters/saved-cot7b/checkpoint-1600/rng_state_1.pth b/adapters/saved-cot7b/checkpoint-1600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d5d0e6ac85b8163a41da7bae67602767778cfa55 --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2458696a94fe9d9303b7a31ed3dbd91dedf4dcceb6f9f86d50c0966818fea051 +size 14583 diff --git a/adapters/saved-cot7b/checkpoint-1600/rng_state_2.pth b/adapters/saved-cot7b/checkpoint-1600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..f176eb0e888a9fd61bdb95aade2945cc1383bbd2 --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c69b0f03d4454a0a26186fd36cfda693f4b63c0703cd6141bc35262cdfa7599 +size 14583 diff --git a/adapters/saved-cot7b/checkpoint-1600/rng_state_3.pth b/adapters/saved-cot7b/checkpoint-1600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..2bb819f70d16c15775244c6455cd84bac7fa51cf --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d790337b66ef5386d77b69a45b6cd084d1511fb13ed374e8d59f28f07630dfa4 +size 14583 diff --git a/adapters/saved-cot7b/checkpoint-1600/scaler.pt b/adapters/saved-cot7b/checkpoint-1600/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f03c4ce6fd4620d95ae66b6787fdbcf34c6622a --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1600/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbbb3b0bb9c64d37bf898d0431b3ed1f0a6f5c9d8c2b563e0f884424fb8bd92a +size 557 diff --git a/adapters/saved-cot7b/checkpoint-1600/scheduler.pt b/adapters/saved-cot7b/checkpoint-1600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b3b0de2ba6a25f0b515e1e982ad296ec913f20a --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d00552de5f1ce2fb49ddc5a66e1fdc291d32e57777eb935e72943e293537bc85 +size 627 diff --git a/adapters/saved-cot7b/checkpoint-1600/trainer_state.json b/adapters/saved-cot7b/checkpoint-1600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..506d6a6506356438396e183bc05d013304cae32e --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1600/trainer_state.json @@ -0,0 +1,560 @@ +{ + "best_metric": 0.6989061236381531, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-cot7b/checkpoint-1600", + "epoch": 2.813805231919103, + "global_step": 1600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04, + "learning_rate": 5.9999999999999995e-05, + "loss": 2.3604, + "step": 20 + }, + { + "epoch": 0.07, + "learning_rate": 0.00011999999999999999, + "loss": 1.944, + "step": 40 + }, + { + "epoch": 0.11, + "learning_rate": 0.00017999999999999998, + "loss": 1.2746, + "step": 60 + }, + { + "epoch": 0.14, + "learning_rate": 0.00023999999999999998, + "loss": 0.9818, + "step": 80 + }, + { + "epoch": 0.18, + "learning_rate": 0.0003, + "loss": 0.863, + "step": 100 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002962593516209476, + "loss": 0.8083, + "step": 120 + }, + { + "epoch": 0.25, + "learning_rate": 0.00029251870324189524, + "loss": 0.7923, + "step": 140 + }, + { + "epoch": 0.28, + "learning_rate": 0.00028877805486284284, + "loss": 0.7623, + "step": 160 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002850374064837905, + "loss": 0.7665, + "step": 180 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002812967581047381, + "loss": 0.7554, + "step": 200 + }, + { + "epoch": 0.35, + "eval_loss": 0.7609586715698242, + "eval_runtime": 32.1749, + "eval_samples_per_second": 62.16, + "eval_steps_per_second": 1.958, + "step": 200 + }, + { + "epoch": 0.39, + "learning_rate": 0.00027755610972568577, + "loss": 0.7353, + "step": 220 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002738154613466334, + "loss": 0.7354, + "step": 240 + }, + { + "epoch": 0.46, + "learning_rate": 0.00027007481296758103, + "loss": 0.7435, + "step": 260 + }, + { + "epoch": 0.49, + "learning_rate": 0.00026633416458852864, + "loss": 0.7373, + "step": 280 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002625935162094763, + "loss": 0.7332, + "step": 300 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002588528678304239, + "loss": 0.7265, + "step": 320 + }, + { + "epoch": 0.6, + "learning_rate": 0.00025511221945137156, + "loss": 0.7274, + "step": 340 + }, + { + "epoch": 0.63, + "learning_rate": 0.00025137157107231917, + "loss": 0.7279, + "step": 360 + }, + { + "epoch": 0.67, + "learning_rate": 0.00024763092269326683, + "loss": 0.7231, + "step": 380 + }, + { + "epoch": 0.7, + "learning_rate": 0.00024389027431421443, + "loss": 0.7238, + "step": 400 + }, + { + "epoch": 0.7, + "eval_loss": 0.7300755381584167, + "eval_runtime": 32.1181, + "eval_samples_per_second": 62.27, + "eval_steps_per_second": 1.962, + "step": 400 + }, + { + "epoch": 0.74, + "learning_rate": 0.00024014962593516207, + "loss": 0.7161, + "step": 420 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002364089775561097, + "loss": 0.7146, + "step": 440 + }, + { + "epoch": 0.81, + "learning_rate": 0.00023266832917705733, + "loss": 0.7152, + "step": 460 + }, + { + "epoch": 0.84, + "learning_rate": 0.00022892768079800496, + "loss": 0.7202, + "step": 480 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002251870324189526, + "loss": 0.7142, + "step": 500 + }, + { + "epoch": 0.91, + "learning_rate": 0.00022144638403990023, + "loss": 0.7146, + "step": 520 + }, + { + "epoch": 0.95, + "learning_rate": 0.00021770573566084786, + "loss": 0.7104, + "step": 540 + }, + { + "epoch": 0.98, + "learning_rate": 0.0002139650872817955, + "loss": 0.7034, + "step": 560 + }, + { + "epoch": 1.02, + "learning_rate": 0.00021022443890274313, + "loss": 0.7153, + "step": 580 + }, + { + "epoch": 1.06, + "learning_rate": 0.00020648379052369076, + "loss": 0.7052, + "step": 600 + }, + { + "epoch": 1.06, + "eval_loss": 0.7185753583908081, + "eval_runtime": 32.4703, + "eval_samples_per_second": 61.595, + "eval_steps_per_second": 1.94, + "step": 600 + }, + { + "epoch": 1.09, + "learning_rate": 0.0002027431421446384, + "loss": 0.7061, + "step": 620 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019900249376558603, + "loss": 0.7096, + "step": 640 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019526184538653366, + "loss": 0.7065, + "step": 660 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019152119700748126, + "loss": 0.7046, + "step": 680 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001877805486284289, + "loss": 0.701, + "step": 700 + }, + { + "epoch": 1.27, + "learning_rate": 0.00018403990024937653, + "loss": 0.6922, + "step": 720 + }, + { + "epoch": 1.3, + "learning_rate": 0.00018029925187032416, + "loss": 0.6982, + "step": 740 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001765586034912718, + "loss": 0.6993, + "step": 760 + }, + { + "epoch": 1.37, + "learning_rate": 0.00017281795511221943, + "loss": 0.6922, + "step": 780 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016907730673316706, + "loss": 0.6989, + "step": 800 + }, + { + "epoch": 1.41, + "eval_loss": 0.7114558219909668, + "eval_runtime": 32.0158, + "eval_samples_per_second": 62.469, + "eval_steps_per_second": 1.968, + "step": 800 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001653366583541147, + "loss": 0.6964, + "step": 820 + }, + { + "epoch": 1.48, + "learning_rate": 0.00016159600997506232, + "loss": 0.6969, + "step": 840 + }, + { + "epoch": 1.51, + "learning_rate": 0.00015785536159600996, + "loss": 0.6982, + "step": 860 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001541147132169576, + "loss": 0.6977, + "step": 880 + }, + { + "epoch": 1.58, + "learning_rate": 0.00015037406483790522, + "loss": 0.7019, + "step": 900 + }, + { + "epoch": 1.62, + "learning_rate": 0.00014663341645885285, + "loss": 0.6963, + "step": 920 + }, + { + "epoch": 1.65, + "learning_rate": 0.00014289276807980049, + "loss": 0.7006, + "step": 940 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013915211970074812, + "loss": 0.6935, + "step": 960 + }, + { + "epoch": 1.72, + "learning_rate": 0.00013541147132169575, + "loss": 0.6846, + "step": 980 + }, + { + "epoch": 1.76, + "learning_rate": 0.00013167082294264338, + "loss": 0.701, + "step": 1000 + }, + { + "epoch": 1.76, + "eval_loss": 0.7069133520126343, + "eval_runtime": 31.9943, + "eval_samples_per_second": 62.511, + "eval_steps_per_second": 1.969, + "step": 1000 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012793017456359102, + "loss": 0.6896, + "step": 1020 + }, + { + "epoch": 1.83, + "learning_rate": 0.00012418952618453862, + "loss": 0.702, + "step": 1040 + }, + { + "epoch": 1.86, + "learning_rate": 0.00012044887780548627, + "loss": 0.6952, + "step": 1060 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001167082294264339, + "loss": 0.6902, + "step": 1080 + }, + { + "epoch": 1.93, + "learning_rate": 0.00011296758104738153, + "loss": 0.6866, + "step": 1100 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010922693266832918, + "loss": 0.6929, + "step": 1120 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010548628428927681, + "loss": 0.6846, + "step": 1140 + }, + { + "epoch": 2.04, + "learning_rate": 0.00010174563591022444, + "loss": 0.6944, + "step": 1160 + }, + { + "epoch": 2.08, + "learning_rate": 9.800498753117206e-05, + "loss": 0.6868, + "step": 1180 + }, + { + "epoch": 2.11, + "learning_rate": 9.42643391521197e-05, + "loss": 0.6938, + "step": 1200 + }, + { + "epoch": 2.11, + "eval_loss": 0.7033773064613342, + "eval_runtime": 32.1557, + "eval_samples_per_second": 62.197, + "eval_steps_per_second": 1.959, + "step": 1200 + }, + { + "epoch": 2.15, + "learning_rate": 9.052369077306733e-05, + "loss": 0.6868, + "step": 1220 + }, + { + "epoch": 2.18, + "learning_rate": 8.678304239401496e-05, + "loss": 0.6795, + "step": 1240 + }, + { + "epoch": 2.22, + "learning_rate": 8.304239401496259e-05, + "loss": 0.6887, + "step": 1260 + }, + { + "epoch": 2.25, + "learning_rate": 7.930174563591023e-05, + "loss": 0.6795, + "step": 1280 + }, + { + "epoch": 2.29, + "learning_rate": 7.556109725685786e-05, + "loss": 0.6934, + "step": 1300 + }, + { + "epoch": 2.32, + "learning_rate": 7.182044887780548e-05, + "loss": 0.6905, + "step": 1320 + }, + { + "epoch": 2.36, + "learning_rate": 6.807980049875311e-05, + "loss": 0.685, + "step": 1340 + }, + { + "epoch": 2.39, + "learning_rate": 6.433915211970074e-05, + "loss": 0.6887, + "step": 1360 + }, + { + "epoch": 2.43, + "learning_rate": 6.0598503740648375e-05, + "loss": 0.6875, + "step": 1380 + }, + { + "epoch": 2.46, + "learning_rate": 5.6857855361596e-05, + "loss": 0.6807, + "step": 1400 + }, + { + "epoch": 2.46, + "eval_loss": 0.7009322643280029, + "eval_runtime": 32.0972, + "eval_samples_per_second": 62.311, + "eval_steps_per_second": 1.963, + "step": 1400 + }, + { + "epoch": 2.5, + "learning_rate": 5.311720698254363e-05, + "loss": 0.6743, + "step": 1420 + }, + { + "epoch": 2.53, + "learning_rate": 4.9376558603491265e-05, + "loss": 0.6872, + "step": 1440 + }, + { + "epoch": 2.57, + "learning_rate": 4.56359102244389e-05, + "loss": 0.6776, + "step": 1460 + }, + { + "epoch": 2.6, + "learning_rate": 4.1895261845386524e-05, + "loss": 0.6769, + "step": 1480 + }, + { + "epoch": 2.64, + "learning_rate": 3.8154613466334156e-05, + "loss": 0.68, + "step": 1500 + }, + { + "epoch": 2.67, + "learning_rate": 3.4413965087281796e-05, + "loss": 0.6804, + "step": 1520 + }, + { + "epoch": 2.71, + "learning_rate": 3.067331670822942e-05, + "loss": 0.6848, + "step": 1540 + }, + { + "epoch": 2.74, + "learning_rate": 2.6932668329177054e-05, + "loss": 0.6825, + "step": 1560 + }, + { + "epoch": 2.78, + "learning_rate": 2.3192019950124686e-05, + "loss": 0.6899, + "step": 1580 + }, + { + "epoch": 2.81, + "learning_rate": 1.945137157107232e-05, + "loss": 0.6775, + "step": 1600 + }, + { + "epoch": 2.81, + "eval_loss": 0.6989061236381531, + "eval_runtime": 32.1279, + "eval_samples_per_second": 62.251, + "eval_steps_per_second": 1.961, + "step": 1600 + } + ], + "max_steps": 1704, + "num_train_epochs": 3, + "total_flos": 4.159132701791617e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved-cot7b/checkpoint-1600/training_args.bin b/adapters/saved-cot7b/checkpoint-1600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..378e9372689af61e38c0f92507c56ae97d993c2c --- /dev/null +++ b/adapters/saved-cot7b/checkpoint-1600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86cc090bd15d462669757ec33f893d72d88487a15309964597325653a9952413 +size 3579 diff --git a/adapters/saved_bloom100lines/adapter_config.json b/adapters/saved_bloom100lines/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..896c4cf962709fe6c09e7461f3422660563f6ea4 --- /dev/null +++ b/adapters/saved_bloom100lines/adapter_config.json @@ -0,0 +1,22 @@ +{ + "base_model_name_or_path": "bigscience/bloomz-7b1-mt", + "bias": "none", + "enable_lora": [ + true, + false, + true + ], + "fan_in_fan_out": true, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved_bloom100lines/adapter_model.bin b/adapters/saved_bloom100lines/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..d683f65c23923077052dda2516d53a19df936176 --- /dev/null +++ b/adapters/saved_bloom100lines/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3626ccfddd5a457dcea73628930225ff7b6e3fa8f49b092373e6bf695951b600 +size 15751077 diff --git a/adapters/saved_bloom7b_all_chinese_noxP3/adapter_config.json b/adapters/saved_bloom7b_all_chinese_noxP3/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6d8b850b4ee64259ab7f5c0408ccbe0aadb878de --- /dev/null +++ b/adapters/saved_bloom7b_all_chinese_noxP3/adapter_config.json @@ -0,0 +1,16 @@ +{ + "base_model_name_or_path": "/mnt/bn/qingyi-bn-lq/llama/bloomz-7b1-mt", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved_bloom7b_all_chinese_noxP3/adapter_model.bin b/adapters/saved_bloom7b_all_chinese_noxP3/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..e2e600e40636e4f6451036edded33061ebb0980b --- /dev/null +++ b/adapters/saved_bloom7b_all_chinese_noxP3/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b86e66ab04e9251479e46581ccef7633f49dd4ea0e3bb1bbc0265c486ae32998 +size 15750885 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/adapter_config.json b/adapters/saved_bloom_instinwild_cn-belle1.5m/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..896c4cf962709fe6c09e7461f3422660563f6ea4 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/adapter_config.json @@ -0,0 +1,22 @@ +{ + "base_model_name_or_path": "bigscience/bloomz-7b1-mt", + "bias": "none", + "enable_lora": [ + true, + false, + true + ], + "fan_in_fan_out": true, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/adapter_model.bin b/adapters/saved_bloom_instinwild_cn-belle1.5m/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..a8cc594f483e07a5694f77054d35027183a28f37 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0edd13152d5b395595cfcae1afb7e0a435aeb05f7e18db05a803737bbf6cc942 +size 15751077 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/optimizer.pt b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d1ab00275d6b76e3af22a38a06945d4677a5c32 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c0e7695fd0c28fc4fde59922e6ecf16a72ea4970be64b4b1eef7c3d41450369 +size 31493061 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/pytorch_model.bin b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..e3e3bf7a5f423adbe1263772435e4e2b95fb2e21 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e9a63f91c792ac483495039a469a90f0f7b86ff742f95c2b08179a6073b5bdd +size 15751077 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_0.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef29ed49ae99bdf3a84a9f1b20b1a711ec7c6897 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:094e3e5512f5f1b86da86aede1c31aea6de85e0f0111f5a9d5addac277e6d818 +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_1.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..5e6edfe7329a97f9d7e18bd40d684a6b702bfc34 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:185e3d7b25db051ab5979637143199360d3f9acfd8ebabe849f760cf4a84b544 +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_2.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..bf496205b84937cc8decedf933ed61755dcf330a --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3c0cef4a79ddcec81aa8bdbc6a26759f6813c9534f0598725274a2e5cc7e92d +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_3.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..8c8b0583aa48a1a0cf26ada8e7ec15b456458118 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fec9045f49c0d8ca9d9e08d06b06a2dd7c6356f92ba1b8bfeea52bb455dcadcb +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_4.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..34cc372808ee77be6903602e770bac939f7b5436 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94c8c90734065bf8f46a34fa70910b738147e6b60ffed80435361a44789c54de +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_5.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..ff7484ac0ad18793e5039296dde22f21b61bf591 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ab2aebd09862fe30303d7a3cc1325eba2f4ace809bf925e01c79c989f7c2001 +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_6.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..97931a1e80b0af8865d7095e3d423c19fc898551 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd3493ca1b614961f8c87cae19b6bf02f1a49ab5ecdf60baea725c43c912d826 +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_7.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..f42c6dbd4f3764c4983fa41e4423c38696512154 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c3af9501fd6b94ddb0ff61b9cf2deeec7ec758905cfce92625dd99efb4495fe +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/scaler.pt b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..efdbd3c795f6b0d4144e68355e99c220ccdedd09 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68cff80b680ddf6e7abbef98b5f336b97f9b5963e2209307f639383870e8cc71 +size 557 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/scheduler.pt b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e25fb18fc0c5ea3b4957fcb4f487b7029e7f5ac8 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34b2aa5e11900f68efa398098e735923f7de6d9ebf32fe96d553bbcfb561151a +size 627 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/trainer_state.json b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..21730bd3f208b9b7c9f157b4520bd26b81bff32f --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/trainer_state.json @@ -0,0 +1,356 @@ +{ + "best_metric": 1.6601147651672363, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_bloominstinwild-belle1.5m/checkpoint-1000", + "epoch": 0.6428801028608164, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 5.9999999999999995e-05, + "loss": 2.3902, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.00011999999999999999, + "loss": 2.1808, + "step": 40 + }, + { + "epoch": 0.04, + "learning_rate": 0.00017999999999999998, + "loss": 1.9937, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 0.00023999999999999998, + "loss": 1.8967, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 0.0003, + "loss": 1.8529, + "step": 100 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002958762886597938, + "loss": 1.8004, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002917525773195876, + "loss": 1.785, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002876288659793814, + "loss": 1.7755, + "step": 160 + }, + { + "epoch": 0.12, + "learning_rate": 0.00028350515463917525, + "loss": 1.7554, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002793814432989691, + "loss": 1.7534, + "step": 200 + }, + { + "epoch": 0.13, + "eval_loss": 1.7528996467590332, + "eval_runtime": 17.6175, + "eval_samples_per_second": 113.524, + "eval_steps_per_second": 1.816, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 0.00027525773195876286, + "loss": 1.7375, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002711340206185567, + "loss": 1.7331, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 0.00026701030927835047, + "loss": 1.7248, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002628865979381443, + "loss": 1.7282, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 0.00025876288659793813, + "loss": 1.7167, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002546391752577319, + "loss": 1.7215, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 0.00025051546391752574, + "loss": 1.7157, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002463917525773196, + "loss": 1.7146, + "step": 360 + }, + { + "epoch": 0.24, + "learning_rate": 0.00024226804123711338, + "loss": 1.6953, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 0.00023814432989690718, + "loss": 1.6951, + "step": 400 + }, + { + "epoch": 0.26, + "eval_loss": 1.7059786319732666, + "eval_runtime": 17.3801, + "eval_samples_per_second": 115.074, + "eval_steps_per_second": 1.841, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 0.00023402061855670102, + "loss": 1.6916, + "step": 420 + }, + { + "epoch": 0.28, + "learning_rate": 0.00022989690721649485, + "loss": 1.6893, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 0.00022577319587628863, + "loss": 1.6932, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 0.00022164948453608246, + "loss": 1.6913, + "step": 480 + }, + { + "epoch": 0.32, + "learning_rate": 0.00021752577319587626, + "loss": 1.6823, + "step": 500 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002134020618556701, + "loss": 1.6921, + "step": 520 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002092783505154639, + "loss": 1.6759, + "step": 540 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002051546391752577, + "loss": 1.6758, + "step": 560 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002010309278350515, + "loss": 1.6732, + "step": 580 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019690721649484534, + "loss": 1.673, + "step": 600 + }, + { + "epoch": 0.39, + "eval_loss": 1.6831790208816528, + "eval_runtime": 17.4032, + "eval_samples_per_second": 114.921, + "eval_steps_per_second": 1.839, + "step": 600 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019278350515463918, + "loss": 1.6625, + "step": 620 + }, + { + "epoch": 0.41, + "learning_rate": 0.00018865979381443298, + "loss": 1.666, + "step": 640 + }, + { + "epoch": 0.42, + "learning_rate": 0.00018453608247422679, + "loss": 1.6662, + "step": 660 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001804123711340206, + "loss": 1.6692, + "step": 680 + }, + { + "epoch": 0.45, + "learning_rate": 0.00017628865979381442, + "loss": 1.6635, + "step": 700 + }, + { + "epoch": 0.46, + "learning_rate": 0.00017216494845360823, + "loss": 1.6685, + "step": 720 + }, + { + "epoch": 0.48, + "learning_rate": 0.00016804123711340206, + "loss": 1.6534, + "step": 740 + }, + { + "epoch": 0.49, + "learning_rate": 0.00016391752577319584, + "loss": 1.6627, + "step": 760 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015979381443298967, + "loss": 1.6596, + "step": 780 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001556701030927835, + "loss": 1.6659, + "step": 800 + }, + { + "epoch": 0.51, + "eval_loss": 1.6697583198547363, + "eval_runtime": 17.3944, + "eval_samples_per_second": 114.979, + "eval_steps_per_second": 1.84, + "step": 800 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001515463917525773, + "loss": 1.6599, + "step": 820 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001474226804123711, + "loss": 1.6587, + "step": 840 + }, + { + "epoch": 0.55, + "learning_rate": 0.00014329896907216494, + "loss": 1.6586, + "step": 860 + }, + { + "epoch": 0.57, + "learning_rate": 0.00013917525773195875, + "loss": 1.6602, + "step": 880 + }, + { + "epoch": 0.58, + "learning_rate": 0.00013505154639175258, + "loss": 1.6513, + "step": 900 + }, + { + "epoch": 0.59, + "learning_rate": 0.00013092783505154639, + "loss": 1.6477, + "step": 920 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001268041237113402, + "loss": 1.6455, + "step": 940 + }, + { + "epoch": 0.62, + "learning_rate": 0.000122680412371134, + "loss": 1.6543, + "step": 960 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011855670103092781, + "loss": 1.6499, + "step": 980 + }, + { + "epoch": 0.64, + "learning_rate": 0.00011443298969072163, + "loss": 1.647, + "step": 1000 + }, + { + "epoch": 0.64, + "eval_loss": 1.6601147651672363, + "eval_runtime": 17.3892, + "eval_samples_per_second": 115.014, + "eval_steps_per_second": 1.84, + "step": 1000 + } + ], + "max_steps": 1555, + "num_train_epochs": 1, + "total_flos": 8.598717456522936e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/training_args.bin b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d16d7b78034c42abf83c055eed5422d528d80c3d --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70e976eac0670bb3a2fdafdedfc8005e8501f5017958892dc4d0b1661f81f2e3 +size 3643 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/optimizer.pt b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9eaaffde6ecc27ef82973741de803cd69fa77c24 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c7ef4e3734b50b347cd023829f2bb90303e24534444d5850c49afbc50ebad59 +size 31493061 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/pytorch_model.bin b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..aa82203424164aaffc34ce8eef45220d7cb72e07 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6a238630e5e46a71e125d0ae1df82b44b4acb2a02de8ba25609043d0bc6dbbb +size 15751077 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_0.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..04a944b268fe8b4c4ea91b3f07dbe15bad28db8c --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a5bf2f1b14c29f7cbc30d55780a0cd3daf75c51243789863e902c84a246e5fc +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_1.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..c832549d6d0b8f885a75a2afc9f0c1a44c41851b --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72e37281a2312b74bdc796e5217e98801b4fe5b2aae5f8b07ef4612263c2de67 +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_2.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..0eb88d982545483faddceb4e0d79ab19668e5070 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f44efe8840ca03e23a7391b17342bbb2636858968a899a900a56802993d22f2 +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_3.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..ac2eed370102020209092d39bc22a51257f9d02b --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4defade2d3499ebcee865a8b6ebf761779fa82759e33ebefb5a7e6a55c33564d +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_4.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..a01bd00599ed6dd16c921ddc1ebdb4fb10a101fc --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21272b6c71afbb7e49d278383eb281b5f34db7c913f4d07b9bb26eccd3d669de +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_5.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..635e9aec00f0aefb1b2fc25082419e23a3707c0a --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91b4349fcff57c4425d405cb8c719b481da5d8a89a33b911e1098413b175e12d +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_6.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..7fa32774666b4ef5d95038e044041a0b99ca7bae --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84c870ecb8634dec7cab65f6f68096d412bd4d40ad8790ed125fc0021d8534a8 +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_7.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..48018348e8234ec331002e83c1d2456d997b38d8 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43e5f8346125bd2d75164e4cc1e10e8fb3b33d8c7d3a8bffeaebeb1427cbbe50 +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/scaler.pt b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c73b6e7148d8ae7026711173634e0a11b1b94e2d --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:741cefeca9ef427f92406d2d10b81996655e2a9d50eb7aaa9614e6fdd1c9f529 +size 557 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/scheduler.pt b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c25aaba153f31126725e100a9c3a7f23e7523783 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8635236f16da2ba7cd0ec71eb304506ec07d953705bf882b84cd4c5e7338f8d7 +size 627 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/trainer_state.json b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9b60f6a09e9cfee70fcef28c3f1aa40e52a3eb33 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/trainer_state.json @@ -0,0 +1,424 @@ +{ + "best_metric": 1.6533111333847046, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_bloominstinwild-belle1.5m/checkpoint-1200", + "epoch": 0.7714561234329798, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 5.9999999999999995e-05, + "loss": 2.3902, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.00011999999999999999, + "loss": 2.1808, + "step": 40 + }, + { + "epoch": 0.04, + "learning_rate": 0.00017999999999999998, + "loss": 1.9937, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 0.00023999999999999998, + "loss": 1.8967, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 0.0003, + "loss": 1.8529, + "step": 100 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002958762886597938, + "loss": 1.8004, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002917525773195876, + "loss": 1.785, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002876288659793814, + "loss": 1.7755, + "step": 160 + }, + { + "epoch": 0.12, + "learning_rate": 0.00028350515463917525, + "loss": 1.7554, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002793814432989691, + "loss": 1.7534, + "step": 200 + }, + { + "epoch": 0.13, + "eval_loss": 1.7528996467590332, + "eval_runtime": 17.6175, + "eval_samples_per_second": 113.524, + "eval_steps_per_second": 1.816, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 0.00027525773195876286, + "loss": 1.7375, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002711340206185567, + "loss": 1.7331, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 0.00026701030927835047, + "loss": 1.7248, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002628865979381443, + "loss": 1.7282, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 0.00025876288659793813, + "loss": 1.7167, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002546391752577319, + "loss": 1.7215, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 0.00025051546391752574, + "loss": 1.7157, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002463917525773196, + "loss": 1.7146, + "step": 360 + }, + { + "epoch": 0.24, + "learning_rate": 0.00024226804123711338, + "loss": 1.6953, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 0.00023814432989690718, + "loss": 1.6951, + "step": 400 + }, + { + "epoch": 0.26, + "eval_loss": 1.7059786319732666, + "eval_runtime": 17.3801, + "eval_samples_per_second": 115.074, + "eval_steps_per_second": 1.841, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 0.00023402061855670102, + "loss": 1.6916, + "step": 420 + }, + { + "epoch": 0.28, + "learning_rate": 0.00022989690721649485, + "loss": 1.6893, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 0.00022577319587628863, + "loss": 1.6932, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 0.00022164948453608246, + "loss": 1.6913, + "step": 480 + }, + { + "epoch": 0.32, + "learning_rate": 0.00021752577319587626, + "loss": 1.6823, + "step": 500 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002134020618556701, + "loss": 1.6921, + "step": 520 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002092783505154639, + "loss": 1.6759, + "step": 540 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002051546391752577, + "loss": 1.6758, + "step": 560 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002010309278350515, + "loss": 1.6732, + "step": 580 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019690721649484534, + "loss": 1.673, + "step": 600 + }, + { + "epoch": 0.39, + "eval_loss": 1.6831790208816528, + "eval_runtime": 17.4032, + "eval_samples_per_second": 114.921, + "eval_steps_per_second": 1.839, + "step": 600 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019278350515463918, + "loss": 1.6625, + "step": 620 + }, + { + "epoch": 0.41, + "learning_rate": 0.00018865979381443298, + "loss": 1.666, + "step": 640 + }, + { + "epoch": 0.42, + "learning_rate": 0.00018453608247422679, + "loss": 1.6662, + "step": 660 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001804123711340206, + "loss": 1.6692, + "step": 680 + }, + { + "epoch": 0.45, + "learning_rate": 0.00017628865979381442, + "loss": 1.6635, + "step": 700 + }, + { + "epoch": 0.46, + "learning_rate": 0.00017216494845360823, + "loss": 1.6685, + "step": 720 + }, + { + "epoch": 0.48, + "learning_rate": 0.00016804123711340206, + "loss": 1.6534, + "step": 740 + }, + { + "epoch": 0.49, + "learning_rate": 0.00016391752577319584, + "loss": 1.6627, + "step": 760 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015979381443298967, + "loss": 1.6596, + "step": 780 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001556701030927835, + "loss": 1.6659, + "step": 800 + }, + { + "epoch": 0.51, + "eval_loss": 1.6697583198547363, + "eval_runtime": 17.3944, + "eval_samples_per_second": 114.979, + "eval_steps_per_second": 1.84, + "step": 800 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001515463917525773, + "loss": 1.6599, + "step": 820 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001474226804123711, + "loss": 1.6587, + "step": 840 + }, + { + "epoch": 0.55, + "learning_rate": 0.00014329896907216494, + "loss": 1.6586, + "step": 860 + }, + { + "epoch": 0.57, + "learning_rate": 0.00013917525773195875, + "loss": 1.6602, + "step": 880 + }, + { + "epoch": 0.58, + "learning_rate": 0.00013505154639175258, + "loss": 1.6513, + "step": 900 + }, + { + "epoch": 0.59, + "learning_rate": 0.00013092783505154639, + "loss": 1.6477, + "step": 920 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001268041237113402, + "loss": 1.6455, + "step": 940 + }, + { + "epoch": 0.62, + "learning_rate": 0.000122680412371134, + "loss": 1.6543, + "step": 960 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011855670103092781, + "loss": 1.6499, + "step": 980 + }, + { + "epoch": 0.64, + "learning_rate": 0.00011443298969072163, + "loss": 1.647, + "step": 1000 + }, + { + "epoch": 0.64, + "eval_loss": 1.6601147651672363, + "eval_runtime": 17.3892, + "eval_samples_per_second": 115.014, + "eval_steps_per_second": 1.84, + "step": 1000 + }, + { + "epoch": 0.66, + "learning_rate": 0.00011030927835051547, + "loss": 1.649, + "step": 1020 + }, + { + "epoch": 0.67, + "learning_rate": 0.00010618556701030927, + "loss": 1.6562, + "step": 1040 + }, + { + "epoch": 0.68, + "learning_rate": 0.00010206185567010309, + "loss": 1.6488, + "step": 1060 + }, + { + "epoch": 0.69, + "learning_rate": 9.79381443298969e-05, + "loss": 1.6453, + "step": 1080 + }, + { + "epoch": 0.71, + "learning_rate": 9.381443298969071e-05, + "loss": 1.6348, + "step": 1100 + }, + { + "epoch": 0.72, + "learning_rate": 8.969072164948453e-05, + "loss": 1.6453, + "step": 1120 + }, + { + "epoch": 0.73, + "learning_rate": 8.556701030927834e-05, + "loss": 1.6362, + "step": 1140 + }, + { + "epoch": 0.75, + "learning_rate": 8.144329896907215e-05, + "loss": 1.6417, + "step": 1160 + }, + { + "epoch": 0.76, + "learning_rate": 7.731958762886596e-05, + "loss": 1.6382, + "step": 1180 + }, + { + "epoch": 0.77, + "learning_rate": 7.319587628865979e-05, + "loss": 1.6433, + "step": 1200 + }, + { + "epoch": 0.77, + "eval_loss": 1.6533111333847046, + "eval_runtime": 17.3863, + "eval_samples_per_second": 115.033, + "eval_steps_per_second": 1.841, + "step": 1200 + } + ], + "max_steps": 1555, + "num_train_epochs": 1, + "total_flos": 1.0319485572405527e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/training_args.bin b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d16d7b78034c42abf83c055eed5422d528d80c3d --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70e976eac0670bb3a2fdafdedfc8005e8501f5017958892dc4d0b1661f81f2e3 +size 3643 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/optimizer.pt b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..fdd973c2ba12657e5f77ee68eac82cf4c66e7969 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55b16a35942cfdcb12d784284b23f051d88ca6471181f8cf01767b785c98a19e +size 31493061 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/pytorch_model.bin b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..15b9b3f837c240c2451dc54bfb365bc065eb8031 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e74a7caff1adb8db493171d57726e003279e9c9b975690c22ef5c9592e73960 +size 15751077 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_0.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..4c06883961b755ed6c45a0b1100c75273eb91e48 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8cf4bdbc94d3e4f454c35476e14c6c91fb3bc76546ba39a245a6fc55f401d83 +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_1.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..bbbde68fb2d0ca412d0c528dbc3253d95e64c010 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27585411a31cd8bb949ffa68c2c4573e8e8150fcb0adb6b3305a766db86f83a6 +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_2.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..4ad7e379540049e6234d54a0a78f4dda5cadc14d --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa7ad9fa88325c1a850cd89a1795eb58b584cd73f1e707cb976618aa306d40ed +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_3.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..8112931a8ab588e3c0200315c37348bd71d330f9 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64506315f92c401028afdeec4ef772b3bd7415e2709e61d40c13b68c2ac5e52e +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_4.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..7a2cf6f824920daab2f8087e84141ba36e70beb1 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45e242c13acc198015bdb36a022b038c60fa5e13a93d1fb59121edffcda46031 +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_5.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9e4430066aacaa352eb9e49739fb1c7b3557113 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdbde408517bf3bbf49bbc0931e17a1857e670355b90f5cfd7fc7e3886e2e50f +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_6.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..76f6003190f54691d3fa591754dcee80ac73c082 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f852d41ee2383e66fd03fb11e08aea904377583b0ef58baf745cd6e03a47ccf6 +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_7.pth b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..dbe05bc68154531595807fbb81c5d71575dac8a7 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2f80e1cc0a0d06e5dd4633aaa38072113dbe36255b0bb56b63b6bdd7c8626da +size 14583 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/scaler.pt b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a8cb12388904652e2007207f80582007b39a2051 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16fdfc03b58220402968eacaac23fb5471cdb9061302380bd3c8d4d326c02ade +size 557 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/scheduler.pt b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a05523bd82b07a47cabd30402cc088d52391a62 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f311554cca86f0324fa3c3461e4aa4917118bbf144c27ddece5156dff7bb5ee9 +size 627 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/trainer_state.json b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..02887caf4b3993e4e271954838f1b633f3faa081 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/trainer_state.json @@ -0,0 +1,492 @@ +{ + "best_metric": 1.6501820087432861, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_bloominstinwild-belle1.5m/checkpoint-1400", + "epoch": 0.9000321440051431, + "global_step": 1400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 5.9999999999999995e-05, + "loss": 2.3902, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.00011999999999999999, + "loss": 2.1808, + "step": 40 + }, + { + "epoch": 0.04, + "learning_rate": 0.00017999999999999998, + "loss": 1.9937, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 0.00023999999999999998, + "loss": 1.8967, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 0.0003, + "loss": 1.8529, + "step": 100 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002958762886597938, + "loss": 1.8004, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002917525773195876, + "loss": 1.785, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002876288659793814, + "loss": 1.7755, + "step": 160 + }, + { + "epoch": 0.12, + "learning_rate": 0.00028350515463917525, + "loss": 1.7554, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002793814432989691, + "loss": 1.7534, + "step": 200 + }, + { + "epoch": 0.13, + "eval_loss": 1.7528996467590332, + "eval_runtime": 17.6175, + "eval_samples_per_second": 113.524, + "eval_steps_per_second": 1.816, + "step": 200 + }, + { + "epoch": 0.14, + "learning_rate": 0.00027525773195876286, + "loss": 1.7375, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002711340206185567, + "loss": 1.7331, + "step": 240 + }, + { + "epoch": 0.17, + "learning_rate": 0.00026701030927835047, + "loss": 1.7248, + "step": 260 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002628865979381443, + "loss": 1.7282, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 0.00025876288659793813, + "loss": 1.7167, + "step": 300 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002546391752577319, + "loss": 1.7215, + "step": 320 + }, + { + "epoch": 0.22, + "learning_rate": 0.00025051546391752574, + "loss": 1.7157, + "step": 340 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002463917525773196, + "loss": 1.7146, + "step": 360 + }, + { + "epoch": 0.24, + "learning_rate": 0.00024226804123711338, + "loss": 1.6953, + "step": 380 + }, + { + "epoch": 0.26, + "learning_rate": 0.00023814432989690718, + "loss": 1.6951, + "step": 400 + }, + { + "epoch": 0.26, + "eval_loss": 1.7059786319732666, + "eval_runtime": 17.3801, + "eval_samples_per_second": 115.074, + "eval_steps_per_second": 1.841, + "step": 400 + }, + { + "epoch": 0.27, + "learning_rate": 0.00023402061855670102, + "loss": 1.6916, + "step": 420 + }, + { + "epoch": 0.28, + "learning_rate": 0.00022989690721649485, + "loss": 1.6893, + "step": 440 + }, + { + "epoch": 0.3, + "learning_rate": 0.00022577319587628863, + "loss": 1.6932, + "step": 460 + }, + { + "epoch": 0.31, + "learning_rate": 0.00022164948453608246, + "loss": 1.6913, + "step": 480 + }, + { + "epoch": 0.32, + "learning_rate": 0.00021752577319587626, + "loss": 1.6823, + "step": 500 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002134020618556701, + "loss": 1.6921, + "step": 520 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002092783505154639, + "loss": 1.6759, + "step": 540 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002051546391752577, + "loss": 1.6758, + "step": 560 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002010309278350515, + "loss": 1.6732, + "step": 580 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019690721649484534, + "loss": 1.673, + "step": 600 + }, + { + "epoch": 0.39, + "eval_loss": 1.6831790208816528, + "eval_runtime": 17.4032, + "eval_samples_per_second": 114.921, + "eval_steps_per_second": 1.839, + "step": 600 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019278350515463918, + "loss": 1.6625, + "step": 620 + }, + { + "epoch": 0.41, + "learning_rate": 0.00018865979381443298, + "loss": 1.666, + "step": 640 + }, + { + "epoch": 0.42, + "learning_rate": 0.00018453608247422679, + "loss": 1.6662, + "step": 660 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001804123711340206, + "loss": 1.6692, + "step": 680 + }, + { + "epoch": 0.45, + "learning_rate": 0.00017628865979381442, + "loss": 1.6635, + "step": 700 + }, + { + "epoch": 0.46, + "learning_rate": 0.00017216494845360823, + "loss": 1.6685, + "step": 720 + }, + { + "epoch": 0.48, + "learning_rate": 0.00016804123711340206, + "loss": 1.6534, + "step": 740 + }, + { + "epoch": 0.49, + "learning_rate": 0.00016391752577319584, + "loss": 1.6627, + "step": 760 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015979381443298967, + "loss": 1.6596, + "step": 780 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001556701030927835, + "loss": 1.6659, + "step": 800 + }, + { + "epoch": 0.51, + "eval_loss": 1.6697583198547363, + "eval_runtime": 17.3944, + "eval_samples_per_second": 114.979, + "eval_steps_per_second": 1.84, + "step": 800 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001515463917525773, + "loss": 1.6599, + "step": 820 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001474226804123711, + "loss": 1.6587, + "step": 840 + }, + { + "epoch": 0.55, + "learning_rate": 0.00014329896907216494, + "loss": 1.6586, + "step": 860 + }, + { + "epoch": 0.57, + "learning_rate": 0.00013917525773195875, + "loss": 1.6602, + "step": 880 + }, + { + "epoch": 0.58, + "learning_rate": 0.00013505154639175258, + "loss": 1.6513, + "step": 900 + }, + { + "epoch": 0.59, + "learning_rate": 0.00013092783505154639, + "loss": 1.6477, + "step": 920 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001268041237113402, + "loss": 1.6455, + "step": 940 + }, + { + "epoch": 0.62, + "learning_rate": 0.000122680412371134, + "loss": 1.6543, + "step": 960 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011855670103092781, + "loss": 1.6499, + "step": 980 + }, + { + "epoch": 0.64, + "learning_rate": 0.00011443298969072163, + "loss": 1.647, + "step": 1000 + }, + { + "epoch": 0.64, + "eval_loss": 1.6601147651672363, + "eval_runtime": 17.3892, + "eval_samples_per_second": 115.014, + "eval_steps_per_second": 1.84, + "step": 1000 + }, + { + "epoch": 0.66, + "learning_rate": 0.00011030927835051547, + "loss": 1.649, + "step": 1020 + }, + { + "epoch": 0.67, + "learning_rate": 0.00010618556701030927, + "loss": 1.6562, + "step": 1040 + }, + { + "epoch": 0.68, + "learning_rate": 0.00010206185567010309, + "loss": 1.6488, + "step": 1060 + }, + { + "epoch": 0.69, + "learning_rate": 9.79381443298969e-05, + "loss": 1.6453, + "step": 1080 + }, + { + "epoch": 0.71, + "learning_rate": 9.381443298969071e-05, + "loss": 1.6348, + "step": 1100 + }, + { + "epoch": 0.72, + "learning_rate": 8.969072164948453e-05, + "loss": 1.6453, + "step": 1120 + }, + { + "epoch": 0.73, + "learning_rate": 8.556701030927834e-05, + "loss": 1.6362, + "step": 1140 + }, + { + "epoch": 0.75, + "learning_rate": 8.144329896907215e-05, + "loss": 1.6417, + "step": 1160 + }, + { + "epoch": 0.76, + "learning_rate": 7.731958762886596e-05, + "loss": 1.6382, + "step": 1180 + }, + { + "epoch": 0.77, + "learning_rate": 7.319587628865979e-05, + "loss": 1.6433, + "step": 1200 + }, + { + "epoch": 0.77, + "eval_loss": 1.6533111333847046, + "eval_runtime": 17.3863, + "eval_samples_per_second": 115.033, + "eval_steps_per_second": 1.841, + "step": 1200 + }, + { + "epoch": 0.78, + "learning_rate": 6.90721649484536e-05, + "loss": 1.6415, + "step": 1220 + }, + { + "epoch": 0.8, + "learning_rate": 6.494845360824742e-05, + "loss": 1.6459, + "step": 1240 + }, + { + "epoch": 0.81, + "learning_rate": 6.0824742268041234e-05, + "loss": 1.644, + "step": 1260 + }, + { + "epoch": 0.82, + "learning_rate": 5.6701030927835046e-05, + "loss": 1.6472, + "step": 1280 + }, + { + "epoch": 0.84, + "learning_rate": 5.257731958762886e-05, + "loss": 1.6368, + "step": 1300 + }, + { + "epoch": 0.85, + "learning_rate": 4.8453608247422676e-05, + "loss": 1.6325, + "step": 1320 + }, + { + "epoch": 0.86, + "learning_rate": 4.4329896907216494e-05, + "loss": 1.6354, + "step": 1340 + }, + { + "epoch": 0.87, + "learning_rate": 4.0206185567010306e-05, + "loss": 1.6381, + "step": 1360 + }, + { + "epoch": 0.89, + "learning_rate": 3.608247422680412e-05, + "loss": 1.6519, + "step": 1380 + }, + { + "epoch": 0.9, + "learning_rate": 3.1958762886597937e-05, + "loss": 1.6492, + "step": 1400 + }, + { + "epoch": 0.9, + "eval_loss": 1.6501820087432861, + "eval_runtime": 17.3757, + "eval_samples_per_second": 115.103, + "eval_steps_per_second": 1.842, + "step": 1400 + } + ], + "max_steps": 1555, + "num_train_epochs": 1, + "total_flos": 1.203777052717613e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/training_args.bin b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d16d7b78034c42abf83c055eed5422d528d80c3d --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/checkpoint-1400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70e976eac0670bb3a2fdafdedfc8005e8501f5017958892dc4d0b1661f81f2e3 +size 3643 diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/middle/adapter_config.json b/adapters/saved_bloom_instinwild_cn-belle1.5m/middle/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..896c4cf962709fe6c09e7461f3422660563f6ea4 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/middle/adapter_config.json @@ -0,0 +1,22 @@ +{ + "base_model_name_or_path": "bigscience/bloomz-7b1-mt", + "bias": "none", + "enable_lora": [ + true, + false, + true + ], + "fan_in_fan_out": true, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved_bloom_instinwild_cn-belle1.5m/middle/adapter_model.bin b/adapters/saved_bloom_instinwild_cn-belle1.5m/middle/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..aa82203424164aaffc34ce8eef45220d7cb72e07 --- /dev/null +++ b/adapters/saved_bloom_instinwild_cn-belle1.5m/middle/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6a238630e5e46a71e125d0ae1df82b44b4acb2a02de8ba25609043d0bc6dbbb +size 15751077 diff --git a/adapters/saved_bloomfirefly/adapter_config.json b/adapters/saved_bloomfirefly/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..854d12e5e9502d2be965ee9c13133dbf4c923d67 --- /dev/null +++ b/adapters/saved_bloomfirefly/adapter_config.json @@ -0,0 +1,16 @@ +{ + "base_model_name_or_path": "bigscience/bloomz-7b1-mt", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved_bloomfirefly/adapter_model.bin b/adapters/saved_bloomfirefly/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..4325014ef41d714ef0f7d34711582d856d9bbde5 --- /dev/null +++ b/adapters/saved_bloomfirefly/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9811ba709c8fbf328aed339d629c33c4363f9c811d3ede0467d1126f2eb183d6 +size 15750885 diff --git a/adapters/saved_bloomfirefly/checkpoint-18800/optimizer.pt b/adapters/saved_bloomfirefly/checkpoint-18800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..df7fa44a1dfa11c6bc775df69c12c57c976f2394 --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-18800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48793a05958d630a9d20b115c1f0ce85f118d7639c605a22ac581afbe05525c0 +size 31492741 diff --git a/adapters/saved_bloomfirefly/checkpoint-18800/pytorch_model.bin b/adapters/saved_bloomfirefly/checkpoint-18800/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..4275a5101facb2d936426464649cfca6443daa9e --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-18800/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34b6caf2ccf55b3202b532bc40512ab132887dd64a54adb0b9363a0360cd8f58 +size 15750885 diff --git a/adapters/saved_bloomfirefly/checkpoint-18800/rng_state_0.pth b/adapters/saved_bloomfirefly/checkpoint-18800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..cca42e7cc02b87423d587dfb891f33f734c31c1c --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-18800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afab8dc4098f4bff3b7b98cce4588b39dbacfee5b0404c9e7aff2b0f0e222eb6 +size 14583 diff --git a/adapters/saved_bloomfirefly/checkpoint-18800/rng_state_1.pth b/adapters/saved_bloomfirefly/checkpoint-18800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..f799afd4c68726f5f1ddf69748a3bfb5abf2c2f9 --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-18800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8def2ef25b1f281c9381fa7d9e55bbae968d644bc01f751bc733254ea14295cd +size 14583 diff --git a/adapters/saved_bloomfirefly/checkpoint-18800/scaler.pt b/adapters/saved_bloomfirefly/checkpoint-18800/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..bad7b18ea83553e043bacfb5abdb036d80e46576 --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-18800/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7741b7c9883d8d3b50d5337188affe12f52f92811bb463907dc67788317599e8 +size 557 diff --git a/adapters/saved_bloomfirefly/checkpoint-18800/scheduler.pt b/adapters/saved_bloomfirefly/checkpoint-18800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..376568ad6a74d84d3fd87120763c9a5e208083b2 --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-18800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a07d8d48fab79adc1dcca65956a82ff6393741eb03bbfe9e47912339c32b04cb +size 627 diff --git a/adapters/saved_bloomfirefly/checkpoint-18800/trainer_state.json b/adapters/saved_bloomfirefly/checkpoint-18800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bd28141596144ddca55b076a72bb04a634fcc51f --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-18800/trainer_state.json @@ -0,0 +1,6408 @@ +{ + "best_metric": 2.3294034004211426, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_bloomfirefly/checkpoint-18800", + "epoch": 2.9214519849459757, + "global_step": 18800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 2.9733, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 2.7809, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 2.6052, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 0.00023999999999999998, + "loss": 2.4925, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.0003, + "loss": 2.458, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029968758135902107, + "loss": 2.4281, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029937516271804216, + "loss": 2.4178, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029906274407706326, + "loss": 2.3839, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002987503254360843, + "loss": 2.3521, + "step": 180 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029843790679510545, + "loss": 2.338, + "step": 200 + }, + { + "epoch": 0.03, + "eval_loss": 2.510117292404175, + "eval_runtime": 69.1765, + "eval_samples_per_second": 28.912, + "eval_steps_per_second": 1.807, + "step": 200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002981254881541265, + "loss": 2.3401, + "step": 220 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002978130695131476, + "loss": 2.3665, + "step": 240 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002975006508721687, + "loss": 2.3691, + "step": 260 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002971882322311898, + "loss": 2.3514, + "step": 280 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002968758135902109, + "loss": 2.3203, + "step": 300 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029656339494923197, + "loss": 2.3393, + "step": 320 + }, + { + "epoch": 0.05, + "learning_rate": 0.000296250976308253, + "loss": 2.3289, + "step": 340 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029593855766727416, + "loss": 2.3407, + "step": 360 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002956261390262952, + "loss": 2.3163, + "step": 380 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002953137203853163, + "loss": 2.3212, + "step": 400 + }, + { + "epoch": 0.06, + "eval_loss": 2.473245620727539, + "eval_runtime": 69.0219, + "eval_samples_per_second": 28.976, + "eval_steps_per_second": 1.811, + "step": 400 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002950013017443374, + "loss": 2.2927, + "step": 420 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002946888831033585, + "loss": 2.2927, + "step": 440 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002943764644623796, + "loss": 2.29, + "step": 460 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002940640458214007, + "loss": 2.3099, + "step": 480 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002937516271804217, + "loss": 2.3286, + "step": 500 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002934392085394428, + "loss": 2.2928, + "step": 520 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002931267898984639, + "loss": 2.2956, + "step": 540 + }, + { + "epoch": 0.09, + "learning_rate": 0.000292814371257485, + "loss": 2.2627, + "step": 560 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002925019526165061, + "loss": 2.2897, + "step": 580 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002921895339755272, + "loss": 2.2994, + "step": 600 + }, + { + "epoch": 0.09, + "eval_loss": 2.455402374267578, + "eval_runtime": 69.1315, + "eval_samples_per_second": 28.93, + "eval_steps_per_second": 1.808, + "step": 600 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029187711533454824, + "loss": 2.3232, + "step": 620 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002915646966935694, + "loss": 2.2515, + "step": 640 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029125227805259043, + "loss": 2.2856, + "step": 660 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002909398594116115, + "loss": 2.252, + "step": 680 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002906274407706326, + "loss": 2.2891, + "step": 700 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002903150221296537, + "loss": 2.2769, + "step": 720 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002900026034886748, + "loss": 2.2763, + "step": 740 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002896901848476959, + "loss": 2.278, + "step": 760 + }, + { + "epoch": 0.12, + "learning_rate": 0.00028937776620671695, + "loss": 2.3126, + "step": 780 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002890653475657381, + "loss": 2.2698, + "step": 800 + }, + { + "epoch": 0.12, + "eval_loss": 2.4434444904327393, + "eval_runtime": 69.7211, + "eval_samples_per_second": 28.686, + "eval_steps_per_second": 1.793, + "step": 800 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028875292892475914, + "loss": 2.2587, + "step": 820 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028844051028378023, + "loss": 2.2954, + "step": 840 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028812809164280133, + "loss": 2.3102, + "step": 860 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002878156730018224, + "loss": 2.2918, + "step": 880 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002875032543608435, + "loss": 2.2698, + "step": 900 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002871908357198646, + "loss": 2.2514, + "step": 920 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028687841707888566, + "loss": 2.2684, + "step": 940 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028656599843790675, + "loss": 2.2833, + "step": 960 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028625357979692785, + "loss": 2.2709, + "step": 980 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028594116115594894, + "loss": 2.2596, + "step": 1000 + }, + { + "epoch": 0.16, + "eval_loss": 2.436037302017212, + "eval_runtime": 69.727, + "eval_samples_per_second": 28.683, + "eval_steps_per_second": 1.793, + "step": 1000 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028562874251497004, + "loss": 2.2743, + "step": 1020 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028531632387399113, + "loss": 2.23, + "step": 1040 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002850039052330122, + "loss": 2.2723, + "step": 1060 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002846914865920333, + "loss": 2.2585, + "step": 1080 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028437906795105437, + "loss": 2.2463, + "step": 1100 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028406664931007546, + "loss": 2.2264, + "step": 1120 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028375423066909656, + "loss": 2.223, + "step": 1140 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028344181202811765, + "loss": 2.2412, + "step": 1160 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028312939338713875, + "loss": 2.2714, + "step": 1180 + }, + { + "epoch": 0.19, + "learning_rate": 0.00028281697474615984, + "loss": 2.2638, + "step": 1200 + }, + { + "epoch": 0.19, + "eval_loss": 2.4272871017456055, + "eval_runtime": 69.3748, + "eval_samples_per_second": 28.829, + "eval_steps_per_second": 1.802, + "step": 1200 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002825045561051809, + "loss": 2.2303, + "step": 1220 + }, + { + "epoch": 0.19, + "learning_rate": 0.000282192137464202, + "loss": 2.2491, + "step": 1240 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028187971882322313, + "loss": 2.2598, + "step": 1260 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028156730018224417, + "loss": 2.2566, + "step": 1280 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028125488154126527, + "loss": 2.2642, + "step": 1300 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028094246290028636, + "loss": 2.2976, + "step": 1320 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028063004425930746, + "loss": 2.2144, + "step": 1340 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028031762561832855, + "loss": 2.2618, + "step": 1360 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028000520697734965, + "loss": 2.2232, + "step": 1380 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002796927883363707, + "loss": 2.2349, + "step": 1400 + }, + { + "epoch": 0.22, + "eval_loss": 2.422177314758301, + "eval_runtime": 69.7796, + "eval_samples_per_second": 28.662, + "eval_steps_per_second": 1.791, + "step": 1400 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027938036969539184, + "loss": 2.2655, + "step": 1420 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002790679510544129, + "loss": 2.265, + "step": 1440 + }, + { + "epoch": 0.23, + "learning_rate": 0.000278755532413434, + "loss": 2.2552, + "step": 1460 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027844311377245507, + "loss": 2.252, + "step": 1480 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027813069513147617, + "loss": 2.255, + "step": 1500 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027781827649049726, + "loss": 2.1869, + "step": 1520 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027750585784951836, + "loss": 2.2601, + "step": 1540 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002771934392085394, + "loss": 2.2607, + "step": 1560 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002768810205675605, + "loss": 2.2245, + "step": 1580 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002765686019265816, + "loss": 2.2561, + "step": 1600 + }, + { + "epoch": 0.25, + "eval_loss": 2.4173202514648438, + "eval_runtime": 69.7813, + "eval_samples_per_second": 28.661, + "eval_steps_per_second": 1.791, + "step": 1600 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002762561832856027, + "loss": 2.2472, + "step": 1620 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002759437646446238, + "loss": 2.2952, + "step": 1640 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002756313460036449, + "loss": 2.1941, + "step": 1660 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002753189273626659, + "loss": 2.2396, + "step": 1680 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027500650872168707, + "loss": 2.2325, + "step": 1700 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002746940900807081, + "loss": 2.2458, + "step": 1720 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002743816714397292, + "loss": 2.2464, + "step": 1740 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002740692527987503, + "loss": 2.2487, + "step": 1760 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002737568341577714, + "loss": 2.2609, + "step": 1780 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002734444155167925, + "loss": 2.3016, + "step": 1800 + }, + { + "epoch": 0.28, + "eval_loss": 2.4146716594696045, + "eval_runtime": 69.513, + "eval_samples_per_second": 28.772, + "eval_steps_per_second": 1.798, + "step": 1800 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002731319968758136, + "loss": 2.2415, + "step": 1820 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002728195782348346, + "loss": 2.2512, + "step": 1840 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002725071595938558, + "loss": 2.2186, + "step": 1860 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002721947409528768, + "loss": 2.1982, + "step": 1880 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002718823223118979, + "loss": 2.2358, + "step": 1900 + }, + { + "epoch": 0.3, + "learning_rate": 0.000271569903670919, + "loss": 2.2359, + "step": 1920 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002712574850299401, + "loss": 2.2367, + "step": 1940 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002709450663889612, + "loss": 2.2209, + "step": 1960 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002706326477479823, + "loss": 2.2026, + "step": 1980 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027032022910700333, + "loss": 2.2302, + "step": 2000 + }, + { + "epoch": 0.31, + "eval_loss": 2.4096806049346924, + "eval_runtime": 69.8744, + "eval_samples_per_second": 28.623, + "eval_steps_per_second": 1.789, + "step": 2000 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027000781046602443, + "loss": 2.2516, + "step": 2020 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002696953918250455, + "loss": 2.2173, + "step": 2040 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002693829731840666, + "loss": 2.2414, + "step": 2060 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002690705545430877, + "loss": 2.1922, + "step": 2080 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002687581359021088, + "loss": 2.2396, + "step": 2100 + }, + { + "epoch": 0.33, + "learning_rate": 0.00026844571726112985, + "loss": 2.2602, + "step": 2120 + }, + { + "epoch": 0.33, + "learning_rate": 0.000268133298620151, + "loss": 2.2263, + "step": 2140 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026782087997917204, + "loss": 2.2082, + "step": 2160 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026750846133819314, + "loss": 2.2144, + "step": 2180 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026719604269721423, + "loss": 2.2066, + "step": 2200 + }, + { + "epoch": 0.34, + "eval_loss": 2.4065375328063965, + "eval_runtime": 69.933, + "eval_samples_per_second": 28.599, + "eval_steps_per_second": 1.787, + "step": 2200 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026688362405623533, + "loss": 2.2494, + "step": 2220 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002665712054152564, + "loss": 2.2471, + "step": 2240 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002662587867742775, + "loss": 2.2512, + "step": 2260 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026594636813329856, + "loss": 2.2249, + "step": 2280 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002656339494923197, + "loss": 2.2526, + "step": 2300 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026532153085134075, + "loss": 2.2375, + "step": 2320 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026500911221036185, + "loss": 2.169, + "step": 2340 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026469669356938294, + "loss": 2.2206, + "step": 2360 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026438427492840404, + "loss": 2.2284, + "step": 2380 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026407185628742513, + "loss": 2.2116, + "step": 2400 + }, + { + "epoch": 0.37, + "eval_loss": 2.402400255203247, + "eval_runtime": 70.6508, + "eval_samples_per_second": 28.308, + "eval_steps_per_second": 1.769, + "step": 2400 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026375943764644623, + "loss": 2.2228, + "step": 2420 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002634470190054673, + "loss": 2.2264, + "step": 2440 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026313460036448837, + "loss": 2.2212, + "step": 2460 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002628221817235095, + "loss": 2.2164, + "step": 2480 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026250976308253056, + "loss": 2.2523, + "step": 2500 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026219734444155165, + "loss": 2.2272, + "step": 2520 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026188492580057275, + "loss": 2.2381, + "step": 2540 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026157250715959384, + "loss": 2.2149, + "step": 2560 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026126008851861494, + "loss": 2.228, + "step": 2580 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026094766987763603, + "loss": 2.2145, + "step": 2600 + }, + { + "epoch": 0.4, + "eval_loss": 2.399576425552368, + "eval_runtime": 69.9194, + "eval_samples_per_second": 28.604, + "eval_steps_per_second": 1.788, + "step": 2600 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002606352512366571, + "loss": 2.18, + "step": 2620 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002603228325956782, + "loss": 2.1965, + "step": 2640 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026001041395469927, + "loss": 2.178, + "step": 2660 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025969799531372036, + "loss": 2.194, + "step": 2680 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025938557667274146, + "loss": 2.2024, + "step": 2700 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025907315803176255, + "loss": 2.2427, + "step": 2720 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025876073939078365, + "loss": 2.2246, + "step": 2740 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025844832074980474, + "loss": 2.2169, + "step": 2760 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002581359021088258, + "loss": 2.2154, + "step": 2780 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002578234834678469, + "loss": 2.1732, + "step": 2800 + }, + { + "epoch": 0.44, + "eval_loss": 2.3982491493225098, + "eval_runtime": 70.2191, + "eval_samples_per_second": 28.482, + "eval_steps_per_second": 1.78, + "step": 2800 + }, + { + "epoch": 0.44, + "learning_rate": 0.000257511064826868, + "loss": 2.1951, + "step": 2820 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025719864618588907, + "loss": 2.2139, + "step": 2840 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025688622754491017, + "loss": 2.197, + "step": 2860 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025657380890393126, + "loss": 2.2317, + "step": 2880 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002562613902629523, + "loss": 2.2107, + "step": 2900 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025594897162197345, + "loss": 2.2087, + "step": 2920 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002556365529809945, + "loss": 2.2124, + "step": 2940 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002553241343400156, + "loss": 2.1762, + "step": 2960 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002550117156990367, + "loss": 2.2488, + "step": 2980 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002546992970580578, + "loss": 2.2316, + "step": 3000 + }, + { + "epoch": 0.47, + "eval_loss": 2.394296646118164, + "eval_runtime": 70.2494, + "eval_samples_per_second": 28.47, + "eval_steps_per_second": 1.779, + "step": 3000 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002543868784170789, + "loss": 2.2386, + "step": 3020 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025407445977609997, + "loss": 2.224, + "step": 3040 + }, + { + "epoch": 0.48, + "learning_rate": 0.000253762041135121, + "loss": 2.2479, + "step": 3060 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002534496224941421, + "loss": 2.2396, + "step": 3080 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002531372038531632, + "loss": 2.2405, + "step": 3100 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002528247852121843, + "loss": 2.1969, + "step": 3120 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002525123665712054, + "loss": 2.2095, + "step": 3140 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002521999479302265, + "loss": 2.2202, + "step": 3160 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002518875292892476, + "loss": 2.2088, + "step": 3180 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002515751106482687, + "loss": 2.2075, + "step": 3200 + }, + { + "epoch": 0.5, + "eval_loss": 2.3918581008911133, + "eval_runtime": 69.2896, + "eval_samples_per_second": 28.864, + "eval_steps_per_second": 1.804, + "step": 3200 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002512626920072897, + "loss": 2.1993, + "step": 3220 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002509502733663108, + "loss": 2.2406, + "step": 3240 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002506378547253319, + "loss": 2.2352, + "step": 3260 + }, + { + "epoch": 0.51, + "learning_rate": 0.000250325436084353, + "loss": 2.236, + "step": 3280 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002500130174433741, + "loss": 2.1805, + "step": 3300 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002497005988023952, + "loss": 2.2249, + "step": 3320 + }, + { + "epoch": 0.52, + "learning_rate": 0.00024938818016141624, + "loss": 2.2153, + "step": 3340 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002490757615204374, + "loss": 2.2115, + "step": 3360 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024876334287945843, + "loss": 2.2284, + "step": 3380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002484509242384795, + "loss": 2.184, + "step": 3400 + }, + { + "epoch": 0.53, + "eval_loss": 2.3887791633605957, + "eval_runtime": 69.2387, + "eval_samples_per_second": 28.886, + "eval_steps_per_second": 1.805, + "step": 3400 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002481385055975006, + "loss": 2.2172, + "step": 3420 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002478260869565217, + "loss": 2.2347, + "step": 3440 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002475136683155428, + "loss": 2.2213, + "step": 3460 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002472012496745639, + "loss": 2.2215, + "step": 3480 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024688883103358495, + "loss": 2.2058, + "step": 3500 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024657641239260604, + "loss": 2.1918, + "step": 3520 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002462639937516272, + "loss": 2.2021, + "step": 3540 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024595157511064824, + "loss": 2.1832, + "step": 3560 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024563915646966933, + "loss": 2.2199, + "step": 3580 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002453267378286904, + "loss": 2.1997, + "step": 3600 + }, + { + "epoch": 0.56, + "eval_loss": 2.386540412902832, + "eval_runtime": 69.2123, + "eval_samples_per_second": 28.897, + "eval_steps_per_second": 1.806, + "step": 3600 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002450143191877115, + "loss": 2.2009, + "step": 3620 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002447019005467326, + "loss": 2.2045, + "step": 3640 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002443894819057537, + "loss": 2.2231, + "step": 3660 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024407706326477478, + "loss": 2.211, + "step": 3680 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024376464462379588, + "loss": 2.1904, + "step": 3700 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024345222598281694, + "loss": 2.1492, + "step": 3720 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024313980734183807, + "loss": 2.2368, + "step": 3740 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024282738870085914, + "loss": 2.1753, + "step": 3760 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024251497005988023, + "loss": 2.179, + "step": 3780 + }, + { + "epoch": 0.59, + "learning_rate": 0.0002422025514189013, + "loss": 2.1811, + "step": 3800 + }, + { + "epoch": 0.59, + "eval_loss": 2.3864212036132812, + "eval_runtime": 69.2951, + "eval_samples_per_second": 28.862, + "eval_steps_per_second": 1.804, + "step": 3800 + }, + { + "epoch": 0.59, + "learning_rate": 0.0002418901327779224, + "loss": 2.1496, + "step": 3820 + }, + { + "epoch": 0.6, + "learning_rate": 0.0002415777141369435, + "loss": 2.2071, + "step": 3840 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024126529549596459, + "loss": 2.189, + "step": 3860 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024095287685498565, + "loss": 2.1838, + "step": 3880 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024064045821400675, + "loss": 2.2292, + "step": 3900 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024032803957302782, + "loss": 2.1931, + "step": 3920 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024001562093204894, + "loss": 2.2293, + "step": 3940 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023970320229107, + "loss": 2.2112, + "step": 3960 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002393907836500911, + "loss": 2.1479, + "step": 3980 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023907836500911217, + "loss": 2.1661, + "step": 4000 + }, + { + "epoch": 0.62, + "eval_loss": 2.383505344390869, + "eval_runtime": 69.2876, + "eval_samples_per_second": 28.865, + "eval_steps_per_second": 1.804, + "step": 4000 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002387659463681333, + "loss": 2.1783, + "step": 4020 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023845352772715436, + "loss": 2.1975, + "step": 4040 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023814110908617546, + "loss": 2.2268, + "step": 4060 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023782869044519653, + "loss": 2.1815, + "step": 4080 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023751627180421765, + "loss": 2.2305, + "step": 4100 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023720385316323872, + "loss": 2.2087, + "step": 4120 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002368914345222598, + "loss": 2.2204, + "step": 4140 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023657901588128088, + "loss": 2.2138, + "step": 4160 + }, + { + "epoch": 0.65, + "learning_rate": 0.000236266597240302, + "loss": 2.2071, + "step": 4180 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023595417859932307, + "loss": 2.1728, + "step": 4200 + }, + { + "epoch": 0.65, + "eval_loss": 2.3820013999938965, + "eval_runtime": 69.3049, + "eval_samples_per_second": 28.858, + "eval_steps_per_second": 1.804, + "step": 4200 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023564175995834417, + "loss": 2.182, + "step": 4220 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023532934131736524, + "loss": 2.1948, + "step": 4240 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023501692267638633, + "loss": 2.2178, + "step": 4260 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023470450403540743, + "loss": 2.1979, + "step": 4280 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023439208539442852, + "loss": 2.222, + "step": 4300 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002340796667534496, + "loss": 2.221, + "step": 4320 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023376724811247069, + "loss": 2.208, + "step": 4340 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023345482947149175, + "loss": 2.1502, + "step": 4360 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023314241083051288, + "loss": 2.1628, + "step": 4380 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023282999218953395, + "loss": 2.1933, + "step": 4400 + }, + { + "epoch": 0.68, + "eval_loss": 2.380128860473633, + "eval_runtime": 69.2864, + "eval_samples_per_second": 28.866, + "eval_steps_per_second": 1.804, + "step": 4400 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023251757354855504, + "loss": 2.2204, + "step": 4420 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002322051549075761, + "loss": 2.218, + "step": 4440 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023189273626659723, + "loss": 2.199, + "step": 4460 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002315803176256183, + "loss": 2.1826, + "step": 4480 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002312678989846394, + "loss": 2.174, + "step": 4500 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023095548034366046, + "loss": 2.2011, + "step": 4520 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023064306170268159, + "loss": 2.1951, + "step": 4540 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023033064306170265, + "loss": 2.2189, + "step": 4560 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023001822442072375, + "loss": 2.1891, + "step": 4580 + }, + { + "epoch": 0.71, + "learning_rate": 0.00022970580577974482, + "loss": 2.1873, + "step": 4600 + }, + { + "epoch": 0.71, + "eval_loss": 2.379713296890259, + "eval_runtime": 69.3005, + "eval_samples_per_second": 28.86, + "eval_steps_per_second": 1.804, + "step": 4600 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022939338713876591, + "loss": 2.2191, + "step": 4620 + }, + { + "epoch": 0.72, + "learning_rate": 0.000229080968497787, + "loss": 2.1966, + "step": 4640 + }, + { + "epoch": 0.72, + "learning_rate": 0.0002287685498568081, + "loss": 2.2062, + "step": 4660 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022845613121582917, + "loss": 2.1888, + "step": 4680 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022814371257485027, + "loss": 2.1938, + "step": 4700 + }, + { + "epoch": 0.73, + "learning_rate": 0.0002278312939338714, + "loss": 2.206, + "step": 4720 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022751887529289246, + "loss": 2.1584, + "step": 4740 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022720645665191355, + "loss": 2.1933, + "step": 4760 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022689403801093462, + "loss": 2.2087, + "step": 4780 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022658161936995575, + "loss": 2.2239, + "step": 4800 + }, + { + "epoch": 0.75, + "eval_loss": 2.3774757385253906, + "eval_runtime": 69.3137, + "eval_samples_per_second": 28.854, + "eval_steps_per_second": 1.803, + "step": 4800 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022626920072897681, + "loss": 2.2136, + "step": 4820 + }, + { + "epoch": 0.75, + "learning_rate": 0.0002259567820879979, + "loss": 2.2046, + "step": 4840 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022564436344701898, + "loss": 2.2031, + "step": 4860 + }, + { + "epoch": 0.76, + "learning_rate": 0.0002253319448060401, + "loss": 2.171, + "step": 4880 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022501952616506117, + "loss": 2.2101, + "step": 4900 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022470710752408226, + "loss": 2.1306, + "step": 4920 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022439468888310333, + "loss": 2.1754, + "step": 4940 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022408227024212443, + "loss": 2.1972, + "step": 4960 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022376985160114552, + "loss": 2.2175, + "step": 4980 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022345743296016662, + "loss": 2.139, + "step": 5000 + }, + { + "epoch": 0.78, + "eval_loss": 2.3760337829589844, + "eval_runtime": 69.3092, + "eval_samples_per_second": 28.856, + "eval_steps_per_second": 1.804, + "step": 5000 + }, + { + "epoch": 0.78, + "learning_rate": 0.0002231450143191877, + "loss": 2.1912, + "step": 5020 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022283259567820878, + "loss": 2.2036, + "step": 5040 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022252017703722985, + "loss": 2.1852, + "step": 5060 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022220775839625097, + "loss": 2.1672, + "step": 5080 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022189533975527204, + "loss": 2.1828, + "step": 5100 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022158292111429314, + "loss": 2.1875, + "step": 5120 + }, + { + "epoch": 0.8, + "learning_rate": 0.0002212705024733142, + "loss": 2.1997, + "step": 5140 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022095808383233533, + "loss": 2.2162, + "step": 5160 + }, + { + "epoch": 0.8, + "learning_rate": 0.0002206456651913564, + "loss": 2.2213, + "step": 5180 + }, + { + "epoch": 0.81, + "learning_rate": 0.0002203332465503775, + "loss": 2.1972, + "step": 5200 + }, + { + "epoch": 0.81, + "eval_loss": 2.374734878540039, + "eval_runtime": 69.2582, + "eval_samples_per_second": 28.877, + "eval_steps_per_second": 1.805, + "step": 5200 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022002082790939856, + "loss": 2.175, + "step": 5220 + }, + { + "epoch": 0.81, + "learning_rate": 0.00021970840926841968, + "loss": 2.1951, + "step": 5240 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021939599062744075, + "loss": 2.1493, + "step": 5260 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021908357198646185, + "loss": 2.1611, + "step": 5280 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021877115334548291, + "loss": 2.1621, + "step": 5300 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021845873470450404, + "loss": 2.1875, + "step": 5320 + }, + { + "epoch": 0.83, + "learning_rate": 0.0002181463160635251, + "loss": 2.1733, + "step": 5340 + }, + { + "epoch": 0.83, + "learning_rate": 0.0002178338974225462, + "loss": 2.242, + "step": 5360 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021752147878156727, + "loss": 2.2154, + "step": 5380 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021720906014058836, + "loss": 2.1969, + "step": 5400 + }, + { + "epoch": 0.84, + "eval_loss": 2.372680902481079, + "eval_runtime": 69.283, + "eval_samples_per_second": 28.867, + "eval_steps_per_second": 1.804, + "step": 5400 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021689664149960946, + "loss": 2.1245, + "step": 5420 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021658422285863056, + "loss": 2.2049, + "step": 5440 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021627180421765162, + "loss": 2.1716, + "step": 5460 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021595938557667272, + "loss": 2.1891, + "step": 5480 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002156469669356938, + "loss": 2.1963, + "step": 5500 + }, + { + "epoch": 0.86, + "learning_rate": 0.0002153345482947149, + "loss": 2.1946, + "step": 5520 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021502212965373598, + "loss": 2.1982, + "step": 5540 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021470971101275707, + "loss": 2.1759, + "step": 5560 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021439729237177814, + "loss": 2.1661, + "step": 5580 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021408487373079926, + "loss": 2.2051, + "step": 5600 + }, + { + "epoch": 0.87, + "eval_loss": 2.3719565868377686, + "eval_runtime": 69.321, + "eval_samples_per_second": 28.851, + "eval_steps_per_second": 1.803, + "step": 5600 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021377245508982033, + "loss": 2.1605, + "step": 5620 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021346003644884143, + "loss": 2.1375, + "step": 5640 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002131476178078625, + "loss": 2.1293, + "step": 5660 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021283519916688362, + "loss": 2.2189, + "step": 5680 + }, + { + "epoch": 0.89, + "learning_rate": 0.0002125227805259047, + "loss": 2.1784, + "step": 5700 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021221036188492578, + "loss": 2.1764, + "step": 5720 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021189794324394685, + "loss": 2.1569, + "step": 5740 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021158552460296795, + "loss": 2.1704, + "step": 5760 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021127310596198904, + "loss": 2.1614, + "step": 5780 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021096068732101014, + "loss": 2.2078, + "step": 5800 + }, + { + "epoch": 0.9, + "eval_loss": 2.370939016342163, + "eval_runtime": 69.2728, + "eval_samples_per_second": 28.871, + "eval_steps_per_second": 1.804, + "step": 5800 + }, + { + "epoch": 0.9, + "learning_rate": 0.0002106482686800312, + "loss": 2.198, + "step": 5820 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002103358500390523, + "loss": 2.1735, + "step": 5840 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021002343139807342, + "loss": 2.1936, + "step": 5860 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002097110127570945, + "loss": 2.1559, + "step": 5880 + }, + { + "epoch": 0.92, + "learning_rate": 0.0002093985941161156, + "loss": 2.1856, + "step": 5900 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020908617547513666, + "loss": 2.194, + "step": 5920 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020877375683415778, + "loss": 2.1983, + "step": 5940 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020846133819317885, + "loss": 2.1788, + "step": 5960 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020814891955219994, + "loss": 2.2126, + "step": 5980 + }, + { + "epoch": 0.93, + "learning_rate": 0.000207836500911221, + "loss": 2.1454, + "step": 6000 + }, + { + "epoch": 0.93, + "eval_loss": 2.369137763977051, + "eval_runtime": 69.3036, + "eval_samples_per_second": 28.859, + "eval_steps_per_second": 1.804, + "step": 6000 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020752408227024213, + "loss": 2.1603, + "step": 6020 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002072116636292632, + "loss": 2.2075, + "step": 6040 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002068992449882843, + "loss": 2.1817, + "step": 6060 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020658682634730537, + "loss": 2.1917, + "step": 6080 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020627440770632646, + "loss": 2.1727, + "step": 6100 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020596198906534756, + "loss": 2.1985, + "step": 6120 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020564957042436865, + "loss": 2.1888, + "step": 6140 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020533715178338972, + "loss": 2.1425, + "step": 6160 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020502473314241082, + "loss": 2.1659, + "step": 6180 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020471231450143188, + "loss": 2.1768, + "step": 6200 + }, + { + "epoch": 0.96, + "eval_loss": 2.368589162826538, + "eval_runtime": 69.4033, + "eval_samples_per_second": 28.817, + "eval_steps_per_second": 1.801, + "step": 6200 + }, + { + "epoch": 0.97, + "learning_rate": 0.000204399895860453, + "loss": 2.1744, + "step": 6220 + }, + { + "epoch": 0.97, + "learning_rate": 0.00020408747721947407, + "loss": 2.1484, + "step": 6240 + }, + { + "epoch": 0.97, + "learning_rate": 0.00020377505857849517, + "loss": 2.2154, + "step": 6260 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020346263993751624, + "loss": 2.1358, + "step": 6280 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020315022129653736, + "loss": 2.1809, + "step": 6300 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020283780265555843, + "loss": 2.1813, + "step": 6320 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020252538401457952, + "loss": 2.1903, + "step": 6340 + }, + { + "epoch": 0.99, + "learning_rate": 0.0002022129653736006, + "loss": 2.1971, + "step": 6360 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020190054673262172, + "loss": 2.2041, + "step": 6380 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020158812809164278, + "loss": 2.2169, + "step": 6400 + }, + { + "epoch": 0.99, + "eval_loss": 2.3672330379486084, + "eval_runtime": 69.3516, + "eval_samples_per_second": 28.839, + "eval_steps_per_second": 1.802, + "step": 6400 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020127570945066388, + "loss": 2.2101, + "step": 6420 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020096329080968495, + "loss": 2.1739, + "step": 6440 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020065087216870604, + "loss": 2.1764, + "step": 6460 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020033845352772714, + "loss": 2.1718, + "step": 6480 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020002603488674823, + "loss": 2.1688, + "step": 6500 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001997136162457693, + "loss": 2.1322, + "step": 6520 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001994011976047904, + "loss": 2.1593, + "step": 6540 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001990887789638115, + "loss": 2.179, + "step": 6560 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001987763603228326, + "loss": 2.139, + "step": 6580 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019846394168185366, + "loss": 2.1594, + "step": 6600 + }, + { + "epoch": 1.03, + "eval_loss": 2.367051839828491, + "eval_runtime": 69.3473, + "eval_samples_per_second": 28.84, + "eval_steps_per_second": 1.803, + "step": 6600 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019815152304087475, + "loss": 2.2033, + "step": 6620 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019783910439989582, + "loss": 2.183, + "step": 6640 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019752668575891694, + "loss": 2.1517, + "step": 6660 + }, + { + "epoch": 1.04, + "learning_rate": 0.000197214267117938, + "loss": 2.183, + "step": 6680 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001969018484769591, + "loss": 2.197, + "step": 6700 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019658942983598017, + "loss": 2.1778, + "step": 6720 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001962770111950013, + "loss": 2.1745, + "step": 6740 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019596459255402237, + "loss": 2.1585, + "step": 6760 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019565217391304346, + "loss": 2.1708, + "step": 6780 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019533975527206453, + "loss": 2.1649, + "step": 6800 + }, + { + "epoch": 1.06, + "eval_loss": 2.363710880279541, + "eval_runtime": 69.2642, + "eval_samples_per_second": 28.875, + "eval_steps_per_second": 1.805, + "step": 6800 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019502733663108565, + "loss": 2.1391, + "step": 6820 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019471491799010672, + "loss": 2.1939, + "step": 6840 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019440249934912782, + "loss": 2.1558, + "step": 6860 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019409008070814888, + "loss": 2.173, + "step": 6880 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019377766206716998, + "loss": 2.1821, + "step": 6900 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019346524342619107, + "loss": 2.16, + "step": 6920 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019315282478521217, + "loss": 2.1808, + "step": 6940 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019284040614423324, + "loss": 2.1355, + "step": 6960 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019252798750325433, + "loss": 2.1813, + "step": 6980 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019221556886227546, + "loss": 2.1677, + "step": 7000 + }, + { + "epoch": 1.09, + "eval_loss": 2.3648109436035156, + "eval_runtime": 69.3675, + "eval_samples_per_second": 28.832, + "eval_steps_per_second": 1.802, + "step": 7000 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019190315022129652, + "loss": 2.1479, + "step": 7020 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019159073158031762, + "loss": 2.1852, + "step": 7040 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001912783129393387, + "loss": 2.14, + "step": 7060 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001909658942983598, + "loss": 2.1332, + "step": 7080 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019065347565738088, + "loss": 2.178, + "step": 7100 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019034105701640197, + "loss": 2.1661, + "step": 7120 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019002863837542304, + "loss": 2.1902, + "step": 7140 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018971621973444417, + "loss": 2.1775, + "step": 7160 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018940380109346523, + "loss": 2.2007, + "step": 7180 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018909138245248633, + "loss": 2.2078, + "step": 7200 + }, + { + "epoch": 1.12, + "eval_loss": 2.3642289638519287, + "eval_runtime": 69.5476, + "eval_samples_per_second": 28.757, + "eval_steps_per_second": 1.797, + "step": 7200 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001887789638115074, + "loss": 2.185, + "step": 7220 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001884665451705285, + "loss": 2.1856, + "step": 7240 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001881541265295496, + "loss": 2.2049, + "step": 7260 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018784170788857068, + "loss": 2.1376, + "step": 7280 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018752928924759175, + "loss": 2.1693, + "step": 7300 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018721687060661285, + "loss": 2.1825, + "step": 7320 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018690445196563392, + "loss": 2.1649, + "step": 7340 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018659203332465504, + "loss": 2.1936, + "step": 7360 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001862796146836761, + "loss": 2.143, + "step": 7380 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001859671960426972, + "loss": 2.1617, + "step": 7400 + }, + { + "epoch": 1.15, + "eval_loss": 2.362150192260742, + "eval_runtime": 69.3218, + "eval_samples_per_second": 28.851, + "eval_steps_per_second": 1.803, + "step": 7400 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018565477740171827, + "loss": 2.1555, + "step": 7420 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001853423587607394, + "loss": 2.1639, + "step": 7440 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018502994011976046, + "loss": 2.1678, + "step": 7460 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018471752147878156, + "loss": 2.1775, + "step": 7480 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018440510283780263, + "loss": 2.1784, + "step": 7500 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018409268419682375, + "loss": 2.1499, + "step": 7520 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018378026555584482, + "loss": 2.154, + "step": 7540 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001834678469148659, + "loss": 2.1793, + "step": 7560 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018315542827388698, + "loss": 2.2292, + "step": 7580 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018284300963290808, + "loss": 2.1578, + "step": 7600 + }, + { + "epoch": 1.18, + "eval_loss": 2.3628857135772705, + "eval_runtime": 69.2564, + "eval_samples_per_second": 28.878, + "eval_steps_per_second": 1.805, + "step": 7600 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018253059099192917, + "loss": 2.1494, + "step": 7620 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018221817235095027, + "loss": 2.1669, + "step": 7640 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018190575370997133, + "loss": 2.1447, + "step": 7660 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018159333506899243, + "loss": 2.1663, + "step": 7680 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001812809164280135, + "loss": 2.1871, + "step": 7700 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018096849778703462, + "loss": 2.1338, + "step": 7720 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001806560791460557, + "loss": 2.1767, + "step": 7740 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018034366050507678, + "loss": 2.1694, + "step": 7760 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018003124186409785, + "loss": 2.1674, + "step": 7780 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017971882322311898, + "loss": 2.1863, + "step": 7800 + }, + { + "epoch": 1.21, + "eval_loss": 2.3613035678863525, + "eval_runtime": 69.2881, + "eval_samples_per_second": 28.865, + "eval_steps_per_second": 1.804, + "step": 7800 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017940640458214004, + "loss": 2.1441, + "step": 7820 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017909398594116114, + "loss": 2.1885, + "step": 7840 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001787815673001822, + "loss": 2.1514, + "step": 7860 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017846914865920333, + "loss": 2.2002, + "step": 7880 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001781567300182244, + "loss": 2.1759, + "step": 7900 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001778443113772455, + "loss": 2.1611, + "step": 7920 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017753189273626656, + "loss": 2.1667, + "step": 7940 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017721947409528768, + "loss": 2.1717, + "step": 7960 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017690705545430875, + "loss": 2.1983, + "step": 7980 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017659463681332985, + "loss": 2.2092, + "step": 8000 + }, + { + "epoch": 1.24, + "eval_loss": 2.3608274459838867, + "eval_runtime": 69.3364, + "eval_samples_per_second": 28.845, + "eval_steps_per_second": 1.803, + "step": 8000 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017628221817235092, + "loss": 2.1305, + "step": 8020 + }, + { + "epoch": 1.25, + "learning_rate": 0.000175969799531372, + "loss": 2.1431, + "step": 8040 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001756573808903931, + "loss": 2.1384, + "step": 8060 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001753449622494142, + "loss": 2.2093, + "step": 8080 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017503254360843527, + "loss": 2.1271, + "step": 8100 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017472012496745637, + "loss": 2.1466, + "step": 8120 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001744077063264775, + "loss": 2.1578, + "step": 8140 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017409528768549856, + "loss": 2.1632, + "step": 8160 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017378286904451965, + "loss": 2.1465, + "step": 8180 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017347045040354072, + "loss": 2.2226, + "step": 8200 + }, + { + "epoch": 1.27, + "eval_loss": 2.35835599899292, + "eval_runtime": 69.2657, + "eval_samples_per_second": 28.874, + "eval_steps_per_second": 1.805, + "step": 8200 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017315803176256184, + "loss": 2.1585, + "step": 8220 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001728456131215829, + "loss": 2.1529, + "step": 8240 + }, + { + "epoch": 1.28, + "learning_rate": 0.000172533194480604, + "loss": 2.1663, + "step": 8260 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017222077583962508, + "loss": 2.1422, + "step": 8280 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017190835719864617, + "loss": 2.158, + "step": 8300 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017159593855766727, + "loss": 2.1984, + "step": 8320 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017128351991668836, + "loss": 2.1395, + "step": 8340 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017097110127570943, + "loss": 2.14, + "step": 8360 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017065868263473053, + "loss": 2.1657, + "step": 8380 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017036188492580056, + "loss": 2.167, + "step": 8400 + }, + { + "epoch": 1.31, + "eval_loss": 2.35697603225708, + "eval_runtime": 69.2685, + "eval_samples_per_second": 28.873, + "eval_steps_per_second": 1.805, + "step": 8400 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017004946628482165, + "loss": 2.1396, + "step": 8420 + }, + { + "epoch": 1.31, + "learning_rate": 0.00016973704764384272, + "loss": 2.1777, + "step": 8440 + }, + { + "epoch": 1.31, + "learning_rate": 0.00016942462900286384, + "loss": 2.1366, + "step": 8460 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001691122103618849, + "loss": 2.1625, + "step": 8480 + }, + { + "epoch": 1.32, + "learning_rate": 0.000168799791720906, + "loss": 2.1859, + "step": 8500 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016848737307992707, + "loss": 2.1705, + "step": 8520 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001681749544389482, + "loss": 2.1971, + "step": 8540 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016786253579796927, + "loss": 2.1937, + "step": 8560 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016755011715699036, + "loss": 2.1436, + "step": 8580 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016723769851601143, + "loss": 2.1592, + "step": 8600 + }, + { + "epoch": 1.34, + "eval_loss": 2.3576247692108154, + "eval_runtime": 69.277, + "eval_samples_per_second": 28.87, + "eval_steps_per_second": 1.804, + "step": 8600 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016692527987503252, + "loss": 2.1745, + "step": 8620 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016661286123405362, + "loss": 2.1517, + "step": 8640 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016630044259307472, + "loss": 2.1921, + "step": 8660 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016598802395209578, + "loss": 2.1703, + "step": 8680 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016567560531111688, + "loss": 2.1223, + "step": 8700 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016536318667013795, + "loss": 2.1748, + "step": 8720 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016505076802915907, + "loss": 2.145, + "step": 8740 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016473834938818014, + "loss": 2.1077, + "step": 8760 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016442593074720123, + "loss": 2.1571, + "step": 8780 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001641135121062223, + "loss": 2.1946, + "step": 8800 + }, + { + "epoch": 1.37, + "eval_loss": 2.3559648990631104, + "eval_runtime": 69.3886, + "eval_samples_per_second": 28.823, + "eval_steps_per_second": 1.801, + "step": 8800 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016380109346524342, + "loss": 2.1635, + "step": 8820 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001634886748242645, + "loss": 2.1546, + "step": 8840 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001631762561832856, + "loss": 2.1359, + "step": 8860 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016286383754230666, + "loss": 2.1741, + "step": 8880 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016255141890132778, + "loss": 2.1382, + "step": 8900 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016223900026034885, + "loss": 2.1514, + "step": 8920 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016192658161936994, + "loss": 2.17, + "step": 8940 + }, + { + "epoch": 1.39, + "learning_rate": 0.000161614162978391, + "loss": 2.1784, + "step": 8960 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001613017443374121, + "loss": 2.1869, + "step": 8980 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001609893256964332, + "loss": 2.155, + "step": 9000 + }, + { + "epoch": 1.4, + "eval_loss": 2.3562612533569336, + "eval_runtime": 70.7208, + "eval_samples_per_second": 28.28, + "eval_steps_per_second": 1.768, + "step": 9000 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001606769070554543, + "loss": 2.1467, + "step": 9020 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016036448841447537, + "loss": 2.1662, + "step": 9040 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016005206977349646, + "loss": 2.1928, + "step": 9060 + }, + { + "epoch": 1.41, + "learning_rate": 0.00015973965113251756, + "loss": 2.1084, + "step": 9080 + }, + { + "epoch": 1.41, + "learning_rate": 0.00015942723249153865, + "loss": 2.182, + "step": 9100 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015911481385055975, + "loss": 2.1502, + "step": 9120 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015880239520958082, + "loss": 2.1645, + "step": 9140 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015848997656860194, + "loss": 2.1246, + "step": 9160 + }, + { + "epoch": 1.43, + "learning_rate": 0.000158177557927623, + "loss": 2.1769, + "step": 9180 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001578651392866441, + "loss": 2.1772, + "step": 9200 + }, + { + "epoch": 1.43, + "eval_loss": 2.354128360748291, + "eval_runtime": 70.4883, + "eval_samples_per_second": 28.374, + "eval_steps_per_second": 1.773, + "step": 9200 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015755272064566517, + "loss": 2.1777, + "step": 9220 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001572403020046863, + "loss": 2.1749, + "step": 9240 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015692788336370736, + "loss": 2.1861, + "step": 9260 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015661546472272846, + "loss": 2.1567, + "step": 9280 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015630304608174952, + "loss": 2.1426, + "step": 9300 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015599062744077062, + "loss": 2.1658, + "step": 9320 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015567820879979172, + "loss": 2.1639, + "step": 9340 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001553657901588128, + "loss": 2.1897, + "step": 9360 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015505337151783388, + "loss": 2.1439, + "step": 9380 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015474095287685497, + "loss": 2.1326, + "step": 9400 + }, + { + "epoch": 1.46, + "eval_loss": 2.352673292160034, + "eval_runtime": 69.2871, + "eval_samples_per_second": 28.865, + "eval_steps_per_second": 1.804, + "step": 9400 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015442853423587604, + "loss": 2.139, + "step": 9420 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015411611559489717, + "loss": 2.1087, + "step": 9440 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015380369695391823, + "loss": 2.1528, + "step": 9460 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015349127831293933, + "loss": 2.1866, + "step": 9480 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001531788596719604, + "loss": 2.1436, + "step": 9500 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015286644103098152, + "loss": 2.1699, + "step": 9520 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001525540223900026, + "loss": 2.1415, + "step": 9540 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015224160374902368, + "loss": 2.1092, + "step": 9560 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015192918510804475, + "loss": 2.1422, + "step": 9580 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015161676646706587, + "loss": 2.1677, + "step": 9600 + }, + { + "epoch": 1.49, + "eval_loss": 2.3518292903900146, + "eval_runtime": 69.3029, + "eval_samples_per_second": 28.859, + "eval_steps_per_second": 1.804, + "step": 9600 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015130434782608694, + "loss": 2.1594, + "step": 9620 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015099192918510804, + "loss": 2.1539, + "step": 9640 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001506795105441291, + "loss": 2.1343, + "step": 9660 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015036709190315023, + "loss": 2.1386, + "step": 9680 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001500546732621713, + "loss": 2.1512, + "step": 9700 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001497422546211924, + "loss": 2.1669, + "step": 9720 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001494298359802135, + "loss": 2.158, + "step": 9740 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014911741733923456, + "loss": 2.1643, + "step": 9760 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014880499869825565, + "loss": 2.1612, + "step": 9780 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014849258005727675, + "loss": 2.1441, + "step": 9800 + }, + { + "epoch": 1.52, + "eval_loss": 2.35211181640625, + "eval_runtime": 69.2821, + "eval_samples_per_second": 28.867, + "eval_steps_per_second": 1.804, + "step": 9800 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014818016141629784, + "loss": 2.1704, + "step": 9820 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001478677427753189, + "loss": 2.1546, + "step": 9840 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014755532413434, + "loss": 2.1909, + "step": 9860 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001472429054933611, + "loss": 2.149, + "step": 9880 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014693048685238217, + "loss": 2.1419, + "step": 9900 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014661806821140327, + "loss": 2.1465, + "step": 9920 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014630564957042436, + "loss": 2.1551, + "step": 9940 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014599323092944546, + "loss": 2.1526, + "step": 9960 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014568081228846653, + "loss": 2.1437, + "step": 9980 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014536839364748762, + "loss": 2.1659, + "step": 10000 + }, + { + "epoch": 1.55, + "eval_loss": 2.3507654666900635, + "eval_runtime": 69.2997, + "eval_samples_per_second": 28.86, + "eval_steps_per_second": 1.804, + "step": 10000 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014505597500650872, + "loss": 2.14, + "step": 10020 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001447435563655298, + "loss": 2.1289, + "step": 10040 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014443113772455088, + "loss": 2.1226, + "step": 10060 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014411871908357198, + "loss": 2.1627, + "step": 10080 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014380630044259307, + "loss": 2.1759, + "step": 10100 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014349388180161414, + "loss": 2.1511, + "step": 10120 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014318146316063523, + "loss": 2.1275, + "step": 10140 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014286904451965633, + "loss": 2.1638, + "step": 10160 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014255662587867743, + "loss": 2.1494, + "step": 10180 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001422442072376985, + "loss": 2.1554, + "step": 10200 + }, + { + "epoch": 1.59, + "eval_loss": 2.349271059036255, + "eval_runtime": 69.2627, + "eval_samples_per_second": 28.876, + "eval_steps_per_second": 1.805, + "step": 10200 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001419317885967196, + "loss": 2.133, + "step": 10220 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014161936995574068, + "loss": 2.1515, + "step": 10240 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014130695131476178, + "loss": 2.1262, + "step": 10260 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014099453267378285, + "loss": 2.142, + "step": 10280 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014068211403280394, + "loss": 2.1578, + "step": 10300 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014036969539182504, + "loss": 2.1583, + "step": 10320 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001400572767508461, + "loss": 2.1043, + "step": 10340 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001397448581098672, + "loss": 2.1539, + "step": 10360 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001394324394688883, + "loss": 2.1189, + "step": 10380 + }, + { + "epoch": 1.62, + "learning_rate": 0.0001391200208279094, + "loss": 2.1484, + "step": 10400 + }, + { + "epoch": 1.62, + "eval_loss": 2.3479487895965576, + "eval_runtime": 69.2625, + "eval_samples_per_second": 28.876, + "eval_steps_per_second": 1.805, + "step": 10400 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013880760218693046, + "loss": 2.1993, + "step": 10420 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013849518354595156, + "loss": 2.1869, + "step": 10440 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013818276490497265, + "loss": 2.1644, + "step": 10460 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013787034626399375, + "loss": 2.1751, + "step": 10480 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013755792762301482, + "loss": 2.1416, + "step": 10500 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001372455089820359, + "loss": 2.1809, + "step": 10520 + }, + { + "epoch": 1.64, + "learning_rate": 0.000136933090341057, + "loss": 2.1653, + "step": 10540 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013662067170007808, + "loss": 2.1026, + "step": 10560 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013630825305909917, + "loss": 2.1503, + "step": 10580 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013599583441812027, + "loss": 2.1289, + "step": 10600 + }, + { + "epoch": 1.65, + "eval_loss": 2.3468515872955322, + "eval_runtime": 69.2274, + "eval_samples_per_second": 28.89, + "eval_steps_per_second": 1.806, + "step": 10600 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013568341577714136, + "loss": 2.1929, + "step": 10620 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013537099713616243, + "loss": 2.1547, + "step": 10640 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013505857849518353, + "loss": 2.1571, + "step": 10660 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013474615985420462, + "loss": 2.1649, + "step": 10680 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013443374121322572, + "loss": 2.1647, + "step": 10700 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013412132257224679, + "loss": 2.206, + "step": 10720 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013380890393126788, + "loss": 2.1377, + "step": 10740 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013349648529028898, + "loss": 2.1347, + "step": 10760 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013318406664931004, + "loss": 2.1948, + "step": 10780 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013287164800833114, + "loss": 2.1844, + "step": 10800 + }, + { + "epoch": 1.68, + "eval_loss": 2.347837209701538, + "eval_runtime": 69.2425, + "eval_samples_per_second": 28.884, + "eval_steps_per_second": 1.805, + "step": 10800 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013255922936735224, + "loss": 2.1515, + "step": 10820 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013224681072637333, + "loss": 2.1885, + "step": 10840 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013193439208539443, + "loss": 2.143, + "step": 10860 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013162197344441552, + "loss": 2.1671, + "step": 10880 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001313095548034366, + "loss": 2.1426, + "step": 10900 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013099713616245769, + "loss": 2.1653, + "step": 10920 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013068471752147878, + "loss": 2.1774, + "step": 10940 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013037229888049988, + "loss": 2.1344, + "step": 10960 + }, + { + "epoch": 1.71, + "learning_rate": 0.00013005988023952094, + "loss": 2.1217, + "step": 10980 + }, + { + "epoch": 1.71, + "learning_rate": 0.00012974746159854204, + "loss": 2.1281, + "step": 11000 + }, + { + "epoch": 1.71, + "eval_loss": 2.345808982849121, + "eval_runtime": 69.2499, + "eval_samples_per_second": 28.881, + "eval_steps_per_second": 1.805, + "step": 11000 + }, + { + "epoch": 1.71, + "learning_rate": 0.00012943504295756314, + "loss": 2.1459, + "step": 11020 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001291226243165842, + "loss": 2.1294, + "step": 11040 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001288102056756053, + "loss": 2.1455, + "step": 11060 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001284977870346264, + "loss": 2.1219, + "step": 11080 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001281853683936475, + "loss": 2.1696, + "step": 11100 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012787294975266856, + "loss": 2.1474, + "step": 11120 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012756053111168965, + "loss": 2.1436, + "step": 11140 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012724811247071075, + "loss": 2.1785, + "step": 11160 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012693569382973184, + "loss": 2.1677, + "step": 11180 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001266232751887529, + "loss": 2.1564, + "step": 11200 + }, + { + "epoch": 1.74, + "eval_loss": 2.3451294898986816, + "eval_runtime": 69.2454, + "eval_samples_per_second": 28.883, + "eval_steps_per_second": 1.805, + "step": 11200 + }, + { + "epoch": 1.74, + "learning_rate": 0.000126310856547774, + "loss": 2.1793, + "step": 11220 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001259984379067951, + "loss": 2.1583, + "step": 11240 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012568601926581617, + "loss": 2.1482, + "step": 11260 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012537360062483727, + "loss": 2.1393, + "step": 11280 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012506118198385836, + "loss": 2.1586, + "step": 11300 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012474876334287946, + "loss": 2.1533, + "step": 11320 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012443634470190053, + "loss": 2.1516, + "step": 11340 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012412392606092162, + "loss": 2.1184, + "step": 11360 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012381150741994272, + "loss": 2.1162, + "step": 11380 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001234990887789638, + "loss": 2.1588, + "step": 11400 + }, + { + "epoch": 1.77, + "eval_loss": 2.3451669216156006, + "eval_runtime": 69.2383, + "eval_samples_per_second": 28.886, + "eval_steps_per_second": 1.805, + "step": 11400 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012318667013798488, + "loss": 2.1588, + "step": 11420 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012287425149700598, + "loss": 2.1463, + "step": 11440 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012256183285602707, + "loss": 2.1498, + "step": 11460 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012224941421504814, + "loss": 2.1663, + "step": 11480 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012193699557406924, + "loss": 2.1306, + "step": 11500 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012162457693309033, + "loss": 2.1542, + "step": 11520 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012131215829211141, + "loss": 2.1513, + "step": 11540 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012099973965113251, + "loss": 2.2031, + "step": 11560 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012068732101015359, + "loss": 2.1438, + "step": 11580 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012037490236917469, + "loss": 2.1431, + "step": 11600 + }, + { + "epoch": 1.8, + "eval_loss": 2.3447554111480713, + "eval_runtime": 69.2865, + "eval_samples_per_second": 28.866, + "eval_steps_per_second": 1.804, + "step": 11600 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012006248372819577, + "loss": 2.1272, + "step": 11620 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011975006508721686, + "loss": 2.1584, + "step": 11640 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011943764644623794, + "loss": 2.128, + "step": 11660 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011912522780525903, + "loss": 2.1461, + "step": 11680 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011881280916428012, + "loss": 2.1411, + "step": 11700 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001185003905233012, + "loss": 2.1592, + "step": 11720 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001181879718823223, + "loss": 2.1642, + "step": 11740 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011787555324134338, + "loss": 2.1914, + "step": 11760 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011756313460036448, + "loss": 2.1612, + "step": 11780 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011725071595938556, + "loss": 2.1452, + "step": 11800 + }, + { + "epoch": 1.83, + "eval_loss": 2.3442630767822266, + "eval_runtime": 69.2459, + "eval_samples_per_second": 28.883, + "eval_steps_per_second": 1.805, + "step": 11800 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011693829731840665, + "loss": 2.1453, + "step": 11820 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011662587867742774, + "loss": 2.1251, + "step": 11840 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011631346003644882, + "loss": 2.1412, + "step": 11860 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011600104139546991, + "loss": 2.1033, + "step": 11880 + }, + { + "epoch": 1.85, + "learning_rate": 0.000115688622754491, + "loss": 2.1219, + "step": 11900 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011537620411351209, + "loss": 2.1831, + "step": 11920 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011506378547253317, + "loss": 2.1434, + "step": 11940 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011475136683155427, + "loss": 2.1439, + "step": 11960 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011443894819057536, + "loss": 2.1377, + "step": 11980 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011412652954959646, + "loss": 2.1345, + "step": 12000 + }, + { + "epoch": 1.86, + "eval_loss": 2.342855453491211, + "eval_runtime": 69.2714, + "eval_samples_per_second": 28.872, + "eval_steps_per_second": 1.804, + "step": 12000 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011381411090861754, + "loss": 2.1527, + "step": 12020 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011350169226763864, + "loss": 2.1737, + "step": 12040 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011318927362665972, + "loss": 2.137, + "step": 12060 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011287685498568081, + "loss": 2.1616, + "step": 12080 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001125644363447019, + "loss": 2.1688, + "step": 12100 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011225201770372299, + "loss": 2.1746, + "step": 12120 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011193959906274407, + "loss": 2.1552, + "step": 12140 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011162718042176515, + "loss": 2.1643, + "step": 12160 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011131476178078625, + "loss": 2.1494, + "step": 12180 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011100234313980733, + "loss": 2.1112, + "step": 12200 + }, + { + "epoch": 1.9, + "eval_loss": 2.34304141998291, + "eval_runtime": 72.1422, + "eval_samples_per_second": 27.723, + "eval_steps_per_second": 1.733, + "step": 12200 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011068992449882843, + "loss": 2.1505, + "step": 12220 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011037750585784951, + "loss": 2.1722, + "step": 12240 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001100650872168706, + "loss": 2.1582, + "step": 12260 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010975266857589169, + "loss": 2.1806, + "step": 12280 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010944024993491278, + "loss": 2.1508, + "step": 12300 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010912783129393386, + "loss": 2.1654, + "step": 12320 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010881541265295496, + "loss": 2.131, + "step": 12340 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010850299401197604, + "loss": 2.1301, + "step": 12360 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010819057537099712, + "loss": 2.1312, + "step": 12380 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010787815673001822, + "loss": 2.1301, + "step": 12400 + }, + { + "epoch": 1.93, + "eval_loss": 2.3404922485351562, + "eval_runtime": 71.3367, + "eval_samples_per_second": 28.036, + "eval_steps_per_second": 1.752, + "step": 12400 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010758135902108825, + "loss": 2.1398, + "step": 12420 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010726894038010933, + "loss": 2.1449, + "step": 12440 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010695652173913043, + "loss": 2.1498, + "step": 12460 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010664410309815151, + "loss": 2.1484, + "step": 12480 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001063316844571726, + "loss": 2.1705, + "step": 12500 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010601926581619368, + "loss": 2.1236, + "step": 12520 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010570684717521478, + "loss": 2.1435, + "step": 12540 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010539442853423586, + "loss": 2.1656, + "step": 12560 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010508200989325696, + "loss": 2.1459, + "step": 12580 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010476959125227804, + "loss": 2.1392, + "step": 12600 + }, + { + "epoch": 1.96, + "eval_loss": 2.3410892486572266, + "eval_runtime": 72.1407, + "eval_samples_per_second": 27.724, + "eval_steps_per_second": 1.733, + "step": 12600 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010445717261129913, + "loss": 2.1399, + "step": 12620 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010414475397032022, + "loss": 2.1979, + "step": 12640 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001038323353293413, + "loss": 2.1596, + "step": 12660 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001035199166883624, + "loss": 2.1817, + "step": 12680 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010320749804738348, + "loss": 2.0972, + "step": 12700 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010289507940640457, + "loss": 2.1293, + "step": 12720 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010258266076542565, + "loss": 2.1362, + "step": 12740 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010227024212444675, + "loss": 2.1474, + "step": 12760 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010195782348346783, + "loss": 2.2004, + "step": 12780 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010164540484248893, + "loss": 2.1221, + "step": 12800 + }, + { + "epoch": 1.99, + "eval_loss": 2.340029716491699, + "eval_runtime": 72.0796, + "eval_samples_per_second": 27.747, + "eval_steps_per_second": 1.734, + "step": 12800 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010133298620151001, + "loss": 2.1782, + "step": 12820 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010102056756053109, + "loss": 2.1358, + "step": 12840 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010070814891955218, + "loss": 2.122, + "step": 12860 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010039573027857327, + "loss": 2.1494, + "step": 12880 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010008331163759436, + "loss": 2.1522, + "step": 12900 + }, + { + "epoch": 2.01, + "learning_rate": 9.977089299661544e-05, + "loss": 2.1241, + "step": 12920 + }, + { + "epoch": 2.01, + "learning_rate": 9.945847435563654e-05, + "loss": 2.1456, + "step": 12940 + }, + { + "epoch": 2.01, + "learning_rate": 9.914605571465763e-05, + "loss": 2.1495, + "step": 12960 + }, + { + "epoch": 2.02, + "learning_rate": 9.883363707367873e-05, + "loss": 2.1734, + "step": 12980 + }, + { + "epoch": 2.02, + "learning_rate": 9.852121843269981e-05, + "loss": 2.1711, + "step": 13000 + }, + { + "epoch": 2.02, + "eval_loss": 2.339312791824341, + "eval_runtime": 69.2994, + "eval_samples_per_second": 28.86, + "eval_steps_per_second": 1.804, + "step": 13000 + }, + { + "epoch": 2.02, + "learning_rate": 9.820879979172091e-05, + "loss": 2.1483, + "step": 13020 + }, + { + "epoch": 2.03, + "learning_rate": 9.789638115074199e-05, + "loss": 2.124, + "step": 13040 + }, + { + "epoch": 2.03, + "learning_rate": 9.758396250976308e-05, + "loss": 2.1337, + "step": 13060 + }, + { + "epoch": 2.03, + "learning_rate": 9.727154386878417e-05, + "loss": 2.137, + "step": 13080 + }, + { + "epoch": 2.04, + "learning_rate": 9.695912522780526e-05, + "loss": 2.1225, + "step": 13100 + }, + { + "epoch": 2.04, + "learning_rate": 9.664670658682634e-05, + "loss": 2.1384, + "step": 13120 + }, + { + "epoch": 2.04, + "learning_rate": 9.633428794584743e-05, + "loss": 2.1052, + "step": 13140 + }, + { + "epoch": 2.05, + "learning_rate": 9.602186930486852e-05, + "loss": 2.1489, + "step": 13160 + }, + { + "epoch": 2.05, + "learning_rate": 9.57094506638896e-05, + "loss": 2.1154, + "step": 13180 + }, + { + "epoch": 2.05, + "learning_rate": 9.53970320229107e-05, + "loss": 2.1476, + "step": 13200 + }, + { + "epoch": 2.05, + "eval_loss": 2.3396096229553223, + "eval_runtime": 69.2833, + "eval_samples_per_second": 28.867, + "eval_steps_per_second": 1.804, + "step": 13200 + }, + { + "epoch": 2.05, + "learning_rate": 9.508461338193178e-05, + "loss": 2.1109, + "step": 13220 + }, + { + "epoch": 2.06, + "learning_rate": 9.477219474095288e-05, + "loss": 2.0973, + "step": 13240 + }, + { + "epoch": 2.06, + "learning_rate": 9.445977609997396e-05, + "loss": 2.1281, + "step": 13260 + }, + { + "epoch": 2.06, + "learning_rate": 9.414735745899505e-05, + "loss": 2.1216, + "step": 13280 + }, + { + "epoch": 2.07, + "learning_rate": 9.383493881801614e-05, + "loss": 2.1323, + "step": 13300 + }, + { + "epoch": 2.07, + "learning_rate": 9.352252017703723e-05, + "loss": 2.1477, + "step": 13320 + }, + { + "epoch": 2.07, + "learning_rate": 9.321010153605831e-05, + "loss": 2.1309, + "step": 13340 + }, + { + "epoch": 2.08, + "learning_rate": 9.28976828950794e-05, + "loss": 2.0899, + "step": 13360 + }, + { + "epoch": 2.08, + "learning_rate": 9.258526425410049e-05, + "loss": 2.1402, + "step": 13380 + }, + { + "epoch": 2.08, + "learning_rate": 9.227284561312157e-05, + "loss": 2.0768, + "step": 13400 + }, + { + "epoch": 2.08, + "eval_loss": 2.3376858234405518, + "eval_runtime": 69.4568, + "eval_samples_per_second": 28.795, + "eval_steps_per_second": 1.8, + "step": 13400 + }, + { + "epoch": 2.09, + "learning_rate": 9.196042697214267e-05, + "loss": 2.1405, + "step": 13420 + }, + { + "epoch": 2.09, + "learning_rate": 9.164800833116375e-05, + "loss": 2.1118, + "step": 13440 + }, + { + "epoch": 2.09, + "learning_rate": 9.133558969018484e-05, + "loss": 2.1525, + "step": 13460 + }, + { + "epoch": 2.09, + "learning_rate": 9.102317104920593e-05, + "loss": 2.1369, + "step": 13480 + }, + { + "epoch": 2.1, + "learning_rate": 9.071075240822702e-05, + "loss": 2.1683, + "step": 13500 + }, + { + "epoch": 2.1, + "learning_rate": 9.03983337672481e-05, + "loss": 2.1193, + "step": 13520 + }, + { + "epoch": 2.1, + "learning_rate": 9.00859151262692e-05, + "loss": 2.1222, + "step": 13540 + }, + { + "epoch": 2.11, + "learning_rate": 8.977349648529028e-05, + "loss": 2.1461, + "step": 13560 + }, + { + "epoch": 2.11, + "learning_rate": 8.946107784431136e-05, + "loss": 2.1106, + "step": 13580 + }, + { + "epoch": 2.11, + "learning_rate": 8.914865920333246e-05, + "loss": 2.1307, + "step": 13600 + }, + { + "epoch": 2.11, + "eval_loss": 2.3381118774414062, + "eval_runtime": 69.5609, + "eval_samples_per_second": 28.752, + "eval_steps_per_second": 1.797, + "step": 13600 + }, + { + "epoch": 2.12, + "learning_rate": 8.883624056235354e-05, + "loss": 2.1679, + "step": 13620 + }, + { + "epoch": 2.12, + "learning_rate": 8.852382192137464e-05, + "loss": 2.1418, + "step": 13640 + }, + { + "epoch": 2.12, + "learning_rate": 8.821140328039572e-05, + "loss": 2.1238, + "step": 13660 + }, + { + "epoch": 2.13, + "learning_rate": 8.789898463941681e-05, + "loss": 2.0995, + "step": 13680 + }, + { + "epoch": 2.13, + "learning_rate": 8.75865659984379e-05, + "loss": 2.1596, + "step": 13700 + }, + { + "epoch": 2.13, + "learning_rate": 8.727414735745899e-05, + "loss": 2.1478, + "step": 13720 + }, + { + "epoch": 2.14, + "learning_rate": 8.696172871648007e-05, + "loss": 2.1299, + "step": 13740 + }, + { + "epoch": 2.14, + "learning_rate": 8.664931007550115e-05, + "loss": 2.1405, + "step": 13760 + }, + { + "epoch": 2.14, + "learning_rate": 8.633689143452225e-05, + "loss": 2.174, + "step": 13780 + }, + { + "epoch": 2.14, + "learning_rate": 8.602447279354333e-05, + "loss": 2.129, + "step": 13800 + }, + { + "epoch": 2.14, + "eval_loss": 2.337769031524658, + "eval_runtime": 69.7472, + "eval_samples_per_second": 28.675, + "eval_steps_per_second": 1.792, + "step": 13800 + }, + { + "epoch": 2.15, + "learning_rate": 8.571205415256443e-05, + "loss": 2.1368, + "step": 13820 + }, + { + "epoch": 2.15, + "learning_rate": 8.539963551158551e-05, + "loss": 2.1573, + "step": 13840 + }, + { + "epoch": 2.15, + "learning_rate": 8.50872168706066e-05, + "loss": 2.1132, + "step": 13860 + }, + { + "epoch": 2.16, + "learning_rate": 8.477479822962769e-05, + "loss": 2.1131, + "step": 13880 + }, + { + "epoch": 2.16, + "learning_rate": 8.446237958864878e-05, + "loss": 2.1351, + "step": 13900 + }, + { + "epoch": 2.16, + "learning_rate": 8.414996094766986e-05, + "loss": 2.1738, + "step": 13920 + }, + { + "epoch": 2.17, + "learning_rate": 8.383754230669096e-05, + "loss": 2.1551, + "step": 13940 + }, + { + "epoch": 2.17, + "learning_rate": 8.352512366571204e-05, + "loss": 2.1195, + "step": 13960 + }, + { + "epoch": 2.17, + "learning_rate": 8.321270502473312e-05, + "loss": 2.1125, + "step": 13980 + }, + { + "epoch": 2.18, + "learning_rate": 8.290028638375422e-05, + "loss": 2.1549, + "step": 14000 + }, + { + "epoch": 2.18, + "eval_loss": 2.337301731109619, + "eval_runtime": 69.7462, + "eval_samples_per_second": 28.675, + "eval_steps_per_second": 1.792, + "step": 14000 + }, + { + "epoch": 2.18, + "learning_rate": 8.25878677427753e-05, + "loss": 2.1573, + "step": 14020 + }, + { + "epoch": 2.18, + "learning_rate": 8.22754491017964e-05, + "loss": 2.1125, + "step": 14040 + }, + { + "epoch": 2.18, + "learning_rate": 8.196303046081748e-05, + "loss": 2.161, + "step": 14060 + }, + { + "epoch": 2.19, + "learning_rate": 8.165061181983857e-05, + "loss": 2.1511, + "step": 14080 + }, + { + "epoch": 2.19, + "learning_rate": 8.133819317885967e-05, + "loss": 2.1737, + "step": 14100 + }, + { + "epoch": 2.19, + "learning_rate": 8.102577453788076e-05, + "loss": 2.1158, + "step": 14120 + }, + { + "epoch": 2.2, + "learning_rate": 8.071335589690184e-05, + "loss": 2.1398, + "step": 14140 + }, + { + "epoch": 2.2, + "learning_rate": 8.040093725592294e-05, + "loss": 2.1183, + "step": 14160 + }, + { + "epoch": 2.2, + "learning_rate": 8.008851861494402e-05, + "loss": 2.1295, + "step": 14180 + }, + { + "epoch": 2.21, + "learning_rate": 7.977609997396512e-05, + "loss": 2.1416, + "step": 14200 + }, + { + "epoch": 2.21, + "eval_loss": 2.336796760559082, + "eval_runtime": 69.3578, + "eval_samples_per_second": 28.836, + "eval_steps_per_second": 1.802, + "step": 14200 + }, + { + "epoch": 2.21, + "learning_rate": 7.94636813329862e-05, + "loss": 2.1461, + "step": 14220 + }, + { + "epoch": 2.21, + "learning_rate": 7.91512626920073e-05, + "loss": 2.0931, + "step": 14240 + }, + { + "epoch": 2.22, + "learning_rate": 7.883884405102838e-05, + "loss": 2.1341, + "step": 14260 + }, + { + "epoch": 2.22, + "learning_rate": 7.852642541004946e-05, + "loss": 2.1369, + "step": 14280 + }, + { + "epoch": 2.22, + "learning_rate": 7.821400676907055e-05, + "loss": 2.1431, + "step": 14300 + }, + { + "epoch": 2.23, + "learning_rate": 7.790158812809164e-05, + "loss": 2.1508, + "step": 14320 + }, + { + "epoch": 2.23, + "learning_rate": 7.758916948711273e-05, + "loss": 2.1456, + "step": 14340 + }, + { + "epoch": 2.23, + "learning_rate": 7.727675084613381e-05, + "loss": 2.1448, + "step": 14360 + }, + { + "epoch": 2.23, + "learning_rate": 7.696433220515491e-05, + "loss": 2.1637, + "step": 14380 + }, + { + "epoch": 2.24, + "learning_rate": 7.665191356417599e-05, + "loss": 2.114, + "step": 14400 + }, + { + "epoch": 2.24, + "eval_loss": 2.3362655639648438, + "eval_runtime": 69.5792, + "eval_samples_per_second": 28.744, + "eval_steps_per_second": 1.797, + "step": 14400 + }, + { + "epoch": 2.24, + "learning_rate": 7.633949492319709e-05, + "loss": 2.1222, + "step": 14420 + }, + { + "epoch": 2.24, + "learning_rate": 7.602707628221817e-05, + "loss": 2.1776, + "step": 14440 + }, + { + "epoch": 2.25, + "learning_rate": 7.57302785732882e-05, + "loss": 2.1414, + "step": 14460 + }, + { + "epoch": 2.25, + "learning_rate": 7.541785993230929e-05, + "loss": 2.1231, + "step": 14480 + }, + { + "epoch": 2.25, + "learning_rate": 7.510544129133038e-05, + "loss": 2.1345, + "step": 14500 + }, + { + "epoch": 2.26, + "learning_rate": 7.479302265035147e-05, + "loss": 2.1339, + "step": 14520 + }, + { + "epoch": 2.26, + "learning_rate": 7.448060400937255e-05, + "loss": 2.1562, + "step": 14540 + }, + { + "epoch": 2.26, + "learning_rate": 7.416818536839363e-05, + "loss": 2.1649, + "step": 14560 + }, + { + "epoch": 2.27, + "learning_rate": 7.385576672741473e-05, + "loss": 2.1339, + "step": 14580 + }, + { + "epoch": 2.27, + "learning_rate": 7.354334808643581e-05, + "loss": 2.1347, + "step": 14600 + }, + { + "epoch": 2.27, + "eval_loss": 2.335818290710449, + "eval_runtime": 69.5131, + "eval_samples_per_second": 28.772, + "eval_steps_per_second": 1.798, + "step": 14600 + }, + { + "epoch": 2.27, + "learning_rate": 7.323092944545691e-05, + "loss": 2.1078, + "step": 14620 + }, + { + "epoch": 2.28, + "learning_rate": 7.291851080447799e-05, + "loss": 2.1446, + "step": 14640 + }, + { + "epoch": 2.28, + "learning_rate": 7.260609216349908e-05, + "loss": 2.1076, + "step": 14660 + }, + { + "epoch": 2.28, + "learning_rate": 7.229367352252017e-05, + "loss": 2.1548, + "step": 14680 + }, + { + "epoch": 2.28, + "learning_rate": 7.198125488154126e-05, + "loss": 2.1317, + "step": 14700 + }, + { + "epoch": 2.29, + "learning_rate": 7.166883624056234e-05, + "loss": 2.0991, + "step": 14720 + }, + { + "epoch": 2.29, + "learning_rate": 7.135641759958343e-05, + "loss": 2.1507, + "step": 14740 + }, + { + "epoch": 2.29, + "learning_rate": 7.104399895860452e-05, + "loss": 2.1173, + "step": 14760 + }, + { + "epoch": 2.3, + "learning_rate": 7.073158031762562e-05, + "loss": 2.104, + "step": 14780 + }, + { + "epoch": 2.3, + "learning_rate": 7.043478260869565e-05, + "loss": 2.1118, + "step": 14800 + }, + { + "epoch": 2.3, + "eval_loss": 2.334048271179199, + "eval_runtime": 69.3816, + "eval_samples_per_second": 28.826, + "eval_steps_per_second": 1.802, + "step": 14800 + }, + { + "epoch": 2.3, + "learning_rate": 7.012236396771674e-05, + "loss": 2.0738, + "step": 14820 + }, + { + "epoch": 2.31, + "learning_rate": 6.980994532673782e-05, + "loss": 2.1221, + "step": 14840 + }, + { + "epoch": 2.31, + "learning_rate": 6.94975266857589e-05, + "loss": 2.1531, + "step": 14860 + }, + { + "epoch": 2.31, + "learning_rate": 6.918510804478e-05, + "loss": 2.1318, + "step": 14880 + }, + { + "epoch": 2.32, + "learning_rate": 6.887268940380108e-05, + "loss": 2.1251, + "step": 14900 + }, + { + "epoch": 2.32, + "learning_rate": 6.856027076282218e-05, + "loss": 2.1212, + "step": 14920 + }, + { + "epoch": 2.32, + "learning_rate": 6.824785212184326e-05, + "loss": 2.0927, + "step": 14940 + }, + { + "epoch": 2.32, + "learning_rate": 6.793543348086436e-05, + "loss": 2.1277, + "step": 14960 + }, + { + "epoch": 2.33, + "learning_rate": 6.762301483988544e-05, + "loss": 2.156, + "step": 14980 + }, + { + "epoch": 2.33, + "learning_rate": 6.731059619890653e-05, + "loss": 2.1276, + "step": 15000 + }, + { + "epoch": 2.33, + "eval_loss": 2.3340351581573486, + "eval_runtime": 69.2926, + "eval_samples_per_second": 28.863, + "eval_steps_per_second": 1.804, + "step": 15000 + }, + { + "epoch": 2.33, + "learning_rate": 6.699817755792761e-05, + "loss": 2.1313, + "step": 15020 + }, + { + "epoch": 2.34, + "learning_rate": 6.668575891694871e-05, + "loss": 2.1452, + "step": 15040 + }, + { + "epoch": 2.34, + "learning_rate": 6.637334027596979e-05, + "loss": 2.1148, + "step": 15060 + }, + { + "epoch": 2.34, + "learning_rate": 6.606092163499087e-05, + "loss": 2.1193, + "step": 15080 + }, + { + "epoch": 2.35, + "learning_rate": 6.574850299401197e-05, + "loss": 2.1672, + "step": 15100 + }, + { + "epoch": 2.35, + "learning_rate": 6.543608435303305e-05, + "loss": 2.0789, + "step": 15120 + }, + { + "epoch": 2.35, + "learning_rate": 6.512366571205415e-05, + "loss": 2.1438, + "step": 15140 + }, + { + "epoch": 2.36, + "learning_rate": 6.481124707107523e-05, + "loss": 2.1597, + "step": 15160 + }, + { + "epoch": 2.36, + "learning_rate": 6.449882843009632e-05, + "loss": 2.11, + "step": 15180 + }, + { + "epoch": 2.36, + "learning_rate": 6.418640978911742e-05, + "loss": 2.1279, + "step": 15200 + }, + { + "epoch": 2.36, + "eval_loss": 2.3344008922576904, + "eval_runtime": 69.3363, + "eval_samples_per_second": 28.845, + "eval_steps_per_second": 1.803, + "step": 15200 + }, + { + "epoch": 2.37, + "learning_rate": 6.38739911481385e-05, + "loss": 2.1459, + "step": 15220 + }, + { + "epoch": 2.37, + "learning_rate": 6.35615725071596e-05, + "loss": 2.1702, + "step": 15240 + }, + { + "epoch": 2.37, + "learning_rate": 6.324915386618068e-05, + "loss": 2.1262, + "step": 15260 + }, + { + "epoch": 2.37, + "learning_rate": 6.293673522520177e-05, + "loss": 2.0988, + "step": 15280 + }, + { + "epoch": 2.38, + "learning_rate": 6.262431658422286e-05, + "loss": 2.1224, + "step": 15300 + }, + { + "epoch": 2.38, + "learning_rate": 6.231189794324394e-05, + "loss": 2.1102, + "step": 15320 + }, + { + "epoch": 2.38, + "learning_rate": 6.199947930226503e-05, + "loss": 2.1168, + "step": 15340 + }, + { + "epoch": 2.39, + "learning_rate": 6.168706066128611e-05, + "loss": 2.1205, + "step": 15360 + }, + { + "epoch": 2.39, + "learning_rate": 6.137464202030721e-05, + "loss": 2.0855, + "step": 15380 + }, + { + "epoch": 2.39, + "learning_rate": 6.106222337932829e-05, + "loss": 2.1548, + "step": 15400 + }, + { + "epoch": 2.39, + "eval_loss": 2.333451271057129, + "eval_runtime": 69.3334, + "eval_samples_per_second": 28.846, + "eval_steps_per_second": 1.803, + "step": 15400 + }, + { + "epoch": 2.4, + "learning_rate": 6.074980473834938e-05, + "loss": 2.1433, + "step": 15420 + }, + { + "epoch": 2.4, + "learning_rate": 6.043738609737047e-05, + "loss": 2.123, + "step": 15440 + }, + { + "epoch": 2.4, + "learning_rate": 6.012496745639156e-05, + "loss": 2.0965, + "step": 15460 + }, + { + "epoch": 2.41, + "learning_rate": 5.9812548815412647e-05, + "loss": 2.1498, + "step": 15480 + }, + { + "epoch": 2.41, + "learning_rate": 5.9500130174433735e-05, + "loss": 2.1456, + "step": 15500 + }, + { + "epoch": 2.41, + "learning_rate": 5.9187711533454824e-05, + "loss": 2.1295, + "step": 15520 + }, + { + "epoch": 2.41, + "learning_rate": 5.887529289247591e-05, + "loss": 2.108, + "step": 15540 + }, + { + "epoch": 2.42, + "learning_rate": 5.8562874251497e-05, + "loss": 2.1592, + "step": 15560 + }, + { + "epoch": 2.42, + "learning_rate": 5.825045561051809e-05, + "loss": 2.1214, + "step": 15580 + }, + { + "epoch": 2.42, + "learning_rate": 5.793803696953918e-05, + "loss": 2.1561, + "step": 15600 + }, + { + "epoch": 2.42, + "eval_loss": 2.3329403400421143, + "eval_runtime": 69.6034, + "eval_samples_per_second": 28.734, + "eval_steps_per_second": 1.796, + "step": 15600 + }, + { + "epoch": 2.43, + "learning_rate": 5.762561832856026e-05, + "loss": 2.1382, + "step": 15620 + }, + { + "epoch": 2.43, + "learning_rate": 5.731319968758135e-05, + "loss": 2.109, + "step": 15640 + }, + { + "epoch": 2.43, + "learning_rate": 5.700078104660244e-05, + "loss": 2.1283, + "step": 15660 + }, + { + "epoch": 2.44, + "learning_rate": 5.6688362405623526e-05, + "loss": 2.15, + "step": 15680 + }, + { + "epoch": 2.44, + "learning_rate": 5.6375943764644615e-05, + "loss": 2.1125, + "step": 15700 + }, + { + "epoch": 2.44, + "learning_rate": 5.6063525123665704e-05, + "loss": 2.1709, + "step": 15720 + }, + { + "epoch": 2.45, + "learning_rate": 5.575110648268679e-05, + "loss": 2.1622, + "step": 15740 + }, + { + "epoch": 2.45, + "learning_rate": 5.543868784170789e-05, + "loss": 2.0769, + "step": 15760 + }, + { + "epoch": 2.45, + "learning_rate": 5.5126269200728976e-05, + "loss": 2.137, + "step": 15780 + }, + { + "epoch": 2.46, + "learning_rate": 5.4813850559750065e-05, + "loss": 2.1294, + "step": 15800 + }, + { + "epoch": 2.46, + "eval_loss": 2.3324475288391113, + "eval_runtime": 69.559, + "eval_samples_per_second": 28.753, + "eval_steps_per_second": 1.797, + "step": 15800 + }, + { + "epoch": 2.46, + "learning_rate": 5.4501431918771154e-05, + "loss": 2.1425, + "step": 15820 + }, + { + "epoch": 2.46, + "learning_rate": 5.418901327779224e-05, + "loss": 2.128, + "step": 15840 + }, + { + "epoch": 2.46, + "learning_rate": 5.387659463681333e-05, + "loss": 2.1553, + "step": 15860 + }, + { + "epoch": 2.47, + "learning_rate": 5.356417599583441e-05, + "loss": 2.1339, + "step": 15880 + }, + { + "epoch": 2.47, + "learning_rate": 5.32517573548555e-05, + "loss": 2.1536, + "step": 15900 + }, + { + "epoch": 2.47, + "learning_rate": 5.293933871387659e-05, + "loss": 2.1669, + "step": 15920 + }, + { + "epoch": 2.48, + "learning_rate": 5.262692007289768e-05, + "loss": 2.122, + "step": 15940 + }, + { + "epoch": 2.48, + "learning_rate": 5.231450143191877e-05, + "loss": 2.1435, + "step": 15960 + }, + { + "epoch": 2.48, + "learning_rate": 5.2002082790939856e-05, + "loss": 2.1406, + "step": 15980 + }, + { + "epoch": 2.49, + "learning_rate": 5.1689664149960945e-05, + "loss": 2.1174, + "step": 16000 + }, + { + "epoch": 2.49, + "eval_loss": 2.332836866378784, + "eval_runtime": 69.3739, + "eval_samples_per_second": 28.829, + "eval_steps_per_second": 1.802, + "step": 16000 + }, + { + "epoch": 2.49, + "learning_rate": 5.137724550898203e-05, + "loss": 2.1286, + "step": 16020 + }, + { + "epoch": 2.49, + "learning_rate": 5.106482686800312e-05, + "loss": 2.1343, + "step": 16040 + }, + { + "epoch": 2.5, + "learning_rate": 5.075240822702421e-05, + "loss": 2.1134, + "step": 16060 + }, + { + "epoch": 2.5, + "learning_rate": 5.043998958604529e-05, + "loss": 2.1633, + "step": 16080 + }, + { + "epoch": 2.5, + "learning_rate": 5.012757094506638e-05, + "loss": 2.1473, + "step": 16100 + }, + { + "epoch": 2.5, + "learning_rate": 4.981515230408747e-05, + "loss": 2.1535, + "step": 16120 + }, + { + "epoch": 2.51, + "learning_rate": 4.950273366310856e-05, + "loss": 2.112, + "step": 16140 + }, + { + "epoch": 2.51, + "learning_rate": 4.919031502212965e-05, + "loss": 2.1399, + "step": 16160 + }, + { + "epoch": 2.51, + "learning_rate": 4.8877896381150736e-05, + "loss": 2.0913, + "step": 16180 + }, + { + "epoch": 2.52, + "learning_rate": 4.8565477740171824e-05, + "loss": 2.1179, + "step": 16200 + }, + { + "epoch": 2.52, + "eval_loss": 2.332409143447876, + "eval_runtime": 69.3294, + "eval_samples_per_second": 28.848, + "eval_steps_per_second": 1.803, + "step": 16200 + }, + { + "epoch": 2.52, + "learning_rate": 4.825305909919291e-05, + "loss": 2.1756, + "step": 16220 + }, + { + "epoch": 2.52, + "learning_rate": 4.7940640458214e-05, + "loss": 2.1466, + "step": 16240 + }, + { + "epoch": 2.53, + "learning_rate": 4.762822181723509e-05, + "loss": 2.1443, + "step": 16260 + }, + { + "epoch": 2.53, + "learning_rate": 4.731580317625618e-05, + "loss": 2.1207, + "step": 16280 + }, + { + "epoch": 2.53, + "learning_rate": 4.700338453527726e-05, + "loss": 2.1275, + "step": 16300 + }, + { + "epoch": 2.54, + "learning_rate": 4.669096589429835e-05, + "loss": 2.1305, + "step": 16320 + }, + { + "epoch": 2.54, + "learning_rate": 4.6378547253319445e-05, + "loss": 2.134, + "step": 16340 + }, + { + "epoch": 2.54, + "learning_rate": 4.6066128612340534e-05, + "loss": 2.1681, + "step": 16360 + }, + { + "epoch": 2.55, + "learning_rate": 4.575370997136162e-05, + "loss": 2.1627, + "step": 16380 + }, + { + "epoch": 2.55, + "learning_rate": 4.544129133038271e-05, + "loss": 2.1421, + "step": 16400 + }, + { + "epoch": 2.55, + "eval_loss": 2.3318614959716797, + "eval_runtime": 69.3251, + "eval_samples_per_second": 28.85, + "eval_steps_per_second": 1.803, + "step": 16400 + }, + { + "epoch": 2.55, + "learning_rate": 4.51288726894038e-05, + "loss": 2.1225, + "step": 16420 + }, + { + "epoch": 2.55, + "learning_rate": 4.481645404842489e-05, + "loss": 2.156, + "step": 16440 + }, + { + "epoch": 2.56, + "learning_rate": 4.450403540744598e-05, + "loss": 2.1573, + "step": 16460 + }, + { + "epoch": 2.56, + "learning_rate": 4.4191616766467066e-05, + "loss": 2.1295, + "step": 16480 + }, + { + "epoch": 2.56, + "learning_rate": 4.3879198125488154e-05, + "loss": 2.14, + "step": 16500 + }, + { + "epoch": 2.57, + "learning_rate": 4.356677948450924e-05, + "loss": 2.1046, + "step": 16520 + }, + { + "epoch": 2.57, + "learning_rate": 4.3254360843530325e-05, + "loss": 2.1201, + "step": 16540 + }, + { + "epoch": 2.57, + "learning_rate": 4.2941942202551413e-05, + "loss": 2.1767, + "step": 16560 + }, + { + "epoch": 2.58, + "learning_rate": 4.26295235615725e-05, + "loss": 2.1244, + "step": 16580 + }, + { + "epoch": 2.58, + "learning_rate": 4.231710492059359e-05, + "loss": 2.1301, + "step": 16600 + }, + { + "epoch": 2.58, + "eval_loss": 2.331899881362915, + "eval_runtime": 69.3398, + "eval_samples_per_second": 28.843, + "eval_steps_per_second": 1.803, + "step": 16600 + }, + { + "epoch": 2.58, + "learning_rate": 4.200468627961468e-05, + "loss": 2.1022, + "step": 16620 + }, + { + "epoch": 2.59, + "learning_rate": 4.169226763863577e-05, + "loss": 2.1121, + "step": 16640 + }, + { + "epoch": 2.59, + "learning_rate": 4.137984899765686e-05, + "loss": 2.1014, + "step": 16660 + }, + { + "epoch": 2.59, + "learning_rate": 4.1067430356677945e-05, + "loss": 2.1867, + "step": 16680 + }, + { + "epoch": 2.6, + "learning_rate": 4.0755011715699034e-05, + "loss": 2.1055, + "step": 16700 + }, + { + "epoch": 2.6, + "learning_rate": 4.044259307472012e-05, + "loss": 2.1435, + "step": 16720 + }, + { + "epoch": 2.6, + "learning_rate": 4.013017443374121e-05, + "loss": 2.09, + "step": 16740 + }, + { + "epoch": 2.6, + "learning_rate": 3.981775579276229e-05, + "loss": 2.1317, + "step": 16760 + }, + { + "epoch": 2.61, + "learning_rate": 3.950533715178338e-05, + "loss": 2.0683, + "step": 16780 + }, + { + "epoch": 2.61, + "learning_rate": 3.919291851080447e-05, + "loss": 2.1249, + "step": 16800 + }, + { + "epoch": 2.61, + "eval_loss": 2.331566572189331, + "eval_runtime": 69.3154, + "eval_samples_per_second": 28.854, + "eval_steps_per_second": 1.803, + "step": 16800 + }, + { + "epoch": 2.61, + "learning_rate": 3.888049986982556e-05, + "loss": 2.164, + "step": 16820 + }, + { + "epoch": 2.62, + "learning_rate": 3.856808122884665e-05, + "loss": 2.16, + "step": 16840 + }, + { + "epoch": 2.62, + "learning_rate": 3.8255662587867736e-05, + "loss": 2.1603, + "step": 16860 + }, + { + "epoch": 2.62, + "learning_rate": 3.7943243946888825e-05, + "loss": 2.1346, + "step": 16880 + }, + { + "epoch": 2.63, + "learning_rate": 3.7630825305909914e-05, + "loss": 2.1082, + "step": 16900 + }, + { + "epoch": 2.63, + "learning_rate": 3.7318406664931e-05, + "loss": 2.1014, + "step": 16920 + }, + { + "epoch": 2.63, + "learning_rate": 3.700598802395209e-05, + "loss": 2.1088, + "step": 16940 + }, + { + "epoch": 2.64, + "learning_rate": 3.669356938297318e-05, + "loss": 2.0975, + "step": 16960 + }, + { + "epoch": 2.64, + "learning_rate": 3.638115074199427e-05, + "loss": 2.1212, + "step": 16980 + }, + { + "epoch": 2.64, + "learning_rate": 3.606873210101536e-05, + "loss": 2.1226, + "step": 17000 + }, + { + "epoch": 2.64, + "eval_loss": 2.3310983180999756, + "eval_runtime": 69.3945, + "eval_samples_per_second": 28.821, + "eval_steps_per_second": 1.801, + "step": 17000 + }, + { + "epoch": 2.64, + "learning_rate": 3.5756313460036446e-05, + "loss": 2.1318, + "step": 17020 + }, + { + "epoch": 2.65, + "learning_rate": 3.5443894819057534e-05, + "loss": 2.1073, + "step": 17040 + }, + { + "epoch": 2.65, + "learning_rate": 3.513147617807862e-05, + "loss": 2.1411, + "step": 17060 + }, + { + "epoch": 2.65, + "learning_rate": 3.481905753709971e-05, + "loss": 2.0959, + "step": 17080 + }, + { + "epoch": 2.66, + "learning_rate": 3.45066388961208e-05, + "loss": 2.0858, + "step": 17100 + }, + { + "epoch": 2.66, + "learning_rate": 3.419422025514189e-05, + "loss": 2.1174, + "step": 17120 + }, + { + "epoch": 2.66, + "learning_rate": 3.388180161416298e-05, + "loss": 2.1459, + "step": 17140 + }, + { + "epoch": 2.67, + "learning_rate": 3.3569382973184066e-05, + "loss": 2.1425, + "step": 17160 + }, + { + "epoch": 2.67, + "learning_rate": 3.3256964332205155e-05, + "loss": 2.0971, + "step": 17180 + }, + { + "epoch": 2.67, + "learning_rate": 3.2944545691226243e-05, + "loss": 2.1176, + "step": 17200 + }, + { + "epoch": 2.67, + "eval_loss": 2.330962896347046, + "eval_runtime": 69.3407, + "eval_samples_per_second": 28.843, + "eval_steps_per_second": 1.803, + "step": 17200 + }, + { + "epoch": 2.68, + "learning_rate": 3.2632127050247325e-05, + "loss": 2.1471, + "step": 17220 + }, + { + "epoch": 2.68, + "learning_rate": 3.2319708409268414e-05, + "loss": 2.1064, + "step": 17240 + }, + { + "epoch": 2.68, + "learning_rate": 3.20072897682895e-05, + "loss": 2.1347, + "step": 17260 + }, + { + "epoch": 2.69, + "learning_rate": 3.169487112731059e-05, + "loss": 2.142, + "step": 17280 + }, + { + "epoch": 2.69, + "learning_rate": 3.138245248633168e-05, + "loss": 2.1773, + "step": 17300 + }, + { + "epoch": 2.69, + "learning_rate": 3.107003384535277e-05, + "loss": 2.1489, + "step": 17320 + }, + { + "epoch": 2.69, + "learning_rate": 3.075761520437386e-05, + "loss": 2.1257, + "step": 17340 + }, + { + "epoch": 2.7, + "learning_rate": 3.044519656339495e-05, + "loss": 2.1288, + "step": 17360 + }, + { + "epoch": 2.7, + "learning_rate": 3.0132777922416038e-05, + "loss": 2.1258, + "step": 17380 + }, + { + "epoch": 2.7, + "learning_rate": 2.9820359281437123e-05, + "loss": 2.1322, + "step": 17400 + }, + { + "epoch": 2.7, + "eval_loss": 2.3309593200683594, + "eval_runtime": 69.3923, + "eval_samples_per_second": 28.822, + "eval_steps_per_second": 1.801, + "step": 17400 + }, + { + "epoch": 2.71, + "learning_rate": 2.9507940640458212e-05, + "loss": 2.1495, + "step": 17420 + }, + { + "epoch": 2.71, + "learning_rate": 2.91955219994793e-05, + "loss": 2.0843, + "step": 17440 + }, + { + "epoch": 2.71, + "learning_rate": 2.888310335850039e-05, + "loss": 2.11, + "step": 17460 + }, + { + "epoch": 2.72, + "learning_rate": 2.8570684717521478e-05, + "loss": 2.1005, + "step": 17480 + }, + { + "epoch": 2.72, + "learning_rate": 2.827388700859151e-05, + "loss": 2.1302, + "step": 17500 + }, + { + "epoch": 2.72, + "learning_rate": 2.79614683676126e-05, + "loss": 2.1086, + "step": 17520 + }, + { + "epoch": 2.73, + "learning_rate": 2.7649049726633688e-05, + "loss": 2.1302, + "step": 17540 + }, + { + "epoch": 2.73, + "learning_rate": 2.7336631085654777e-05, + "loss": 2.1417, + "step": 17560 + }, + { + "epoch": 2.73, + "learning_rate": 2.7024212444675862e-05, + "loss": 2.1369, + "step": 17580 + }, + { + "epoch": 2.73, + "learning_rate": 2.671179380369695e-05, + "loss": 2.1384, + "step": 17600 + }, + { + "epoch": 2.73, + "eval_loss": 2.33089017868042, + "eval_runtime": 69.3747, + "eval_samples_per_second": 28.829, + "eval_steps_per_second": 1.802, + "step": 17600 + }, + { + "epoch": 2.74, + "learning_rate": 2.639937516271804e-05, + "loss": 2.1243, + "step": 17620 + }, + { + "epoch": 2.74, + "learning_rate": 2.6086956521739128e-05, + "loss": 2.1161, + "step": 17640 + }, + { + "epoch": 2.74, + "learning_rate": 2.5774537880760217e-05, + "loss": 2.1051, + "step": 17660 + }, + { + "epoch": 2.75, + "learning_rate": 2.5462119239781302e-05, + "loss": 2.0762, + "step": 17680 + }, + { + "epoch": 2.75, + "learning_rate": 2.514970059880239e-05, + "loss": 2.1105, + "step": 17700 + }, + { + "epoch": 2.75, + "learning_rate": 2.483728195782348e-05, + "loss": 2.1535, + "step": 17720 + }, + { + "epoch": 2.76, + "learning_rate": 2.452486331684457e-05, + "loss": 2.1706, + "step": 17740 + }, + { + "epoch": 2.76, + "learning_rate": 2.421244467586566e-05, + "loss": 2.0857, + "step": 17760 + }, + { + "epoch": 2.76, + "learning_rate": 2.390002603488675e-05, + "loss": 2.1553, + "step": 17780 + }, + { + "epoch": 2.77, + "learning_rate": 2.3587607393907834e-05, + "loss": 2.0983, + "step": 17800 + }, + { + "epoch": 2.77, + "eval_loss": 2.3304569721221924, + "eval_runtime": 69.35, + "eval_samples_per_second": 28.839, + "eval_steps_per_second": 1.802, + "step": 17800 + }, + { + "epoch": 2.77, + "learning_rate": 2.3275188752928923e-05, + "loss": 2.1212, + "step": 17820 + }, + { + "epoch": 2.77, + "learning_rate": 2.296277011195001e-05, + "loss": 2.0816, + "step": 17840 + }, + { + "epoch": 2.78, + "learning_rate": 2.26503514709711e-05, + "loss": 2.0935, + "step": 17860 + }, + { + "epoch": 2.78, + "learning_rate": 2.233793282999219e-05, + "loss": 2.1576, + "step": 17880 + }, + { + "epoch": 2.78, + "learning_rate": 2.2025514189013274e-05, + "loss": 2.1076, + "step": 17900 + }, + { + "epoch": 2.78, + "learning_rate": 2.1713095548034362e-05, + "loss": 2.1184, + "step": 17920 + }, + { + "epoch": 2.79, + "learning_rate": 2.140067690705545e-05, + "loss": 2.1169, + "step": 17940 + }, + { + "epoch": 2.79, + "learning_rate": 2.108825826607654e-05, + "loss": 2.1442, + "step": 17960 + }, + { + "epoch": 2.79, + "learning_rate": 2.077583962509763e-05, + "loss": 2.1332, + "step": 17980 + }, + { + "epoch": 2.8, + "learning_rate": 2.0463420984118717e-05, + "loss": 2.1553, + "step": 18000 + }, + { + "epoch": 2.8, + "eval_loss": 2.330599069595337, + "eval_runtime": 69.346, + "eval_samples_per_second": 28.841, + "eval_steps_per_second": 1.803, + "step": 18000 + }, + { + "epoch": 2.8, + "learning_rate": 2.0151002343139802e-05, + "loss": 2.1055, + "step": 18020 + }, + { + "epoch": 2.8, + "learning_rate": 1.9838583702160894e-05, + "loss": 2.0778, + "step": 18040 + }, + { + "epoch": 2.81, + "learning_rate": 1.9526165061181983e-05, + "loss": 2.143, + "step": 18060 + }, + { + "epoch": 2.81, + "learning_rate": 1.921374642020307e-05, + "loss": 2.0886, + "step": 18080 + }, + { + "epoch": 2.81, + "learning_rate": 1.890132777922416e-05, + "loss": 2.1236, + "step": 18100 + }, + { + "epoch": 2.82, + "learning_rate": 1.858890913824525e-05, + "loss": 2.1307, + "step": 18120 + }, + { + "epoch": 2.82, + "learning_rate": 1.8276490497266334e-05, + "loss": 2.1192, + "step": 18140 + }, + { + "epoch": 2.82, + "learning_rate": 1.7964071856287423e-05, + "loss": 2.0999, + "step": 18160 + }, + { + "epoch": 2.83, + "learning_rate": 1.765165321530851e-05, + "loss": 2.0792, + "step": 18180 + }, + { + "epoch": 2.83, + "learning_rate": 1.73392345743296e-05, + "loss": 2.1015, + "step": 18200 + }, + { + "epoch": 2.83, + "eval_loss": 2.330050230026245, + "eval_runtime": 69.3278, + "eval_samples_per_second": 28.848, + "eval_steps_per_second": 1.803, + "step": 18200 + }, + { + "epoch": 2.83, + "learning_rate": 1.702681593335069e-05, + "loss": 2.1226, + "step": 18220 + }, + { + "epoch": 2.83, + "learning_rate": 1.6714397292371778e-05, + "loss": 2.0924, + "step": 18240 + }, + { + "epoch": 2.84, + "learning_rate": 1.6401978651392866e-05, + "loss": 2.1272, + "step": 18260 + }, + { + "epoch": 2.84, + "learning_rate": 1.6089560010413955e-05, + "loss": 2.1175, + "step": 18280 + }, + { + "epoch": 2.84, + "learning_rate": 1.577714136943504e-05, + "loss": 2.1396, + "step": 18300 + }, + { + "epoch": 2.85, + "learning_rate": 1.546472272845613e-05, + "loss": 2.1514, + "step": 18320 + }, + { + "epoch": 2.85, + "learning_rate": 1.5152304087477217e-05, + "loss": 2.1257, + "step": 18340 + }, + { + "epoch": 2.85, + "learning_rate": 1.4839885446498306e-05, + "loss": 2.1459, + "step": 18360 + }, + { + "epoch": 2.86, + "learning_rate": 1.4527466805519396e-05, + "loss": 2.09, + "step": 18380 + }, + { + "epoch": 2.86, + "learning_rate": 1.4215048164540483e-05, + "loss": 2.1442, + "step": 18400 + }, + { + "epoch": 2.86, + "eval_loss": 2.330048084259033, + "eval_runtime": 69.2975, + "eval_samples_per_second": 28.861, + "eval_steps_per_second": 1.804, + "step": 18400 + }, + { + "epoch": 2.86, + "learning_rate": 1.3902629523561572e-05, + "loss": 2.1816, + "step": 18420 + }, + { + "epoch": 2.87, + "learning_rate": 1.3590210882582659e-05, + "loss": 2.0965, + "step": 18440 + }, + { + "epoch": 2.87, + "learning_rate": 1.3277792241603748e-05, + "loss": 2.1178, + "step": 18460 + }, + { + "epoch": 2.87, + "learning_rate": 1.2965373600624836e-05, + "loss": 2.1562, + "step": 18480 + }, + { + "epoch": 2.87, + "learning_rate": 1.2652954959645923e-05, + "loss": 2.095, + "step": 18500 + }, + { + "epoch": 2.88, + "learning_rate": 1.2340536318667012e-05, + "loss": 2.1522, + "step": 18520 + }, + { + "epoch": 2.88, + "learning_rate": 1.2028117677688102e-05, + "loss": 2.1729, + "step": 18540 + }, + { + "epoch": 2.88, + "learning_rate": 1.1715699036709189e-05, + "loss": 2.141, + "step": 18560 + }, + { + "epoch": 2.89, + "learning_rate": 1.1403280395730278e-05, + "loss": 2.148, + "step": 18580 + }, + { + "epoch": 2.89, + "learning_rate": 1.1090861754751366e-05, + "loss": 2.1619, + "step": 18600 + }, + { + "epoch": 2.89, + "eval_loss": 2.329728603363037, + "eval_runtime": 69.3412, + "eval_samples_per_second": 28.843, + "eval_steps_per_second": 1.803, + "step": 18600 + }, + { + "epoch": 2.89, + "learning_rate": 1.0778443113772453e-05, + "loss": 2.1199, + "step": 18620 + }, + { + "epoch": 2.9, + "learning_rate": 1.0466024472793542e-05, + "loss": 2.131, + "step": 18640 + }, + { + "epoch": 2.9, + "learning_rate": 1.0153605831814629e-05, + "loss": 2.1512, + "step": 18660 + }, + { + "epoch": 2.9, + "learning_rate": 9.84118719083572e-06, + "loss": 2.1292, + "step": 18680 + }, + { + "epoch": 2.91, + "learning_rate": 9.528768549856808e-06, + "loss": 2.0928, + "step": 18700 + }, + { + "epoch": 2.91, + "learning_rate": 9.216349908877897e-06, + "loss": 2.1168, + "step": 18720 + }, + { + "epoch": 2.91, + "learning_rate": 8.903931267898984e-06, + "loss": 2.1316, + "step": 18740 + }, + { + "epoch": 2.92, + "learning_rate": 8.591512626920072e-06, + "loss": 2.1198, + "step": 18760 + }, + { + "epoch": 2.92, + "learning_rate": 8.279093985941161e-06, + "loss": 2.1226, + "step": 18780 + }, + { + "epoch": 2.92, + "learning_rate": 7.96667534496225e-06, + "loss": 2.1234, + "step": 18800 + }, + { + "epoch": 2.92, + "eval_loss": 2.3294034004211426, + "eval_runtime": 69.3303, + "eval_samples_per_second": 28.847, + "eval_steps_per_second": 1.803, + "step": 18800 + } + ], + "max_steps": 19305, + "num_train_epochs": 3, + "total_flos": 5.259835058140322e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_bloomfirefly/checkpoint-18800/training_args.bin b/adapters/saved_bloomfirefly/checkpoint-18800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..87b78c4a804a75de63299eeb2fc899bcd70e34ae --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-18800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23cad2e050712e91e38e68c484cff1b7c0ef83524df4c9bb983745b616667737 +size 3643 diff --git a/adapters/saved_bloomfirefly/checkpoint-19000/optimizer.pt b/adapters/saved_bloomfirefly/checkpoint-19000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8fa3aa111c6d6bcef681e9b49a693675ec5c558 --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-19000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9b7fd4985629765ba88f650c1af64b83e429a7f28c6dabbeaa0da617b383331 +size 31492741 diff --git a/adapters/saved_bloomfirefly/checkpoint-19000/pytorch_model.bin b/adapters/saved_bloomfirefly/checkpoint-19000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..86743880c666aaf5a83ae1b48533bd1983aaece1 --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-19000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:349e1f3b4adc95df174f5e5614611bc4983abbfd9ce82150ca847a5d0511c474 +size 15750885 diff --git a/adapters/saved_bloomfirefly/checkpoint-19000/rng_state_0.pth b/adapters/saved_bloomfirefly/checkpoint-19000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..67690da1b8c80c1e40e57077d76c7395e18f6842 --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-19000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4249483e92cc9f60669ea9254c11f24bd606796883aa9d5a7bbcc1ab92a3c6d4 +size 14583 diff --git a/adapters/saved_bloomfirefly/checkpoint-19000/rng_state_1.pth b/adapters/saved_bloomfirefly/checkpoint-19000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..1cb579dbde0e09898906acd8921ad9cdab074c8f --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-19000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbfb0d9313511d7ebdf35df19f5fd36836cb33d7c9c5c2747cb610330b8a3cf4 +size 14583 diff --git a/adapters/saved_bloomfirefly/checkpoint-19000/scaler.pt b/adapters/saved_bloomfirefly/checkpoint-19000/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a3de4b231fe93f08a999fb50d1d57cd7d14bc63 --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-19000/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0108901e5714062e1d1207611fc298f5850e4afdc39062bf2ed42e1cbd55014 +size 557 diff --git a/adapters/saved_bloomfirefly/checkpoint-19000/scheduler.pt b/adapters/saved_bloomfirefly/checkpoint-19000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fde55a6feb9915f755113bc8c15607fc7abe71d1 --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-19000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:511ca7621e9abead89221ec3a0b1d45b2de13dbc7be5f61fe66e28d8501ef1f3 +size 627 diff --git a/adapters/saved_bloomfirefly/checkpoint-19000/trainer_state.json b/adapters/saved_bloomfirefly/checkpoint-19000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fcd9c108892be23f7be3c4fd1f34c5e119b88bcf --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-19000/trainer_state.json @@ -0,0 +1,6476 @@ +{ + "best_metric": 2.3293075561523438, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_bloomfirefly/checkpoint-19000", + "epoch": 2.952531261381571, + "global_step": 19000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 2.9733, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 2.7809, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 2.6052, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 0.00023999999999999998, + "loss": 2.4925, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.0003, + "loss": 2.458, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029968758135902107, + "loss": 2.4281, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029937516271804216, + "loss": 2.4178, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029906274407706326, + "loss": 2.3839, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002987503254360843, + "loss": 2.3521, + "step": 180 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029843790679510545, + "loss": 2.338, + "step": 200 + }, + { + "epoch": 0.03, + "eval_loss": 2.510117292404175, + "eval_runtime": 69.1765, + "eval_samples_per_second": 28.912, + "eval_steps_per_second": 1.807, + "step": 200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002981254881541265, + "loss": 2.3401, + "step": 220 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002978130695131476, + "loss": 2.3665, + "step": 240 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002975006508721687, + "loss": 2.3691, + "step": 260 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002971882322311898, + "loss": 2.3514, + "step": 280 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002968758135902109, + "loss": 2.3203, + "step": 300 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029656339494923197, + "loss": 2.3393, + "step": 320 + }, + { + "epoch": 0.05, + "learning_rate": 0.000296250976308253, + "loss": 2.3289, + "step": 340 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029593855766727416, + "loss": 2.3407, + "step": 360 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002956261390262952, + "loss": 2.3163, + "step": 380 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002953137203853163, + "loss": 2.3212, + "step": 400 + }, + { + "epoch": 0.06, + "eval_loss": 2.473245620727539, + "eval_runtime": 69.0219, + "eval_samples_per_second": 28.976, + "eval_steps_per_second": 1.811, + "step": 400 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002950013017443374, + "loss": 2.2927, + "step": 420 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002946888831033585, + "loss": 2.2927, + "step": 440 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002943764644623796, + "loss": 2.29, + "step": 460 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002940640458214007, + "loss": 2.3099, + "step": 480 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002937516271804217, + "loss": 2.3286, + "step": 500 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002934392085394428, + "loss": 2.2928, + "step": 520 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002931267898984639, + "loss": 2.2956, + "step": 540 + }, + { + "epoch": 0.09, + "learning_rate": 0.000292814371257485, + "loss": 2.2627, + "step": 560 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002925019526165061, + "loss": 2.2897, + "step": 580 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002921895339755272, + "loss": 2.2994, + "step": 600 + }, + { + "epoch": 0.09, + "eval_loss": 2.455402374267578, + "eval_runtime": 69.1315, + "eval_samples_per_second": 28.93, + "eval_steps_per_second": 1.808, + "step": 600 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029187711533454824, + "loss": 2.3232, + "step": 620 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002915646966935694, + "loss": 2.2515, + "step": 640 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029125227805259043, + "loss": 2.2856, + "step": 660 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002909398594116115, + "loss": 2.252, + "step": 680 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002906274407706326, + "loss": 2.2891, + "step": 700 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002903150221296537, + "loss": 2.2769, + "step": 720 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002900026034886748, + "loss": 2.2763, + "step": 740 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002896901848476959, + "loss": 2.278, + "step": 760 + }, + { + "epoch": 0.12, + "learning_rate": 0.00028937776620671695, + "loss": 2.3126, + "step": 780 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002890653475657381, + "loss": 2.2698, + "step": 800 + }, + { + "epoch": 0.12, + "eval_loss": 2.4434444904327393, + "eval_runtime": 69.7211, + "eval_samples_per_second": 28.686, + "eval_steps_per_second": 1.793, + "step": 800 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028875292892475914, + "loss": 2.2587, + "step": 820 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028844051028378023, + "loss": 2.2954, + "step": 840 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028812809164280133, + "loss": 2.3102, + "step": 860 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002878156730018224, + "loss": 2.2918, + "step": 880 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002875032543608435, + "loss": 2.2698, + "step": 900 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002871908357198646, + "loss": 2.2514, + "step": 920 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028687841707888566, + "loss": 2.2684, + "step": 940 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028656599843790675, + "loss": 2.2833, + "step": 960 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028625357979692785, + "loss": 2.2709, + "step": 980 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028594116115594894, + "loss": 2.2596, + "step": 1000 + }, + { + "epoch": 0.16, + "eval_loss": 2.436037302017212, + "eval_runtime": 69.727, + "eval_samples_per_second": 28.683, + "eval_steps_per_second": 1.793, + "step": 1000 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028562874251497004, + "loss": 2.2743, + "step": 1020 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028531632387399113, + "loss": 2.23, + "step": 1040 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002850039052330122, + "loss": 2.2723, + "step": 1060 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002846914865920333, + "loss": 2.2585, + "step": 1080 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028437906795105437, + "loss": 2.2463, + "step": 1100 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028406664931007546, + "loss": 2.2264, + "step": 1120 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028375423066909656, + "loss": 2.223, + "step": 1140 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028344181202811765, + "loss": 2.2412, + "step": 1160 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028312939338713875, + "loss": 2.2714, + "step": 1180 + }, + { + "epoch": 0.19, + "learning_rate": 0.00028281697474615984, + "loss": 2.2638, + "step": 1200 + }, + { + "epoch": 0.19, + "eval_loss": 2.4272871017456055, + "eval_runtime": 69.3748, + "eval_samples_per_second": 28.829, + "eval_steps_per_second": 1.802, + "step": 1200 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002825045561051809, + "loss": 2.2303, + "step": 1220 + }, + { + "epoch": 0.19, + "learning_rate": 0.000282192137464202, + "loss": 2.2491, + "step": 1240 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028187971882322313, + "loss": 2.2598, + "step": 1260 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028156730018224417, + "loss": 2.2566, + "step": 1280 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028125488154126527, + "loss": 2.2642, + "step": 1300 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028094246290028636, + "loss": 2.2976, + "step": 1320 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028063004425930746, + "loss": 2.2144, + "step": 1340 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028031762561832855, + "loss": 2.2618, + "step": 1360 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028000520697734965, + "loss": 2.2232, + "step": 1380 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002796927883363707, + "loss": 2.2349, + "step": 1400 + }, + { + "epoch": 0.22, + "eval_loss": 2.422177314758301, + "eval_runtime": 69.7796, + "eval_samples_per_second": 28.662, + "eval_steps_per_second": 1.791, + "step": 1400 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027938036969539184, + "loss": 2.2655, + "step": 1420 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002790679510544129, + "loss": 2.265, + "step": 1440 + }, + { + "epoch": 0.23, + "learning_rate": 0.000278755532413434, + "loss": 2.2552, + "step": 1460 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027844311377245507, + "loss": 2.252, + "step": 1480 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027813069513147617, + "loss": 2.255, + "step": 1500 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027781827649049726, + "loss": 2.1869, + "step": 1520 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027750585784951836, + "loss": 2.2601, + "step": 1540 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002771934392085394, + "loss": 2.2607, + "step": 1560 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002768810205675605, + "loss": 2.2245, + "step": 1580 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002765686019265816, + "loss": 2.2561, + "step": 1600 + }, + { + "epoch": 0.25, + "eval_loss": 2.4173202514648438, + "eval_runtime": 69.7813, + "eval_samples_per_second": 28.661, + "eval_steps_per_second": 1.791, + "step": 1600 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002762561832856027, + "loss": 2.2472, + "step": 1620 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002759437646446238, + "loss": 2.2952, + "step": 1640 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002756313460036449, + "loss": 2.1941, + "step": 1660 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002753189273626659, + "loss": 2.2396, + "step": 1680 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027500650872168707, + "loss": 2.2325, + "step": 1700 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002746940900807081, + "loss": 2.2458, + "step": 1720 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002743816714397292, + "loss": 2.2464, + "step": 1740 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002740692527987503, + "loss": 2.2487, + "step": 1760 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002737568341577714, + "loss": 2.2609, + "step": 1780 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002734444155167925, + "loss": 2.3016, + "step": 1800 + }, + { + "epoch": 0.28, + "eval_loss": 2.4146716594696045, + "eval_runtime": 69.513, + "eval_samples_per_second": 28.772, + "eval_steps_per_second": 1.798, + "step": 1800 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002731319968758136, + "loss": 2.2415, + "step": 1820 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002728195782348346, + "loss": 2.2512, + "step": 1840 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002725071595938558, + "loss": 2.2186, + "step": 1860 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002721947409528768, + "loss": 2.1982, + "step": 1880 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002718823223118979, + "loss": 2.2358, + "step": 1900 + }, + { + "epoch": 0.3, + "learning_rate": 0.000271569903670919, + "loss": 2.2359, + "step": 1920 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002712574850299401, + "loss": 2.2367, + "step": 1940 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002709450663889612, + "loss": 2.2209, + "step": 1960 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002706326477479823, + "loss": 2.2026, + "step": 1980 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027032022910700333, + "loss": 2.2302, + "step": 2000 + }, + { + "epoch": 0.31, + "eval_loss": 2.4096806049346924, + "eval_runtime": 69.8744, + "eval_samples_per_second": 28.623, + "eval_steps_per_second": 1.789, + "step": 2000 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027000781046602443, + "loss": 2.2516, + "step": 2020 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002696953918250455, + "loss": 2.2173, + "step": 2040 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002693829731840666, + "loss": 2.2414, + "step": 2060 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002690705545430877, + "loss": 2.1922, + "step": 2080 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002687581359021088, + "loss": 2.2396, + "step": 2100 + }, + { + "epoch": 0.33, + "learning_rate": 0.00026844571726112985, + "loss": 2.2602, + "step": 2120 + }, + { + "epoch": 0.33, + "learning_rate": 0.000268133298620151, + "loss": 2.2263, + "step": 2140 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026782087997917204, + "loss": 2.2082, + "step": 2160 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026750846133819314, + "loss": 2.2144, + "step": 2180 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026719604269721423, + "loss": 2.2066, + "step": 2200 + }, + { + "epoch": 0.34, + "eval_loss": 2.4065375328063965, + "eval_runtime": 69.933, + "eval_samples_per_second": 28.599, + "eval_steps_per_second": 1.787, + "step": 2200 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026688362405623533, + "loss": 2.2494, + "step": 2220 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002665712054152564, + "loss": 2.2471, + "step": 2240 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002662587867742775, + "loss": 2.2512, + "step": 2260 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026594636813329856, + "loss": 2.2249, + "step": 2280 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002656339494923197, + "loss": 2.2526, + "step": 2300 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026532153085134075, + "loss": 2.2375, + "step": 2320 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026500911221036185, + "loss": 2.169, + "step": 2340 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026469669356938294, + "loss": 2.2206, + "step": 2360 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026438427492840404, + "loss": 2.2284, + "step": 2380 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026407185628742513, + "loss": 2.2116, + "step": 2400 + }, + { + "epoch": 0.37, + "eval_loss": 2.402400255203247, + "eval_runtime": 70.6508, + "eval_samples_per_second": 28.308, + "eval_steps_per_second": 1.769, + "step": 2400 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026375943764644623, + "loss": 2.2228, + "step": 2420 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002634470190054673, + "loss": 2.2264, + "step": 2440 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026313460036448837, + "loss": 2.2212, + "step": 2460 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002628221817235095, + "loss": 2.2164, + "step": 2480 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026250976308253056, + "loss": 2.2523, + "step": 2500 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026219734444155165, + "loss": 2.2272, + "step": 2520 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026188492580057275, + "loss": 2.2381, + "step": 2540 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026157250715959384, + "loss": 2.2149, + "step": 2560 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026126008851861494, + "loss": 2.228, + "step": 2580 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026094766987763603, + "loss": 2.2145, + "step": 2600 + }, + { + "epoch": 0.4, + "eval_loss": 2.399576425552368, + "eval_runtime": 69.9194, + "eval_samples_per_second": 28.604, + "eval_steps_per_second": 1.788, + "step": 2600 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002606352512366571, + "loss": 2.18, + "step": 2620 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002603228325956782, + "loss": 2.1965, + "step": 2640 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026001041395469927, + "loss": 2.178, + "step": 2660 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025969799531372036, + "loss": 2.194, + "step": 2680 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025938557667274146, + "loss": 2.2024, + "step": 2700 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025907315803176255, + "loss": 2.2427, + "step": 2720 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025876073939078365, + "loss": 2.2246, + "step": 2740 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025844832074980474, + "loss": 2.2169, + "step": 2760 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002581359021088258, + "loss": 2.2154, + "step": 2780 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002578234834678469, + "loss": 2.1732, + "step": 2800 + }, + { + "epoch": 0.44, + "eval_loss": 2.3982491493225098, + "eval_runtime": 70.2191, + "eval_samples_per_second": 28.482, + "eval_steps_per_second": 1.78, + "step": 2800 + }, + { + "epoch": 0.44, + "learning_rate": 0.000257511064826868, + "loss": 2.1951, + "step": 2820 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025719864618588907, + "loss": 2.2139, + "step": 2840 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025688622754491017, + "loss": 2.197, + "step": 2860 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025657380890393126, + "loss": 2.2317, + "step": 2880 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002562613902629523, + "loss": 2.2107, + "step": 2900 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025594897162197345, + "loss": 2.2087, + "step": 2920 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002556365529809945, + "loss": 2.2124, + "step": 2940 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002553241343400156, + "loss": 2.1762, + "step": 2960 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002550117156990367, + "loss": 2.2488, + "step": 2980 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002546992970580578, + "loss": 2.2316, + "step": 3000 + }, + { + "epoch": 0.47, + "eval_loss": 2.394296646118164, + "eval_runtime": 70.2494, + "eval_samples_per_second": 28.47, + "eval_steps_per_second": 1.779, + "step": 3000 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002543868784170789, + "loss": 2.2386, + "step": 3020 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025407445977609997, + "loss": 2.224, + "step": 3040 + }, + { + "epoch": 0.48, + "learning_rate": 0.000253762041135121, + "loss": 2.2479, + "step": 3060 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002534496224941421, + "loss": 2.2396, + "step": 3080 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002531372038531632, + "loss": 2.2405, + "step": 3100 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002528247852121843, + "loss": 2.1969, + "step": 3120 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002525123665712054, + "loss": 2.2095, + "step": 3140 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002521999479302265, + "loss": 2.2202, + "step": 3160 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002518875292892476, + "loss": 2.2088, + "step": 3180 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002515751106482687, + "loss": 2.2075, + "step": 3200 + }, + { + "epoch": 0.5, + "eval_loss": 2.3918581008911133, + "eval_runtime": 69.2896, + "eval_samples_per_second": 28.864, + "eval_steps_per_second": 1.804, + "step": 3200 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002512626920072897, + "loss": 2.1993, + "step": 3220 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002509502733663108, + "loss": 2.2406, + "step": 3240 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002506378547253319, + "loss": 2.2352, + "step": 3260 + }, + { + "epoch": 0.51, + "learning_rate": 0.000250325436084353, + "loss": 2.236, + "step": 3280 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002500130174433741, + "loss": 2.1805, + "step": 3300 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002497005988023952, + "loss": 2.2249, + "step": 3320 + }, + { + "epoch": 0.52, + "learning_rate": 0.00024938818016141624, + "loss": 2.2153, + "step": 3340 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002490757615204374, + "loss": 2.2115, + "step": 3360 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024876334287945843, + "loss": 2.2284, + "step": 3380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002484509242384795, + "loss": 2.184, + "step": 3400 + }, + { + "epoch": 0.53, + "eval_loss": 2.3887791633605957, + "eval_runtime": 69.2387, + "eval_samples_per_second": 28.886, + "eval_steps_per_second": 1.805, + "step": 3400 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002481385055975006, + "loss": 2.2172, + "step": 3420 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002478260869565217, + "loss": 2.2347, + "step": 3440 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002475136683155428, + "loss": 2.2213, + "step": 3460 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002472012496745639, + "loss": 2.2215, + "step": 3480 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024688883103358495, + "loss": 2.2058, + "step": 3500 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024657641239260604, + "loss": 2.1918, + "step": 3520 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002462639937516272, + "loss": 2.2021, + "step": 3540 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024595157511064824, + "loss": 2.1832, + "step": 3560 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024563915646966933, + "loss": 2.2199, + "step": 3580 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002453267378286904, + "loss": 2.1997, + "step": 3600 + }, + { + "epoch": 0.56, + "eval_loss": 2.386540412902832, + "eval_runtime": 69.2123, + "eval_samples_per_second": 28.897, + "eval_steps_per_second": 1.806, + "step": 3600 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002450143191877115, + "loss": 2.2009, + "step": 3620 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002447019005467326, + "loss": 2.2045, + "step": 3640 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002443894819057537, + "loss": 2.2231, + "step": 3660 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024407706326477478, + "loss": 2.211, + "step": 3680 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024376464462379588, + "loss": 2.1904, + "step": 3700 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024345222598281694, + "loss": 2.1492, + "step": 3720 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024313980734183807, + "loss": 2.2368, + "step": 3740 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024282738870085914, + "loss": 2.1753, + "step": 3760 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024251497005988023, + "loss": 2.179, + "step": 3780 + }, + { + "epoch": 0.59, + "learning_rate": 0.0002422025514189013, + "loss": 2.1811, + "step": 3800 + }, + { + "epoch": 0.59, + "eval_loss": 2.3864212036132812, + "eval_runtime": 69.2951, + "eval_samples_per_second": 28.862, + "eval_steps_per_second": 1.804, + "step": 3800 + }, + { + "epoch": 0.59, + "learning_rate": 0.0002418901327779224, + "loss": 2.1496, + "step": 3820 + }, + { + "epoch": 0.6, + "learning_rate": 0.0002415777141369435, + "loss": 2.2071, + "step": 3840 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024126529549596459, + "loss": 2.189, + "step": 3860 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024095287685498565, + "loss": 2.1838, + "step": 3880 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024064045821400675, + "loss": 2.2292, + "step": 3900 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024032803957302782, + "loss": 2.1931, + "step": 3920 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024001562093204894, + "loss": 2.2293, + "step": 3940 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023970320229107, + "loss": 2.2112, + "step": 3960 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002393907836500911, + "loss": 2.1479, + "step": 3980 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023907836500911217, + "loss": 2.1661, + "step": 4000 + }, + { + "epoch": 0.62, + "eval_loss": 2.383505344390869, + "eval_runtime": 69.2876, + "eval_samples_per_second": 28.865, + "eval_steps_per_second": 1.804, + "step": 4000 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002387659463681333, + "loss": 2.1783, + "step": 4020 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023845352772715436, + "loss": 2.1975, + "step": 4040 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023814110908617546, + "loss": 2.2268, + "step": 4060 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023782869044519653, + "loss": 2.1815, + "step": 4080 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023751627180421765, + "loss": 2.2305, + "step": 4100 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023720385316323872, + "loss": 2.2087, + "step": 4120 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002368914345222598, + "loss": 2.2204, + "step": 4140 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023657901588128088, + "loss": 2.2138, + "step": 4160 + }, + { + "epoch": 0.65, + "learning_rate": 0.000236266597240302, + "loss": 2.2071, + "step": 4180 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023595417859932307, + "loss": 2.1728, + "step": 4200 + }, + { + "epoch": 0.65, + "eval_loss": 2.3820013999938965, + "eval_runtime": 69.3049, + "eval_samples_per_second": 28.858, + "eval_steps_per_second": 1.804, + "step": 4200 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023564175995834417, + "loss": 2.182, + "step": 4220 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023532934131736524, + "loss": 2.1948, + "step": 4240 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023501692267638633, + "loss": 2.2178, + "step": 4260 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023470450403540743, + "loss": 2.1979, + "step": 4280 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023439208539442852, + "loss": 2.222, + "step": 4300 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002340796667534496, + "loss": 2.221, + "step": 4320 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023376724811247069, + "loss": 2.208, + "step": 4340 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023345482947149175, + "loss": 2.1502, + "step": 4360 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023314241083051288, + "loss": 2.1628, + "step": 4380 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023282999218953395, + "loss": 2.1933, + "step": 4400 + }, + { + "epoch": 0.68, + "eval_loss": 2.380128860473633, + "eval_runtime": 69.2864, + "eval_samples_per_second": 28.866, + "eval_steps_per_second": 1.804, + "step": 4400 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023251757354855504, + "loss": 2.2204, + "step": 4420 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002322051549075761, + "loss": 2.218, + "step": 4440 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023189273626659723, + "loss": 2.199, + "step": 4460 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002315803176256183, + "loss": 2.1826, + "step": 4480 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002312678989846394, + "loss": 2.174, + "step": 4500 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023095548034366046, + "loss": 2.2011, + "step": 4520 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023064306170268159, + "loss": 2.1951, + "step": 4540 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023033064306170265, + "loss": 2.2189, + "step": 4560 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023001822442072375, + "loss": 2.1891, + "step": 4580 + }, + { + "epoch": 0.71, + "learning_rate": 0.00022970580577974482, + "loss": 2.1873, + "step": 4600 + }, + { + "epoch": 0.71, + "eval_loss": 2.379713296890259, + "eval_runtime": 69.3005, + "eval_samples_per_second": 28.86, + "eval_steps_per_second": 1.804, + "step": 4600 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022939338713876591, + "loss": 2.2191, + "step": 4620 + }, + { + "epoch": 0.72, + "learning_rate": 0.000229080968497787, + "loss": 2.1966, + "step": 4640 + }, + { + "epoch": 0.72, + "learning_rate": 0.0002287685498568081, + "loss": 2.2062, + "step": 4660 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022845613121582917, + "loss": 2.1888, + "step": 4680 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022814371257485027, + "loss": 2.1938, + "step": 4700 + }, + { + "epoch": 0.73, + "learning_rate": 0.0002278312939338714, + "loss": 2.206, + "step": 4720 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022751887529289246, + "loss": 2.1584, + "step": 4740 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022720645665191355, + "loss": 2.1933, + "step": 4760 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022689403801093462, + "loss": 2.2087, + "step": 4780 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022658161936995575, + "loss": 2.2239, + "step": 4800 + }, + { + "epoch": 0.75, + "eval_loss": 2.3774757385253906, + "eval_runtime": 69.3137, + "eval_samples_per_second": 28.854, + "eval_steps_per_second": 1.803, + "step": 4800 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022626920072897681, + "loss": 2.2136, + "step": 4820 + }, + { + "epoch": 0.75, + "learning_rate": 0.0002259567820879979, + "loss": 2.2046, + "step": 4840 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022564436344701898, + "loss": 2.2031, + "step": 4860 + }, + { + "epoch": 0.76, + "learning_rate": 0.0002253319448060401, + "loss": 2.171, + "step": 4880 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022501952616506117, + "loss": 2.2101, + "step": 4900 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022470710752408226, + "loss": 2.1306, + "step": 4920 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022439468888310333, + "loss": 2.1754, + "step": 4940 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022408227024212443, + "loss": 2.1972, + "step": 4960 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022376985160114552, + "loss": 2.2175, + "step": 4980 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022345743296016662, + "loss": 2.139, + "step": 5000 + }, + { + "epoch": 0.78, + "eval_loss": 2.3760337829589844, + "eval_runtime": 69.3092, + "eval_samples_per_second": 28.856, + "eval_steps_per_second": 1.804, + "step": 5000 + }, + { + "epoch": 0.78, + "learning_rate": 0.0002231450143191877, + "loss": 2.1912, + "step": 5020 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022283259567820878, + "loss": 2.2036, + "step": 5040 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022252017703722985, + "loss": 2.1852, + "step": 5060 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022220775839625097, + "loss": 2.1672, + "step": 5080 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022189533975527204, + "loss": 2.1828, + "step": 5100 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022158292111429314, + "loss": 2.1875, + "step": 5120 + }, + { + "epoch": 0.8, + "learning_rate": 0.0002212705024733142, + "loss": 2.1997, + "step": 5140 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022095808383233533, + "loss": 2.2162, + "step": 5160 + }, + { + "epoch": 0.8, + "learning_rate": 0.0002206456651913564, + "loss": 2.2213, + "step": 5180 + }, + { + "epoch": 0.81, + "learning_rate": 0.0002203332465503775, + "loss": 2.1972, + "step": 5200 + }, + { + "epoch": 0.81, + "eval_loss": 2.374734878540039, + "eval_runtime": 69.2582, + "eval_samples_per_second": 28.877, + "eval_steps_per_second": 1.805, + "step": 5200 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022002082790939856, + "loss": 2.175, + "step": 5220 + }, + { + "epoch": 0.81, + "learning_rate": 0.00021970840926841968, + "loss": 2.1951, + "step": 5240 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021939599062744075, + "loss": 2.1493, + "step": 5260 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021908357198646185, + "loss": 2.1611, + "step": 5280 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021877115334548291, + "loss": 2.1621, + "step": 5300 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021845873470450404, + "loss": 2.1875, + "step": 5320 + }, + { + "epoch": 0.83, + "learning_rate": 0.0002181463160635251, + "loss": 2.1733, + "step": 5340 + }, + { + "epoch": 0.83, + "learning_rate": 0.0002178338974225462, + "loss": 2.242, + "step": 5360 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021752147878156727, + "loss": 2.2154, + "step": 5380 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021720906014058836, + "loss": 2.1969, + "step": 5400 + }, + { + "epoch": 0.84, + "eval_loss": 2.372680902481079, + "eval_runtime": 69.283, + "eval_samples_per_second": 28.867, + "eval_steps_per_second": 1.804, + "step": 5400 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021689664149960946, + "loss": 2.1245, + "step": 5420 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021658422285863056, + "loss": 2.2049, + "step": 5440 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021627180421765162, + "loss": 2.1716, + "step": 5460 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021595938557667272, + "loss": 2.1891, + "step": 5480 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002156469669356938, + "loss": 2.1963, + "step": 5500 + }, + { + "epoch": 0.86, + "learning_rate": 0.0002153345482947149, + "loss": 2.1946, + "step": 5520 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021502212965373598, + "loss": 2.1982, + "step": 5540 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021470971101275707, + "loss": 2.1759, + "step": 5560 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021439729237177814, + "loss": 2.1661, + "step": 5580 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021408487373079926, + "loss": 2.2051, + "step": 5600 + }, + { + "epoch": 0.87, + "eval_loss": 2.3719565868377686, + "eval_runtime": 69.321, + "eval_samples_per_second": 28.851, + "eval_steps_per_second": 1.803, + "step": 5600 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021377245508982033, + "loss": 2.1605, + "step": 5620 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021346003644884143, + "loss": 2.1375, + "step": 5640 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002131476178078625, + "loss": 2.1293, + "step": 5660 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021283519916688362, + "loss": 2.2189, + "step": 5680 + }, + { + "epoch": 0.89, + "learning_rate": 0.0002125227805259047, + "loss": 2.1784, + "step": 5700 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021221036188492578, + "loss": 2.1764, + "step": 5720 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021189794324394685, + "loss": 2.1569, + "step": 5740 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021158552460296795, + "loss": 2.1704, + "step": 5760 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021127310596198904, + "loss": 2.1614, + "step": 5780 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021096068732101014, + "loss": 2.2078, + "step": 5800 + }, + { + "epoch": 0.9, + "eval_loss": 2.370939016342163, + "eval_runtime": 69.2728, + "eval_samples_per_second": 28.871, + "eval_steps_per_second": 1.804, + "step": 5800 + }, + { + "epoch": 0.9, + "learning_rate": 0.0002106482686800312, + "loss": 2.198, + "step": 5820 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002103358500390523, + "loss": 2.1735, + "step": 5840 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021002343139807342, + "loss": 2.1936, + "step": 5860 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002097110127570945, + "loss": 2.1559, + "step": 5880 + }, + { + "epoch": 0.92, + "learning_rate": 0.0002093985941161156, + "loss": 2.1856, + "step": 5900 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020908617547513666, + "loss": 2.194, + "step": 5920 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020877375683415778, + "loss": 2.1983, + "step": 5940 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020846133819317885, + "loss": 2.1788, + "step": 5960 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020814891955219994, + "loss": 2.2126, + "step": 5980 + }, + { + "epoch": 0.93, + "learning_rate": 0.000207836500911221, + "loss": 2.1454, + "step": 6000 + }, + { + "epoch": 0.93, + "eval_loss": 2.369137763977051, + "eval_runtime": 69.3036, + "eval_samples_per_second": 28.859, + "eval_steps_per_second": 1.804, + "step": 6000 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020752408227024213, + "loss": 2.1603, + "step": 6020 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002072116636292632, + "loss": 2.2075, + "step": 6040 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002068992449882843, + "loss": 2.1817, + "step": 6060 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020658682634730537, + "loss": 2.1917, + "step": 6080 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020627440770632646, + "loss": 2.1727, + "step": 6100 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020596198906534756, + "loss": 2.1985, + "step": 6120 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020564957042436865, + "loss": 2.1888, + "step": 6140 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020533715178338972, + "loss": 2.1425, + "step": 6160 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020502473314241082, + "loss": 2.1659, + "step": 6180 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020471231450143188, + "loss": 2.1768, + "step": 6200 + }, + { + "epoch": 0.96, + "eval_loss": 2.368589162826538, + "eval_runtime": 69.4033, + "eval_samples_per_second": 28.817, + "eval_steps_per_second": 1.801, + "step": 6200 + }, + { + "epoch": 0.97, + "learning_rate": 0.000204399895860453, + "loss": 2.1744, + "step": 6220 + }, + { + "epoch": 0.97, + "learning_rate": 0.00020408747721947407, + "loss": 2.1484, + "step": 6240 + }, + { + "epoch": 0.97, + "learning_rate": 0.00020377505857849517, + "loss": 2.2154, + "step": 6260 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020346263993751624, + "loss": 2.1358, + "step": 6280 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020315022129653736, + "loss": 2.1809, + "step": 6300 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020283780265555843, + "loss": 2.1813, + "step": 6320 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020252538401457952, + "loss": 2.1903, + "step": 6340 + }, + { + "epoch": 0.99, + "learning_rate": 0.0002022129653736006, + "loss": 2.1971, + "step": 6360 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020190054673262172, + "loss": 2.2041, + "step": 6380 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020158812809164278, + "loss": 2.2169, + "step": 6400 + }, + { + "epoch": 0.99, + "eval_loss": 2.3672330379486084, + "eval_runtime": 69.3516, + "eval_samples_per_second": 28.839, + "eval_steps_per_second": 1.802, + "step": 6400 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020127570945066388, + "loss": 2.2101, + "step": 6420 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020096329080968495, + "loss": 2.1739, + "step": 6440 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020065087216870604, + "loss": 2.1764, + "step": 6460 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020033845352772714, + "loss": 2.1718, + "step": 6480 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020002603488674823, + "loss": 2.1688, + "step": 6500 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001997136162457693, + "loss": 2.1322, + "step": 6520 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001994011976047904, + "loss": 2.1593, + "step": 6540 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001990887789638115, + "loss": 2.179, + "step": 6560 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001987763603228326, + "loss": 2.139, + "step": 6580 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019846394168185366, + "loss": 2.1594, + "step": 6600 + }, + { + "epoch": 1.03, + "eval_loss": 2.367051839828491, + "eval_runtime": 69.3473, + "eval_samples_per_second": 28.84, + "eval_steps_per_second": 1.803, + "step": 6600 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019815152304087475, + "loss": 2.2033, + "step": 6620 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019783910439989582, + "loss": 2.183, + "step": 6640 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019752668575891694, + "loss": 2.1517, + "step": 6660 + }, + { + "epoch": 1.04, + "learning_rate": 0.000197214267117938, + "loss": 2.183, + "step": 6680 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001969018484769591, + "loss": 2.197, + "step": 6700 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019658942983598017, + "loss": 2.1778, + "step": 6720 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001962770111950013, + "loss": 2.1745, + "step": 6740 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019596459255402237, + "loss": 2.1585, + "step": 6760 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019565217391304346, + "loss": 2.1708, + "step": 6780 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019533975527206453, + "loss": 2.1649, + "step": 6800 + }, + { + "epoch": 1.06, + "eval_loss": 2.363710880279541, + "eval_runtime": 69.2642, + "eval_samples_per_second": 28.875, + "eval_steps_per_second": 1.805, + "step": 6800 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019502733663108565, + "loss": 2.1391, + "step": 6820 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019471491799010672, + "loss": 2.1939, + "step": 6840 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019440249934912782, + "loss": 2.1558, + "step": 6860 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019409008070814888, + "loss": 2.173, + "step": 6880 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019377766206716998, + "loss": 2.1821, + "step": 6900 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019346524342619107, + "loss": 2.16, + "step": 6920 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019315282478521217, + "loss": 2.1808, + "step": 6940 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019284040614423324, + "loss": 2.1355, + "step": 6960 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019252798750325433, + "loss": 2.1813, + "step": 6980 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019221556886227546, + "loss": 2.1677, + "step": 7000 + }, + { + "epoch": 1.09, + "eval_loss": 2.3648109436035156, + "eval_runtime": 69.3675, + "eval_samples_per_second": 28.832, + "eval_steps_per_second": 1.802, + "step": 7000 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019190315022129652, + "loss": 2.1479, + "step": 7020 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019159073158031762, + "loss": 2.1852, + "step": 7040 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001912783129393387, + "loss": 2.14, + "step": 7060 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001909658942983598, + "loss": 2.1332, + "step": 7080 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019065347565738088, + "loss": 2.178, + "step": 7100 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019034105701640197, + "loss": 2.1661, + "step": 7120 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019002863837542304, + "loss": 2.1902, + "step": 7140 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018971621973444417, + "loss": 2.1775, + "step": 7160 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018940380109346523, + "loss": 2.2007, + "step": 7180 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018909138245248633, + "loss": 2.2078, + "step": 7200 + }, + { + "epoch": 1.12, + "eval_loss": 2.3642289638519287, + "eval_runtime": 69.5476, + "eval_samples_per_second": 28.757, + "eval_steps_per_second": 1.797, + "step": 7200 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001887789638115074, + "loss": 2.185, + "step": 7220 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001884665451705285, + "loss": 2.1856, + "step": 7240 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001881541265295496, + "loss": 2.2049, + "step": 7260 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018784170788857068, + "loss": 2.1376, + "step": 7280 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018752928924759175, + "loss": 2.1693, + "step": 7300 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018721687060661285, + "loss": 2.1825, + "step": 7320 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018690445196563392, + "loss": 2.1649, + "step": 7340 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018659203332465504, + "loss": 2.1936, + "step": 7360 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001862796146836761, + "loss": 2.143, + "step": 7380 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001859671960426972, + "loss": 2.1617, + "step": 7400 + }, + { + "epoch": 1.15, + "eval_loss": 2.362150192260742, + "eval_runtime": 69.3218, + "eval_samples_per_second": 28.851, + "eval_steps_per_second": 1.803, + "step": 7400 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018565477740171827, + "loss": 2.1555, + "step": 7420 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001853423587607394, + "loss": 2.1639, + "step": 7440 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018502994011976046, + "loss": 2.1678, + "step": 7460 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018471752147878156, + "loss": 2.1775, + "step": 7480 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018440510283780263, + "loss": 2.1784, + "step": 7500 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018409268419682375, + "loss": 2.1499, + "step": 7520 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018378026555584482, + "loss": 2.154, + "step": 7540 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001834678469148659, + "loss": 2.1793, + "step": 7560 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018315542827388698, + "loss": 2.2292, + "step": 7580 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018284300963290808, + "loss": 2.1578, + "step": 7600 + }, + { + "epoch": 1.18, + "eval_loss": 2.3628857135772705, + "eval_runtime": 69.2564, + "eval_samples_per_second": 28.878, + "eval_steps_per_second": 1.805, + "step": 7600 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018253059099192917, + "loss": 2.1494, + "step": 7620 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018221817235095027, + "loss": 2.1669, + "step": 7640 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018190575370997133, + "loss": 2.1447, + "step": 7660 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018159333506899243, + "loss": 2.1663, + "step": 7680 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001812809164280135, + "loss": 2.1871, + "step": 7700 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018096849778703462, + "loss": 2.1338, + "step": 7720 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001806560791460557, + "loss": 2.1767, + "step": 7740 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018034366050507678, + "loss": 2.1694, + "step": 7760 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018003124186409785, + "loss": 2.1674, + "step": 7780 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017971882322311898, + "loss": 2.1863, + "step": 7800 + }, + { + "epoch": 1.21, + "eval_loss": 2.3613035678863525, + "eval_runtime": 69.2881, + "eval_samples_per_second": 28.865, + "eval_steps_per_second": 1.804, + "step": 7800 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017940640458214004, + "loss": 2.1441, + "step": 7820 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017909398594116114, + "loss": 2.1885, + "step": 7840 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001787815673001822, + "loss": 2.1514, + "step": 7860 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017846914865920333, + "loss": 2.2002, + "step": 7880 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001781567300182244, + "loss": 2.1759, + "step": 7900 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001778443113772455, + "loss": 2.1611, + "step": 7920 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017753189273626656, + "loss": 2.1667, + "step": 7940 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017721947409528768, + "loss": 2.1717, + "step": 7960 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017690705545430875, + "loss": 2.1983, + "step": 7980 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017659463681332985, + "loss": 2.2092, + "step": 8000 + }, + { + "epoch": 1.24, + "eval_loss": 2.3608274459838867, + "eval_runtime": 69.3364, + "eval_samples_per_second": 28.845, + "eval_steps_per_second": 1.803, + "step": 8000 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017628221817235092, + "loss": 2.1305, + "step": 8020 + }, + { + "epoch": 1.25, + "learning_rate": 0.000175969799531372, + "loss": 2.1431, + "step": 8040 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001756573808903931, + "loss": 2.1384, + "step": 8060 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001753449622494142, + "loss": 2.2093, + "step": 8080 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017503254360843527, + "loss": 2.1271, + "step": 8100 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017472012496745637, + "loss": 2.1466, + "step": 8120 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001744077063264775, + "loss": 2.1578, + "step": 8140 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017409528768549856, + "loss": 2.1632, + "step": 8160 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017378286904451965, + "loss": 2.1465, + "step": 8180 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017347045040354072, + "loss": 2.2226, + "step": 8200 + }, + { + "epoch": 1.27, + "eval_loss": 2.35835599899292, + "eval_runtime": 69.2657, + "eval_samples_per_second": 28.874, + "eval_steps_per_second": 1.805, + "step": 8200 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017315803176256184, + "loss": 2.1585, + "step": 8220 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001728456131215829, + "loss": 2.1529, + "step": 8240 + }, + { + "epoch": 1.28, + "learning_rate": 0.000172533194480604, + "loss": 2.1663, + "step": 8260 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017222077583962508, + "loss": 2.1422, + "step": 8280 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017190835719864617, + "loss": 2.158, + "step": 8300 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017159593855766727, + "loss": 2.1984, + "step": 8320 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017128351991668836, + "loss": 2.1395, + "step": 8340 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017097110127570943, + "loss": 2.14, + "step": 8360 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017065868263473053, + "loss": 2.1657, + "step": 8380 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017036188492580056, + "loss": 2.167, + "step": 8400 + }, + { + "epoch": 1.31, + "eval_loss": 2.35697603225708, + "eval_runtime": 69.2685, + "eval_samples_per_second": 28.873, + "eval_steps_per_second": 1.805, + "step": 8400 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017004946628482165, + "loss": 2.1396, + "step": 8420 + }, + { + "epoch": 1.31, + "learning_rate": 0.00016973704764384272, + "loss": 2.1777, + "step": 8440 + }, + { + "epoch": 1.31, + "learning_rate": 0.00016942462900286384, + "loss": 2.1366, + "step": 8460 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001691122103618849, + "loss": 2.1625, + "step": 8480 + }, + { + "epoch": 1.32, + "learning_rate": 0.000168799791720906, + "loss": 2.1859, + "step": 8500 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016848737307992707, + "loss": 2.1705, + "step": 8520 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001681749544389482, + "loss": 2.1971, + "step": 8540 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016786253579796927, + "loss": 2.1937, + "step": 8560 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016755011715699036, + "loss": 2.1436, + "step": 8580 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016723769851601143, + "loss": 2.1592, + "step": 8600 + }, + { + "epoch": 1.34, + "eval_loss": 2.3576247692108154, + "eval_runtime": 69.277, + "eval_samples_per_second": 28.87, + "eval_steps_per_second": 1.804, + "step": 8600 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016692527987503252, + "loss": 2.1745, + "step": 8620 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016661286123405362, + "loss": 2.1517, + "step": 8640 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016630044259307472, + "loss": 2.1921, + "step": 8660 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016598802395209578, + "loss": 2.1703, + "step": 8680 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016567560531111688, + "loss": 2.1223, + "step": 8700 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016536318667013795, + "loss": 2.1748, + "step": 8720 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016505076802915907, + "loss": 2.145, + "step": 8740 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016473834938818014, + "loss": 2.1077, + "step": 8760 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016442593074720123, + "loss": 2.1571, + "step": 8780 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001641135121062223, + "loss": 2.1946, + "step": 8800 + }, + { + "epoch": 1.37, + "eval_loss": 2.3559648990631104, + "eval_runtime": 69.3886, + "eval_samples_per_second": 28.823, + "eval_steps_per_second": 1.801, + "step": 8800 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016380109346524342, + "loss": 2.1635, + "step": 8820 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001634886748242645, + "loss": 2.1546, + "step": 8840 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001631762561832856, + "loss": 2.1359, + "step": 8860 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016286383754230666, + "loss": 2.1741, + "step": 8880 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016255141890132778, + "loss": 2.1382, + "step": 8900 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016223900026034885, + "loss": 2.1514, + "step": 8920 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016192658161936994, + "loss": 2.17, + "step": 8940 + }, + { + "epoch": 1.39, + "learning_rate": 0.000161614162978391, + "loss": 2.1784, + "step": 8960 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001613017443374121, + "loss": 2.1869, + "step": 8980 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001609893256964332, + "loss": 2.155, + "step": 9000 + }, + { + "epoch": 1.4, + "eval_loss": 2.3562612533569336, + "eval_runtime": 70.7208, + "eval_samples_per_second": 28.28, + "eval_steps_per_second": 1.768, + "step": 9000 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001606769070554543, + "loss": 2.1467, + "step": 9020 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016036448841447537, + "loss": 2.1662, + "step": 9040 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016005206977349646, + "loss": 2.1928, + "step": 9060 + }, + { + "epoch": 1.41, + "learning_rate": 0.00015973965113251756, + "loss": 2.1084, + "step": 9080 + }, + { + "epoch": 1.41, + "learning_rate": 0.00015942723249153865, + "loss": 2.182, + "step": 9100 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015911481385055975, + "loss": 2.1502, + "step": 9120 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015880239520958082, + "loss": 2.1645, + "step": 9140 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015848997656860194, + "loss": 2.1246, + "step": 9160 + }, + { + "epoch": 1.43, + "learning_rate": 0.000158177557927623, + "loss": 2.1769, + "step": 9180 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001578651392866441, + "loss": 2.1772, + "step": 9200 + }, + { + "epoch": 1.43, + "eval_loss": 2.354128360748291, + "eval_runtime": 70.4883, + "eval_samples_per_second": 28.374, + "eval_steps_per_second": 1.773, + "step": 9200 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015755272064566517, + "loss": 2.1777, + "step": 9220 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001572403020046863, + "loss": 2.1749, + "step": 9240 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015692788336370736, + "loss": 2.1861, + "step": 9260 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015661546472272846, + "loss": 2.1567, + "step": 9280 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015630304608174952, + "loss": 2.1426, + "step": 9300 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015599062744077062, + "loss": 2.1658, + "step": 9320 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015567820879979172, + "loss": 2.1639, + "step": 9340 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001553657901588128, + "loss": 2.1897, + "step": 9360 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015505337151783388, + "loss": 2.1439, + "step": 9380 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015474095287685497, + "loss": 2.1326, + "step": 9400 + }, + { + "epoch": 1.46, + "eval_loss": 2.352673292160034, + "eval_runtime": 69.2871, + "eval_samples_per_second": 28.865, + "eval_steps_per_second": 1.804, + "step": 9400 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015442853423587604, + "loss": 2.139, + "step": 9420 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015411611559489717, + "loss": 2.1087, + "step": 9440 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015380369695391823, + "loss": 2.1528, + "step": 9460 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015349127831293933, + "loss": 2.1866, + "step": 9480 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001531788596719604, + "loss": 2.1436, + "step": 9500 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015286644103098152, + "loss": 2.1699, + "step": 9520 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001525540223900026, + "loss": 2.1415, + "step": 9540 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015224160374902368, + "loss": 2.1092, + "step": 9560 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015192918510804475, + "loss": 2.1422, + "step": 9580 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015161676646706587, + "loss": 2.1677, + "step": 9600 + }, + { + "epoch": 1.49, + "eval_loss": 2.3518292903900146, + "eval_runtime": 69.3029, + "eval_samples_per_second": 28.859, + "eval_steps_per_second": 1.804, + "step": 9600 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015130434782608694, + "loss": 2.1594, + "step": 9620 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015099192918510804, + "loss": 2.1539, + "step": 9640 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001506795105441291, + "loss": 2.1343, + "step": 9660 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015036709190315023, + "loss": 2.1386, + "step": 9680 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001500546732621713, + "loss": 2.1512, + "step": 9700 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001497422546211924, + "loss": 2.1669, + "step": 9720 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001494298359802135, + "loss": 2.158, + "step": 9740 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014911741733923456, + "loss": 2.1643, + "step": 9760 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014880499869825565, + "loss": 2.1612, + "step": 9780 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014849258005727675, + "loss": 2.1441, + "step": 9800 + }, + { + "epoch": 1.52, + "eval_loss": 2.35211181640625, + "eval_runtime": 69.2821, + "eval_samples_per_second": 28.867, + "eval_steps_per_second": 1.804, + "step": 9800 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014818016141629784, + "loss": 2.1704, + "step": 9820 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001478677427753189, + "loss": 2.1546, + "step": 9840 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014755532413434, + "loss": 2.1909, + "step": 9860 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001472429054933611, + "loss": 2.149, + "step": 9880 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014693048685238217, + "loss": 2.1419, + "step": 9900 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014661806821140327, + "loss": 2.1465, + "step": 9920 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014630564957042436, + "loss": 2.1551, + "step": 9940 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014599323092944546, + "loss": 2.1526, + "step": 9960 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014568081228846653, + "loss": 2.1437, + "step": 9980 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014536839364748762, + "loss": 2.1659, + "step": 10000 + }, + { + "epoch": 1.55, + "eval_loss": 2.3507654666900635, + "eval_runtime": 69.2997, + "eval_samples_per_second": 28.86, + "eval_steps_per_second": 1.804, + "step": 10000 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014505597500650872, + "loss": 2.14, + "step": 10020 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001447435563655298, + "loss": 2.1289, + "step": 10040 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014443113772455088, + "loss": 2.1226, + "step": 10060 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014411871908357198, + "loss": 2.1627, + "step": 10080 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014380630044259307, + "loss": 2.1759, + "step": 10100 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014349388180161414, + "loss": 2.1511, + "step": 10120 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014318146316063523, + "loss": 2.1275, + "step": 10140 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014286904451965633, + "loss": 2.1638, + "step": 10160 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014255662587867743, + "loss": 2.1494, + "step": 10180 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001422442072376985, + "loss": 2.1554, + "step": 10200 + }, + { + "epoch": 1.59, + "eval_loss": 2.349271059036255, + "eval_runtime": 69.2627, + "eval_samples_per_second": 28.876, + "eval_steps_per_second": 1.805, + "step": 10200 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001419317885967196, + "loss": 2.133, + "step": 10220 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014161936995574068, + "loss": 2.1515, + "step": 10240 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014130695131476178, + "loss": 2.1262, + "step": 10260 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014099453267378285, + "loss": 2.142, + "step": 10280 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014068211403280394, + "loss": 2.1578, + "step": 10300 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014036969539182504, + "loss": 2.1583, + "step": 10320 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001400572767508461, + "loss": 2.1043, + "step": 10340 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001397448581098672, + "loss": 2.1539, + "step": 10360 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001394324394688883, + "loss": 2.1189, + "step": 10380 + }, + { + "epoch": 1.62, + "learning_rate": 0.0001391200208279094, + "loss": 2.1484, + "step": 10400 + }, + { + "epoch": 1.62, + "eval_loss": 2.3479487895965576, + "eval_runtime": 69.2625, + "eval_samples_per_second": 28.876, + "eval_steps_per_second": 1.805, + "step": 10400 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013880760218693046, + "loss": 2.1993, + "step": 10420 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013849518354595156, + "loss": 2.1869, + "step": 10440 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013818276490497265, + "loss": 2.1644, + "step": 10460 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013787034626399375, + "loss": 2.1751, + "step": 10480 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013755792762301482, + "loss": 2.1416, + "step": 10500 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001372455089820359, + "loss": 2.1809, + "step": 10520 + }, + { + "epoch": 1.64, + "learning_rate": 0.000136933090341057, + "loss": 2.1653, + "step": 10540 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013662067170007808, + "loss": 2.1026, + "step": 10560 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013630825305909917, + "loss": 2.1503, + "step": 10580 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013599583441812027, + "loss": 2.1289, + "step": 10600 + }, + { + "epoch": 1.65, + "eval_loss": 2.3468515872955322, + "eval_runtime": 69.2274, + "eval_samples_per_second": 28.89, + "eval_steps_per_second": 1.806, + "step": 10600 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013568341577714136, + "loss": 2.1929, + "step": 10620 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013537099713616243, + "loss": 2.1547, + "step": 10640 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013505857849518353, + "loss": 2.1571, + "step": 10660 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013474615985420462, + "loss": 2.1649, + "step": 10680 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013443374121322572, + "loss": 2.1647, + "step": 10700 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013412132257224679, + "loss": 2.206, + "step": 10720 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013380890393126788, + "loss": 2.1377, + "step": 10740 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013349648529028898, + "loss": 2.1347, + "step": 10760 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013318406664931004, + "loss": 2.1948, + "step": 10780 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013287164800833114, + "loss": 2.1844, + "step": 10800 + }, + { + "epoch": 1.68, + "eval_loss": 2.347837209701538, + "eval_runtime": 69.2425, + "eval_samples_per_second": 28.884, + "eval_steps_per_second": 1.805, + "step": 10800 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013255922936735224, + "loss": 2.1515, + "step": 10820 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013224681072637333, + "loss": 2.1885, + "step": 10840 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013193439208539443, + "loss": 2.143, + "step": 10860 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013162197344441552, + "loss": 2.1671, + "step": 10880 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001313095548034366, + "loss": 2.1426, + "step": 10900 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013099713616245769, + "loss": 2.1653, + "step": 10920 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013068471752147878, + "loss": 2.1774, + "step": 10940 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013037229888049988, + "loss": 2.1344, + "step": 10960 + }, + { + "epoch": 1.71, + "learning_rate": 0.00013005988023952094, + "loss": 2.1217, + "step": 10980 + }, + { + "epoch": 1.71, + "learning_rate": 0.00012974746159854204, + "loss": 2.1281, + "step": 11000 + }, + { + "epoch": 1.71, + "eval_loss": 2.345808982849121, + "eval_runtime": 69.2499, + "eval_samples_per_second": 28.881, + "eval_steps_per_second": 1.805, + "step": 11000 + }, + { + "epoch": 1.71, + "learning_rate": 0.00012943504295756314, + "loss": 2.1459, + "step": 11020 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001291226243165842, + "loss": 2.1294, + "step": 11040 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001288102056756053, + "loss": 2.1455, + "step": 11060 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001284977870346264, + "loss": 2.1219, + "step": 11080 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001281853683936475, + "loss": 2.1696, + "step": 11100 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012787294975266856, + "loss": 2.1474, + "step": 11120 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012756053111168965, + "loss": 2.1436, + "step": 11140 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012724811247071075, + "loss": 2.1785, + "step": 11160 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012693569382973184, + "loss": 2.1677, + "step": 11180 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001266232751887529, + "loss": 2.1564, + "step": 11200 + }, + { + "epoch": 1.74, + "eval_loss": 2.3451294898986816, + "eval_runtime": 69.2454, + "eval_samples_per_second": 28.883, + "eval_steps_per_second": 1.805, + "step": 11200 + }, + { + "epoch": 1.74, + "learning_rate": 0.000126310856547774, + "loss": 2.1793, + "step": 11220 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001259984379067951, + "loss": 2.1583, + "step": 11240 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012568601926581617, + "loss": 2.1482, + "step": 11260 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012537360062483727, + "loss": 2.1393, + "step": 11280 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012506118198385836, + "loss": 2.1586, + "step": 11300 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012474876334287946, + "loss": 2.1533, + "step": 11320 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012443634470190053, + "loss": 2.1516, + "step": 11340 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012412392606092162, + "loss": 2.1184, + "step": 11360 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012381150741994272, + "loss": 2.1162, + "step": 11380 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001234990887789638, + "loss": 2.1588, + "step": 11400 + }, + { + "epoch": 1.77, + "eval_loss": 2.3451669216156006, + "eval_runtime": 69.2383, + "eval_samples_per_second": 28.886, + "eval_steps_per_second": 1.805, + "step": 11400 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012318667013798488, + "loss": 2.1588, + "step": 11420 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012287425149700598, + "loss": 2.1463, + "step": 11440 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012256183285602707, + "loss": 2.1498, + "step": 11460 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012224941421504814, + "loss": 2.1663, + "step": 11480 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012193699557406924, + "loss": 2.1306, + "step": 11500 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012162457693309033, + "loss": 2.1542, + "step": 11520 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012131215829211141, + "loss": 2.1513, + "step": 11540 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012099973965113251, + "loss": 2.2031, + "step": 11560 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012068732101015359, + "loss": 2.1438, + "step": 11580 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012037490236917469, + "loss": 2.1431, + "step": 11600 + }, + { + "epoch": 1.8, + "eval_loss": 2.3447554111480713, + "eval_runtime": 69.2865, + "eval_samples_per_second": 28.866, + "eval_steps_per_second": 1.804, + "step": 11600 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012006248372819577, + "loss": 2.1272, + "step": 11620 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011975006508721686, + "loss": 2.1584, + "step": 11640 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011943764644623794, + "loss": 2.128, + "step": 11660 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011912522780525903, + "loss": 2.1461, + "step": 11680 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011881280916428012, + "loss": 2.1411, + "step": 11700 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001185003905233012, + "loss": 2.1592, + "step": 11720 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001181879718823223, + "loss": 2.1642, + "step": 11740 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011787555324134338, + "loss": 2.1914, + "step": 11760 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011756313460036448, + "loss": 2.1612, + "step": 11780 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011725071595938556, + "loss": 2.1452, + "step": 11800 + }, + { + "epoch": 1.83, + "eval_loss": 2.3442630767822266, + "eval_runtime": 69.2459, + "eval_samples_per_second": 28.883, + "eval_steps_per_second": 1.805, + "step": 11800 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011693829731840665, + "loss": 2.1453, + "step": 11820 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011662587867742774, + "loss": 2.1251, + "step": 11840 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011631346003644882, + "loss": 2.1412, + "step": 11860 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011600104139546991, + "loss": 2.1033, + "step": 11880 + }, + { + "epoch": 1.85, + "learning_rate": 0.000115688622754491, + "loss": 2.1219, + "step": 11900 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011537620411351209, + "loss": 2.1831, + "step": 11920 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011506378547253317, + "loss": 2.1434, + "step": 11940 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011475136683155427, + "loss": 2.1439, + "step": 11960 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011443894819057536, + "loss": 2.1377, + "step": 11980 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011412652954959646, + "loss": 2.1345, + "step": 12000 + }, + { + "epoch": 1.86, + "eval_loss": 2.342855453491211, + "eval_runtime": 69.2714, + "eval_samples_per_second": 28.872, + "eval_steps_per_second": 1.804, + "step": 12000 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011381411090861754, + "loss": 2.1527, + "step": 12020 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011350169226763864, + "loss": 2.1737, + "step": 12040 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011318927362665972, + "loss": 2.137, + "step": 12060 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011287685498568081, + "loss": 2.1616, + "step": 12080 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001125644363447019, + "loss": 2.1688, + "step": 12100 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011225201770372299, + "loss": 2.1746, + "step": 12120 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011193959906274407, + "loss": 2.1552, + "step": 12140 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011162718042176515, + "loss": 2.1643, + "step": 12160 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011131476178078625, + "loss": 2.1494, + "step": 12180 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011100234313980733, + "loss": 2.1112, + "step": 12200 + }, + { + "epoch": 1.9, + "eval_loss": 2.34304141998291, + "eval_runtime": 72.1422, + "eval_samples_per_second": 27.723, + "eval_steps_per_second": 1.733, + "step": 12200 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011068992449882843, + "loss": 2.1505, + "step": 12220 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011037750585784951, + "loss": 2.1722, + "step": 12240 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001100650872168706, + "loss": 2.1582, + "step": 12260 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010975266857589169, + "loss": 2.1806, + "step": 12280 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010944024993491278, + "loss": 2.1508, + "step": 12300 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010912783129393386, + "loss": 2.1654, + "step": 12320 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010881541265295496, + "loss": 2.131, + "step": 12340 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010850299401197604, + "loss": 2.1301, + "step": 12360 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010819057537099712, + "loss": 2.1312, + "step": 12380 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010787815673001822, + "loss": 2.1301, + "step": 12400 + }, + { + "epoch": 1.93, + "eval_loss": 2.3404922485351562, + "eval_runtime": 71.3367, + "eval_samples_per_second": 28.036, + "eval_steps_per_second": 1.752, + "step": 12400 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010758135902108825, + "loss": 2.1398, + "step": 12420 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010726894038010933, + "loss": 2.1449, + "step": 12440 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010695652173913043, + "loss": 2.1498, + "step": 12460 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010664410309815151, + "loss": 2.1484, + "step": 12480 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001063316844571726, + "loss": 2.1705, + "step": 12500 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010601926581619368, + "loss": 2.1236, + "step": 12520 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010570684717521478, + "loss": 2.1435, + "step": 12540 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010539442853423586, + "loss": 2.1656, + "step": 12560 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010508200989325696, + "loss": 2.1459, + "step": 12580 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010476959125227804, + "loss": 2.1392, + "step": 12600 + }, + { + "epoch": 1.96, + "eval_loss": 2.3410892486572266, + "eval_runtime": 72.1407, + "eval_samples_per_second": 27.724, + "eval_steps_per_second": 1.733, + "step": 12600 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010445717261129913, + "loss": 2.1399, + "step": 12620 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010414475397032022, + "loss": 2.1979, + "step": 12640 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001038323353293413, + "loss": 2.1596, + "step": 12660 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001035199166883624, + "loss": 2.1817, + "step": 12680 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010320749804738348, + "loss": 2.0972, + "step": 12700 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010289507940640457, + "loss": 2.1293, + "step": 12720 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010258266076542565, + "loss": 2.1362, + "step": 12740 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010227024212444675, + "loss": 2.1474, + "step": 12760 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010195782348346783, + "loss": 2.2004, + "step": 12780 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010164540484248893, + "loss": 2.1221, + "step": 12800 + }, + { + "epoch": 1.99, + "eval_loss": 2.340029716491699, + "eval_runtime": 72.0796, + "eval_samples_per_second": 27.747, + "eval_steps_per_second": 1.734, + "step": 12800 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010133298620151001, + "loss": 2.1782, + "step": 12820 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010102056756053109, + "loss": 2.1358, + "step": 12840 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010070814891955218, + "loss": 2.122, + "step": 12860 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010039573027857327, + "loss": 2.1494, + "step": 12880 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010008331163759436, + "loss": 2.1522, + "step": 12900 + }, + { + "epoch": 2.01, + "learning_rate": 9.977089299661544e-05, + "loss": 2.1241, + "step": 12920 + }, + { + "epoch": 2.01, + "learning_rate": 9.945847435563654e-05, + "loss": 2.1456, + "step": 12940 + }, + { + "epoch": 2.01, + "learning_rate": 9.914605571465763e-05, + "loss": 2.1495, + "step": 12960 + }, + { + "epoch": 2.02, + "learning_rate": 9.883363707367873e-05, + "loss": 2.1734, + "step": 12980 + }, + { + "epoch": 2.02, + "learning_rate": 9.852121843269981e-05, + "loss": 2.1711, + "step": 13000 + }, + { + "epoch": 2.02, + "eval_loss": 2.339312791824341, + "eval_runtime": 69.2994, + "eval_samples_per_second": 28.86, + "eval_steps_per_second": 1.804, + "step": 13000 + }, + { + "epoch": 2.02, + "learning_rate": 9.820879979172091e-05, + "loss": 2.1483, + "step": 13020 + }, + { + "epoch": 2.03, + "learning_rate": 9.789638115074199e-05, + "loss": 2.124, + "step": 13040 + }, + { + "epoch": 2.03, + "learning_rate": 9.758396250976308e-05, + "loss": 2.1337, + "step": 13060 + }, + { + "epoch": 2.03, + "learning_rate": 9.727154386878417e-05, + "loss": 2.137, + "step": 13080 + }, + { + "epoch": 2.04, + "learning_rate": 9.695912522780526e-05, + "loss": 2.1225, + "step": 13100 + }, + { + "epoch": 2.04, + "learning_rate": 9.664670658682634e-05, + "loss": 2.1384, + "step": 13120 + }, + { + "epoch": 2.04, + "learning_rate": 9.633428794584743e-05, + "loss": 2.1052, + "step": 13140 + }, + { + "epoch": 2.05, + "learning_rate": 9.602186930486852e-05, + "loss": 2.1489, + "step": 13160 + }, + { + "epoch": 2.05, + "learning_rate": 9.57094506638896e-05, + "loss": 2.1154, + "step": 13180 + }, + { + "epoch": 2.05, + "learning_rate": 9.53970320229107e-05, + "loss": 2.1476, + "step": 13200 + }, + { + "epoch": 2.05, + "eval_loss": 2.3396096229553223, + "eval_runtime": 69.2833, + "eval_samples_per_second": 28.867, + "eval_steps_per_second": 1.804, + "step": 13200 + }, + { + "epoch": 2.05, + "learning_rate": 9.508461338193178e-05, + "loss": 2.1109, + "step": 13220 + }, + { + "epoch": 2.06, + "learning_rate": 9.477219474095288e-05, + "loss": 2.0973, + "step": 13240 + }, + { + "epoch": 2.06, + "learning_rate": 9.445977609997396e-05, + "loss": 2.1281, + "step": 13260 + }, + { + "epoch": 2.06, + "learning_rate": 9.414735745899505e-05, + "loss": 2.1216, + "step": 13280 + }, + { + "epoch": 2.07, + "learning_rate": 9.383493881801614e-05, + "loss": 2.1323, + "step": 13300 + }, + { + "epoch": 2.07, + "learning_rate": 9.352252017703723e-05, + "loss": 2.1477, + "step": 13320 + }, + { + "epoch": 2.07, + "learning_rate": 9.321010153605831e-05, + "loss": 2.1309, + "step": 13340 + }, + { + "epoch": 2.08, + "learning_rate": 9.28976828950794e-05, + "loss": 2.0899, + "step": 13360 + }, + { + "epoch": 2.08, + "learning_rate": 9.258526425410049e-05, + "loss": 2.1402, + "step": 13380 + }, + { + "epoch": 2.08, + "learning_rate": 9.227284561312157e-05, + "loss": 2.0768, + "step": 13400 + }, + { + "epoch": 2.08, + "eval_loss": 2.3376858234405518, + "eval_runtime": 69.4568, + "eval_samples_per_second": 28.795, + "eval_steps_per_second": 1.8, + "step": 13400 + }, + { + "epoch": 2.09, + "learning_rate": 9.196042697214267e-05, + "loss": 2.1405, + "step": 13420 + }, + { + "epoch": 2.09, + "learning_rate": 9.164800833116375e-05, + "loss": 2.1118, + "step": 13440 + }, + { + "epoch": 2.09, + "learning_rate": 9.133558969018484e-05, + "loss": 2.1525, + "step": 13460 + }, + { + "epoch": 2.09, + "learning_rate": 9.102317104920593e-05, + "loss": 2.1369, + "step": 13480 + }, + { + "epoch": 2.1, + "learning_rate": 9.071075240822702e-05, + "loss": 2.1683, + "step": 13500 + }, + { + "epoch": 2.1, + "learning_rate": 9.03983337672481e-05, + "loss": 2.1193, + "step": 13520 + }, + { + "epoch": 2.1, + "learning_rate": 9.00859151262692e-05, + "loss": 2.1222, + "step": 13540 + }, + { + "epoch": 2.11, + "learning_rate": 8.977349648529028e-05, + "loss": 2.1461, + "step": 13560 + }, + { + "epoch": 2.11, + "learning_rate": 8.946107784431136e-05, + "loss": 2.1106, + "step": 13580 + }, + { + "epoch": 2.11, + "learning_rate": 8.914865920333246e-05, + "loss": 2.1307, + "step": 13600 + }, + { + "epoch": 2.11, + "eval_loss": 2.3381118774414062, + "eval_runtime": 69.5609, + "eval_samples_per_second": 28.752, + "eval_steps_per_second": 1.797, + "step": 13600 + }, + { + "epoch": 2.12, + "learning_rate": 8.883624056235354e-05, + "loss": 2.1679, + "step": 13620 + }, + { + "epoch": 2.12, + "learning_rate": 8.852382192137464e-05, + "loss": 2.1418, + "step": 13640 + }, + { + "epoch": 2.12, + "learning_rate": 8.821140328039572e-05, + "loss": 2.1238, + "step": 13660 + }, + { + "epoch": 2.13, + "learning_rate": 8.789898463941681e-05, + "loss": 2.0995, + "step": 13680 + }, + { + "epoch": 2.13, + "learning_rate": 8.75865659984379e-05, + "loss": 2.1596, + "step": 13700 + }, + { + "epoch": 2.13, + "learning_rate": 8.727414735745899e-05, + "loss": 2.1478, + "step": 13720 + }, + { + "epoch": 2.14, + "learning_rate": 8.696172871648007e-05, + "loss": 2.1299, + "step": 13740 + }, + { + "epoch": 2.14, + "learning_rate": 8.664931007550115e-05, + "loss": 2.1405, + "step": 13760 + }, + { + "epoch": 2.14, + "learning_rate": 8.633689143452225e-05, + "loss": 2.174, + "step": 13780 + }, + { + "epoch": 2.14, + "learning_rate": 8.602447279354333e-05, + "loss": 2.129, + "step": 13800 + }, + { + "epoch": 2.14, + "eval_loss": 2.337769031524658, + "eval_runtime": 69.7472, + "eval_samples_per_second": 28.675, + "eval_steps_per_second": 1.792, + "step": 13800 + }, + { + "epoch": 2.15, + "learning_rate": 8.571205415256443e-05, + "loss": 2.1368, + "step": 13820 + }, + { + "epoch": 2.15, + "learning_rate": 8.539963551158551e-05, + "loss": 2.1573, + "step": 13840 + }, + { + "epoch": 2.15, + "learning_rate": 8.50872168706066e-05, + "loss": 2.1132, + "step": 13860 + }, + { + "epoch": 2.16, + "learning_rate": 8.477479822962769e-05, + "loss": 2.1131, + "step": 13880 + }, + { + "epoch": 2.16, + "learning_rate": 8.446237958864878e-05, + "loss": 2.1351, + "step": 13900 + }, + { + "epoch": 2.16, + "learning_rate": 8.414996094766986e-05, + "loss": 2.1738, + "step": 13920 + }, + { + "epoch": 2.17, + "learning_rate": 8.383754230669096e-05, + "loss": 2.1551, + "step": 13940 + }, + { + "epoch": 2.17, + "learning_rate": 8.352512366571204e-05, + "loss": 2.1195, + "step": 13960 + }, + { + "epoch": 2.17, + "learning_rate": 8.321270502473312e-05, + "loss": 2.1125, + "step": 13980 + }, + { + "epoch": 2.18, + "learning_rate": 8.290028638375422e-05, + "loss": 2.1549, + "step": 14000 + }, + { + "epoch": 2.18, + "eval_loss": 2.337301731109619, + "eval_runtime": 69.7462, + "eval_samples_per_second": 28.675, + "eval_steps_per_second": 1.792, + "step": 14000 + }, + { + "epoch": 2.18, + "learning_rate": 8.25878677427753e-05, + "loss": 2.1573, + "step": 14020 + }, + { + "epoch": 2.18, + "learning_rate": 8.22754491017964e-05, + "loss": 2.1125, + "step": 14040 + }, + { + "epoch": 2.18, + "learning_rate": 8.196303046081748e-05, + "loss": 2.161, + "step": 14060 + }, + { + "epoch": 2.19, + "learning_rate": 8.165061181983857e-05, + "loss": 2.1511, + "step": 14080 + }, + { + "epoch": 2.19, + "learning_rate": 8.133819317885967e-05, + "loss": 2.1737, + "step": 14100 + }, + { + "epoch": 2.19, + "learning_rate": 8.102577453788076e-05, + "loss": 2.1158, + "step": 14120 + }, + { + "epoch": 2.2, + "learning_rate": 8.071335589690184e-05, + "loss": 2.1398, + "step": 14140 + }, + { + "epoch": 2.2, + "learning_rate": 8.040093725592294e-05, + "loss": 2.1183, + "step": 14160 + }, + { + "epoch": 2.2, + "learning_rate": 8.008851861494402e-05, + "loss": 2.1295, + "step": 14180 + }, + { + "epoch": 2.21, + "learning_rate": 7.977609997396512e-05, + "loss": 2.1416, + "step": 14200 + }, + { + "epoch": 2.21, + "eval_loss": 2.336796760559082, + "eval_runtime": 69.3578, + "eval_samples_per_second": 28.836, + "eval_steps_per_second": 1.802, + "step": 14200 + }, + { + "epoch": 2.21, + "learning_rate": 7.94636813329862e-05, + "loss": 2.1461, + "step": 14220 + }, + { + "epoch": 2.21, + "learning_rate": 7.91512626920073e-05, + "loss": 2.0931, + "step": 14240 + }, + { + "epoch": 2.22, + "learning_rate": 7.883884405102838e-05, + "loss": 2.1341, + "step": 14260 + }, + { + "epoch": 2.22, + "learning_rate": 7.852642541004946e-05, + "loss": 2.1369, + "step": 14280 + }, + { + "epoch": 2.22, + "learning_rate": 7.821400676907055e-05, + "loss": 2.1431, + "step": 14300 + }, + { + "epoch": 2.23, + "learning_rate": 7.790158812809164e-05, + "loss": 2.1508, + "step": 14320 + }, + { + "epoch": 2.23, + "learning_rate": 7.758916948711273e-05, + "loss": 2.1456, + "step": 14340 + }, + { + "epoch": 2.23, + "learning_rate": 7.727675084613381e-05, + "loss": 2.1448, + "step": 14360 + }, + { + "epoch": 2.23, + "learning_rate": 7.696433220515491e-05, + "loss": 2.1637, + "step": 14380 + }, + { + "epoch": 2.24, + "learning_rate": 7.665191356417599e-05, + "loss": 2.114, + "step": 14400 + }, + { + "epoch": 2.24, + "eval_loss": 2.3362655639648438, + "eval_runtime": 69.5792, + "eval_samples_per_second": 28.744, + "eval_steps_per_second": 1.797, + "step": 14400 + }, + { + "epoch": 2.24, + "learning_rate": 7.633949492319709e-05, + "loss": 2.1222, + "step": 14420 + }, + { + "epoch": 2.24, + "learning_rate": 7.602707628221817e-05, + "loss": 2.1776, + "step": 14440 + }, + { + "epoch": 2.25, + "learning_rate": 7.57302785732882e-05, + "loss": 2.1414, + "step": 14460 + }, + { + "epoch": 2.25, + "learning_rate": 7.541785993230929e-05, + "loss": 2.1231, + "step": 14480 + }, + { + "epoch": 2.25, + "learning_rate": 7.510544129133038e-05, + "loss": 2.1345, + "step": 14500 + }, + { + "epoch": 2.26, + "learning_rate": 7.479302265035147e-05, + "loss": 2.1339, + "step": 14520 + }, + { + "epoch": 2.26, + "learning_rate": 7.448060400937255e-05, + "loss": 2.1562, + "step": 14540 + }, + { + "epoch": 2.26, + "learning_rate": 7.416818536839363e-05, + "loss": 2.1649, + "step": 14560 + }, + { + "epoch": 2.27, + "learning_rate": 7.385576672741473e-05, + "loss": 2.1339, + "step": 14580 + }, + { + "epoch": 2.27, + "learning_rate": 7.354334808643581e-05, + "loss": 2.1347, + "step": 14600 + }, + { + "epoch": 2.27, + "eval_loss": 2.335818290710449, + "eval_runtime": 69.5131, + "eval_samples_per_second": 28.772, + "eval_steps_per_second": 1.798, + "step": 14600 + }, + { + "epoch": 2.27, + "learning_rate": 7.323092944545691e-05, + "loss": 2.1078, + "step": 14620 + }, + { + "epoch": 2.28, + "learning_rate": 7.291851080447799e-05, + "loss": 2.1446, + "step": 14640 + }, + { + "epoch": 2.28, + "learning_rate": 7.260609216349908e-05, + "loss": 2.1076, + "step": 14660 + }, + { + "epoch": 2.28, + "learning_rate": 7.229367352252017e-05, + "loss": 2.1548, + "step": 14680 + }, + { + "epoch": 2.28, + "learning_rate": 7.198125488154126e-05, + "loss": 2.1317, + "step": 14700 + }, + { + "epoch": 2.29, + "learning_rate": 7.166883624056234e-05, + "loss": 2.0991, + "step": 14720 + }, + { + "epoch": 2.29, + "learning_rate": 7.135641759958343e-05, + "loss": 2.1507, + "step": 14740 + }, + { + "epoch": 2.29, + "learning_rate": 7.104399895860452e-05, + "loss": 2.1173, + "step": 14760 + }, + { + "epoch": 2.3, + "learning_rate": 7.073158031762562e-05, + "loss": 2.104, + "step": 14780 + }, + { + "epoch": 2.3, + "learning_rate": 7.043478260869565e-05, + "loss": 2.1118, + "step": 14800 + }, + { + "epoch": 2.3, + "eval_loss": 2.334048271179199, + "eval_runtime": 69.3816, + "eval_samples_per_second": 28.826, + "eval_steps_per_second": 1.802, + "step": 14800 + }, + { + "epoch": 2.3, + "learning_rate": 7.012236396771674e-05, + "loss": 2.0738, + "step": 14820 + }, + { + "epoch": 2.31, + "learning_rate": 6.980994532673782e-05, + "loss": 2.1221, + "step": 14840 + }, + { + "epoch": 2.31, + "learning_rate": 6.94975266857589e-05, + "loss": 2.1531, + "step": 14860 + }, + { + "epoch": 2.31, + "learning_rate": 6.918510804478e-05, + "loss": 2.1318, + "step": 14880 + }, + { + "epoch": 2.32, + "learning_rate": 6.887268940380108e-05, + "loss": 2.1251, + "step": 14900 + }, + { + "epoch": 2.32, + "learning_rate": 6.856027076282218e-05, + "loss": 2.1212, + "step": 14920 + }, + { + "epoch": 2.32, + "learning_rate": 6.824785212184326e-05, + "loss": 2.0927, + "step": 14940 + }, + { + "epoch": 2.32, + "learning_rate": 6.793543348086436e-05, + "loss": 2.1277, + "step": 14960 + }, + { + "epoch": 2.33, + "learning_rate": 6.762301483988544e-05, + "loss": 2.156, + "step": 14980 + }, + { + "epoch": 2.33, + "learning_rate": 6.731059619890653e-05, + "loss": 2.1276, + "step": 15000 + }, + { + "epoch": 2.33, + "eval_loss": 2.3340351581573486, + "eval_runtime": 69.2926, + "eval_samples_per_second": 28.863, + "eval_steps_per_second": 1.804, + "step": 15000 + }, + { + "epoch": 2.33, + "learning_rate": 6.699817755792761e-05, + "loss": 2.1313, + "step": 15020 + }, + { + "epoch": 2.34, + "learning_rate": 6.668575891694871e-05, + "loss": 2.1452, + "step": 15040 + }, + { + "epoch": 2.34, + "learning_rate": 6.637334027596979e-05, + "loss": 2.1148, + "step": 15060 + }, + { + "epoch": 2.34, + "learning_rate": 6.606092163499087e-05, + "loss": 2.1193, + "step": 15080 + }, + { + "epoch": 2.35, + "learning_rate": 6.574850299401197e-05, + "loss": 2.1672, + "step": 15100 + }, + { + "epoch": 2.35, + "learning_rate": 6.543608435303305e-05, + "loss": 2.0789, + "step": 15120 + }, + { + "epoch": 2.35, + "learning_rate": 6.512366571205415e-05, + "loss": 2.1438, + "step": 15140 + }, + { + "epoch": 2.36, + "learning_rate": 6.481124707107523e-05, + "loss": 2.1597, + "step": 15160 + }, + { + "epoch": 2.36, + "learning_rate": 6.449882843009632e-05, + "loss": 2.11, + "step": 15180 + }, + { + "epoch": 2.36, + "learning_rate": 6.418640978911742e-05, + "loss": 2.1279, + "step": 15200 + }, + { + "epoch": 2.36, + "eval_loss": 2.3344008922576904, + "eval_runtime": 69.3363, + "eval_samples_per_second": 28.845, + "eval_steps_per_second": 1.803, + "step": 15200 + }, + { + "epoch": 2.37, + "learning_rate": 6.38739911481385e-05, + "loss": 2.1459, + "step": 15220 + }, + { + "epoch": 2.37, + "learning_rate": 6.35615725071596e-05, + "loss": 2.1702, + "step": 15240 + }, + { + "epoch": 2.37, + "learning_rate": 6.324915386618068e-05, + "loss": 2.1262, + "step": 15260 + }, + { + "epoch": 2.37, + "learning_rate": 6.293673522520177e-05, + "loss": 2.0988, + "step": 15280 + }, + { + "epoch": 2.38, + "learning_rate": 6.262431658422286e-05, + "loss": 2.1224, + "step": 15300 + }, + { + "epoch": 2.38, + "learning_rate": 6.231189794324394e-05, + "loss": 2.1102, + "step": 15320 + }, + { + "epoch": 2.38, + "learning_rate": 6.199947930226503e-05, + "loss": 2.1168, + "step": 15340 + }, + { + "epoch": 2.39, + "learning_rate": 6.168706066128611e-05, + "loss": 2.1205, + "step": 15360 + }, + { + "epoch": 2.39, + "learning_rate": 6.137464202030721e-05, + "loss": 2.0855, + "step": 15380 + }, + { + "epoch": 2.39, + "learning_rate": 6.106222337932829e-05, + "loss": 2.1548, + "step": 15400 + }, + { + "epoch": 2.39, + "eval_loss": 2.333451271057129, + "eval_runtime": 69.3334, + "eval_samples_per_second": 28.846, + "eval_steps_per_second": 1.803, + "step": 15400 + }, + { + "epoch": 2.4, + "learning_rate": 6.074980473834938e-05, + "loss": 2.1433, + "step": 15420 + }, + { + "epoch": 2.4, + "learning_rate": 6.043738609737047e-05, + "loss": 2.123, + "step": 15440 + }, + { + "epoch": 2.4, + "learning_rate": 6.012496745639156e-05, + "loss": 2.0965, + "step": 15460 + }, + { + "epoch": 2.41, + "learning_rate": 5.9812548815412647e-05, + "loss": 2.1498, + "step": 15480 + }, + { + "epoch": 2.41, + "learning_rate": 5.9500130174433735e-05, + "loss": 2.1456, + "step": 15500 + }, + { + "epoch": 2.41, + "learning_rate": 5.9187711533454824e-05, + "loss": 2.1295, + "step": 15520 + }, + { + "epoch": 2.41, + "learning_rate": 5.887529289247591e-05, + "loss": 2.108, + "step": 15540 + }, + { + "epoch": 2.42, + "learning_rate": 5.8562874251497e-05, + "loss": 2.1592, + "step": 15560 + }, + { + "epoch": 2.42, + "learning_rate": 5.825045561051809e-05, + "loss": 2.1214, + "step": 15580 + }, + { + "epoch": 2.42, + "learning_rate": 5.793803696953918e-05, + "loss": 2.1561, + "step": 15600 + }, + { + "epoch": 2.42, + "eval_loss": 2.3329403400421143, + "eval_runtime": 69.6034, + "eval_samples_per_second": 28.734, + "eval_steps_per_second": 1.796, + "step": 15600 + }, + { + "epoch": 2.43, + "learning_rate": 5.762561832856026e-05, + "loss": 2.1382, + "step": 15620 + }, + { + "epoch": 2.43, + "learning_rate": 5.731319968758135e-05, + "loss": 2.109, + "step": 15640 + }, + { + "epoch": 2.43, + "learning_rate": 5.700078104660244e-05, + "loss": 2.1283, + "step": 15660 + }, + { + "epoch": 2.44, + "learning_rate": 5.6688362405623526e-05, + "loss": 2.15, + "step": 15680 + }, + { + "epoch": 2.44, + "learning_rate": 5.6375943764644615e-05, + "loss": 2.1125, + "step": 15700 + }, + { + "epoch": 2.44, + "learning_rate": 5.6063525123665704e-05, + "loss": 2.1709, + "step": 15720 + }, + { + "epoch": 2.45, + "learning_rate": 5.575110648268679e-05, + "loss": 2.1622, + "step": 15740 + }, + { + "epoch": 2.45, + "learning_rate": 5.543868784170789e-05, + "loss": 2.0769, + "step": 15760 + }, + { + "epoch": 2.45, + "learning_rate": 5.5126269200728976e-05, + "loss": 2.137, + "step": 15780 + }, + { + "epoch": 2.46, + "learning_rate": 5.4813850559750065e-05, + "loss": 2.1294, + "step": 15800 + }, + { + "epoch": 2.46, + "eval_loss": 2.3324475288391113, + "eval_runtime": 69.559, + "eval_samples_per_second": 28.753, + "eval_steps_per_second": 1.797, + "step": 15800 + }, + { + "epoch": 2.46, + "learning_rate": 5.4501431918771154e-05, + "loss": 2.1425, + "step": 15820 + }, + { + "epoch": 2.46, + "learning_rate": 5.418901327779224e-05, + "loss": 2.128, + "step": 15840 + }, + { + "epoch": 2.46, + "learning_rate": 5.387659463681333e-05, + "loss": 2.1553, + "step": 15860 + }, + { + "epoch": 2.47, + "learning_rate": 5.356417599583441e-05, + "loss": 2.1339, + "step": 15880 + }, + { + "epoch": 2.47, + "learning_rate": 5.32517573548555e-05, + "loss": 2.1536, + "step": 15900 + }, + { + "epoch": 2.47, + "learning_rate": 5.293933871387659e-05, + "loss": 2.1669, + "step": 15920 + }, + { + "epoch": 2.48, + "learning_rate": 5.262692007289768e-05, + "loss": 2.122, + "step": 15940 + }, + { + "epoch": 2.48, + "learning_rate": 5.231450143191877e-05, + "loss": 2.1435, + "step": 15960 + }, + { + "epoch": 2.48, + "learning_rate": 5.2002082790939856e-05, + "loss": 2.1406, + "step": 15980 + }, + { + "epoch": 2.49, + "learning_rate": 5.1689664149960945e-05, + "loss": 2.1174, + "step": 16000 + }, + { + "epoch": 2.49, + "eval_loss": 2.332836866378784, + "eval_runtime": 69.3739, + "eval_samples_per_second": 28.829, + "eval_steps_per_second": 1.802, + "step": 16000 + }, + { + "epoch": 2.49, + "learning_rate": 5.137724550898203e-05, + "loss": 2.1286, + "step": 16020 + }, + { + "epoch": 2.49, + "learning_rate": 5.106482686800312e-05, + "loss": 2.1343, + "step": 16040 + }, + { + "epoch": 2.5, + "learning_rate": 5.075240822702421e-05, + "loss": 2.1134, + "step": 16060 + }, + { + "epoch": 2.5, + "learning_rate": 5.043998958604529e-05, + "loss": 2.1633, + "step": 16080 + }, + { + "epoch": 2.5, + "learning_rate": 5.012757094506638e-05, + "loss": 2.1473, + "step": 16100 + }, + { + "epoch": 2.5, + "learning_rate": 4.981515230408747e-05, + "loss": 2.1535, + "step": 16120 + }, + { + "epoch": 2.51, + "learning_rate": 4.950273366310856e-05, + "loss": 2.112, + "step": 16140 + }, + { + "epoch": 2.51, + "learning_rate": 4.919031502212965e-05, + "loss": 2.1399, + "step": 16160 + }, + { + "epoch": 2.51, + "learning_rate": 4.8877896381150736e-05, + "loss": 2.0913, + "step": 16180 + }, + { + "epoch": 2.52, + "learning_rate": 4.8565477740171824e-05, + "loss": 2.1179, + "step": 16200 + }, + { + "epoch": 2.52, + "eval_loss": 2.332409143447876, + "eval_runtime": 69.3294, + "eval_samples_per_second": 28.848, + "eval_steps_per_second": 1.803, + "step": 16200 + }, + { + "epoch": 2.52, + "learning_rate": 4.825305909919291e-05, + "loss": 2.1756, + "step": 16220 + }, + { + "epoch": 2.52, + "learning_rate": 4.7940640458214e-05, + "loss": 2.1466, + "step": 16240 + }, + { + "epoch": 2.53, + "learning_rate": 4.762822181723509e-05, + "loss": 2.1443, + "step": 16260 + }, + { + "epoch": 2.53, + "learning_rate": 4.731580317625618e-05, + "loss": 2.1207, + "step": 16280 + }, + { + "epoch": 2.53, + "learning_rate": 4.700338453527726e-05, + "loss": 2.1275, + "step": 16300 + }, + { + "epoch": 2.54, + "learning_rate": 4.669096589429835e-05, + "loss": 2.1305, + "step": 16320 + }, + { + "epoch": 2.54, + "learning_rate": 4.6378547253319445e-05, + "loss": 2.134, + "step": 16340 + }, + { + "epoch": 2.54, + "learning_rate": 4.6066128612340534e-05, + "loss": 2.1681, + "step": 16360 + }, + { + "epoch": 2.55, + "learning_rate": 4.575370997136162e-05, + "loss": 2.1627, + "step": 16380 + }, + { + "epoch": 2.55, + "learning_rate": 4.544129133038271e-05, + "loss": 2.1421, + "step": 16400 + }, + { + "epoch": 2.55, + "eval_loss": 2.3318614959716797, + "eval_runtime": 69.3251, + "eval_samples_per_second": 28.85, + "eval_steps_per_second": 1.803, + "step": 16400 + }, + { + "epoch": 2.55, + "learning_rate": 4.51288726894038e-05, + "loss": 2.1225, + "step": 16420 + }, + { + "epoch": 2.55, + "learning_rate": 4.481645404842489e-05, + "loss": 2.156, + "step": 16440 + }, + { + "epoch": 2.56, + "learning_rate": 4.450403540744598e-05, + "loss": 2.1573, + "step": 16460 + }, + { + "epoch": 2.56, + "learning_rate": 4.4191616766467066e-05, + "loss": 2.1295, + "step": 16480 + }, + { + "epoch": 2.56, + "learning_rate": 4.3879198125488154e-05, + "loss": 2.14, + "step": 16500 + }, + { + "epoch": 2.57, + "learning_rate": 4.356677948450924e-05, + "loss": 2.1046, + "step": 16520 + }, + { + "epoch": 2.57, + "learning_rate": 4.3254360843530325e-05, + "loss": 2.1201, + "step": 16540 + }, + { + "epoch": 2.57, + "learning_rate": 4.2941942202551413e-05, + "loss": 2.1767, + "step": 16560 + }, + { + "epoch": 2.58, + "learning_rate": 4.26295235615725e-05, + "loss": 2.1244, + "step": 16580 + }, + { + "epoch": 2.58, + "learning_rate": 4.231710492059359e-05, + "loss": 2.1301, + "step": 16600 + }, + { + "epoch": 2.58, + "eval_loss": 2.331899881362915, + "eval_runtime": 69.3398, + "eval_samples_per_second": 28.843, + "eval_steps_per_second": 1.803, + "step": 16600 + }, + { + "epoch": 2.58, + "learning_rate": 4.200468627961468e-05, + "loss": 2.1022, + "step": 16620 + }, + { + "epoch": 2.59, + "learning_rate": 4.169226763863577e-05, + "loss": 2.1121, + "step": 16640 + }, + { + "epoch": 2.59, + "learning_rate": 4.137984899765686e-05, + "loss": 2.1014, + "step": 16660 + }, + { + "epoch": 2.59, + "learning_rate": 4.1067430356677945e-05, + "loss": 2.1867, + "step": 16680 + }, + { + "epoch": 2.6, + "learning_rate": 4.0755011715699034e-05, + "loss": 2.1055, + "step": 16700 + }, + { + "epoch": 2.6, + "learning_rate": 4.044259307472012e-05, + "loss": 2.1435, + "step": 16720 + }, + { + "epoch": 2.6, + "learning_rate": 4.013017443374121e-05, + "loss": 2.09, + "step": 16740 + }, + { + "epoch": 2.6, + "learning_rate": 3.981775579276229e-05, + "loss": 2.1317, + "step": 16760 + }, + { + "epoch": 2.61, + "learning_rate": 3.950533715178338e-05, + "loss": 2.0683, + "step": 16780 + }, + { + "epoch": 2.61, + "learning_rate": 3.919291851080447e-05, + "loss": 2.1249, + "step": 16800 + }, + { + "epoch": 2.61, + "eval_loss": 2.331566572189331, + "eval_runtime": 69.3154, + "eval_samples_per_second": 28.854, + "eval_steps_per_second": 1.803, + "step": 16800 + }, + { + "epoch": 2.61, + "learning_rate": 3.888049986982556e-05, + "loss": 2.164, + "step": 16820 + }, + { + "epoch": 2.62, + "learning_rate": 3.856808122884665e-05, + "loss": 2.16, + "step": 16840 + }, + { + "epoch": 2.62, + "learning_rate": 3.8255662587867736e-05, + "loss": 2.1603, + "step": 16860 + }, + { + "epoch": 2.62, + "learning_rate": 3.7943243946888825e-05, + "loss": 2.1346, + "step": 16880 + }, + { + "epoch": 2.63, + "learning_rate": 3.7630825305909914e-05, + "loss": 2.1082, + "step": 16900 + }, + { + "epoch": 2.63, + "learning_rate": 3.7318406664931e-05, + "loss": 2.1014, + "step": 16920 + }, + { + "epoch": 2.63, + "learning_rate": 3.700598802395209e-05, + "loss": 2.1088, + "step": 16940 + }, + { + "epoch": 2.64, + "learning_rate": 3.669356938297318e-05, + "loss": 2.0975, + "step": 16960 + }, + { + "epoch": 2.64, + "learning_rate": 3.638115074199427e-05, + "loss": 2.1212, + "step": 16980 + }, + { + "epoch": 2.64, + "learning_rate": 3.606873210101536e-05, + "loss": 2.1226, + "step": 17000 + }, + { + "epoch": 2.64, + "eval_loss": 2.3310983180999756, + "eval_runtime": 69.3945, + "eval_samples_per_second": 28.821, + "eval_steps_per_second": 1.801, + "step": 17000 + }, + { + "epoch": 2.64, + "learning_rate": 3.5756313460036446e-05, + "loss": 2.1318, + "step": 17020 + }, + { + "epoch": 2.65, + "learning_rate": 3.5443894819057534e-05, + "loss": 2.1073, + "step": 17040 + }, + { + "epoch": 2.65, + "learning_rate": 3.513147617807862e-05, + "loss": 2.1411, + "step": 17060 + }, + { + "epoch": 2.65, + "learning_rate": 3.481905753709971e-05, + "loss": 2.0959, + "step": 17080 + }, + { + "epoch": 2.66, + "learning_rate": 3.45066388961208e-05, + "loss": 2.0858, + "step": 17100 + }, + { + "epoch": 2.66, + "learning_rate": 3.419422025514189e-05, + "loss": 2.1174, + "step": 17120 + }, + { + "epoch": 2.66, + "learning_rate": 3.388180161416298e-05, + "loss": 2.1459, + "step": 17140 + }, + { + "epoch": 2.67, + "learning_rate": 3.3569382973184066e-05, + "loss": 2.1425, + "step": 17160 + }, + { + "epoch": 2.67, + "learning_rate": 3.3256964332205155e-05, + "loss": 2.0971, + "step": 17180 + }, + { + "epoch": 2.67, + "learning_rate": 3.2944545691226243e-05, + "loss": 2.1176, + "step": 17200 + }, + { + "epoch": 2.67, + "eval_loss": 2.330962896347046, + "eval_runtime": 69.3407, + "eval_samples_per_second": 28.843, + "eval_steps_per_second": 1.803, + "step": 17200 + }, + { + "epoch": 2.68, + "learning_rate": 3.2632127050247325e-05, + "loss": 2.1471, + "step": 17220 + }, + { + "epoch": 2.68, + "learning_rate": 3.2319708409268414e-05, + "loss": 2.1064, + "step": 17240 + }, + { + "epoch": 2.68, + "learning_rate": 3.20072897682895e-05, + "loss": 2.1347, + "step": 17260 + }, + { + "epoch": 2.69, + "learning_rate": 3.169487112731059e-05, + "loss": 2.142, + "step": 17280 + }, + { + "epoch": 2.69, + "learning_rate": 3.138245248633168e-05, + "loss": 2.1773, + "step": 17300 + }, + { + "epoch": 2.69, + "learning_rate": 3.107003384535277e-05, + "loss": 2.1489, + "step": 17320 + }, + { + "epoch": 2.69, + "learning_rate": 3.075761520437386e-05, + "loss": 2.1257, + "step": 17340 + }, + { + "epoch": 2.7, + "learning_rate": 3.044519656339495e-05, + "loss": 2.1288, + "step": 17360 + }, + { + "epoch": 2.7, + "learning_rate": 3.0132777922416038e-05, + "loss": 2.1258, + "step": 17380 + }, + { + "epoch": 2.7, + "learning_rate": 2.9820359281437123e-05, + "loss": 2.1322, + "step": 17400 + }, + { + "epoch": 2.7, + "eval_loss": 2.3309593200683594, + "eval_runtime": 69.3923, + "eval_samples_per_second": 28.822, + "eval_steps_per_second": 1.801, + "step": 17400 + }, + { + "epoch": 2.71, + "learning_rate": 2.9507940640458212e-05, + "loss": 2.1495, + "step": 17420 + }, + { + "epoch": 2.71, + "learning_rate": 2.91955219994793e-05, + "loss": 2.0843, + "step": 17440 + }, + { + "epoch": 2.71, + "learning_rate": 2.888310335850039e-05, + "loss": 2.11, + "step": 17460 + }, + { + "epoch": 2.72, + "learning_rate": 2.8570684717521478e-05, + "loss": 2.1005, + "step": 17480 + }, + { + "epoch": 2.72, + "learning_rate": 2.827388700859151e-05, + "loss": 2.1302, + "step": 17500 + }, + { + "epoch": 2.72, + "learning_rate": 2.79614683676126e-05, + "loss": 2.1086, + "step": 17520 + }, + { + "epoch": 2.73, + "learning_rate": 2.7649049726633688e-05, + "loss": 2.1302, + "step": 17540 + }, + { + "epoch": 2.73, + "learning_rate": 2.7336631085654777e-05, + "loss": 2.1417, + "step": 17560 + }, + { + "epoch": 2.73, + "learning_rate": 2.7024212444675862e-05, + "loss": 2.1369, + "step": 17580 + }, + { + "epoch": 2.73, + "learning_rate": 2.671179380369695e-05, + "loss": 2.1384, + "step": 17600 + }, + { + "epoch": 2.73, + "eval_loss": 2.33089017868042, + "eval_runtime": 69.3747, + "eval_samples_per_second": 28.829, + "eval_steps_per_second": 1.802, + "step": 17600 + }, + { + "epoch": 2.74, + "learning_rate": 2.639937516271804e-05, + "loss": 2.1243, + "step": 17620 + }, + { + "epoch": 2.74, + "learning_rate": 2.6086956521739128e-05, + "loss": 2.1161, + "step": 17640 + }, + { + "epoch": 2.74, + "learning_rate": 2.5774537880760217e-05, + "loss": 2.1051, + "step": 17660 + }, + { + "epoch": 2.75, + "learning_rate": 2.5462119239781302e-05, + "loss": 2.0762, + "step": 17680 + }, + { + "epoch": 2.75, + "learning_rate": 2.514970059880239e-05, + "loss": 2.1105, + "step": 17700 + }, + { + "epoch": 2.75, + "learning_rate": 2.483728195782348e-05, + "loss": 2.1535, + "step": 17720 + }, + { + "epoch": 2.76, + "learning_rate": 2.452486331684457e-05, + "loss": 2.1706, + "step": 17740 + }, + { + "epoch": 2.76, + "learning_rate": 2.421244467586566e-05, + "loss": 2.0857, + "step": 17760 + }, + { + "epoch": 2.76, + "learning_rate": 2.390002603488675e-05, + "loss": 2.1553, + "step": 17780 + }, + { + "epoch": 2.77, + "learning_rate": 2.3587607393907834e-05, + "loss": 2.0983, + "step": 17800 + }, + { + "epoch": 2.77, + "eval_loss": 2.3304569721221924, + "eval_runtime": 69.35, + "eval_samples_per_second": 28.839, + "eval_steps_per_second": 1.802, + "step": 17800 + }, + { + "epoch": 2.77, + "learning_rate": 2.3275188752928923e-05, + "loss": 2.1212, + "step": 17820 + }, + { + "epoch": 2.77, + "learning_rate": 2.296277011195001e-05, + "loss": 2.0816, + "step": 17840 + }, + { + "epoch": 2.78, + "learning_rate": 2.26503514709711e-05, + "loss": 2.0935, + "step": 17860 + }, + { + "epoch": 2.78, + "learning_rate": 2.233793282999219e-05, + "loss": 2.1576, + "step": 17880 + }, + { + "epoch": 2.78, + "learning_rate": 2.2025514189013274e-05, + "loss": 2.1076, + "step": 17900 + }, + { + "epoch": 2.78, + "learning_rate": 2.1713095548034362e-05, + "loss": 2.1184, + "step": 17920 + }, + { + "epoch": 2.79, + "learning_rate": 2.140067690705545e-05, + "loss": 2.1169, + "step": 17940 + }, + { + "epoch": 2.79, + "learning_rate": 2.108825826607654e-05, + "loss": 2.1442, + "step": 17960 + }, + { + "epoch": 2.79, + "learning_rate": 2.077583962509763e-05, + "loss": 2.1332, + "step": 17980 + }, + { + "epoch": 2.8, + "learning_rate": 2.0463420984118717e-05, + "loss": 2.1553, + "step": 18000 + }, + { + "epoch": 2.8, + "eval_loss": 2.330599069595337, + "eval_runtime": 69.346, + "eval_samples_per_second": 28.841, + "eval_steps_per_second": 1.803, + "step": 18000 + }, + { + "epoch": 2.8, + "learning_rate": 2.0151002343139802e-05, + "loss": 2.1055, + "step": 18020 + }, + { + "epoch": 2.8, + "learning_rate": 1.9838583702160894e-05, + "loss": 2.0778, + "step": 18040 + }, + { + "epoch": 2.81, + "learning_rate": 1.9526165061181983e-05, + "loss": 2.143, + "step": 18060 + }, + { + "epoch": 2.81, + "learning_rate": 1.921374642020307e-05, + "loss": 2.0886, + "step": 18080 + }, + { + "epoch": 2.81, + "learning_rate": 1.890132777922416e-05, + "loss": 2.1236, + "step": 18100 + }, + { + "epoch": 2.82, + "learning_rate": 1.858890913824525e-05, + "loss": 2.1307, + "step": 18120 + }, + { + "epoch": 2.82, + "learning_rate": 1.8276490497266334e-05, + "loss": 2.1192, + "step": 18140 + }, + { + "epoch": 2.82, + "learning_rate": 1.7964071856287423e-05, + "loss": 2.0999, + "step": 18160 + }, + { + "epoch": 2.83, + "learning_rate": 1.765165321530851e-05, + "loss": 2.0792, + "step": 18180 + }, + { + "epoch": 2.83, + "learning_rate": 1.73392345743296e-05, + "loss": 2.1015, + "step": 18200 + }, + { + "epoch": 2.83, + "eval_loss": 2.330050230026245, + "eval_runtime": 69.3278, + "eval_samples_per_second": 28.848, + "eval_steps_per_second": 1.803, + "step": 18200 + }, + { + "epoch": 2.83, + "learning_rate": 1.702681593335069e-05, + "loss": 2.1226, + "step": 18220 + }, + { + "epoch": 2.83, + "learning_rate": 1.6714397292371778e-05, + "loss": 2.0924, + "step": 18240 + }, + { + "epoch": 2.84, + "learning_rate": 1.6401978651392866e-05, + "loss": 2.1272, + "step": 18260 + }, + { + "epoch": 2.84, + "learning_rate": 1.6089560010413955e-05, + "loss": 2.1175, + "step": 18280 + }, + { + "epoch": 2.84, + "learning_rate": 1.577714136943504e-05, + "loss": 2.1396, + "step": 18300 + }, + { + "epoch": 2.85, + "learning_rate": 1.546472272845613e-05, + "loss": 2.1514, + "step": 18320 + }, + { + "epoch": 2.85, + "learning_rate": 1.5152304087477217e-05, + "loss": 2.1257, + "step": 18340 + }, + { + "epoch": 2.85, + "learning_rate": 1.4839885446498306e-05, + "loss": 2.1459, + "step": 18360 + }, + { + "epoch": 2.86, + "learning_rate": 1.4527466805519396e-05, + "loss": 2.09, + "step": 18380 + }, + { + "epoch": 2.86, + "learning_rate": 1.4215048164540483e-05, + "loss": 2.1442, + "step": 18400 + }, + { + "epoch": 2.86, + "eval_loss": 2.330048084259033, + "eval_runtime": 69.2975, + "eval_samples_per_second": 28.861, + "eval_steps_per_second": 1.804, + "step": 18400 + }, + { + "epoch": 2.86, + "learning_rate": 1.3902629523561572e-05, + "loss": 2.1816, + "step": 18420 + }, + { + "epoch": 2.87, + "learning_rate": 1.3590210882582659e-05, + "loss": 2.0965, + "step": 18440 + }, + { + "epoch": 2.87, + "learning_rate": 1.3277792241603748e-05, + "loss": 2.1178, + "step": 18460 + }, + { + "epoch": 2.87, + "learning_rate": 1.2965373600624836e-05, + "loss": 2.1562, + "step": 18480 + }, + { + "epoch": 2.87, + "learning_rate": 1.2652954959645923e-05, + "loss": 2.095, + "step": 18500 + }, + { + "epoch": 2.88, + "learning_rate": 1.2340536318667012e-05, + "loss": 2.1522, + "step": 18520 + }, + { + "epoch": 2.88, + "learning_rate": 1.2028117677688102e-05, + "loss": 2.1729, + "step": 18540 + }, + { + "epoch": 2.88, + "learning_rate": 1.1715699036709189e-05, + "loss": 2.141, + "step": 18560 + }, + { + "epoch": 2.89, + "learning_rate": 1.1403280395730278e-05, + "loss": 2.148, + "step": 18580 + }, + { + "epoch": 2.89, + "learning_rate": 1.1090861754751366e-05, + "loss": 2.1619, + "step": 18600 + }, + { + "epoch": 2.89, + "eval_loss": 2.329728603363037, + "eval_runtime": 69.3412, + "eval_samples_per_second": 28.843, + "eval_steps_per_second": 1.803, + "step": 18600 + }, + { + "epoch": 2.89, + "learning_rate": 1.0778443113772453e-05, + "loss": 2.1199, + "step": 18620 + }, + { + "epoch": 2.9, + "learning_rate": 1.0466024472793542e-05, + "loss": 2.131, + "step": 18640 + }, + { + "epoch": 2.9, + "learning_rate": 1.0153605831814629e-05, + "loss": 2.1512, + "step": 18660 + }, + { + "epoch": 2.9, + "learning_rate": 9.84118719083572e-06, + "loss": 2.1292, + "step": 18680 + }, + { + "epoch": 2.91, + "learning_rate": 9.528768549856808e-06, + "loss": 2.0928, + "step": 18700 + }, + { + "epoch": 2.91, + "learning_rate": 9.216349908877897e-06, + "loss": 2.1168, + "step": 18720 + }, + { + "epoch": 2.91, + "learning_rate": 8.903931267898984e-06, + "loss": 2.1316, + "step": 18740 + }, + { + "epoch": 2.92, + "learning_rate": 8.591512626920072e-06, + "loss": 2.1198, + "step": 18760 + }, + { + "epoch": 2.92, + "learning_rate": 8.279093985941161e-06, + "loss": 2.1226, + "step": 18780 + }, + { + "epoch": 2.92, + "learning_rate": 7.96667534496225e-06, + "loss": 2.1234, + "step": 18800 + }, + { + "epoch": 2.92, + "eval_loss": 2.3294034004211426, + "eval_runtime": 69.3303, + "eval_samples_per_second": 28.847, + "eval_steps_per_second": 1.803, + "step": 18800 + }, + { + "epoch": 2.92, + "learning_rate": 7.654256703983337e-06, + "loss": 2.1251, + "step": 18820 + }, + { + "epoch": 2.93, + "learning_rate": 7.341838063004425e-06, + "loss": 2.1278, + "step": 18840 + }, + { + "epoch": 2.93, + "learning_rate": 7.029419422025514e-06, + "loss": 2.1115, + "step": 18860 + }, + { + "epoch": 2.93, + "learning_rate": 6.717000781046602e-06, + "loss": 2.1468, + "step": 18880 + }, + { + "epoch": 2.94, + "learning_rate": 6.4045821400676894e-06, + "loss": 2.0903, + "step": 18900 + }, + { + "epoch": 2.94, + "learning_rate": 6.092163499088779e-06, + "loss": 2.1271, + "step": 18920 + }, + { + "epoch": 2.94, + "learning_rate": 5.779744858109867e-06, + "loss": 2.1253, + "step": 18940 + }, + { + "epoch": 2.95, + "learning_rate": 5.4673262171309545e-06, + "loss": 2.0903, + "step": 18960 + }, + { + "epoch": 2.95, + "learning_rate": 5.154907576152043e-06, + "loss": 2.1566, + "step": 18980 + }, + { + "epoch": 2.95, + "learning_rate": 4.842488935173132e-06, + "loss": 2.1477, + "step": 19000 + }, + { + "epoch": 2.95, + "eval_loss": 2.3293075561523438, + "eval_runtime": 69.6518, + "eval_samples_per_second": 28.714, + "eval_steps_per_second": 1.795, + "step": 19000 + } + ], + "max_steps": 19305, + "num_train_epochs": 3, + "total_flos": 5.3158443458154725e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_bloomfirefly/checkpoint-19000/training_args.bin b/adapters/saved_bloomfirefly/checkpoint-19000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..87b78c4a804a75de63299eeb2fc899bcd70e34ae --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-19000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23cad2e050712e91e38e68c484cff1b7c0ef83524df4c9bb983745b616667737 +size 3643 diff --git a/adapters/saved_bloomfirefly/checkpoint-19200/optimizer.pt b/adapters/saved_bloomfirefly/checkpoint-19200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c676294776c1f9496aa2a2d0f96a7076e278e0b --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-19200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1c3427788407e1ce1359b461f0119a674fb7334dbee7b9daf995bd3cbbe79ee +size 31492741 diff --git a/adapters/saved_bloomfirefly/checkpoint-19200/pytorch_model.bin b/adapters/saved_bloomfirefly/checkpoint-19200/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..4325014ef41d714ef0f7d34711582d856d9bbde5 --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-19200/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9811ba709c8fbf328aed339d629c33c4363f9c811d3ede0467d1126f2eb183d6 +size 15750885 diff --git a/adapters/saved_bloomfirefly/checkpoint-19200/rng_state_0.pth b/adapters/saved_bloomfirefly/checkpoint-19200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d95b3331794e21ba094fdf69f32e525a61d664e9 --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-19200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86ba7071d1fde110016d2988a0f876ea0c6cffeec8d6db0e3abbaccd608a9e70 +size 14583 diff --git a/adapters/saved_bloomfirefly/checkpoint-19200/rng_state_1.pth b/adapters/saved_bloomfirefly/checkpoint-19200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2d324131097923a335d3acae972fe488a0c88c31 --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-19200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2962d4d389c2f2459f54619d4a2ee34baba822a12db95e8244c3e2fcdca68231 +size 14583 diff --git a/adapters/saved_bloomfirefly/checkpoint-19200/scaler.pt b/adapters/saved_bloomfirefly/checkpoint-19200/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d273a63712a69a514b213f27797af93bfebfabb --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-19200/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d16a690b43366775691bbec0623ff922ac015111105267a6c1b0d11be97cf091 +size 557 diff --git a/adapters/saved_bloomfirefly/checkpoint-19200/scheduler.pt b/adapters/saved_bloomfirefly/checkpoint-19200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ebea65229852da0e358514f24b93bf7eb41c5386 --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-19200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65a64fd7fd10817e80a553e348a43546f999e56915fca0f6a9615662ac301390 +size 627 diff --git a/adapters/saved_bloomfirefly/checkpoint-19200/trainer_state.json b/adapters/saved_bloomfirefly/checkpoint-19200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..953b2653e7760ffe217341abd7a809a8a597ec58 --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-19200/trainer_state.json @@ -0,0 +1,6544 @@ +{ + "best_metric": 2.329240560531616, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_bloomfirefly/checkpoint-19200", + "epoch": 2.9836105378171665, + "global_step": 19200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 2.9733, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 2.7809, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 2.6052, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 0.00023999999999999998, + "loss": 2.4925, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.0003, + "loss": 2.458, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029968758135902107, + "loss": 2.4281, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029937516271804216, + "loss": 2.4178, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029906274407706326, + "loss": 2.3839, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002987503254360843, + "loss": 2.3521, + "step": 180 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029843790679510545, + "loss": 2.338, + "step": 200 + }, + { + "epoch": 0.03, + "eval_loss": 2.510117292404175, + "eval_runtime": 69.1765, + "eval_samples_per_second": 28.912, + "eval_steps_per_second": 1.807, + "step": 200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002981254881541265, + "loss": 2.3401, + "step": 220 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002978130695131476, + "loss": 2.3665, + "step": 240 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002975006508721687, + "loss": 2.3691, + "step": 260 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002971882322311898, + "loss": 2.3514, + "step": 280 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002968758135902109, + "loss": 2.3203, + "step": 300 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029656339494923197, + "loss": 2.3393, + "step": 320 + }, + { + "epoch": 0.05, + "learning_rate": 0.000296250976308253, + "loss": 2.3289, + "step": 340 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029593855766727416, + "loss": 2.3407, + "step": 360 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002956261390262952, + "loss": 2.3163, + "step": 380 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002953137203853163, + "loss": 2.3212, + "step": 400 + }, + { + "epoch": 0.06, + "eval_loss": 2.473245620727539, + "eval_runtime": 69.0219, + "eval_samples_per_second": 28.976, + "eval_steps_per_second": 1.811, + "step": 400 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002950013017443374, + "loss": 2.2927, + "step": 420 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002946888831033585, + "loss": 2.2927, + "step": 440 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002943764644623796, + "loss": 2.29, + "step": 460 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002940640458214007, + "loss": 2.3099, + "step": 480 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002937516271804217, + "loss": 2.3286, + "step": 500 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002934392085394428, + "loss": 2.2928, + "step": 520 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002931267898984639, + "loss": 2.2956, + "step": 540 + }, + { + "epoch": 0.09, + "learning_rate": 0.000292814371257485, + "loss": 2.2627, + "step": 560 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002925019526165061, + "loss": 2.2897, + "step": 580 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002921895339755272, + "loss": 2.2994, + "step": 600 + }, + { + "epoch": 0.09, + "eval_loss": 2.455402374267578, + "eval_runtime": 69.1315, + "eval_samples_per_second": 28.93, + "eval_steps_per_second": 1.808, + "step": 600 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029187711533454824, + "loss": 2.3232, + "step": 620 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002915646966935694, + "loss": 2.2515, + "step": 640 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029125227805259043, + "loss": 2.2856, + "step": 660 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002909398594116115, + "loss": 2.252, + "step": 680 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002906274407706326, + "loss": 2.2891, + "step": 700 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002903150221296537, + "loss": 2.2769, + "step": 720 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002900026034886748, + "loss": 2.2763, + "step": 740 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002896901848476959, + "loss": 2.278, + "step": 760 + }, + { + "epoch": 0.12, + "learning_rate": 0.00028937776620671695, + "loss": 2.3126, + "step": 780 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002890653475657381, + "loss": 2.2698, + "step": 800 + }, + { + "epoch": 0.12, + "eval_loss": 2.4434444904327393, + "eval_runtime": 69.7211, + "eval_samples_per_second": 28.686, + "eval_steps_per_second": 1.793, + "step": 800 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028875292892475914, + "loss": 2.2587, + "step": 820 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028844051028378023, + "loss": 2.2954, + "step": 840 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028812809164280133, + "loss": 2.3102, + "step": 860 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002878156730018224, + "loss": 2.2918, + "step": 880 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002875032543608435, + "loss": 2.2698, + "step": 900 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002871908357198646, + "loss": 2.2514, + "step": 920 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028687841707888566, + "loss": 2.2684, + "step": 940 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028656599843790675, + "loss": 2.2833, + "step": 960 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028625357979692785, + "loss": 2.2709, + "step": 980 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028594116115594894, + "loss": 2.2596, + "step": 1000 + }, + { + "epoch": 0.16, + "eval_loss": 2.436037302017212, + "eval_runtime": 69.727, + "eval_samples_per_second": 28.683, + "eval_steps_per_second": 1.793, + "step": 1000 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028562874251497004, + "loss": 2.2743, + "step": 1020 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028531632387399113, + "loss": 2.23, + "step": 1040 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002850039052330122, + "loss": 2.2723, + "step": 1060 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002846914865920333, + "loss": 2.2585, + "step": 1080 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028437906795105437, + "loss": 2.2463, + "step": 1100 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028406664931007546, + "loss": 2.2264, + "step": 1120 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028375423066909656, + "loss": 2.223, + "step": 1140 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028344181202811765, + "loss": 2.2412, + "step": 1160 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028312939338713875, + "loss": 2.2714, + "step": 1180 + }, + { + "epoch": 0.19, + "learning_rate": 0.00028281697474615984, + "loss": 2.2638, + "step": 1200 + }, + { + "epoch": 0.19, + "eval_loss": 2.4272871017456055, + "eval_runtime": 69.3748, + "eval_samples_per_second": 28.829, + "eval_steps_per_second": 1.802, + "step": 1200 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002825045561051809, + "loss": 2.2303, + "step": 1220 + }, + { + "epoch": 0.19, + "learning_rate": 0.000282192137464202, + "loss": 2.2491, + "step": 1240 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028187971882322313, + "loss": 2.2598, + "step": 1260 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028156730018224417, + "loss": 2.2566, + "step": 1280 + }, + { + "epoch": 0.2, + "learning_rate": 0.00028125488154126527, + "loss": 2.2642, + "step": 1300 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028094246290028636, + "loss": 2.2976, + "step": 1320 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028063004425930746, + "loss": 2.2144, + "step": 1340 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028031762561832855, + "loss": 2.2618, + "step": 1360 + }, + { + "epoch": 0.21, + "learning_rate": 0.00028000520697734965, + "loss": 2.2232, + "step": 1380 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002796927883363707, + "loss": 2.2349, + "step": 1400 + }, + { + "epoch": 0.22, + "eval_loss": 2.422177314758301, + "eval_runtime": 69.7796, + "eval_samples_per_second": 28.662, + "eval_steps_per_second": 1.791, + "step": 1400 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027938036969539184, + "loss": 2.2655, + "step": 1420 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002790679510544129, + "loss": 2.265, + "step": 1440 + }, + { + "epoch": 0.23, + "learning_rate": 0.000278755532413434, + "loss": 2.2552, + "step": 1460 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027844311377245507, + "loss": 2.252, + "step": 1480 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027813069513147617, + "loss": 2.255, + "step": 1500 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027781827649049726, + "loss": 2.1869, + "step": 1520 + }, + { + "epoch": 0.24, + "learning_rate": 0.00027750585784951836, + "loss": 2.2601, + "step": 1540 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002771934392085394, + "loss": 2.2607, + "step": 1560 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002768810205675605, + "loss": 2.2245, + "step": 1580 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002765686019265816, + "loss": 2.2561, + "step": 1600 + }, + { + "epoch": 0.25, + "eval_loss": 2.4173202514648438, + "eval_runtime": 69.7813, + "eval_samples_per_second": 28.661, + "eval_steps_per_second": 1.791, + "step": 1600 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002762561832856027, + "loss": 2.2472, + "step": 1620 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002759437646446238, + "loss": 2.2952, + "step": 1640 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002756313460036449, + "loss": 2.1941, + "step": 1660 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002753189273626659, + "loss": 2.2396, + "step": 1680 + }, + { + "epoch": 0.26, + "learning_rate": 0.00027500650872168707, + "loss": 2.2325, + "step": 1700 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002746940900807081, + "loss": 2.2458, + "step": 1720 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002743816714397292, + "loss": 2.2464, + "step": 1740 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002740692527987503, + "loss": 2.2487, + "step": 1760 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002737568341577714, + "loss": 2.2609, + "step": 1780 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002734444155167925, + "loss": 2.3016, + "step": 1800 + }, + { + "epoch": 0.28, + "eval_loss": 2.4146716594696045, + "eval_runtime": 69.513, + "eval_samples_per_second": 28.772, + "eval_steps_per_second": 1.798, + "step": 1800 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002731319968758136, + "loss": 2.2415, + "step": 1820 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002728195782348346, + "loss": 2.2512, + "step": 1840 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002725071595938558, + "loss": 2.2186, + "step": 1860 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002721947409528768, + "loss": 2.1982, + "step": 1880 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002718823223118979, + "loss": 2.2358, + "step": 1900 + }, + { + "epoch": 0.3, + "learning_rate": 0.000271569903670919, + "loss": 2.2359, + "step": 1920 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002712574850299401, + "loss": 2.2367, + "step": 1940 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002709450663889612, + "loss": 2.2209, + "step": 1960 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002706326477479823, + "loss": 2.2026, + "step": 1980 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027032022910700333, + "loss": 2.2302, + "step": 2000 + }, + { + "epoch": 0.31, + "eval_loss": 2.4096806049346924, + "eval_runtime": 69.8744, + "eval_samples_per_second": 28.623, + "eval_steps_per_second": 1.789, + "step": 2000 + }, + { + "epoch": 0.31, + "learning_rate": 0.00027000781046602443, + "loss": 2.2516, + "step": 2020 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002696953918250455, + "loss": 2.2173, + "step": 2040 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002693829731840666, + "loss": 2.2414, + "step": 2060 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002690705545430877, + "loss": 2.1922, + "step": 2080 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002687581359021088, + "loss": 2.2396, + "step": 2100 + }, + { + "epoch": 0.33, + "learning_rate": 0.00026844571726112985, + "loss": 2.2602, + "step": 2120 + }, + { + "epoch": 0.33, + "learning_rate": 0.000268133298620151, + "loss": 2.2263, + "step": 2140 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026782087997917204, + "loss": 2.2082, + "step": 2160 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026750846133819314, + "loss": 2.2144, + "step": 2180 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026719604269721423, + "loss": 2.2066, + "step": 2200 + }, + { + "epoch": 0.34, + "eval_loss": 2.4065375328063965, + "eval_runtime": 69.933, + "eval_samples_per_second": 28.599, + "eval_steps_per_second": 1.787, + "step": 2200 + }, + { + "epoch": 0.34, + "learning_rate": 0.00026688362405623533, + "loss": 2.2494, + "step": 2220 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002665712054152564, + "loss": 2.2471, + "step": 2240 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002662587867742775, + "loss": 2.2512, + "step": 2260 + }, + { + "epoch": 0.35, + "learning_rate": 0.00026594636813329856, + "loss": 2.2249, + "step": 2280 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002656339494923197, + "loss": 2.2526, + "step": 2300 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026532153085134075, + "loss": 2.2375, + "step": 2320 + }, + { + "epoch": 0.36, + "learning_rate": 0.00026500911221036185, + "loss": 2.169, + "step": 2340 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026469669356938294, + "loss": 2.2206, + "step": 2360 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026438427492840404, + "loss": 2.2284, + "step": 2380 + }, + { + "epoch": 0.37, + "learning_rate": 0.00026407185628742513, + "loss": 2.2116, + "step": 2400 + }, + { + "epoch": 0.37, + "eval_loss": 2.402400255203247, + "eval_runtime": 70.6508, + "eval_samples_per_second": 28.308, + "eval_steps_per_second": 1.769, + "step": 2400 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026375943764644623, + "loss": 2.2228, + "step": 2420 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002634470190054673, + "loss": 2.2264, + "step": 2440 + }, + { + "epoch": 0.38, + "learning_rate": 0.00026313460036448837, + "loss": 2.2212, + "step": 2460 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002628221817235095, + "loss": 2.2164, + "step": 2480 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026250976308253056, + "loss": 2.2523, + "step": 2500 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026219734444155165, + "loss": 2.2272, + "step": 2520 + }, + { + "epoch": 0.39, + "learning_rate": 0.00026188492580057275, + "loss": 2.2381, + "step": 2540 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026157250715959384, + "loss": 2.2149, + "step": 2560 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026126008851861494, + "loss": 2.228, + "step": 2580 + }, + { + "epoch": 0.4, + "learning_rate": 0.00026094766987763603, + "loss": 2.2145, + "step": 2600 + }, + { + "epoch": 0.4, + "eval_loss": 2.399576425552368, + "eval_runtime": 69.9194, + "eval_samples_per_second": 28.604, + "eval_steps_per_second": 1.788, + "step": 2600 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002606352512366571, + "loss": 2.18, + "step": 2620 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002603228325956782, + "loss": 2.1965, + "step": 2640 + }, + { + "epoch": 0.41, + "learning_rate": 0.00026001041395469927, + "loss": 2.178, + "step": 2660 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025969799531372036, + "loss": 2.194, + "step": 2680 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025938557667274146, + "loss": 2.2024, + "step": 2700 + }, + { + "epoch": 0.42, + "learning_rate": 0.00025907315803176255, + "loss": 2.2427, + "step": 2720 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025876073939078365, + "loss": 2.2246, + "step": 2740 + }, + { + "epoch": 0.43, + "learning_rate": 0.00025844832074980474, + "loss": 2.2169, + "step": 2760 + }, + { + "epoch": 0.43, + "learning_rate": 0.0002581359021088258, + "loss": 2.2154, + "step": 2780 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002578234834678469, + "loss": 2.1732, + "step": 2800 + }, + { + "epoch": 0.44, + "eval_loss": 2.3982491493225098, + "eval_runtime": 70.2191, + "eval_samples_per_second": 28.482, + "eval_steps_per_second": 1.78, + "step": 2800 + }, + { + "epoch": 0.44, + "learning_rate": 0.000257511064826868, + "loss": 2.1951, + "step": 2820 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025719864618588907, + "loss": 2.2139, + "step": 2840 + }, + { + "epoch": 0.44, + "learning_rate": 0.00025688622754491017, + "loss": 2.197, + "step": 2860 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025657380890393126, + "loss": 2.2317, + "step": 2880 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002562613902629523, + "loss": 2.2107, + "step": 2900 + }, + { + "epoch": 0.45, + "learning_rate": 0.00025594897162197345, + "loss": 2.2087, + "step": 2920 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002556365529809945, + "loss": 2.2124, + "step": 2940 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002553241343400156, + "loss": 2.1762, + "step": 2960 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002550117156990367, + "loss": 2.2488, + "step": 2980 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002546992970580578, + "loss": 2.2316, + "step": 3000 + }, + { + "epoch": 0.47, + "eval_loss": 2.394296646118164, + "eval_runtime": 70.2494, + "eval_samples_per_second": 28.47, + "eval_steps_per_second": 1.779, + "step": 3000 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002543868784170789, + "loss": 2.2386, + "step": 3020 + }, + { + "epoch": 0.47, + "learning_rate": 0.00025407445977609997, + "loss": 2.224, + "step": 3040 + }, + { + "epoch": 0.48, + "learning_rate": 0.000253762041135121, + "loss": 2.2479, + "step": 3060 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002534496224941421, + "loss": 2.2396, + "step": 3080 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002531372038531632, + "loss": 2.2405, + "step": 3100 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002528247852121843, + "loss": 2.1969, + "step": 3120 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002525123665712054, + "loss": 2.2095, + "step": 3140 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002521999479302265, + "loss": 2.2202, + "step": 3160 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002518875292892476, + "loss": 2.2088, + "step": 3180 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002515751106482687, + "loss": 2.2075, + "step": 3200 + }, + { + "epoch": 0.5, + "eval_loss": 2.3918581008911133, + "eval_runtime": 69.2896, + "eval_samples_per_second": 28.864, + "eval_steps_per_second": 1.804, + "step": 3200 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002512626920072897, + "loss": 2.1993, + "step": 3220 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002509502733663108, + "loss": 2.2406, + "step": 3240 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002506378547253319, + "loss": 2.2352, + "step": 3260 + }, + { + "epoch": 0.51, + "learning_rate": 0.000250325436084353, + "loss": 2.236, + "step": 3280 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002500130174433741, + "loss": 2.1805, + "step": 3300 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002497005988023952, + "loss": 2.2249, + "step": 3320 + }, + { + "epoch": 0.52, + "learning_rate": 0.00024938818016141624, + "loss": 2.2153, + "step": 3340 + }, + { + "epoch": 0.52, + "learning_rate": 0.0002490757615204374, + "loss": 2.2115, + "step": 3360 + }, + { + "epoch": 0.53, + "learning_rate": 0.00024876334287945843, + "loss": 2.2284, + "step": 3380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002484509242384795, + "loss": 2.184, + "step": 3400 + }, + { + "epoch": 0.53, + "eval_loss": 2.3887791633605957, + "eval_runtime": 69.2387, + "eval_samples_per_second": 28.886, + "eval_steps_per_second": 1.805, + "step": 3400 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002481385055975006, + "loss": 2.2172, + "step": 3420 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002478260869565217, + "loss": 2.2347, + "step": 3440 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002475136683155428, + "loss": 2.2213, + "step": 3460 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002472012496745639, + "loss": 2.2215, + "step": 3480 + }, + { + "epoch": 0.54, + "learning_rate": 0.00024688883103358495, + "loss": 2.2058, + "step": 3500 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024657641239260604, + "loss": 2.1918, + "step": 3520 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002462639937516272, + "loss": 2.2021, + "step": 3540 + }, + { + "epoch": 0.55, + "learning_rate": 0.00024595157511064824, + "loss": 2.1832, + "step": 3560 + }, + { + "epoch": 0.56, + "learning_rate": 0.00024563915646966933, + "loss": 2.2199, + "step": 3580 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002453267378286904, + "loss": 2.1997, + "step": 3600 + }, + { + "epoch": 0.56, + "eval_loss": 2.386540412902832, + "eval_runtime": 69.2123, + "eval_samples_per_second": 28.897, + "eval_steps_per_second": 1.806, + "step": 3600 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002450143191877115, + "loss": 2.2009, + "step": 3620 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002447019005467326, + "loss": 2.2045, + "step": 3640 + }, + { + "epoch": 0.57, + "learning_rate": 0.0002443894819057537, + "loss": 2.2231, + "step": 3660 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024407706326477478, + "loss": 2.211, + "step": 3680 + }, + { + "epoch": 0.57, + "learning_rate": 0.00024376464462379588, + "loss": 2.1904, + "step": 3700 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024345222598281694, + "loss": 2.1492, + "step": 3720 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024313980734183807, + "loss": 2.2368, + "step": 3740 + }, + { + "epoch": 0.58, + "learning_rate": 0.00024282738870085914, + "loss": 2.1753, + "step": 3760 + }, + { + "epoch": 0.59, + "learning_rate": 0.00024251497005988023, + "loss": 2.179, + "step": 3780 + }, + { + "epoch": 0.59, + "learning_rate": 0.0002422025514189013, + "loss": 2.1811, + "step": 3800 + }, + { + "epoch": 0.59, + "eval_loss": 2.3864212036132812, + "eval_runtime": 69.2951, + "eval_samples_per_second": 28.862, + "eval_steps_per_second": 1.804, + "step": 3800 + }, + { + "epoch": 0.59, + "learning_rate": 0.0002418901327779224, + "loss": 2.1496, + "step": 3820 + }, + { + "epoch": 0.6, + "learning_rate": 0.0002415777141369435, + "loss": 2.2071, + "step": 3840 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024126529549596459, + "loss": 2.189, + "step": 3860 + }, + { + "epoch": 0.6, + "learning_rate": 0.00024095287685498565, + "loss": 2.1838, + "step": 3880 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024064045821400675, + "loss": 2.2292, + "step": 3900 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024032803957302782, + "loss": 2.1931, + "step": 3920 + }, + { + "epoch": 0.61, + "learning_rate": 0.00024001562093204894, + "loss": 2.2293, + "step": 3940 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023970320229107, + "loss": 2.2112, + "step": 3960 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002393907836500911, + "loss": 2.1479, + "step": 3980 + }, + { + "epoch": 0.62, + "learning_rate": 0.00023907836500911217, + "loss": 2.1661, + "step": 4000 + }, + { + "epoch": 0.62, + "eval_loss": 2.383505344390869, + "eval_runtime": 69.2876, + "eval_samples_per_second": 28.865, + "eval_steps_per_second": 1.804, + "step": 4000 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002387659463681333, + "loss": 2.1783, + "step": 4020 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023845352772715436, + "loss": 2.1975, + "step": 4040 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023814110908617546, + "loss": 2.2268, + "step": 4060 + }, + { + "epoch": 0.63, + "learning_rate": 0.00023782869044519653, + "loss": 2.1815, + "step": 4080 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023751627180421765, + "loss": 2.2305, + "step": 4100 + }, + { + "epoch": 0.64, + "learning_rate": 0.00023720385316323872, + "loss": 2.2087, + "step": 4120 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002368914345222598, + "loss": 2.2204, + "step": 4140 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023657901588128088, + "loss": 2.2138, + "step": 4160 + }, + { + "epoch": 0.65, + "learning_rate": 0.000236266597240302, + "loss": 2.2071, + "step": 4180 + }, + { + "epoch": 0.65, + "learning_rate": 0.00023595417859932307, + "loss": 2.1728, + "step": 4200 + }, + { + "epoch": 0.65, + "eval_loss": 2.3820013999938965, + "eval_runtime": 69.3049, + "eval_samples_per_second": 28.858, + "eval_steps_per_second": 1.804, + "step": 4200 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023564175995834417, + "loss": 2.182, + "step": 4220 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023532934131736524, + "loss": 2.1948, + "step": 4240 + }, + { + "epoch": 0.66, + "learning_rate": 0.00023501692267638633, + "loss": 2.2178, + "step": 4260 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023470450403540743, + "loss": 2.1979, + "step": 4280 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023439208539442852, + "loss": 2.222, + "step": 4300 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002340796667534496, + "loss": 2.221, + "step": 4320 + }, + { + "epoch": 0.67, + "learning_rate": 0.00023376724811247069, + "loss": 2.208, + "step": 4340 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023345482947149175, + "loss": 2.1502, + "step": 4360 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023314241083051288, + "loss": 2.1628, + "step": 4380 + }, + { + "epoch": 0.68, + "learning_rate": 0.00023282999218953395, + "loss": 2.1933, + "step": 4400 + }, + { + "epoch": 0.68, + "eval_loss": 2.380128860473633, + "eval_runtime": 69.2864, + "eval_samples_per_second": 28.866, + "eval_steps_per_second": 1.804, + "step": 4400 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023251757354855504, + "loss": 2.2204, + "step": 4420 + }, + { + "epoch": 0.69, + "learning_rate": 0.0002322051549075761, + "loss": 2.218, + "step": 4440 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023189273626659723, + "loss": 2.199, + "step": 4460 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002315803176256183, + "loss": 2.1826, + "step": 4480 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002312678989846394, + "loss": 2.174, + "step": 4500 + }, + { + "epoch": 0.7, + "learning_rate": 0.00023095548034366046, + "loss": 2.2011, + "step": 4520 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023064306170268159, + "loss": 2.1951, + "step": 4540 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023033064306170265, + "loss": 2.2189, + "step": 4560 + }, + { + "epoch": 0.71, + "learning_rate": 0.00023001822442072375, + "loss": 2.1891, + "step": 4580 + }, + { + "epoch": 0.71, + "learning_rate": 0.00022970580577974482, + "loss": 2.1873, + "step": 4600 + }, + { + "epoch": 0.71, + "eval_loss": 2.379713296890259, + "eval_runtime": 69.3005, + "eval_samples_per_second": 28.86, + "eval_steps_per_second": 1.804, + "step": 4600 + }, + { + "epoch": 0.72, + "learning_rate": 0.00022939338713876591, + "loss": 2.2191, + "step": 4620 + }, + { + "epoch": 0.72, + "learning_rate": 0.000229080968497787, + "loss": 2.1966, + "step": 4640 + }, + { + "epoch": 0.72, + "learning_rate": 0.0002287685498568081, + "loss": 2.2062, + "step": 4660 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022845613121582917, + "loss": 2.1888, + "step": 4680 + }, + { + "epoch": 0.73, + "learning_rate": 0.00022814371257485027, + "loss": 2.1938, + "step": 4700 + }, + { + "epoch": 0.73, + "learning_rate": 0.0002278312939338714, + "loss": 2.206, + "step": 4720 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022751887529289246, + "loss": 2.1584, + "step": 4740 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022720645665191355, + "loss": 2.1933, + "step": 4760 + }, + { + "epoch": 0.74, + "learning_rate": 0.00022689403801093462, + "loss": 2.2087, + "step": 4780 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022658161936995575, + "loss": 2.2239, + "step": 4800 + }, + { + "epoch": 0.75, + "eval_loss": 2.3774757385253906, + "eval_runtime": 69.3137, + "eval_samples_per_second": 28.854, + "eval_steps_per_second": 1.803, + "step": 4800 + }, + { + "epoch": 0.75, + "learning_rate": 0.00022626920072897681, + "loss": 2.2136, + "step": 4820 + }, + { + "epoch": 0.75, + "learning_rate": 0.0002259567820879979, + "loss": 2.2046, + "step": 4840 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022564436344701898, + "loss": 2.2031, + "step": 4860 + }, + { + "epoch": 0.76, + "learning_rate": 0.0002253319448060401, + "loss": 2.171, + "step": 4880 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022501952616506117, + "loss": 2.2101, + "step": 4900 + }, + { + "epoch": 0.76, + "learning_rate": 0.00022470710752408226, + "loss": 2.1306, + "step": 4920 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022439468888310333, + "loss": 2.1754, + "step": 4940 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022408227024212443, + "loss": 2.1972, + "step": 4960 + }, + { + "epoch": 0.77, + "learning_rate": 0.00022376985160114552, + "loss": 2.2175, + "step": 4980 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022345743296016662, + "loss": 2.139, + "step": 5000 + }, + { + "epoch": 0.78, + "eval_loss": 2.3760337829589844, + "eval_runtime": 69.3092, + "eval_samples_per_second": 28.856, + "eval_steps_per_second": 1.804, + "step": 5000 + }, + { + "epoch": 0.78, + "learning_rate": 0.0002231450143191877, + "loss": 2.1912, + "step": 5020 + }, + { + "epoch": 0.78, + "learning_rate": 0.00022283259567820878, + "loss": 2.2036, + "step": 5040 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022252017703722985, + "loss": 2.1852, + "step": 5060 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022220775839625097, + "loss": 2.1672, + "step": 5080 + }, + { + "epoch": 0.79, + "learning_rate": 0.00022189533975527204, + "loss": 2.1828, + "step": 5100 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022158292111429314, + "loss": 2.1875, + "step": 5120 + }, + { + "epoch": 0.8, + "learning_rate": 0.0002212705024733142, + "loss": 2.1997, + "step": 5140 + }, + { + "epoch": 0.8, + "learning_rate": 0.00022095808383233533, + "loss": 2.2162, + "step": 5160 + }, + { + "epoch": 0.8, + "learning_rate": 0.0002206456651913564, + "loss": 2.2213, + "step": 5180 + }, + { + "epoch": 0.81, + "learning_rate": 0.0002203332465503775, + "loss": 2.1972, + "step": 5200 + }, + { + "epoch": 0.81, + "eval_loss": 2.374734878540039, + "eval_runtime": 69.2582, + "eval_samples_per_second": 28.877, + "eval_steps_per_second": 1.805, + "step": 5200 + }, + { + "epoch": 0.81, + "learning_rate": 0.00022002082790939856, + "loss": 2.175, + "step": 5220 + }, + { + "epoch": 0.81, + "learning_rate": 0.00021970840926841968, + "loss": 2.1951, + "step": 5240 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021939599062744075, + "loss": 2.1493, + "step": 5260 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021908357198646185, + "loss": 2.1611, + "step": 5280 + }, + { + "epoch": 0.82, + "learning_rate": 0.00021877115334548291, + "loss": 2.1621, + "step": 5300 + }, + { + "epoch": 0.83, + "learning_rate": 0.00021845873470450404, + "loss": 2.1875, + "step": 5320 + }, + { + "epoch": 0.83, + "learning_rate": 0.0002181463160635251, + "loss": 2.1733, + "step": 5340 + }, + { + "epoch": 0.83, + "learning_rate": 0.0002178338974225462, + "loss": 2.242, + "step": 5360 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021752147878156727, + "loss": 2.2154, + "step": 5380 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021720906014058836, + "loss": 2.1969, + "step": 5400 + }, + { + "epoch": 0.84, + "eval_loss": 2.372680902481079, + "eval_runtime": 69.283, + "eval_samples_per_second": 28.867, + "eval_steps_per_second": 1.804, + "step": 5400 + }, + { + "epoch": 0.84, + "learning_rate": 0.00021689664149960946, + "loss": 2.1245, + "step": 5420 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021658422285863056, + "loss": 2.2049, + "step": 5440 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021627180421765162, + "loss": 2.1716, + "step": 5460 + }, + { + "epoch": 0.85, + "learning_rate": 0.00021595938557667272, + "loss": 2.1891, + "step": 5480 + }, + { + "epoch": 0.85, + "learning_rate": 0.0002156469669356938, + "loss": 2.1963, + "step": 5500 + }, + { + "epoch": 0.86, + "learning_rate": 0.0002153345482947149, + "loss": 2.1946, + "step": 5520 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021502212965373598, + "loss": 2.1982, + "step": 5540 + }, + { + "epoch": 0.86, + "learning_rate": 0.00021470971101275707, + "loss": 2.1759, + "step": 5560 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021439729237177814, + "loss": 2.1661, + "step": 5580 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021408487373079926, + "loss": 2.2051, + "step": 5600 + }, + { + "epoch": 0.87, + "eval_loss": 2.3719565868377686, + "eval_runtime": 69.321, + "eval_samples_per_second": 28.851, + "eval_steps_per_second": 1.803, + "step": 5600 + }, + { + "epoch": 0.87, + "learning_rate": 0.00021377245508982033, + "loss": 2.1605, + "step": 5620 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021346003644884143, + "loss": 2.1375, + "step": 5640 + }, + { + "epoch": 0.88, + "learning_rate": 0.0002131476178078625, + "loss": 2.1293, + "step": 5660 + }, + { + "epoch": 0.88, + "learning_rate": 0.00021283519916688362, + "loss": 2.2189, + "step": 5680 + }, + { + "epoch": 0.89, + "learning_rate": 0.0002125227805259047, + "loss": 2.1784, + "step": 5700 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021221036188492578, + "loss": 2.1764, + "step": 5720 + }, + { + "epoch": 0.89, + "learning_rate": 0.00021189794324394685, + "loss": 2.1569, + "step": 5740 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021158552460296795, + "loss": 2.1704, + "step": 5760 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021127310596198904, + "loss": 2.1614, + "step": 5780 + }, + { + "epoch": 0.9, + "learning_rate": 0.00021096068732101014, + "loss": 2.2078, + "step": 5800 + }, + { + "epoch": 0.9, + "eval_loss": 2.370939016342163, + "eval_runtime": 69.2728, + "eval_samples_per_second": 28.871, + "eval_steps_per_second": 1.804, + "step": 5800 + }, + { + "epoch": 0.9, + "learning_rate": 0.0002106482686800312, + "loss": 2.198, + "step": 5820 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002103358500390523, + "loss": 2.1735, + "step": 5840 + }, + { + "epoch": 0.91, + "learning_rate": 0.00021002343139807342, + "loss": 2.1936, + "step": 5860 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002097110127570945, + "loss": 2.1559, + "step": 5880 + }, + { + "epoch": 0.92, + "learning_rate": 0.0002093985941161156, + "loss": 2.1856, + "step": 5900 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020908617547513666, + "loss": 2.194, + "step": 5920 + }, + { + "epoch": 0.92, + "learning_rate": 0.00020877375683415778, + "loss": 2.1983, + "step": 5940 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020846133819317885, + "loss": 2.1788, + "step": 5960 + }, + { + "epoch": 0.93, + "learning_rate": 0.00020814891955219994, + "loss": 2.2126, + "step": 5980 + }, + { + "epoch": 0.93, + "learning_rate": 0.000207836500911221, + "loss": 2.1454, + "step": 6000 + }, + { + "epoch": 0.93, + "eval_loss": 2.369137763977051, + "eval_runtime": 69.3036, + "eval_samples_per_second": 28.859, + "eval_steps_per_second": 1.804, + "step": 6000 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020752408227024213, + "loss": 2.1603, + "step": 6020 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002072116636292632, + "loss": 2.2075, + "step": 6040 + }, + { + "epoch": 0.94, + "learning_rate": 0.0002068992449882843, + "loss": 2.1817, + "step": 6060 + }, + { + "epoch": 0.94, + "learning_rate": 0.00020658682634730537, + "loss": 2.1917, + "step": 6080 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020627440770632646, + "loss": 2.1727, + "step": 6100 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020596198906534756, + "loss": 2.1985, + "step": 6120 + }, + { + "epoch": 0.95, + "learning_rate": 0.00020564957042436865, + "loss": 2.1888, + "step": 6140 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020533715178338972, + "loss": 2.1425, + "step": 6160 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020502473314241082, + "loss": 2.1659, + "step": 6180 + }, + { + "epoch": 0.96, + "learning_rate": 0.00020471231450143188, + "loss": 2.1768, + "step": 6200 + }, + { + "epoch": 0.96, + "eval_loss": 2.368589162826538, + "eval_runtime": 69.4033, + "eval_samples_per_second": 28.817, + "eval_steps_per_second": 1.801, + "step": 6200 + }, + { + "epoch": 0.97, + "learning_rate": 0.000204399895860453, + "loss": 2.1744, + "step": 6220 + }, + { + "epoch": 0.97, + "learning_rate": 0.00020408747721947407, + "loss": 2.1484, + "step": 6240 + }, + { + "epoch": 0.97, + "learning_rate": 0.00020377505857849517, + "loss": 2.2154, + "step": 6260 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020346263993751624, + "loss": 2.1358, + "step": 6280 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020315022129653736, + "loss": 2.1809, + "step": 6300 + }, + { + "epoch": 0.98, + "learning_rate": 0.00020283780265555843, + "loss": 2.1813, + "step": 6320 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020252538401457952, + "loss": 2.1903, + "step": 6340 + }, + { + "epoch": 0.99, + "learning_rate": 0.0002022129653736006, + "loss": 2.1971, + "step": 6360 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020190054673262172, + "loss": 2.2041, + "step": 6380 + }, + { + "epoch": 0.99, + "learning_rate": 0.00020158812809164278, + "loss": 2.2169, + "step": 6400 + }, + { + "epoch": 0.99, + "eval_loss": 2.3672330379486084, + "eval_runtime": 69.3516, + "eval_samples_per_second": 28.839, + "eval_steps_per_second": 1.802, + "step": 6400 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020127570945066388, + "loss": 2.2101, + "step": 6420 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020096329080968495, + "loss": 2.1739, + "step": 6440 + }, + { + "epoch": 1.0, + "learning_rate": 0.00020065087216870604, + "loss": 2.1764, + "step": 6460 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020033845352772714, + "loss": 2.1718, + "step": 6480 + }, + { + "epoch": 1.01, + "learning_rate": 0.00020002603488674823, + "loss": 2.1688, + "step": 6500 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001997136162457693, + "loss": 2.1322, + "step": 6520 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001994011976047904, + "loss": 2.1593, + "step": 6540 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001990887789638115, + "loss": 2.179, + "step": 6560 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001987763603228326, + "loss": 2.139, + "step": 6580 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019846394168185366, + "loss": 2.1594, + "step": 6600 + }, + { + "epoch": 1.03, + "eval_loss": 2.367051839828491, + "eval_runtime": 69.3473, + "eval_samples_per_second": 28.84, + "eval_steps_per_second": 1.803, + "step": 6600 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019815152304087475, + "loss": 2.2033, + "step": 6620 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019783910439989582, + "loss": 2.183, + "step": 6640 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019752668575891694, + "loss": 2.1517, + "step": 6660 + }, + { + "epoch": 1.04, + "learning_rate": 0.000197214267117938, + "loss": 2.183, + "step": 6680 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001969018484769591, + "loss": 2.197, + "step": 6700 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019658942983598017, + "loss": 2.1778, + "step": 6720 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001962770111950013, + "loss": 2.1745, + "step": 6740 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019596459255402237, + "loss": 2.1585, + "step": 6760 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019565217391304346, + "loss": 2.1708, + "step": 6780 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019533975527206453, + "loss": 2.1649, + "step": 6800 + }, + { + "epoch": 1.06, + "eval_loss": 2.363710880279541, + "eval_runtime": 69.2642, + "eval_samples_per_second": 28.875, + "eval_steps_per_second": 1.805, + "step": 6800 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019502733663108565, + "loss": 2.1391, + "step": 6820 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019471491799010672, + "loss": 2.1939, + "step": 6840 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019440249934912782, + "loss": 2.1558, + "step": 6860 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019409008070814888, + "loss": 2.173, + "step": 6880 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019377766206716998, + "loss": 2.1821, + "step": 6900 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019346524342619107, + "loss": 2.16, + "step": 6920 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019315282478521217, + "loss": 2.1808, + "step": 6940 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019284040614423324, + "loss": 2.1355, + "step": 6960 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019252798750325433, + "loss": 2.1813, + "step": 6980 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019221556886227546, + "loss": 2.1677, + "step": 7000 + }, + { + "epoch": 1.09, + "eval_loss": 2.3648109436035156, + "eval_runtime": 69.3675, + "eval_samples_per_second": 28.832, + "eval_steps_per_second": 1.802, + "step": 7000 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019190315022129652, + "loss": 2.1479, + "step": 7020 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019159073158031762, + "loss": 2.1852, + "step": 7040 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001912783129393387, + "loss": 2.14, + "step": 7060 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001909658942983598, + "loss": 2.1332, + "step": 7080 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019065347565738088, + "loss": 2.178, + "step": 7100 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019034105701640197, + "loss": 2.1661, + "step": 7120 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019002863837542304, + "loss": 2.1902, + "step": 7140 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018971621973444417, + "loss": 2.1775, + "step": 7160 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018940380109346523, + "loss": 2.2007, + "step": 7180 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018909138245248633, + "loss": 2.2078, + "step": 7200 + }, + { + "epoch": 1.12, + "eval_loss": 2.3642289638519287, + "eval_runtime": 69.5476, + "eval_samples_per_second": 28.757, + "eval_steps_per_second": 1.797, + "step": 7200 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001887789638115074, + "loss": 2.185, + "step": 7220 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001884665451705285, + "loss": 2.1856, + "step": 7240 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001881541265295496, + "loss": 2.2049, + "step": 7260 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018784170788857068, + "loss": 2.1376, + "step": 7280 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018752928924759175, + "loss": 2.1693, + "step": 7300 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018721687060661285, + "loss": 2.1825, + "step": 7320 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018690445196563392, + "loss": 2.1649, + "step": 7340 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018659203332465504, + "loss": 2.1936, + "step": 7360 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001862796146836761, + "loss": 2.143, + "step": 7380 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001859671960426972, + "loss": 2.1617, + "step": 7400 + }, + { + "epoch": 1.15, + "eval_loss": 2.362150192260742, + "eval_runtime": 69.3218, + "eval_samples_per_second": 28.851, + "eval_steps_per_second": 1.803, + "step": 7400 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018565477740171827, + "loss": 2.1555, + "step": 7420 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001853423587607394, + "loss": 2.1639, + "step": 7440 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018502994011976046, + "loss": 2.1678, + "step": 7460 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018471752147878156, + "loss": 2.1775, + "step": 7480 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018440510283780263, + "loss": 2.1784, + "step": 7500 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018409268419682375, + "loss": 2.1499, + "step": 7520 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018378026555584482, + "loss": 2.154, + "step": 7540 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001834678469148659, + "loss": 2.1793, + "step": 7560 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018315542827388698, + "loss": 2.2292, + "step": 7580 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018284300963290808, + "loss": 2.1578, + "step": 7600 + }, + { + "epoch": 1.18, + "eval_loss": 2.3628857135772705, + "eval_runtime": 69.2564, + "eval_samples_per_second": 28.878, + "eval_steps_per_second": 1.805, + "step": 7600 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018253059099192917, + "loss": 2.1494, + "step": 7620 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018221817235095027, + "loss": 2.1669, + "step": 7640 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018190575370997133, + "loss": 2.1447, + "step": 7660 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018159333506899243, + "loss": 2.1663, + "step": 7680 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001812809164280135, + "loss": 2.1871, + "step": 7700 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018096849778703462, + "loss": 2.1338, + "step": 7720 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001806560791460557, + "loss": 2.1767, + "step": 7740 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018034366050507678, + "loss": 2.1694, + "step": 7760 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018003124186409785, + "loss": 2.1674, + "step": 7780 + }, + { + "epoch": 1.21, + "learning_rate": 0.00017971882322311898, + "loss": 2.1863, + "step": 7800 + }, + { + "epoch": 1.21, + "eval_loss": 2.3613035678863525, + "eval_runtime": 69.2881, + "eval_samples_per_second": 28.865, + "eval_steps_per_second": 1.804, + "step": 7800 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017940640458214004, + "loss": 2.1441, + "step": 7820 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017909398594116114, + "loss": 2.1885, + "step": 7840 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001787815673001822, + "loss": 2.1514, + "step": 7860 + }, + { + "epoch": 1.22, + "learning_rate": 0.00017846914865920333, + "loss": 2.2002, + "step": 7880 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001781567300182244, + "loss": 2.1759, + "step": 7900 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001778443113772455, + "loss": 2.1611, + "step": 7920 + }, + { + "epoch": 1.23, + "learning_rate": 0.00017753189273626656, + "loss": 2.1667, + "step": 7940 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017721947409528768, + "loss": 2.1717, + "step": 7960 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017690705545430875, + "loss": 2.1983, + "step": 7980 + }, + { + "epoch": 1.24, + "learning_rate": 0.00017659463681332985, + "loss": 2.2092, + "step": 8000 + }, + { + "epoch": 1.24, + "eval_loss": 2.3608274459838867, + "eval_runtime": 69.3364, + "eval_samples_per_second": 28.845, + "eval_steps_per_second": 1.803, + "step": 8000 + }, + { + "epoch": 1.25, + "learning_rate": 0.00017628221817235092, + "loss": 2.1305, + "step": 8020 + }, + { + "epoch": 1.25, + "learning_rate": 0.000175969799531372, + "loss": 2.1431, + "step": 8040 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001756573808903931, + "loss": 2.1384, + "step": 8060 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001753449622494142, + "loss": 2.2093, + "step": 8080 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017503254360843527, + "loss": 2.1271, + "step": 8100 + }, + { + "epoch": 1.26, + "learning_rate": 0.00017472012496745637, + "loss": 2.1466, + "step": 8120 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001744077063264775, + "loss": 2.1578, + "step": 8140 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017409528768549856, + "loss": 2.1632, + "step": 8160 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017378286904451965, + "loss": 2.1465, + "step": 8180 + }, + { + "epoch": 1.27, + "learning_rate": 0.00017347045040354072, + "loss": 2.2226, + "step": 8200 + }, + { + "epoch": 1.27, + "eval_loss": 2.35835599899292, + "eval_runtime": 69.2657, + "eval_samples_per_second": 28.874, + "eval_steps_per_second": 1.805, + "step": 8200 + }, + { + "epoch": 1.28, + "learning_rate": 0.00017315803176256184, + "loss": 2.1585, + "step": 8220 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001728456131215829, + "loss": 2.1529, + "step": 8240 + }, + { + "epoch": 1.28, + "learning_rate": 0.000172533194480604, + "loss": 2.1663, + "step": 8260 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017222077583962508, + "loss": 2.1422, + "step": 8280 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017190835719864617, + "loss": 2.158, + "step": 8300 + }, + { + "epoch": 1.29, + "learning_rate": 0.00017159593855766727, + "loss": 2.1984, + "step": 8320 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017128351991668836, + "loss": 2.1395, + "step": 8340 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017097110127570943, + "loss": 2.14, + "step": 8360 + }, + { + "epoch": 1.3, + "learning_rate": 0.00017065868263473053, + "loss": 2.1657, + "step": 8380 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017036188492580056, + "loss": 2.167, + "step": 8400 + }, + { + "epoch": 1.31, + "eval_loss": 2.35697603225708, + "eval_runtime": 69.2685, + "eval_samples_per_second": 28.873, + "eval_steps_per_second": 1.805, + "step": 8400 + }, + { + "epoch": 1.31, + "learning_rate": 0.00017004946628482165, + "loss": 2.1396, + "step": 8420 + }, + { + "epoch": 1.31, + "learning_rate": 0.00016973704764384272, + "loss": 2.1777, + "step": 8440 + }, + { + "epoch": 1.31, + "learning_rate": 0.00016942462900286384, + "loss": 2.1366, + "step": 8460 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001691122103618849, + "loss": 2.1625, + "step": 8480 + }, + { + "epoch": 1.32, + "learning_rate": 0.000168799791720906, + "loss": 2.1859, + "step": 8500 + }, + { + "epoch": 1.32, + "learning_rate": 0.00016848737307992707, + "loss": 2.1705, + "step": 8520 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001681749544389482, + "loss": 2.1971, + "step": 8540 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016786253579796927, + "loss": 2.1937, + "step": 8560 + }, + { + "epoch": 1.33, + "learning_rate": 0.00016755011715699036, + "loss": 2.1436, + "step": 8580 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016723769851601143, + "loss": 2.1592, + "step": 8600 + }, + { + "epoch": 1.34, + "eval_loss": 2.3576247692108154, + "eval_runtime": 69.277, + "eval_samples_per_second": 28.87, + "eval_steps_per_second": 1.804, + "step": 8600 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016692527987503252, + "loss": 2.1745, + "step": 8620 + }, + { + "epoch": 1.34, + "learning_rate": 0.00016661286123405362, + "loss": 2.1517, + "step": 8640 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016630044259307472, + "loss": 2.1921, + "step": 8660 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016598802395209578, + "loss": 2.1703, + "step": 8680 + }, + { + "epoch": 1.35, + "learning_rate": 0.00016567560531111688, + "loss": 2.1223, + "step": 8700 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016536318667013795, + "loss": 2.1748, + "step": 8720 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016505076802915907, + "loss": 2.145, + "step": 8740 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016473834938818014, + "loss": 2.1077, + "step": 8760 + }, + { + "epoch": 1.36, + "learning_rate": 0.00016442593074720123, + "loss": 2.1571, + "step": 8780 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001641135121062223, + "loss": 2.1946, + "step": 8800 + }, + { + "epoch": 1.37, + "eval_loss": 2.3559648990631104, + "eval_runtime": 69.3886, + "eval_samples_per_second": 28.823, + "eval_steps_per_second": 1.801, + "step": 8800 + }, + { + "epoch": 1.37, + "learning_rate": 0.00016380109346524342, + "loss": 2.1635, + "step": 8820 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001634886748242645, + "loss": 2.1546, + "step": 8840 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001631762561832856, + "loss": 2.1359, + "step": 8860 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016286383754230666, + "loss": 2.1741, + "step": 8880 + }, + { + "epoch": 1.38, + "learning_rate": 0.00016255141890132778, + "loss": 2.1382, + "step": 8900 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016223900026034885, + "loss": 2.1514, + "step": 8920 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016192658161936994, + "loss": 2.17, + "step": 8940 + }, + { + "epoch": 1.39, + "learning_rate": 0.000161614162978391, + "loss": 2.1784, + "step": 8960 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001613017443374121, + "loss": 2.1869, + "step": 8980 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001609893256964332, + "loss": 2.155, + "step": 9000 + }, + { + "epoch": 1.4, + "eval_loss": 2.3562612533569336, + "eval_runtime": 70.7208, + "eval_samples_per_second": 28.28, + "eval_steps_per_second": 1.768, + "step": 9000 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001606769070554543, + "loss": 2.1467, + "step": 9020 + }, + { + "epoch": 1.4, + "learning_rate": 0.00016036448841447537, + "loss": 2.1662, + "step": 9040 + }, + { + "epoch": 1.41, + "learning_rate": 0.00016005206977349646, + "loss": 2.1928, + "step": 9060 + }, + { + "epoch": 1.41, + "learning_rate": 0.00015973965113251756, + "loss": 2.1084, + "step": 9080 + }, + { + "epoch": 1.41, + "learning_rate": 0.00015942723249153865, + "loss": 2.182, + "step": 9100 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015911481385055975, + "loss": 2.1502, + "step": 9120 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015880239520958082, + "loss": 2.1645, + "step": 9140 + }, + { + "epoch": 1.42, + "learning_rate": 0.00015848997656860194, + "loss": 2.1246, + "step": 9160 + }, + { + "epoch": 1.43, + "learning_rate": 0.000158177557927623, + "loss": 2.1769, + "step": 9180 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001578651392866441, + "loss": 2.1772, + "step": 9200 + }, + { + "epoch": 1.43, + "eval_loss": 2.354128360748291, + "eval_runtime": 70.4883, + "eval_samples_per_second": 28.374, + "eval_steps_per_second": 1.773, + "step": 9200 + }, + { + "epoch": 1.43, + "learning_rate": 0.00015755272064566517, + "loss": 2.1777, + "step": 9220 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001572403020046863, + "loss": 2.1749, + "step": 9240 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015692788336370736, + "loss": 2.1861, + "step": 9260 + }, + { + "epoch": 1.44, + "learning_rate": 0.00015661546472272846, + "loss": 2.1567, + "step": 9280 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015630304608174952, + "loss": 2.1426, + "step": 9300 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015599062744077062, + "loss": 2.1658, + "step": 9320 + }, + { + "epoch": 1.45, + "learning_rate": 0.00015567820879979172, + "loss": 2.1639, + "step": 9340 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001553657901588128, + "loss": 2.1897, + "step": 9360 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015505337151783388, + "loss": 2.1439, + "step": 9380 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015474095287685497, + "loss": 2.1326, + "step": 9400 + }, + { + "epoch": 1.46, + "eval_loss": 2.352673292160034, + "eval_runtime": 69.2871, + "eval_samples_per_second": 28.865, + "eval_steps_per_second": 1.804, + "step": 9400 + }, + { + "epoch": 1.46, + "learning_rate": 0.00015442853423587604, + "loss": 2.139, + "step": 9420 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015411611559489717, + "loss": 2.1087, + "step": 9440 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015380369695391823, + "loss": 2.1528, + "step": 9460 + }, + { + "epoch": 1.47, + "learning_rate": 0.00015349127831293933, + "loss": 2.1866, + "step": 9480 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001531788596719604, + "loss": 2.1436, + "step": 9500 + }, + { + "epoch": 1.48, + "learning_rate": 0.00015286644103098152, + "loss": 2.1699, + "step": 9520 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001525540223900026, + "loss": 2.1415, + "step": 9540 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015224160374902368, + "loss": 2.1092, + "step": 9560 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015192918510804475, + "loss": 2.1422, + "step": 9580 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015161676646706587, + "loss": 2.1677, + "step": 9600 + }, + { + "epoch": 1.49, + "eval_loss": 2.3518292903900146, + "eval_runtime": 69.3029, + "eval_samples_per_second": 28.859, + "eval_steps_per_second": 1.804, + "step": 9600 + }, + { + "epoch": 1.49, + "learning_rate": 0.00015130434782608694, + "loss": 2.1594, + "step": 9620 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015099192918510804, + "loss": 2.1539, + "step": 9640 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001506795105441291, + "loss": 2.1343, + "step": 9660 + }, + { + "epoch": 1.5, + "learning_rate": 0.00015036709190315023, + "loss": 2.1386, + "step": 9680 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001500546732621713, + "loss": 2.1512, + "step": 9700 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001497422546211924, + "loss": 2.1669, + "step": 9720 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001494298359802135, + "loss": 2.158, + "step": 9740 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014911741733923456, + "loss": 2.1643, + "step": 9760 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014880499869825565, + "loss": 2.1612, + "step": 9780 + }, + { + "epoch": 1.52, + "learning_rate": 0.00014849258005727675, + "loss": 2.1441, + "step": 9800 + }, + { + "epoch": 1.52, + "eval_loss": 2.35211181640625, + "eval_runtime": 69.2821, + "eval_samples_per_second": 28.867, + "eval_steps_per_second": 1.804, + "step": 9800 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014818016141629784, + "loss": 2.1704, + "step": 9820 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001478677427753189, + "loss": 2.1546, + "step": 9840 + }, + { + "epoch": 1.53, + "learning_rate": 0.00014755532413434, + "loss": 2.1909, + "step": 9860 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001472429054933611, + "loss": 2.149, + "step": 9880 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014693048685238217, + "loss": 2.1419, + "step": 9900 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014661806821140327, + "loss": 2.1465, + "step": 9920 + }, + { + "epoch": 1.54, + "learning_rate": 0.00014630564957042436, + "loss": 2.1551, + "step": 9940 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014599323092944546, + "loss": 2.1526, + "step": 9960 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014568081228846653, + "loss": 2.1437, + "step": 9980 + }, + { + "epoch": 1.55, + "learning_rate": 0.00014536839364748762, + "loss": 2.1659, + "step": 10000 + }, + { + "epoch": 1.55, + "eval_loss": 2.3507654666900635, + "eval_runtime": 69.2997, + "eval_samples_per_second": 28.86, + "eval_steps_per_second": 1.804, + "step": 10000 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014505597500650872, + "loss": 2.14, + "step": 10020 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001447435563655298, + "loss": 2.1289, + "step": 10040 + }, + { + "epoch": 1.56, + "learning_rate": 0.00014443113772455088, + "loss": 2.1226, + "step": 10060 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014411871908357198, + "loss": 2.1627, + "step": 10080 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014380630044259307, + "loss": 2.1759, + "step": 10100 + }, + { + "epoch": 1.57, + "learning_rate": 0.00014349388180161414, + "loss": 2.1511, + "step": 10120 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014318146316063523, + "loss": 2.1275, + "step": 10140 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014286904451965633, + "loss": 2.1638, + "step": 10160 + }, + { + "epoch": 1.58, + "learning_rate": 0.00014255662587867743, + "loss": 2.1494, + "step": 10180 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001422442072376985, + "loss": 2.1554, + "step": 10200 + }, + { + "epoch": 1.59, + "eval_loss": 2.349271059036255, + "eval_runtime": 69.2627, + "eval_samples_per_second": 28.876, + "eval_steps_per_second": 1.805, + "step": 10200 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001419317885967196, + "loss": 2.133, + "step": 10220 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014161936995574068, + "loss": 2.1515, + "step": 10240 + }, + { + "epoch": 1.59, + "learning_rate": 0.00014130695131476178, + "loss": 2.1262, + "step": 10260 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014099453267378285, + "loss": 2.142, + "step": 10280 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014068211403280394, + "loss": 2.1578, + "step": 10300 + }, + { + "epoch": 1.6, + "learning_rate": 0.00014036969539182504, + "loss": 2.1583, + "step": 10320 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001400572767508461, + "loss": 2.1043, + "step": 10340 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001397448581098672, + "loss": 2.1539, + "step": 10360 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001394324394688883, + "loss": 2.1189, + "step": 10380 + }, + { + "epoch": 1.62, + "learning_rate": 0.0001391200208279094, + "loss": 2.1484, + "step": 10400 + }, + { + "epoch": 1.62, + "eval_loss": 2.3479487895965576, + "eval_runtime": 69.2625, + "eval_samples_per_second": 28.876, + "eval_steps_per_second": 1.805, + "step": 10400 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013880760218693046, + "loss": 2.1993, + "step": 10420 + }, + { + "epoch": 1.62, + "learning_rate": 0.00013849518354595156, + "loss": 2.1869, + "step": 10440 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013818276490497265, + "loss": 2.1644, + "step": 10460 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013787034626399375, + "loss": 2.1751, + "step": 10480 + }, + { + "epoch": 1.63, + "learning_rate": 0.00013755792762301482, + "loss": 2.1416, + "step": 10500 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001372455089820359, + "loss": 2.1809, + "step": 10520 + }, + { + "epoch": 1.64, + "learning_rate": 0.000136933090341057, + "loss": 2.1653, + "step": 10540 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013662067170007808, + "loss": 2.1026, + "step": 10560 + }, + { + "epoch": 1.64, + "learning_rate": 0.00013630825305909917, + "loss": 2.1503, + "step": 10580 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013599583441812027, + "loss": 2.1289, + "step": 10600 + }, + { + "epoch": 1.65, + "eval_loss": 2.3468515872955322, + "eval_runtime": 69.2274, + "eval_samples_per_second": 28.89, + "eval_steps_per_second": 1.806, + "step": 10600 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013568341577714136, + "loss": 2.1929, + "step": 10620 + }, + { + "epoch": 1.65, + "learning_rate": 0.00013537099713616243, + "loss": 2.1547, + "step": 10640 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013505857849518353, + "loss": 2.1571, + "step": 10660 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013474615985420462, + "loss": 2.1649, + "step": 10680 + }, + { + "epoch": 1.66, + "learning_rate": 0.00013443374121322572, + "loss": 2.1647, + "step": 10700 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013412132257224679, + "loss": 2.206, + "step": 10720 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013380890393126788, + "loss": 2.1377, + "step": 10740 + }, + { + "epoch": 1.67, + "learning_rate": 0.00013349648529028898, + "loss": 2.1347, + "step": 10760 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013318406664931004, + "loss": 2.1948, + "step": 10780 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013287164800833114, + "loss": 2.1844, + "step": 10800 + }, + { + "epoch": 1.68, + "eval_loss": 2.347837209701538, + "eval_runtime": 69.2425, + "eval_samples_per_second": 28.884, + "eval_steps_per_second": 1.805, + "step": 10800 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013255922936735224, + "loss": 2.1515, + "step": 10820 + }, + { + "epoch": 1.68, + "learning_rate": 0.00013224681072637333, + "loss": 2.1885, + "step": 10840 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013193439208539443, + "loss": 2.143, + "step": 10860 + }, + { + "epoch": 1.69, + "learning_rate": 0.00013162197344441552, + "loss": 2.1671, + "step": 10880 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001313095548034366, + "loss": 2.1426, + "step": 10900 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013099713616245769, + "loss": 2.1653, + "step": 10920 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013068471752147878, + "loss": 2.1774, + "step": 10940 + }, + { + "epoch": 1.7, + "learning_rate": 0.00013037229888049988, + "loss": 2.1344, + "step": 10960 + }, + { + "epoch": 1.71, + "learning_rate": 0.00013005988023952094, + "loss": 2.1217, + "step": 10980 + }, + { + "epoch": 1.71, + "learning_rate": 0.00012974746159854204, + "loss": 2.1281, + "step": 11000 + }, + { + "epoch": 1.71, + "eval_loss": 2.345808982849121, + "eval_runtime": 69.2499, + "eval_samples_per_second": 28.881, + "eval_steps_per_second": 1.805, + "step": 11000 + }, + { + "epoch": 1.71, + "learning_rate": 0.00012943504295756314, + "loss": 2.1459, + "step": 11020 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001291226243165842, + "loss": 2.1294, + "step": 11040 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001288102056756053, + "loss": 2.1455, + "step": 11060 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001284977870346264, + "loss": 2.1219, + "step": 11080 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001281853683936475, + "loss": 2.1696, + "step": 11100 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012787294975266856, + "loss": 2.1474, + "step": 11120 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012756053111168965, + "loss": 2.1436, + "step": 11140 + }, + { + "epoch": 1.73, + "learning_rate": 0.00012724811247071075, + "loss": 2.1785, + "step": 11160 + }, + { + "epoch": 1.74, + "learning_rate": 0.00012693569382973184, + "loss": 2.1677, + "step": 11180 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001266232751887529, + "loss": 2.1564, + "step": 11200 + }, + { + "epoch": 1.74, + "eval_loss": 2.3451294898986816, + "eval_runtime": 69.2454, + "eval_samples_per_second": 28.883, + "eval_steps_per_second": 1.805, + "step": 11200 + }, + { + "epoch": 1.74, + "learning_rate": 0.000126310856547774, + "loss": 2.1793, + "step": 11220 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001259984379067951, + "loss": 2.1583, + "step": 11240 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012568601926581617, + "loss": 2.1482, + "step": 11260 + }, + { + "epoch": 1.75, + "learning_rate": 0.00012537360062483727, + "loss": 2.1393, + "step": 11280 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012506118198385836, + "loss": 2.1586, + "step": 11300 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012474876334287946, + "loss": 2.1533, + "step": 11320 + }, + { + "epoch": 1.76, + "learning_rate": 0.00012443634470190053, + "loss": 2.1516, + "step": 11340 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012412392606092162, + "loss": 2.1184, + "step": 11360 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012381150741994272, + "loss": 2.1162, + "step": 11380 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001234990887789638, + "loss": 2.1588, + "step": 11400 + }, + { + "epoch": 1.77, + "eval_loss": 2.3451669216156006, + "eval_runtime": 69.2383, + "eval_samples_per_second": 28.886, + "eval_steps_per_second": 1.805, + "step": 11400 + }, + { + "epoch": 1.77, + "learning_rate": 0.00012318667013798488, + "loss": 2.1588, + "step": 11420 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012287425149700598, + "loss": 2.1463, + "step": 11440 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012256183285602707, + "loss": 2.1498, + "step": 11460 + }, + { + "epoch": 1.78, + "learning_rate": 0.00012224941421504814, + "loss": 2.1663, + "step": 11480 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012193699557406924, + "loss": 2.1306, + "step": 11500 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012162457693309033, + "loss": 2.1542, + "step": 11520 + }, + { + "epoch": 1.79, + "learning_rate": 0.00012131215829211141, + "loss": 2.1513, + "step": 11540 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012099973965113251, + "loss": 2.2031, + "step": 11560 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012068732101015359, + "loss": 2.1438, + "step": 11580 + }, + { + "epoch": 1.8, + "learning_rate": 0.00012037490236917469, + "loss": 2.1431, + "step": 11600 + }, + { + "epoch": 1.8, + "eval_loss": 2.3447554111480713, + "eval_runtime": 69.2865, + "eval_samples_per_second": 28.866, + "eval_steps_per_second": 1.804, + "step": 11600 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012006248372819577, + "loss": 2.1272, + "step": 11620 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011975006508721686, + "loss": 2.1584, + "step": 11640 + }, + { + "epoch": 1.81, + "learning_rate": 0.00011943764644623794, + "loss": 2.128, + "step": 11660 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011912522780525903, + "loss": 2.1461, + "step": 11680 + }, + { + "epoch": 1.82, + "learning_rate": 0.00011881280916428012, + "loss": 2.1411, + "step": 11700 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001185003905233012, + "loss": 2.1592, + "step": 11720 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001181879718823223, + "loss": 2.1642, + "step": 11740 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011787555324134338, + "loss": 2.1914, + "step": 11760 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011756313460036448, + "loss": 2.1612, + "step": 11780 + }, + { + "epoch": 1.83, + "learning_rate": 0.00011725071595938556, + "loss": 2.1452, + "step": 11800 + }, + { + "epoch": 1.83, + "eval_loss": 2.3442630767822266, + "eval_runtime": 69.2459, + "eval_samples_per_second": 28.883, + "eval_steps_per_second": 1.805, + "step": 11800 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011693829731840665, + "loss": 2.1453, + "step": 11820 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011662587867742774, + "loss": 2.1251, + "step": 11840 + }, + { + "epoch": 1.84, + "learning_rate": 0.00011631346003644882, + "loss": 2.1412, + "step": 11860 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011600104139546991, + "loss": 2.1033, + "step": 11880 + }, + { + "epoch": 1.85, + "learning_rate": 0.000115688622754491, + "loss": 2.1219, + "step": 11900 + }, + { + "epoch": 1.85, + "learning_rate": 0.00011537620411351209, + "loss": 2.1831, + "step": 11920 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011506378547253317, + "loss": 2.1434, + "step": 11940 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011475136683155427, + "loss": 2.1439, + "step": 11960 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011443894819057536, + "loss": 2.1377, + "step": 11980 + }, + { + "epoch": 1.86, + "learning_rate": 0.00011412652954959646, + "loss": 2.1345, + "step": 12000 + }, + { + "epoch": 1.86, + "eval_loss": 2.342855453491211, + "eval_runtime": 69.2714, + "eval_samples_per_second": 28.872, + "eval_steps_per_second": 1.804, + "step": 12000 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011381411090861754, + "loss": 2.1527, + "step": 12020 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011350169226763864, + "loss": 2.1737, + "step": 12040 + }, + { + "epoch": 1.87, + "learning_rate": 0.00011318927362665972, + "loss": 2.137, + "step": 12060 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011287685498568081, + "loss": 2.1616, + "step": 12080 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001125644363447019, + "loss": 2.1688, + "step": 12100 + }, + { + "epoch": 1.88, + "learning_rate": 0.00011225201770372299, + "loss": 2.1746, + "step": 12120 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011193959906274407, + "loss": 2.1552, + "step": 12140 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011162718042176515, + "loss": 2.1643, + "step": 12160 + }, + { + "epoch": 1.89, + "learning_rate": 0.00011131476178078625, + "loss": 2.1494, + "step": 12180 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011100234313980733, + "loss": 2.1112, + "step": 12200 + }, + { + "epoch": 1.9, + "eval_loss": 2.34304141998291, + "eval_runtime": 72.1422, + "eval_samples_per_second": 27.723, + "eval_steps_per_second": 1.733, + "step": 12200 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011068992449882843, + "loss": 2.1505, + "step": 12220 + }, + { + "epoch": 1.9, + "learning_rate": 0.00011037750585784951, + "loss": 2.1722, + "step": 12240 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001100650872168706, + "loss": 2.1582, + "step": 12260 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010975266857589169, + "loss": 2.1806, + "step": 12280 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010944024993491278, + "loss": 2.1508, + "step": 12300 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010912783129393386, + "loss": 2.1654, + "step": 12320 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010881541265295496, + "loss": 2.131, + "step": 12340 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010850299401197604, + "loss": 2.1301, + "step": 12360 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010819057537099712, + "loss": 2.1312, + "step": 12380 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010787815673001822, + "loss": 2.1301, + "step": 12400 + }, + { + "epoch": 1.93, + "eval_loss": 2.3404922485351562, + "eval_runtime": 71.3367, + "eval_samples_per_second": 28.036, + "eval_steps_per_second": 1.752, + "step": 12400 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010758135902108825, + "loss": 2.1398, + "step": 12420 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010726894038010933, + "loss": 2.1449, + "step": 12440 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010695652173913043, + "loss": 2.1498, + "step": 12460 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010664410309815151, + "loss": 2.1484, + "step": 12480 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001063316844571726, + "loss": 2.1705, + "step": 12500 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010601926581619368, + "loss": 2.1236, + "step": 12520 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010570684717521478, + "loss": 2.1435, + "step": 12540 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010539442853423586, + "loss": 2.1656, + "step": 12560 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010508200989325696, + "loss": 2.1459, + "step": 12580 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010476959125227804, + "loss": 2.1392, + "step": 12600 + }, + { + "epoch": 1.96, + "eval_loss": 2.3410892486572266, + "eval_runtime": 72.1407, + "eval_samples_per_second": 27.724, + "eval_steps_per_second": 1.733, + "step": 12600 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010445717261129913, + "loss": 2.1399, + "step": 12620 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010414475397032022, + "loss": 2.1979, + "step": 12640 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001038323353293413, + "loss": 2.1596, + "step": 12660 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001035199166883624, + "loss": 2.1817, + "step": 12680 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010320749804738348, + "loss": 2.0972, + "step": 12700 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010289507940640457, + "loss": 2.1293, + "step": 12720 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010258266076542565, + "loss": 2.1362, + "step": 12740 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010227024212444675, + "loss": 2.1474, + "step": 12760 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010195782348346783, + "loss": 2.2004, + "step": 12780 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010164540484248893, + "loss": 2.1221, + "step": 12800 + }, + { + "epoch": 1.99, + "eval_loss": 2.340029716491699, + "eval_runtime": 72.0796, + "eval_samples_per_second": 27.747, + "eval_steps_per_second": 1.734, + "step": 12800 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010133298620151001, + "loss": 2.1782, + "step": 12820 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010102056756053109, + "loss": 2.1358, + "step": 12840 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010070814891955218, + "loss": 2.122, + "step": 12860 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010039573027857327, + "loss": 2.1494, + "step": 12880 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010008331163759436, + "loss": 2.1522, + "step": 12900 + }, + { + "epoch": 2.01, + "learning_rate": 9.977089299661544e-05, + "loss": 2.1241, + "step": 12920 + }, + { + "epoch": 2.01, + "learning_rate": 9.945847435563654e-05, + "loss": 2.1456, + "step": 12940 + }, + { + "epoch": 2.01, + "learning_rate": 9.914605571465763e-05, + "loss": 2.1495, + "step": 12960 + }, + { + "epoch": 2.02, + "learning_rate": 9.883363707367873e-05, + "loss": 2.1734, + "step": 12980 + }, + { + "epoch": 2.02, + "learning_rate": 9.852121843269981e-05, + "loss": 2.1711, + "step": 13000 + }, + { + "epoch": 2.02, + "eval_loss": 2.339312791824341, + "eval_runtime": 69.2994, + "eval_samples_per_second": 28.86, + "eval_steps_per_second": 1.804, + "step": 13000 + }, + { + "epoch": 2.02, + "learning_rate": 9.820879979172091e-05, + "loss": 2.1483, + "step": 13020 + }, + { + "epoch": 2.03, + "learning_rate": 9.789638115074199e-05, + "loss": 2.124, + "step": 13040 + }, + { + "epoch": 2.03, + "learning_rate": 9.758396250976308e-05, + "loss": 2.1337, + "step": 13060 + }, + { + "epoch": 2.03, + "learning_rate": 9.727154386878417e-05, + "loss": 2.137, + "step": 13080 + }, + { + "epoch": 2.04, + "learning_rate": 9.695912522780526e-05, + "loss": 2.1225, + "step": 13100 + }, + { + "epoch": 2.04, + "learning_rate": 9.664670658682634e-05, + "loss": 2.1384, + "step": 13120 + }, + { + "epoch": 2.04, + "learning_rate": 9.633428794584743e-05, + "loss": 2.1052, + "step": 13140 + }, + { + "epoch": 2.05, + "learning_rate": 9.602186930486852e-05, + "loss": 2.1489, + "step": 13160 + }, + { + "epoch": 2.05, + "learning_rate": 9.57094506638896e-05, + "loss": 2.1154, + "step": 13180 + }, + { + "epoch": 2.05, + "learning_rate": 9.53970320229107e-05, + "loss": 2.1476, + "step": 13200 + }, + { + "epoch": 2.05, + "eval_loss": 2.3396096229553223, + "eval_runtime": 69.2833, + "eval_samples_per_second": 28.867, + "eval_steps_per_second": 1.804, + "step": 13200 + }, + { + "epoch": 2.05, + "learning_rate": 9.508461338193178e-05, + "loss": 2.1109, + "step": 13220 + }, + { + "epoch": 2.06, + "learning_rate": 9.477219474095288e-05, + "loss": 2.0973, + "step": 13240 + }, + { + "epoch": 2.06, + "learning_rate": 9.445977609997396e-05, + "loss": 2.1281, + "step": 13260 + }, + { + "epoch": 2.06, + "learning_rate": 9.414735745899505e-05, + "loss": 2.1216, + "step": 13280 + }, + { + "epoch": 2.07, + "learning_rate": 9.383493881801614e-05, + "loss": 2.1323, + "step": 13300 + }, + { + "epoch": 2.07, + "learning_rate": 9.352252017703723e-05, + "loss": 2.1477, + "step": 13320 + }, + { + "epoch": 2.07, + "learning_rate": 9.321010153605831e-05, + "loss": 2.1309, + "step": 13340 + }, + { + "epoch": 2.08, + "learning_rate": 9.28976828950794e-05, + "loss": 2.0899, + "step": 13360 + }, + { + "epoch": 2.08, + "learning_rate": 9.258526425410049e-05, + "loss": 2.1402, + "step": 13380 + }, + { + "epoch": 2.08, + "learning_rate": 9.227284561312157e-05, + "loss": 2.0768, + "step": 13400 + }, + { + "epoch": 2.08, + "eval_loss": 2.3376858234405518, + "eval_runtime": 69.4568, + "eval_samples_per_second": 28.795, + "eval_steps_per_second": 1.8, + "step": 13400 + }, + { + "epoch": 2.09, + "learning_rate": 9.196042697214267e-05, + "loss": 2.1405, + "step": 13420 + }, + { + "epoch": 2.09, + "learning_rate": 9.164800833116375e-05, + "loss": 2.1118, + "step": 13440 + }, + { + "epoch": 2.09, + "learning_rate": 9.133558969018484e-05, + "loss": 2.1525, + "step": 13460 + }, + { + "epoch": 2.09, + "learning_rate": 9.102317104920593e-05, + "loss": 2.1369, + "step": 13480 + }, + { + "epoch": 2.1, + "learning_rate": 9.071075240822702e-05, + "loss": 2.1683, + "step": 13500 + }, + { + "epoch": 2.1, + "learning_rate": 9.03983337672481e-05, + "loss": 2.1193, + "step": 13520 + }, + { + "epoch": 2.1, + "learning_rate": 9.00859151262692e-05, + "loss": 2.1222, + "step": 13540 + }, + { + "epoch": 2.11, + "learning_rate": 8.977349648529028e-05, + "loss": 2.1461, + "step": 13560 + }, + { + "epoch": 2.11, + "learning_rate": 8.946107784431136e-05, + "loss": 2.1106, + "step": 13580 + }, + { + "epoch": 2.11, + "learning_rate": 8.914865920333246e-05, + "loss": 2.1307, + "step": 13600 + }, + { + "epoch": 2.11, + "eval_loss": 2.3381118774414062, + "eval_runtime": 69.5609, + "eval_samples_per_second": 28.752, + "eval_steps_per_second": 1.797, + "step": 13600 + }, + { + "epoch": 2.12, + "learning_rate": 8.883624056235354e-05, + "loss": 2.1679, + "step": 13620 + }, + { + "epoch": 2.12, + "learning_rate": 8.852382192137464e-05, + "loss": 2.1418, + "step": 13640 + }, + { + "epoch": 2.12, + "learning_rate": 8.821140328039572e-05, + "loss": 2.1238, + "step": 13660 + }, + { + "epoch": 2.13, + "learning_rate": 8.789898463941681e-05, + "loss": 2.0995, + "step": 13680 + }, + { + "epoch": 2.13, + "learning_rate": 8.75865659984379e-05, + "loss": 2.1596, + "step": 13700 + }, + { + "epoch": 2.13, + "learning_rate": 8.727414735745899e-05, + "loss": 2.1478, + "step": 13720 + }, + { + "epoch": 2.14, + "learning_rate": 8.696172871648007e-05, + "loss": 2.1299, + "step": 13740 + }, + { + "epoch": 2.14, + "learning_rate": 8.664931007550115e-05, + "loss": 2.1405, + "step": 13760 + }, + { + "epoch": 2.14, + "learning_rate": 8.633689143452225e-05, + "loss": 2.174, + "step": 13780 + }, + { + "epoch": 2.14, + "learning_rate": 8.602447279354333e-05, + "loss": 2.129, + "step": 13800 + }, + { + "epoch": 2.14, + "eval_loss": 2.337769031524658, + "eval_runtime": 69.7472, + "eval_samples_per_second": 28.675, + "eval_steps_per_second": 1.792, + "step": 13800 + }, + { + "epoch": 2.15, + "learning_rate": 8.571205415256443e-05, + "loss": 2.1368, + "step": 13820 + }, + { + "epoch": 2.15, + "learning_rate": 8.539963551158551e-05, + "loss": 2.1573, + "step": 13840 + }, + { + "epoch": 2.15, + "learning_rate": 8.50872168706066e-05, + "loss": 2.1132, + "step": 13860 + }, + { + "epoch": 2.16, + "learning_rate": 8.477479822962769e-05, + "loss": 2.1131, + "step": 13880 + }, + { + "epoch": 2.16, + "learning_rate": 8.446237958864878e-05, + "loss": 2.1351, + "step": 13900 + }, + { + "epoch": 2.16, + "learning_rate": 8.414996094766986e-05, + "loss": 2.1738, + "step": 13920 + }, + { + "epoch": 2.17, + "learning_rate": 8.383754230669096e-05, + "loss": 2.1551, + "step": 13940 + }, + { + "epoch": 2.17, + "learning_rate": 8.352512366571204e-05, + "loss": 2.1195, + "step": 13960 + }, + { + "epoch": 2.17, + "learning_rate": 8.321270502473312e-05, + "loss": 2.1125, + "step": 13980 + }, + { + "epoch": 2.18, + "learning_rate": 8.290028638375422e-05, + "loss": 2.1549, + "step": 14000 + }, + { + "epoch": 2.18, + "eval_loss": 2.337301731109619, + "eval_runtime": 69.7462, + "eval_samples_per_second": 28.675, + "eval_steps_per_second": 1.792, + "step": 14000 + }, + { + "epoch": 2.18, + "learning_rate": 8.25878677427753e-05, + "loss": 2.1573, + "step": 14020 + }, + { + "epoch": 2.18, + "learning_rate": 8.22754491017964e-05, + "loss": 2.1125, + "step": 14040 + }, + { + "epoch": 2.18, + "learning_rate": 8.196303046081748e-05, + "loss": 2.161, + "step": 14060 + }, + { + "epoch": 2.19, + "learning_rate": 8.165061181983857e-05, + "loss": 2.1511, + "step": 14080 + }, + { + "epoch": 2.19, + "learning_rate": 8.133819317885967e-05, + "loss": 2.1737, + "step": 14100 + }, + { + "epoch": 2.19, + "learning_rate": 8.102577453788076e-05, + "loss": 2.1158, + "step": 14120 + }, + { + "epoch": 2.2, + "learning_rate": 8.071335589690184e-05, + "loss": 2.1398, + "step": 14140 + }, + { + "epoch": 2.2, + "learning_rate": 8.040093725592294e-05, + "loss": 2.1183, + "step": 14160 + }, + { + "epoch": 2.2, + "learning_rate": 8.008851861494402e-05, + "loss": 2.1295, + "step": 14180 + }, + { + "epoch": 2.21, + "learning_rate": 7.977609997396512e-05, + "loss": 2.1416, + "step": 14200 + }, + { + "epoch": 2.21, + "eval_loss": 2.336796760559082, + "eval_runtime": 69.3578, + "eval_samples_per_second": 28.836, + "eval_steps_per_second": 1.802, + "step": 14200 + }, + { + "epoch": 2.21, + "learning_rate": 7.94636813329862e-05, + "loss": 2.1461, + "step": 14220 + }, + { + "epoch": 2.21, + "learning_rate": 7.91512626920073e-05, + "loss": 2.0931, + "step": 14240 + }, + { + "epoch": 2.22, + "learning_rate": 7.883884405102838e-05, + "loss": 2.1341, + "step": 14260 + }, + { + "epoch": 2.22, + "learning_rate": 7.852642541004946e-05, + "loss": 2.1369, + "step": 14280 + }, + { + "epoch": 2.22, + "learning_rate": 7.821400676907055e-05, + "loss": 2.1431, + "step": 14300 + }, + { + "epoch": 2.23, + "learning_rate": 7.790158812809164e-05, + "loss": 2.1508, + "step": 14320 + }, + { + "epoch": 2.23, + "learning_rate": 7.758916948711273e-05, + "loss": 2.1456, + "step": 14340 + }, + { + "epoch": 2.23, + "learning_rate": 7.727675084613381e-05, + "loss": 2.1448, + "step": 14360 + }, + { + "epoch": 2.23, + "learning_rate": 7.696433220515491e-05, + "loss": 2.1637, + "step": 14380 + }, + { + "epoch": 2.24, + "learning_rate": 7.665191356417599e-05, + "loss": 2.114, + "step": 14400 + }, + { + "epoch": 2.24, + "eval_loss": 2.3362655639648438, + "eval_runtime": 69.5792, + "eval_samples_per_second": 28.744, + "eval_steps_per_second": 1.797, + "step": 14400 + }, + { + "epoch": 2.24, + "learning_rate": 7.633949492319709e-05, + "loss": 2.1222, + "step": 14420 + }, + { + "epoch": 2.24, + "learning_rate": 7.602707628221817e-05, + "loss": 2.1776, + "step": 14440 + }, + { + "epoch": 2.25, + "learning_rate": 7.57302785732882e-05, + "loss": 2.1414, + "step": 14460 + }, + { + "epoch": 2.25, + "learning_rate": 7.541785993230929e-05, + "loss": 2.1231, + "step": 14480 + }, + { + "epoch": 2.25, + "learning_rate": 7.510544129133038e-05, + "loss": 2.1345, + "step": 14500 + }, + { + "epoch": 2.26, + "learning_rate": 7.479302265035147e-05, + "loss": 2.1339, + "step": 14520 + }, + { + "epoch": 2.26, + "learning_rate": 7.448060400937255e-05, + "loss": 2.1562, + "step": 14540 + }, + { + "epoch": 2.26, + "learning_rate": 7.416818536839363e-05, + "loss": 2.1649, + "step": 14560 + }, + { + "epoch": 2.27, + "learning_rate": 7.385576672741473e-05, + "loss": 2.1339, + "step": 14580 + }, + { + "epoch": 2.27, + "learning_rate": 7.354334808643581e-05, + "loss": 2.1347, + "step": 14600 + }, + { + "epoch": 2.27, + "eval_loss": 2.335818290710449, + "eval_runtime": 69.5131, + "eval_samples_per_second": 28.772, + "eval_steps_per_second": 1.798, + "step": 14600 + }, + { + "epoch": 2.27, + "learning_rate": 7.323092944545691e-05, + "loss": 2.1078, + "step": 14620 + }, + { + "epoch": 2.28, + "learning_rate": 7.291851080447799e-05, + "loss": 2.1446, + "step": 14640 + }, + { + "epoch": 2.28, + "learning_rate": 7.260609216349908e-05, + "loss": 2.1076, + "step": 14660 + }, + { + "epoch": 2.28, + "learning_rate": 7.229367352252017e-05, + "loss": 2.1548, + "step": 14680 + }, + { + "epoch": 2.28, + "learning_rate": 7.198125488154126e-05, + "loss": 2.1317, + "step": 14700 + }, + { + "epoch": 2.29, + "learning_rate": 7.166883624056234e-05, + "loss": 2.0991, + "step": 14720 + }, + { + "epoch": 2.29, + "learning_rate": 7.135641759958343e-05, + "loss": 2.1507, + "step": 14740 + }, + { + "epoch": 2.29, + "learning_rate": 7.104399895860452e-05, + "loss": 2.1173, + "step": 14760 + }, + { + "epoch": 2.3, + "learning_rate": 7.073158031762562e-05, + "loss": 2.104, + "step": 14780 + }, + { + "epoch": 2.3, + "learning_rate": 7.043478260869565e-05, + "loss": 2.1118, + "step": 14800 + }, + { + "epoch": 2.3, + "eval_loss": 2.334048271179199, + "eval_runtime": 69.3816, + "eval_samples_per_second": 28.826, + "eval_steps_per_second": 1.802, + "step": 14800 + }, + { + "epoch": 2.3, + "learning_rate": 7.012236396771674e-05, + "loss": 2.0738, + "step": 14820 + }, + { + "epoch": 2.31, + "learning_rate": 6.980994532673782e-05, + "loss": 2.1221, + "step": 14840 + }, + { + "epoch": 2.31, + "learning_rate": 6.94975266857589e-05, + "loss": 2.1531, + "step": 14860 + }, + { + "epoch": 2.31, + "learning_rate": 6.918510804478e-05, + "loss": 2.1318, + "step": 14880 + }, + { + "epoch": 2.32, + "learning_rate": 6.887268940380108e-05, + "loss": 2.1251, + "step": 14900 + }, + { + "epoch": 2.32, + "learning_rate": 6.856027076282218e-05, + "loss": 2.1212, + "step": 14920 + }, + { + "epoch": 2.32, + "learning_rate": 6.824785212184326e-05, + "loss": 2.0927, + "step": 14940 + }, + { + "epoch": 2.32, + "learning_rate": 6.793543348086436e-05, + "loss": 2.1277, + "step": 14960 + }, + { + "epoch": 2.33, + "learning_rate": 6.762301483988544e-05, + "loss": 2.156, + "step": 14980 + }, + { + "epoch": 2.33, + "learning_rate": 6.731059619890653e-05, + "loss": 2.1276, + "step": 15000 + }, + { + "epoch": 2.33, + "eval_loss": 2.3340351581573486, + "eval_runtime": 69.2926, + "eval_samples_per_second": 28.863, + "eval_steps_per_second": 1.804, + "step": 15000 + }, + { + "epoch": 2.33, + "learning_rate": 6.699817755792761e-05, + "loss": 2.1313, + "step": 15020 + }, + { + "epoch": 2.34, + "learning_rate": 6.668575891694871e-05, + "loss": 2.1452, + "step": 15040 + }, + { + "epoch": 2.34, + "learning_rate": 6.637334027596979e-05, + "loss": 2.1148, + "step": 15060 + }, + { + "epoch": 2.34, + "learning_rate": 6.606092163499087e-05, + "loss": 2.1193, + "step": 15080 + }, + { + "epoch": 2.35, + "learning_rate": 6.574850299401197e-05, + "loss": 2.1672, + "step": 15100 + }, + { + "epoch": 2.35, + "learning_rate": 6.543608435303305e-05, + "loss": 2.0789, + "step": 15120 + }, + { + "epoch": 2.35, + "learning_rate": 6.512366571205415e-05, + "loss": 2.1438, + "step": 15140 + }, + { + "epoch": 2.36, + "learning_rate": 6.481124707107523e-05, + "loss": 2.1597, + "step": 15160 + }, + { + "epoch": 2.36, + "learning_rate": 6.449882843009632e-05, + "loss": 2.11, + "step": 15180 + }, + { + "epoch": 2.36, + "learning_rate": 6.418640978911742e-05, + "loss": 2.1279, + "step": 15200 + }, + { + "epoch": 2.36, + "eval_loss": 2.3344008922576904, + "eval_runtime": 69.3363, + "eval_samples_per_second": 28.845, + "eval_steps_per_second": 1.803, + "step": 15200 + }, + { + "epoch": 2.37, + "learning_rate": 6.38739911481385e-05, + "loss": 2.1459, + "step": 15220 + }, + { + "epoch": 2.37, + "learning_rate": 6.35615725071596e-05, + "loss": 2.1702, + "step": 15240 + }, + { + "epoch": 2.37, + "learning_rate": 6.324915386618068e-05, + "loss": 2.1262, + "step": 15260 + }, + { + "epoch": 2.37, + "learning_rate": 6.293673522520177e-05, + "loss": 2.0988, + "step": 15280 + }, + { + "epoch": 2.38, + "learning_rate": 6.262431658422286e-05, + "loss": 2.1224, + "step": 15300 + }, + { + "epoch": 2.38, + "learning_rate": 6.231189794324394e-05, + "loss": 2.1102, + "step": 15320 + }, + { + "epoch": 2.38, + "learning_rate": 6.199947930226503e-05, + "loss": 2.1168, + "step": 15340 + }, + { + "epoch": 2.39, + "learning_rate": 6.168706066128611e-05, + "loss": 2.1205, + "step": 15360 + }, + { + "epoch": 2.39, + "learning_rate": 6.137464202030721e-05, + "loss": 2.0855, + "step": 15380 + }, + { + "epoch": 2.39, + "learning_rate": 6.106222337932829e-05, + "loss": 2.1548, + "step": 15400 + }, + { + "epoch": 2.39, + "eval_loss": 2.333451271057129, + "eval_runtime": 69.3334, + "eval_samples_per_second": 28.846, + "eval_steps_per_second": 1.803, + "step": 15400 + }, + { + "epoch": 2.4, + "learning_rate": 6.074980473834938e-05, + "loss": 2.1433, + "step": 15420 + }, + { + "epoch": 2.4, + "learning_rate": 6.043738609737047e-05, + "loss": 2.123, + "step": 15440 + }, + { + "epoch": 2.4, + "learning_rate": 6.012496745639156e-05, + "loss": 2.0965, + "step": 15460 + }, + { + "epoch": 2.41, + "learning_rate": 5.9812548815412647e-05, + "loss": 2.1498, + "step": 15480 + }, + { + "epoch": 2.41, + "learning_rate": 5.9500130174433735e-05, + "loss": 2.1456, + "step": 15500 + }, + { + "epoch": 2.41, + "learning_rate": 5.9187711533454824e-05, + "loss": 2.1295, + "step": 15520 + }, + { + "epoch": 2.41, + "learning_rate": 5.887529289247591e-05, + "loss": 2.108, + "step": 15540 + }, + { + "epoch": 2.42, + "learning_rate": 5.8562874251497e-05, + "loss": 2.1592, + "step": 15560 + }, + { + "epoch": 2.42, + "learning_rate": 5.825045561051809e-05, + "loss": 2.1214, + "step": 15580 + }, + { + "epoch": 2.42, + "learning_rate": 5.793803696953918e-05, + "loss": 2.1561, + "step": 15600 + }, + { + "epoch": 2.42, + "eval_loss": 2.3329403400421143, + "eval_runtime": 69.6034, + "eval_samples_per_second": 28.734, + "eval_steps_per_second": 1.796, + "step": 15600 + }, + { + "epoch": 2.43, + "learning_rate": 5.762561832856026e-05, + "loss": 2.1382, + "step": 15620 + }, + { + "epoch": 2.43, + "learning_rate": 5.731319968758135e-05, + "loss": 2.109, + "step": 15640 + }, + { + "epoch": 2.43, + "learning_rate": 5.700078104660244e-05, + "loss": 2.1283, + "step": 15660 + }, + { + "epoch": 2.44, + "learning_rate": 5.6688362405623526e-05, + "loss": 2.15, + "step": 15680 + }, + { + "epoch": 2.44, + "learning_rate": 5.6375943764644615e-05, + "loss": 2.1125, + "step": 15700 + }, + { + "epoch": 2.44, + "learning_rate": 5.6063525123665704e-05, + "loss": 2.1709, + "step": 15720 + }, + { + "epoch": 2.45, + "learning_rate": 5.575110648268679e-05, + "loss": 2.1622, + "step": 15740 + }, + { + "epoch": 2.45, + "learning_rate": 5.543868784170789e-05, + "loss": 2.0769, + "step": 15760 + }, + { + "epoch": 2.45, + "learning_rate": 5.5126269200728976e-05, + "loss": 2.137, + "step": 15780 + }, + { + "epoch": 2.46, + "learning_rate": 5.4813850559750065e-05, + "loss": 2.1294, + "step": 15800 + }, + { + "epoch": 2.46, + "eval_loss": 2.3324475288391113, + "eval_runtime": 69.559, + "eval_samples_per_second": 28.753, + "eval_steps_per_second": 1.797, + "step": 15800 + }, + { + "epoch": 2.46, + "learning_rate": 5.4501431918771154e-05, + "loss": 2.1425, + "step": 15820 + }, + { + "epoch": 2.46, + "learning_rate": 5.418901327779224e-05, + "loss": 2.128, + "step": 15840 + }, + { + "epoch": 2.46, + "learning_rate": 5.387659463681333e-05, + "loss": 2.1553, + "step": 15860 + }, + { + "epoch": 2.47, + "learning_rate": 5.356417599583441e-05, + "loss": 2.1339, + "step": 15880 + }, + { + "epoch": 2.47, + "learning_rate": 5.32517573548555e-05, + "loss": 2.1536, + "step": 15900 + }, + { + "epoch": 2.47, + "learning_rate": 5.293933871387659e-05, + "loss": 2.1669, + "step": 15920 + }, + { + "epoch": 2.48, + "learning_rate": 5.262692007289768e-05, + "loss": 2.122, + "step": 15940 + }, + { + "epoch": 2.48, + "learning_rate": 5.231450143191877e-05, + "loss": 2.1435, + "step": 15960 + }, + { + "epoch": 2.48, + "learning_rate": 5.2002082790939856e-05, + "loss": 2.1406, + "step": 15980 + }, + { + "epoch": 2.49, + "learning_rate": 5.1689664149960945e-05, + "loss": 2.1174, + "step": 16000 + }, + { + "epoch": 2.49, + "eval_loss": 2.332836866378784, + "eval_runtime": 69.3739, + "eval_samples_per_second": 28.829, + "eval_steps_per_second": 1.802, + "step": 16000 + }, + { + "epoch": 2.49, + "learning_rate": 5.137724550898203e-05, + "loss": 2.1286, + "step": 16020 + }, + { + "epoch": 2.49, + "learning_rate": 5.106482686800312e-05, + "loss": 2.1343, + "step": 16040 + }, + { + "epoch": 2.5, + "learning_rate": 5.075240822702421e-05, + "loss": 2.1134, + "step": 16060 + }, + { + "epoch": 2.5, + "learning_rate": 5.043998958604529e-05, + "loss": 2.1633, + "step": 16080 + }, + { + "epoch": 2.5, + "learning_rate": 5.012757094506638e-05, + "loss": 2.1473, + "step": 16100 + }, + { + "epoch": 2.5, + "learning_rate": 4.981515230408747e-05, + "loss": 2.1535, + "step": 16120 + }, + { + "epoch": 2.51, + "learning_rate": 4.950273366310856e-05, + "loss": 2.112, + "step": 16140 + }, + { + "epoch": 2.51, + "learning_rate": 4.919031502212965e-05, + "loss": 2.1399, + "step": 16160 + }, + { + "epoch": 2.51, + "learning_rate": 4.8877896381150736e-05, + "loss": 2.0913, + "step": 16180 + }, + { + "epoch": 2.52, + "learning_rate": 4.8565477740171824e-05, + "loss": 2.1179, + "step": 16200 + }, + { + "epoch": 2.52, + "eval_loss": 2.332409143447876, + "eval_runtime": 69.3294, + "eval_samples_per_second": 28.848, + "eval_steps_per_second": 1.803, + "step": 16200 + }, + { + "epoch": 2.52, + "learning_rate": 4.825305909919291e-05, + "loss": 2.1756, + "step": 16220 + }, + { + "epoch": 2.52, + "learning_rate": 4.7940640458214e-05, + "loss": 2.1466, + "step": 16240 + }, + { + "epoch": 2.53, + "learning_rate": 4.762822181723509e-05, + "loss": 2.1443, + "step": 16260 + }, + { + "epoch": 2.53, + "learning_rate": 4.731580317625618e-05, + "loss": 2.1207, + "step": 16280 + }, + { + "epoch": 2.53, + "learning_rate": 4.700338453527726e-05, + "loss": 2.1275, + "step": 16300 + }, + { + "epoch": 2.54, + "learning_rate": 4.669096589429835e-05, + "loss": 2.1305, + "step": 16320 + }, + { + "epoch": 2.54, + "learning_rate": 4.6378547253319445e-05, + "loss": 2.134, + "step": 16340 + }, + { + "epoch": 2.54, + "learning_rate": 4.6066128612340534e-05, + "loss": 2.1681, + "step": 16360 + }, + { + "epoch": 2.55, + "learning_rate": 4.575370997136162e-05, + "loss": 2.1627, + "step": 16380 + }, + { + "epoch": 2.55, + "learning_rate": 4.544129133038271e-05, + "loss": 2.1421, + "step": 16400 + }, + { + "epoch": 2.55, + "eval_loss": 2.3318614959716797, + "eval_runtime": 69.3251, + "eval_samples_per_second": 28.85, + "eval_steps_per_second": 1.803, + "step": 16400 + }, + { + "epoch": 2.55, + "learning_rate": 4.51288726894038e-05, + "loss": 2.1225, + "step": 16420 + }, + { + "epoch": 2.55, + "learning_rate": 4.481645404842489e-05, + "loss": 2.156, + "step": 16440 + }, + { + "epoch": 2.56, + "learning_rate": 4.450403540744598e-05, + "loss": 2.1573, + "step": 16460 + }, + { + "epoch": 2.56, + "learning_rate": 4.4191616766467066e-05, + "loss": 2.1295, + "step": 16480 + }, + { + "epoch": 2.56, + "learning_rate": 4.3879198125488154e-05, + "loss": 2.14, + "step": 16500 + }, + { + "epoch": 2.57, + "learning_rate": 4.356677948450924e-05, + "loss": 2.1046, + "step": 16520 + }, + { + "epoch": 2.57, + "learning_rate": 4.3254360843530325e-05, + "loss": 2.1201, + "step": 16540 + }, + { + "epoch": 2.57, + "learning_rate": 4.2941942202551413e-05, + "loss": 2.1767, + "step": 16560 + }, + { + "epoch": 2.58, + "learning_rate": 4.26295235615725e-05, + "loss": 2.1244, + "step": 16580 + }, + { + "epoch": 2.58, + "learning_rate": 4.231710492059359e-05, + "loss": 2.1301, + "step": 16600 + }, + { + "epoch": 2.58, + "eval_loss": 2.331899881362915, + "eval_runtime": 69.3398, + "eval_samples_per_second": 28.843, + "eval_steps_per_second": 1.803, + "step": 16600 + }, + { + "epoch": 2.58, + "learning_rate": 4.200468627961468e-05, + "loss": 2.1022, + "step": 16620 + }, + { + "epoch": 2.59, + "learning_rate": 4.169226763863577e-05, + "loss": 2.1121, + "step": 16640 + }, + { + "epoch": 2.59, + "learning_rate": 4.137984899765686e-05, + "loss": 2.1014, + "step": 16660 + }, + { + "epoch": 2.59, + "learning_rate": 4.1067430356677945e-05, + "loss": 2.1867, + "step": 16680 + }, + { + "epoch": 2.6, + "learning_rate": 4.0755011715699034e-05, + "loss": 2.1055, + "step": 16700 + }, + { + "epoch": 2.6, + "learning_rate": 4.044259307472012e-05, + "loss": 2.1435, + "step": 16720 + }, + { + "epoch": 2.6, + "learning_rate": 4.013017443374121e-05, + "loss": 2.09, + "step": 16740 + }, + { + "epoch": 2.6, + "learning_rate": 3.981775579276229e-05, + "loss": 2.1317, + "step": 16760 + }, + { + "epoch": 2.61, + "learning_rate": 3.950533715178338e-05, + "loss": 2.0683, + "step": 16780 + }, + { + "epoch": 2.61, + "learning_rate": 3.919291851080447e-05, + "loss": 2.1249, + "step": 16800 + }, + { + "epoch": 2.61, + "eval_loss": 2.331566572189331, + "eval_runtime": 69.3154, + "eval_samples_per_second": 28.854, + "eval_steps_per_second": 1.803, + "step": 16800 + }, + { + "epoch": 2.61, + "learning_rate": 3.888049986982556e-05, + "loss": 2.164, + "step": 16820 + }, + { + "epoch": 2.62, + "learning_rate": 3.856808122884665e-05, + "loss": 2.16, + "step": 16840 + }, + { + "epoch": 2.62, + "learning_rate": 3.8255662587867736e-05, + "loss": 2.1603, + "step": 16860 + }, + { + "epoch": 2.62, + "learning_rate": 3.7943243946888825e-05, + "loss": 2.1346, + "step": 16880 + }, + { + "epoch": 2.63, + "learning_rate": 3.7630825305909914e-05, + "loss": 2.1082, + "step": 16900 + }, + { + "epoch": 2.63, + "learning_rate": 3.7318406664931e-05, + "loss": 2.1014, + "step": 16920 + }, + { + "epoch": 2.63, + "learning_rate": 3.700598802395209e-05, + "loss": 2.1088, + "step": 16940 + }, + { + "epoch": 2.64, + "learning_rate": 3.669356938297318e-05, + "loss": 2.0975, + "step": 16960 + }, + { + "epoch": 2.64, + "learning_rate": 3.638115074199427e-05, + "loss": 2.1212, + "step": 16980 + }, + { + "epoch": 2.64, + "learning_rate": 3.606873210101536e-05, + "loss": 2.1226, + "step": 17000 + }, + { + "epoch": 2.64, + "eval_loss": 2.3310983180999756, + "eval_runtime": 69.3945, + "eval_samples_per_second": 28.821, + "eval_steps_per_second": 1.801, + "step": 17000 + }, + { + "epoch": 2.64, + "learning_rate": 3.5756313460036446e-05, + "loss": 2.1318, + "step": 17020 + }, + { + "epoch": 2.65, + "learning_rate": 3.5443894819057534e-05, + "loss": 2.1073, + "step": 17040 + }, + { + "epoch": 2.65, + "learning_rate": 3.513147617807862e-05, + "loss": 2.1411, + "step": 17060 + }, + { + "epoch": 2.65, + "learning_rate": 3.481905753709971e-05, + "loss": 2.0959, + "step": 17080 + }, + { + "epoch": 2.66, + "learning_rate": 3.45066388961208e-05, + "loss": 2.0858, + "step": 17100 + }, + { + "epoch": 2.66, + "learning_rate": 3.419422025514189e-05, + "loss": 2.1174, + "step": 17120 + }, + { + "epoch": 2.66, + "learning_rate": 3.388180161416298e-05, + "loss": 2.1459, + "step": 17140 + }, + { + "epoch": 2.67, + "learning_rate": 3.3569382973184066e-05, + "loss": 2.1425, + "step": 17160 + }, + { + "epoch": 2.67, + "learning_rate": 3.3256964332205155e-05, + "loss": 2.0971, + "step": 17180 + }, + { + "epoch": 2.67, + "learning_rate": 3.2944545691226243e-05, + "loss": 2.1176, + "step": 17200 + }, + { + "epoch": 2.67, + "eval_loss": 2.330962896347046, + "eval_runtime": 69.3407, + "eval_samples_per_second": 28.843, + "eval_steps_per_second": 1.803, + "step": 17200 + }, + { + "epoch": 2.68, + "learning_rate": 3.2632127050247325e-05, + "loss": 2.1471, + "step": 17220 + }, + { + "epoch": 2.68, + "learning_rate": 3.2319708409268414e-05, + "loss": 2.1064, + "step": 17240 + }, + { + "epoch": 2.68, + "learning_rate": 3.20072897682895e-05, + "loss": 2.1347, + "step": 17260 + }, + { + "epoch": 2.69, + "learning_rate": 3.169487112731059e-05, + "loss": 2.142, + "step": 17280 + }, + { + "epoch": 2.69, + "learning_rate": 3.138245248633168e-05, + "loss": 2.1773, + "step": 17300 + }, + { + "epoch": 2.69, + "learning_rate": 3.107003384535277e-05, + "loss": 2.1489, + "step": 17320 + }, + { + "epoch": 2.69, + "learning_rate": 3.075761520437386e-05, + "loss": 2.1257, + "step": 17340 + }, + { + "epoch": 2.7, + "learning_rate": 3.044519656339495e-05, + "loss": 2.1288, + "step": 17360 + }, + { + "epoch": 2.7, + "learning_rate": 3.0132777922416038e-05, + "loss": 2.1258, + "step": 17380 + }, + { + "epoch": 2.7, + "learning_rate": 2.9820359281437123e-05, + "loss": 2.1322, + "step": 17400 + }, + { + "epoch": 2.7, + "eval_loss": 2.3309593200683594, + "eval_runtime": 69.3923, + "eval_samples_per_second": 28.822, + "eval_steps_per_second": 1.801, + "step": 17400 + }, + { + "epoch": 2.71, + "learning_rate": 2.9507940640458212e-05, + "loss": 2.1495, + "step": 17420 + }, + { + "epoch": 2.71, + "learning_rate": 2.91955219994793e-05, + "loss": 2.0843, + "step": 17440 + }, + { + "epoch": 2.71, + "learning_rate": 2.888310335850039e-05, + "loss": 2.11, + "step": 17460 + }, + { + "epoch": 2.72, + "learning_rate": 2.8570684717521478e-05, + "loss": 2.1005, + "step": 17480 + }, + { + "epoch": 2.72, + "learning_rate": 2.827388700859151e-05, + "loss": 2.1302, + "step": 17500 + }, + { + "epoch": 2.72, + "learning_rate": 2.79614683676126e-05, + "loss": 2.1086, + "step": 17520 + }, + { + "epoch": 2.73, + "learning_rate": 2.7649049726633688e-05, + "loss": 2.1302, + "step": 17540 + }, + { + "epoch": 2.73, + "learning_rate": 2.7336631085654777e-05, + "loss": 2.1417, + "step": 17560 + }, + { + "epoch": 2.73, + "learning_rate": 2.7024212444675862e-05, + "loss": 2.1369, + "step": 17580 + }, + { + "epoch": 2.73, + "learning_rate": 2.671179380369695e-05, + "loss": 2.1384, + "step": 17600 + }, + { + "epoch": 2.73, + "eval_loss": 2.33089017868042, + "eval_runtime": 69.3747, + "eval_samples_per_second": 28.829, + "eval_steps_per_second": 1.802, + "step": 17600 + }, + { + "epoch": 2.74, + "learning_rate": 2.639937516271804e-05, + "loss": 2.1243, + "step": 17620 + }, + { + "epoch": 2.74, + "learning_rate": 2.6086956521739128e-05, + "loss": 2.1161, + "step": 17640 + }, + { + "epoch": 2.74, + "learning_rate": 2.5774537880760217e-05, + "loss": 2.1051, + "step": 17660 + }, + { + "epoch": 2.75, + "learning_rate": 2.5462119239781302e-05, + "loss": 2.0762, + "step": 17680 + }, + { + "epoch": 2.75, + "learning_rate": 2.514970059880239e-05, + "loss": 2.1105, + "step": 17700 + }, + { + "epoch": 2.75, + "learning_rate": 2.483728195782348e-05, + "loss": 2.1535, + "step": 17720 + }, + { + "epoch": 2.76, + "learning_rate": 2.452486331684457e-05, + "loss": 2.1706, + "step": 17740 + }, + { + "epoch": 2.76, + "learning_rate": 2.421244467586566e-05, + "loss": 2.0857, + "step": 17760 + }, + { + "epoch": 2.76, + "learning_rate": 2.390002603488675e-05, + "loss": 2.1553, + "step": 17780 + }, + { + "epoch": 2.77, + "learning_rate": 2.3587607393907834e-05, + "loss": 2.0983, + "step": 17800 + }, + { + "epoch": 2.77, + "eval_loss": 2.3304569721221924, + "eval_runtime": 69.35, + "eval_samples_per_second": 28.839, + "eval_steps_per_second": 1.802, + "step": 17800 + }, + { + "epoch": 2.77, + "learning_rate": 2.3275188752928923e-05, + "loss": 2.1212, + "step": 17820 + }, + { + "epoch": 2.77, + "learning_rate": 2.296277011195001e-05, + "loss": 2.0816, + "step": 17840 + }, + { + "epoch": 2.78, + "learning_rate": 2.26503514709711e-05, + "loss": 2.0935, + "step": 17860 + }, + { + "epoch": 2.78, + "learning_rate": 2.233793282999219e-05, + "loss": 2.1576, + "step": 17880 + }, + { + "epoch": 2.78, + "learning_rate": 2.2025514189013274e-05, + "loss": 2.1076, + "step": 17900 + }, + { + "epoch": 2.78, + "learning_rate": 2.1713095548034362e-05, + "loss": 2.1184, + "step": 17920 + }, + { + "epoch": 2.79, + "learning_rate": 2.140067690705545e-05, + "loss": 2.1169, + "step": 17940 + }, + { + "epoch": 2.79, + "learning_rate": 2.108825826607654e-05, + "loss": 2.1442, + "step": 17960 + }, + { + "epoch": 2.79, + "learning_rate": 2.077583962509763e-05, + "loss": 2.1332, + "step": 17980 + }, + { + "epoch": 2.8, + "learning_rate": 2.0463420984118717e-05, + "loss": 2.1553, + "step": 18000 + }, + { + "epoch": 2.8, + "eval_loss": 2.330599069595337, + "eval_runtime": 69.346, + "eval_samples_per_second": 28.841, + "eval_steps_per_second": 1.803, + "step": 18000 + }, + { + "epoch": 2.8, + "learning_rate": 2.0151002343139802e-05, + "loss": 2.1055, + "step": 18020 + }, + { + "epoch": 2.8, + "learning_rate": 1.9838583702160894e-05, + "loss": 2.0778, + "step": 18040 + }, + { + "epoch": 2.81, + "learning_rate": 1.9526165061181983e-05, + "loss": 2.143, + "step": 18060 + }, + { + "epoch": 2.81, + "learning_rate": 1.921374642020307e-05, + "loss": 2.0886, + "step": 18080 + }, + { + "epoch": 2.81, + "learning_rate": 1.890132777922416e-05, + "loss": 2.1236, + "step": 18100 + }, + { + "epoch": 2.82, + "learning_rate": 1.858890913824525e-05, + "loss": 2.1307, + "step": 18120 + }, + { + "epoch": 2.82, + "learning_rate": 1.8276490497266334e-05, + "loss": 2.1192, + "step": 18140 + }, + { + "epoch": 2.82, + "learning_rate": 1.7964071856287423e-05, + "loss": 2.0999, + "step": 18160 + }, + { + "epoch": 2.83, + "learning_rate": 1.765165321530851e-05, + "loss": 2.0792, + "step": 18180 + }, + { + "epoch": 2.83, + "learning_rate": 1.73392345743296e-05, + "loss": 2.1015, + "step": 18200 + }, + { + "epoch": 2.83, + "eval_loss": 2.330050230026245, + "eval_runtime": 69.3278, + "eval_samples_per_second": 28.848, + "eval_steps_per_second": 1.803, + "step": 18200 + }, + { + "epoch": 2.83, + "learning_rate": 1.702681593335069e-05, + "loss": 2.1226, + "step": 18220 + }, + { + "epoch": 2.83, + "learning_rate": 1.6714397292371778e-05, + "loss": 2.0924, + "step": 18240 + }, + { + "epoch": 2.84, + "learning_rate": 1.6401978651392866e-05, + "loss": 2.1272, + "step": 18260 + }, + { + "epoch": 2.84, + "learning_rate": 1.6089560010413955e-05, + "loss": 2.1175, + "step": 18280 + }, + { + "epoch": 2.84, + "learning_rate": 1.577714136943504e-05, + "loss": 2.1396, + "step": 18300 + }, + { + "epoch": 2.85, + "learning_rate": 1.546472272845613e-05, + "loss": 2.1514, + "step": 18320 + }, + { + "epoch": 2.85, + "learning_rate": 1.5152304087477217e-05, + "loss": 2.1257, + "step": 18340 + }, + { + "epoch": 2.85, + "learning_rate": 1.4839885446498306e-05, + "loss": 2.1459, + "step": 18360 + }, + { + "epoch": 2.86, + "learning_rate": 1.4527466805519396e-05, + "loss": 2.09, + "step": 18380 + }, + { + "epoch": 2.86, + "learning_rate": 1.4215048164540483e-05, + "loss": 2.1442, + "step": 18400 + }, + { + "epoch": 2.86, + "eval_loss": 2.330048084259033, + "eval_runtime": 69.2975, + "eval_samples_per_second": 28.861, + "eval_steps_per_second": 1.804, + "step": 18400 + }, + { + "epoch": 2.86, + "learning_rate": 1.3902629523561572e-05, + "loss": 2.1816, + "step": 18420 + }, + { + "epoch": 2.87, + "learning_rate": 1.3590210882582659e-05, + "loss": 2.0965, + "step": 18440 + }, + { + "epoch": 2.87, + "learning_rate": 1.3277792241603748e-05, + "loss": 2.1178, + "step": 18460 + }, + { + "epoch": 2.87, + "learning_rate": 1.2965373600624836e-05, + "loss": 2.1562, + "step": 18480 + }, + { + "epoch": 2.87, + "learning_rate": 1.2652954959645923e-05, + "loss": 2.095, + "step": 18500 + }, + { + "epoch": 2.88, + "learning_rate": 1.2340536318667012e-05, + "loss": 2.1522, + "step": 18520 + }, + { + "epoch": 2.88, + "learning_rate": 1.2028117677688102e-05, + "loss": 2.1729, + "step": 18540 + }, + { + "epoch": 2.88, + "learning_rate": 1.1715699036709189e-05, + "loss": 2.141, + "step": 18560 + }, + { + "epoch": 2.89, + "learning_rate": 1.1403280395730278e-05, + "loss": 2.148, + "step": 18580 + }, + { + "epoch": 2.89, + "learning_rate": 1.1090861754751366e-05, + "loss": 2.1619, + "step": 18600 + }, + { + "epoch": 2.89, + "eval_loss": 2.329728603363037, + "eval_runtime": 69.3412, + "eval_samples_per_second": 28.843, + "eval_steps_per_second": 1.803, + "step": 18600 + }, + { + "epoch": 2.89, + "learning_rate": 1.0778443113772453e-05, + "loss": 2.1199, + "step": 18620 + }, + { + "epoch": 2.9, + "learning_rate": 1.0466024472793542e-05, + "loss": 2.131, + "step": 18640 + }, + { + "epoch": 2.9, + "learning_rate": 1.0153605831814629e-05, + "loss": 2.1512, + "step": 18660 + }, + { + "epoch": 2.9, + "learning_rate": 9.84118719083572e-06, + "loss": 2.1292, + "step": 18680 + }, + { + "epoch": 2.91, + "learning_rate": 9.528768549856808e-06, + "loss": 2.0928, + "step": 18700 + }, + { + "epoch": 2.91, + "learning_rate": 9.216349908877897e-06, + "loss": 2.1168, + "step": 18720 + }, + { + "epoch": 2.91, + "learning_rate": 8.903931267898984e-06, + "loss": 2.1316, + "step": 18740 + }, + { + "epoch": 2.92, + "learning_rate": 8.591512626920072e-06, + "loss": 2.1198, + "step": 18760 + }, + { + "epoch": 2.92, + "learning_rate": 8.279093985941161e-06, + "loss": 2.1226, + "step": 18780 + }, + { + "epoch": 2.92, + "learning_rate": 7.96667534496225e-06, + "loss": 2.1234, + "step": 18800 + }, + { + "epoch": 2.92, + "eval_loss": 2.3294034004211426, + "eval_runtime": 69.3303, + "eval_samples_per_second": 28.847, + "eval_steps_per_second": 1.803, + "step": 18800 + }, + { + "epoch": 2.92, + "learning_rate": 7.654256703983337e-06, + "loss": 2.1251, + "step": 18820 + }, + { + "epoch": 2.93, + "learning_rate": 7.341838063004425e-06, + "loss": 2.1278, + "step": 18840 + }, + { + "epoch": 2.93, + "learning_rate": 7.029419422025514e-06, + "loss": 2.1115, + "step": 18860 + }, + { + "epoch": 2.93, + "learning_rate": 6.717000781046602e-06, + "loss": 2.1468, + "step": 18880 + }, + { + "epoch": 2.94, + "learning_rate": 6.4045821400676894e-06, + "loss": 2.0903, + "step": 18900 + }, + { + "epoch": 2.94, + "learning_rate": 6.092163499088779e-06, + "loss": 2.1271, + "step": 18920 + }, + { + "epoch": 2.94, + "learning_rate": 5.779744858109867e-06, + "loss": 2.1253, + "step": 18940 + }, + { + "epoch": 2.95, + "learning_rate": 5.4673262171309545e-06, + "loss": 2.0903, + "step": 18960 + }, + { + "epoch": 2.95, + "learning_rate": 5.154907576152043e-06, + "loss": 2.1566, + "step": 18980 + }, + { + "epoch": 2.95, + "learning_rate": 4.842488935173132e-06, + "loss": 2.1477, + "step": 19000 + }, + { + "epoch": 2.95, + "eval_loss": 2.3293075561523438, + "eval_runtime": 69.6518, + "eval_samples_per_second": 28.714, + "eval_steps_per_second": 1.795, + "step": 19000 + }, + { + "epoch": 2.96, + "learning_rate": 4.53007029419422e-06, + "loss": 2.1145, + "step": 19020 + }, + { + "epoch": 2.96, + "learning_rate": 4.217651653215308e-06, + "loss": 2.1255, + "step": 19040 + }, + { + "epoch": 2.96, + "learning_rate": 3.905233012236396e-06, + "loss": 2.1122, + "step": 19060 + }, + { + "epoch": 2.96, + "learning_rate": 3.5928143712574848e-06, + "loss": 2.1299, + "step": 19080 + }, + { + "epoch": 2.97, + "learning_rate": 3.280395730278573e-06, + "loss": 2.1272, + "step": 19100 + }, + { + "epoch": 2.97, + "learning_rate": 2.9679770892996616e-06, + "loss": 2.1234, + "step": 19120 + }, + { + "epoch": 2.97, + "learning_rate": 2.6555584483207494e-06, + "loss": 2.1791, + "step": 19140 + }, + { + "epoch": 2.98, + "learning_rate": 2.343139807341838e-06, + "loss": 2.1565, + "step": 19160 + }, + { + "epoch": 2.98, + "learning_rate": 2.0307211663629263e-06, + "loss": 2.1256, + "step": 19180 + }, + { + "epoch": 2.98, + "learning_rate": 1.7183025253840145e-06, + "loss": 2.1641, + "step": 19200 + }, + { + "epoch": 2.98, + "eval_loss": 2.329240560531616, + "eval_runtime": 69.5753, + "eval_samples_per_second": 28.746, + "eval_steps_per_second": 1.797, + "step": 19200 + } + ], + "max_steps": 19305, + "num_train_epochs": 3, + "total_flos": 5.372174060814231e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_bloomfirefly/checkpoint-19200/training_args.bin b/adapters/saved_bloomfirefly/checkpoint-19200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..87b78c4a804a75de63299eeb2fc899bcd70e34ae --- /dev/null +++ b/adapters/saved_bloomfirefly/checkpoint-19200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23cad2e050712e91e38e68c484cff1b7c0ef83524df4c9bb983745b616667737 +size 3643 diff --git a/adapters/saved_bloomfirefly/saved_bloom7b_firefly/adapter_config.json b/adapters/saved_bloomfirefly/saved_bloom7b_firefly/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..854d12e5e9502d2be965ee9c13133dbf4c923d67 --- /dev/null +++ b/adapters/saved_bloomfirefly/saved_bloom7b_firefly/adapter_config.json @@ -0,0 +1,16 @@ +{ + "base_model_name_or_path": "bigscience/bloomz-7b1-mt", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved_bloomfirefly/saved_bloom7b_firefly/adapter_model.bin b/adapters/saved_bloomfirefly/saved_bloom7b_firefly/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..4325014ef41d714ef0f7d34711582d856d9bbde5 --- /dev/null +++ b/adapters/saved_bloomfirefly/saved_bloom7b_firefly/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9811ba709c8fbf328aed339d629c33c4363f9c811d3ede0467d1126f2eb183d6 +size 15750885 diff --git a/adapters/saved_bloomwild_cn/adapter_config.json b/adapters/saved_bloomwild_cn/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..896c4cf962709fe6c09e7461f3422660563f6ea4 --- /dev/null +++ b/adapters/saved_bloomwild_cn/adapter_config.json @@ -0,0 +1,22 @@ +{ + "base_model_name_or_path": "bigscience/bloomz-7b1-mt", + "bias": "none", + "enable_lora": [ + true, + false, + true + ], + "fan_in_fan_out": true, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved_bloomwild_cn/adapter_model.bin b/adapters/saved_bloomwild_cn/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..02da69d93ab2de350c30c9904765fcbd0a47fd07 --- /dev/null +++ b/adapters/saved_bloomwild_cn/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:280f5846ba27d653e1dec1272eb7d22f9fd91fb53fa203a6d1092fb810de1757 +size 15751077 diff --git a/adapters/saved_bloomwild_cn/runs/Apr01_03-12-51_mlxlabf2fsmfks6391d74d-20221208122341-dw3tpz-safznp-worker/1680289972.5538526/events.out.tfevents.1680289972.mlxlabf2fsmfks6391d74d-20221208122341-dw3tpz-safznp-worker.55.1 b/adapters/saved_bloomwild_cn/runs/Apr01_03-12-51_mlxlabf2fsmfks6391d74d-20221208122341-dw3tpz-safznp-worker/1680289972.5538526/events.out.tfevents.1680289972.mlxlabf2fsmfks6391d74d-20221208122341-dw3tpz-safznp-worker.55.1 new file mode 100644 index 0000000000000000000000000000000000000000..a7d370a327748ad5a6ea1145331040e5d234d74b --- /dev/null +++ b/adapters/saved_bloomwild_cn/runs/Apr01_03-12-51_mlxlabf2fsmfks6391d74d-20221208122341-dw3tpz-safznp-worker/1680289972.5538526/events.out.tfevents.1680289972.mlxlabf2fsmfks6391d74d-20221208122341-dw3tpz-safznp-worker.55.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de59a897e4096a4b6a4cd3aa476fc96eb4bbd13e56f77ace1d9df549e8922f5b +size 5922 diff --git a/adapters/saved_bloomwild_cn/runs/Apr01_03-12-51_mlxlabf2fsmfks6391d74d-20221208122341-dw3tpz-safznp-worker/events.out.tfevents.1680289972.mlxlabf2fsmfks6391d74d-20221208122341-dw3tpz-safznp-worker.55.0 b/adapters/saved_bloomwild_cn/runs/Apr01_03-12-51_mlxlabf2fsmfks6391d74d-20221208122341-dw3tpz-safznp-worker/events.out.tfevents.1680289972.mlxlabf2fsmfks6391d74d-20221208122341-dw3tpz-safznp-worker.55.0 new file mode 100644 index 0000000000000000000000000000000000000000..7dc62fbd4d3bd767e40387cfa92b912e81bf0e2b --- /dev/null +++ b/adapters/saved_bloomwild_cn/runs/Apr01_03-12-51_mlxlabf2fsmfks6391d74d-20221208122341-dw3tpz-safznp-worker/events.out.tfevents.1680289972.mlxlabf2fsmfks6391d74d-20221208122341-dw3tpz-safznp-worker.55.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a5132f54f88d3f108f84907e572e6315d1a61f556d23ab35e7ad767ddbbb3f7 +size 4205 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/adapter_config.json b/adapters/saved_bloomz-7b1-mt_TQA/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..854d12e5e9502d2be965ee9c13133dbf4c923d67 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/adapter_config.json @@ -0,0 +1,16 @@ +{ + "base_model_name_or_path": "bigscience/bloomz-7b1-mt", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved_bloomz-7b1-mt_TQA/adapter_model.bin b/adapters/saved_bloomz-7b1-mt_TQA/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8710093665dfdb9819e2f8817a1e25a4ccdd9935 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5e1621f48d9ad8feb1d6d31050275f0aafd080c5c07153301fe2f48411f4406 +size 443 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/optimizer.pt b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..cca0c41cf5c51522b77b421cac2b7708d27627b0 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1801f2631947b256b26272f17d6fefc151f5b487355ec5378590905cc1e57fd +size 31492677 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/pytorch_model.bin b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..e06b5da7a241990f44a234271e16980a64f5c3bf --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:400014fa541979539bc4694cf72a417863335c4fcc1010af18a8ba632adcacdd +size 15750885 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/rng_state_0.pth b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..1995c2d6a6d5f71202582c9488598962bae61ff4 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5e1b43a4fd729545e3deb88ff960f19dfbe07d8a793a733c3afc3b9c2d2bbe2 +size 14583 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/rng_state_1.pth b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..c0f0c73d76aed5f5db1f1a8abf3c57643b0aec46 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68e0dca85bb1796db5d947f6a99dd1260b425c62b71b2529828e030869dc4262 +size 14583 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/rng_state_2.pth b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9e4a528c4676c53aa4a26ed885f6456fb164015c --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0725bc06b06a77c1bdf85c7de2719211b32caf5a413011e36517101f627d7191 +size 14583 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/rng_state_3.pth b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..8671f02117ba4bb9f0b99c826df35285e6746c36 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cd70eb4f526e832d4af45d03b912213225dc24ece06274c0deb4e8703072f5a +size 14583 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/scaler.pt b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..87710d0ddc627c070366fcb3112b07dc60d97295 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fccf0f9be1bb8f24861e4393745b3e09cc2687125a69e3757955fb0f0925ea5 +size 557 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/scheduler.pt b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..83e2d9c6a0d43e097f601aaff7d9534f83ab8fb7 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:219e0a10fc905ea14817dfaa4c8f0308a0e582b129d9a3d9cfd775205a2016d3 +size 627 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/trainer_state.json b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..47e0e6cacfdc2b9a4b9507917dcb5889abee868c --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/trainer_state.json @@ -0,0 +1,84 @@ +{ + "best_metric": NaN, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_bloomz-7b1-mt_TQA/checkpoint-200", + "epoch": 0.6990715456034954, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.07, + "learning_rate": 5.9999999999999995e-05, + "loss": 56.1213, + "step": 20 + }, + { + "epoch": 0.14, + "learning_rate": 0.00011999999999999999, + "loss": 46.3785, + "step": 40 + }, + { + "epoch": 0.21, + "learning_rate": 0.00017999999999999998, + "loss": 27.2008, + "step": 60 + }, + { + "epoch": 0.28, + "learning_rate": 0.00023999999999999998, + "loss": 58.3372, + "step": 80 + }, + { + "epoch": 0.35, + "learning_rate": 0.0003, + "loss": 21.5097, + "step": 100 + }, + { + "epoch": 0.42, + "learning_rate": 0.00029208443271767806, + "loss": 149.5568, + "step": 120 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002841688654353562, + "loss": 12.1125, + "step": 140 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002762532981530343, + "loss": 64.6813, + "step": 160 + }, + { + "epoch": 0.63, + "learning_rate": 0.00026833773087071237, + "loss": 12.8244, + "step": 180 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002604221635883905, + "loss": 13.6392, + "step": 200 + }, + { + "epoch": 0.7, + "eval_loss": NaN, + "eval_runtime": 39.2226, + "eval_samples_per_second": 50.991, + "eval_steps_per_second": 1.606, + "step": 200 + } + ], + "max_steps": 858, + "num_train_epochs": 3, + "total_flos": 1.887154186856956e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/training_args.bin b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..1c206c9c2c135387d9c1d84b0d6b9e819390fa07 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a28b62755d840e4f95c04c24d6777a170d9207a8b4d08a1ae0d1aa732742cc0 +size 3643 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/optimizer.pt b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..07f8d55b589d8aa5ddfae220bb290e929d51d7e5 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aca9c2224e7b884face0304a171cc7f3eb12e177830b57839af83c1ea0689bb1 +size 31492741 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/pytorch_model.bin b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..2c21a162d2a0ab8900fa41028527f010ae709f52 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36dbfd25690c82aa8f01f14d44b8df270f3a45ba3c9606be3f30f7ea2eb13bba +size 15750885 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/rng_state_0.pth b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..30ce5e25193f2d824a8ee51ebf4135519329a959 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:349bf7ce30d9a5a8ae1f14de8f8bf890decea3359294dc427c11a58fc7834dca +size 14583 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/rng_state_1.pth b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..279ecabd51c2795f5652ec341ec65236c40b956a --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:633b505c139deea7076399bb6cf6ada793a9fb9e55f7d8a6654c3346c1294420 +size 14583 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/rng_state_2.pth b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..8f9203d44294a81e6f5120782487c75bc4042456 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7f429ab6c9809a85c398495da25e840b82dae270da2cd8ba51e47cd6c1471b2 +size 14583 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/rng_state_3.pth b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..329f150ab9015b68d55941b9db1a547f0ca57812 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:094fe29c55fe140ed58068002508a8897288d29922e700978ea8b4a5ee666b87 +size 14583 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/scaler.pt b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..842791b612283ceb8e68b64ed8e40e81c5a97bce --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dc9eacfeb00bd0bfeb98934a2309be01be65b288e0d747bbfc423b32679169f +size 557 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/scheduler.pt b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b303739804a178938d8f8103c903ff31ca436d1 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a863d33d88237ba0b779b5edb1e3643535190c7965dccb3a7b8c5151988be42f +size 627 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/trainer_state.json b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..db3e3d016b95e0f5f8c201172ba23a391815de00 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/trainer_state.json @@ -0,0 +1,220 @@ +{ + "best_metric": NaN, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_bloomz-7b1-mt_TQA/checkpoint-200", + "epoch": 2.0972146368104863, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.07, + "learning_rate": 5.9999999999999995e-05, + "loss": 56.1213, + "step": 20 + }, + { + "epoch": 0.14, + "learning_rate": 0.00011999999999999999, + "loss": 46.3785, + "step": 40 + }, + { + "epoch": 0.21, + "learning_rate": 0.00017999999999999998, + "loss": 27.2008, + "step": 60 + }, + { + "epoch": 0.28, + "learning_rate": 0.00023999999999999998, + "loss": 58.3372, + "step": 80 + }, + { + "epoch": 0.35, + "learning_rate": 0.0003, + "loss": 21.5097, + "step": 100 + }, + { + "epoch": 0.42, + "learning_rate": 0.00029208443271767806, + "loss": 149.5568, + "step": 120 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002841688654353562, + "loss": 12.1125, + "step": 140 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002762532981530343, + "loss": 64.6813, + "step": 160 + }, + { + "epoch": 0.63, + "learning_rate": 0.00026833773087071237, + "loss": 12.8244, + "step": 180 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002604221635883905, + "loss": 13.6392, + "step": 200 + }, + { + "epoch": 0.7, + "eval_loss": NaN, + "eval_runtime": 39.2226, + "eval_samples_per_second": 50.991, + "eval_steps_per_second": 1.606, + "step": 200 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002525065963060686, + "loss": 29.1801, + "step": 220 + }, + { + "epoch": 0.84, + "learning_rate": 0.0002445910290237467, + "loss": 7.1808, + "step": 240 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002366754617414248, + "loss": 17.5066, + "step": 260 + }, + { + "epoch": 0.98, + "learning_rate": 0.00022875989445910288, + "loss": 13.7954, + "step": 280 + }, + { + "epoch": 1.05, + "learning_rate": 0.000220844327176781, + "loss": 15.2006, + "step": 300 + }, + { + "epoch": 1.12, + "learning_rate": 0.00021292875989445908, + "loss": 18.5282, + "step": 320 + }, + { + "epoch": 1.19, + "learning_rate": 0.0002050131926121372, + "loss": 10.157, + "step": 340 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001970976253298153, + "loss": 9.2541, + "step": 360 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018918205804749339, + "loss": 9.0145, + "step": 380 + }, + { + "epoch": 1.4, + "learning_rate": 0.00018126649076517147, + "loss": 8.6491, + "step": 400 + }, + { + "epoch": 1.4, + "eval_loss": NaN, + "eval_runtime": 39.8237, + "eval_samples_per_second": 50.221, + "eval_steps_per_second": 1.582, + "step": 400 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001733509234828496, + "loss": 6.757, + "step": 420 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001654353562005277, + "loss": 4.1081, + "step": 440 + }, + { + "epoch": 1.61, + "learning_rate": 0.00015751978891820578, + "loss": 19.5577, + "step": 460 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001496042216358839, + "loss": 14.4416, + "step": 480 + }, + { + "epoch": 1.75, + "learning_rate": 0.00014168865435356198, + "loss": 8.4368, + "step": 500 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001337730870712401, + "loss": 5.4915, + "step": 520 + }, + { + "epoch": 1.89, + "learning_rate": 0.00012585751978891818, + "loss": 28.9042, + "step": 540 + }, + { + "epoch": 1.96, + "learning_rate": 0.00011794195250659629, + "loss": 25.3276, + "step": 560 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001100263852242744, + "loss": 8.954, + "step": 580 + }, + { + "epoch": 2.1, + "learning_rate": 0.0001021108179419525, + "loss": 8.7963, + "step": 600 + }, + { + "epoch": 2.1, + "eval_loss": NaN, + "eval_runtime": 39.412, + "eval_samples_per_second": 50.746, + "eval_steps_per_second": 1.598, + "step": 600 + } + ], + "max_steps": 858, + "num_train_epochs": 3, + "total_flos": 5.660860955911782e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/training_args.bin b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..1c206c9c2c135387d9c1d84b0d6b9e819390fa07 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a28b62755d840e4f95c04c24d6777a170d9207a8b4d08a1ae0d1aa732742cc0 +size 3643 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/optimizer.pt b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..35cc25adf09147f4962ee93b23fc4f00cb9f5fb7 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab103f70bd994f540bb71d32e30134006b75566099dddc7b4be15cec8fea4565 +size 31492741 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/pytorch_model.bin b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..c615565716f92469021eea231b8b4e6987e1a8df --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:324dccddea6af8551f66560420068ef10a80741f6023bf14c21ce755b9589607 +size 15750885 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/rng_state_0.pth b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..bb44f040bbd45939abaa22af06754cce9d2318ac --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5e8280a29db51ef9ed18639ff5387e731fa317151d60540cc1a1bb95ab4da9e +size 14583 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/rng_state_1.pth b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d7597bb011318f3d32c69d494bd4df6dedc1ba32 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d7cd20dfb9a709c93ef25d2f054c5417569aaa0629908002a9c4c4f5013b708 +size 14583 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/rng_state_2.pth b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..62c22c3f6593351cd21f2467595ba1cd590e37a0 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b33ddfb1428c72c7b7dd265cd62ba589618d2acfcdb8e8baf40a785310491d1 +size 14583 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/rng_state_3.pth b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..18b5f70bc26a31dd3d3d582827411b577e856e45 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dabdc53844ab17f5e2a60c258db9b00feb3b185480897fa4c05a6f5b7b79e841 +size 14583 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/scaler.pt b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e01dd7b5d3a8968bb4c73a805f08f0f65c9b57f --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27ec07a12731ae6f9765d05fe7c8495505f1d0f90b4cc6255a0853fec3970808 +size 557 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/scheduler.pt b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5942cf027912a36b2eb482bac514e980c732e81 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ca3f4ef34fb4c2c47d4c77348e6319c3e9249c56c306fd6f9e68a440f70e454 +size 627 diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/trainer_state.json b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..20918e95efcb6641a220a79b83485978d5448a49 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/trainer_state.json @@ -0,0 +1,288 @@ +{ + "best_metric": NaN, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_bloomz-7b1-mt_TQA/checkpoint-200", + "epoch": 2.7962861824139815, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.07, + "learning_rate": 5.9999999999999995e-05, + "loss": 56.1213, + "step": 20 + }, + { + "epoch": 0.14, + "learning_rate": 0.00011999999999999999, + "loss": 46.3785, + "step": 40 + }, + { + "epoch": 0.21, + "learning_rate": 0.00017999999999999998, + "loss": 27.2008, + "step": 60 + }, + { + "epoch": 0.28, + "learning_rate": 0.00023999999999999998, + "loss": 58.3372, + "step": 80 + }, + { + "epoch": 0.35, + "learning_rate": 0.0003, + "loss": 21.5097, + "step": 100 + }, + { + "epoch": 0.42, + "learning_rate": 0.00029208443271767806, + "loss": 149.5568, + "step": 120 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002841688654353562, + "loss": 12.1125, + "step": 140 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002762532981530343, + "loss": 64.6813, + "step": 160 + }, + { + "epoch": 0.63, + "learning_rate": 0.00026833773087071237, + "loss": 12.8244, + "step": 180 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002604221635883905, + "loss": 13.6392, + "step": 200 + }, + { + "epoch": 0.7, + "eval_loss": NaN, + "eval_runtime": 39.2226, + "eval_samples_per_second": 50.991, + "eval_steps_per_second": 1.606, + "step": 200 + }, + { + "epoch": 0.77, + "learning_rate": 0.0002525065963060686, + "loss": 29.1801, + "step": 220 + }, + { + "epoch": 0.84, + "learning_rate": 0.0002445910290237467, + "loss": 7.1808, + "step": 240 + }, + { + "epoch": 0.91, + "learning_rate": 0.0002366754617414248, + "loss": 17.5066, + "step": 260 + }, + { + "epoch": 0.98, + "learning_rate": 0.00022875989445910288, + "loss": 13.7954, + "step": 280 + }, + { + "epoch": 1.05, + "learning_rate": 0.000220844327176781, + "loss": 15.2006, + "step": 300 + }, + { + "epoch": 1.12, + "learning_rate": 0.00021292875989445908, + "loss": 18.5282, + "step": 320 + }, + { + "epoch": 1.19, + "learning_rate": 0.0002050131926121372, + "loss": 10.157, + "step": 340 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001970976253298153, + "loss": 9.2541, + "step": 360 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018918205804749339, + "loss": 9.0145, + "step": 380 + }, + { + "epoch": 1.4, + "learning_rate": 0.00018126649076517147, + "loss": 8.6491, + "step": 400 + }, + { + "epoch": 1.4, + "eval_loss": NaN, + "eval_runtime": 39.8237, + "eval_samples_per_second": 50.221, + "eval_steps_per_second": 1.582, + "step": 400 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001733509234828496, + "loss": 6.757, + "step": 420 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001654353562005277, + "loss": 4.1081, + "step": 440 + }, + { + "epoch": 1.61, + "learning_rate": 0.00015751978891820578, + "loss": 19.5577, + "step": 460 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001496042216358839, + "loss": 14.4416, + "step": 480 + }, + { + "epoch": 1.75, + "learning_rate": 0.00014168865435356198, + "loss": 8.4368, + "step": 500 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001337730870712401, + "loss": 5.4915, + "step": 520 + }, + { + "epoch": 1.89, + "learning_rate": 0.00012585751978891818, + "loss": 28.9042, + "step": 540 + }, + { + "epoch": 1.96, + "learning_rate": 0.00011794195250659629, + "loss": 25.3276, + "step": 560 + }, + { + "epoch": 2.03, + "learning_rate": 0.0001100263852242744, + "loss": 8.954, + "step": 580 + }, + { + "epoch": 2.1, + "learning_rate": 0.0001021108179419525, + "loss": 8.7963, + "step": 600 + }, + { + "epoch": 2.1, + "eval_loss": NaN, + "eval_runtime": 39.412, + "eval_samples_per_second": 50.746, + "eval_steps_per_second": 1.598, + "step": 600 + }, + { + "epoch": 2.17, + "learning_rate": 9.41952506596306e-05, + "loss": 17.8767, + "step": 620 + }, + { + "epoch": 2.24, + "learning_rate": 8.62796833773087e-05, + "loss": 19.0455, + "step": 640 + }, + { + "epoch": 2.31, + "learning_rate": 7.836411609498681e-05, + "loss": 6.1526, + "step": 660 + }, + { + "epoch": 2.38, + "learning_rate": 7.044854881266491e-05, + "loss": 8.0983, + "step": 680 + }, + { + "epoch": 2.45, + "learning_rate": 6.2532981530343e-05, + "loss": 7.2175, + "step": 700 + }, + { + "epoch": 2.52, + "learning_rate": 5.461741424802111e-05, + "loss": 12.3619, + "step": 720 + }, + { + "epoch": 2.59, + "learning_rate": 4.670184696569921e-05, + "loss": 6.1864, + "step": 740 + }, + { + "epoch": 2.66, + "learning_rate": 3.8786279683377306e-05, + "loss": 6.9226, + "step": 760 + }, + { + "epoch": 2.73, + "learning_rate": 3.0870712401055405e-05, + "loss": 7.2459, + "step": 780 + }, + { + "epoch": 2.8, + "learning_rate": 2.2955145118733507e-05, + "loss": 7.1272, + "step": 800 + }, + { + "epoch": 2.8, + "eval_loss": NaN, + "eval_runtime": 39.4817, + "eval_samples_per_second": 50.656, + "eval_steps_per_second": 1.596, + "step": 800 + } + ], + "max_steps": 858, + "num_train_epochs": 3, + "total_flos": 7.546667811527983e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/training_args.bin b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..1c206c9c2c135387d9c1d84b0d6b9e819390fa07 --- /dev/null +++ b/adapters/saved_bloomz-7b1-mt_TQA/checkpoint-800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a28b62755d840e4f95c04c24d6777a170d9207a8b4d08a1ae0d1aa732742cc0 +size 3643 diff --git a/adapters/saved_chatglmwild_cn/adapter_config.json b/adapters/saved_chatglmwild_cn/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b10e70867ad14f5a28f2ab61ae422103303d6e8f --- /dev/null +++ b/adapters/saved_chatglmwild_cn/adapter_config.json @@ -0,0 +1,22 @@ +{ + "base_model_name_or_path": "THUDM/chatglm-6b", + "bias": "none", + "enable_lora": [ + true, + false, + true + ], + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 32, + "lora_dropout": 0.1, + "merge_weights": false, + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved_chatglmwild_cn/adapter_model.bin b/adapters/saved_chatglmwild_cn/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8303776e1c3e1467ecd950ece355d27950159486 --- /dev/null +++ b/adapters/saved_chatglmwild_cn/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c3e96a5561337c97ac704c33084fddf81f1dec5f1542b79806f0833c39c7804 +size 58741145 diff --git a/adapters/saved_chatglmwild_cn/checkpoint-200/optimizer.pt b/adapters/saved_chatglmwild_cn/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f99ac9de67a8c4c3de58de51614acefe9f7e87ae --- /dev/null +++ b/adapters/saved_chatglmwild_cn/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bb2d7f1bff4846a37acb6db4df9cee9202e67a12acd5d2f4b698757487d2750 +size 117473861 diff --git a/adapters/saved_chatglmwild_cn/checkpoint-200/pytorch_model.bin b/adapters/saved_chatglmwild_cn/checkpoint-200/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..16932178d2fe749cda49d047aae80eaac4e1e3b4 --- /dev/null +++ b/adapters/saved_chatglmwild_cn/checkpoint-200/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3307e72a6cc0cba9c3e1e4ead895683c009f037e1a72562beafc60e9affc16c +size 58741145 diff --git a/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_0.pth b/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..feed0029adc2cdf491ea8261d2860ae96409abb8 --- /dev/null +++ b/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bca2530e3c814976e42c6586a48dfa2f14f46808a2d3e57966f40d0b0f6b979e +size 14583 diff --git a/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_1.pth b/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..10511f7ac991bb6929352687a6e619697c47185b --- /dev/null +++ b/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:692a2b1bda18e07c3c09179fa5006d21a1bbc04a9a180a3786f7108ac1628f19 +size 14583 diff --git a/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_2.pth b/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..53702dd48c3189ed44d83b1cee28a178b4ecc3ae --- /dev/null +++ b/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b2c8259195fbc40039a2a906a2577eb32e44dda49e517f35041726ab5dfbb85 +size 14583 diff --git a/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_3.pth b/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..639fb4d7cb42c1005f6effcb34e2e093362223d9 --- /dev/null +++ b/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:078abe08c916dd6d6c154371df7c2c21ba2980260e119006963abeaa2c94f750 +size 14583 diff --git a/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_4.pth b/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..51d93536c75114f9a56a3b21bcc42ff0f82ede9f --- /dev/null +++ b/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a68e51d05da4461affaccb66c019b91d667c65d8bcb02898e4380d84c475f1d +size 14583 diff --git a/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_5.pth b/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..469abb2fa859ed8fef416207ee2d211a7be7c1bb --- /dev/null +++ b/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00978662975816cc3fb215367e57fe5fd47b659fb56300d05cf980d7762a0ce6 +size 14583 diff --git a/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_6.pth b/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..bb7df7ddad0eded4c34ea507c811eadabbb23e79 --- /dev/null +++ b/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7ca0b6c2efedf653db2b7cc2b5845047d6e06a917d2601ee769d3adec0d7c99 +size 14583 diff --git a/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_7.pth b/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..127ea73c41cbc586739cb0ad10cee72b83555e70 --- /dev/null +++ b/adapters/saved_chatglmwild_cn/checkpoint-200/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fc2d8706310ec73874b8209caec3dfbe7902f72fbd7155248441f93f3020ec9 +size 14583 diff --git a/adapters/saved_chatglmwild_cn/checkpoint-200/scaler.pt b/adapters/saved_chatglmwild_cn/checkpoint-200/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..36bff09180c8704f2905476f688599630f2d92bc --- /dev/null +++ b/adapters/saved_chatglmwild_cn/checkpoint-200/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f86b828c8e28feef514b46aa8b520a07c5de395eb7d908998548197ecf0f542 +size 557 diff --git a/adapters/saved_chatglmwild_cn/checkpoint-200/scheduler.pt b/adapters/saved_chatglmwild_cn/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b4f9e94bd1aed871a59b990eee911274f5329e40 --- /dev/null +++ b/adapters/saved_chatglmwild_cn/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08b67f3dd811038392db68872605006a59a1df3fb949d78fcbd38904de3ca8e4 +size 627 diff --git a/adapters/saved_chatglmwild_cn/checkpoint-200/trainer_state.json b/adapters/saved_chatglmwild_cn/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cf2befc8aeff7581d8febdc5deac87dbd19f5e2e --- /dev/null +++ b/adapters/saved_chatglmwild_cn/checkpoint-200/trainer_state.json @@ -0,0 +1,84 @@ +{ + "best_metric": 2.574218511581421, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_chatglmwild_cn/checkpoint-200", + "epoch": 1.0342598577892697, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "learning_rate": 3.2000000000000003e-06, + "loss": 3.3366, + "step": 20 + }, + { + "epoch": 0.21, + "learning_rate": 7e-06, + "loss": 3.2662, + "step": 40 + }, + { + "epoch": 0.31, + "learning_rate": 1.1000000000000001e-05, + "loss": 3.0126, + "step": 60 + }, + { + "epoch": 0.41, + "learning_rate": 1.5000000000000002e-05, + "loss": 2.3476, + "step": 80 + }, + { + "epoch": 0.52, + "learning_rate": 1.9e-05, + "loss": 1.2085, + "step": 100 + }, + { + "epoch": 0.62, + "learning_rate": 1.8951048951048952e-05, + "loss": 0.3935, + "step": 120 + }, + { + "epoch": 0.72, + "learning_rate": 1.7552447552447553e-05, + "loss": 0.1678, + "step": 140 + }, + { + "epoch": 0.83, + "learning_rate": 1.6153846153846154e-05, + "loss": 0.0775, + "step": 160 + }, + { + "epoch": 0.93, + "learning_rate": 1.4755244755244758e-05, + "loss": 0.0617, + "step": 180 + }, + { + "epoch": 1.03, + "learning_rate": 1.3356643356643357e-05, + "loss": 0.0479, + "step": 200 + }, + { + "epoch": 1.03, + "eval_loss": 2.574218511581421, + "eval_runtime": 11.8812, + "eval_samples_per_second": 168.332, + "eval_steps_per_second": 2.693, + "step": 200 + } + ], + "max_steps": 386, + "num_train_epochs": 2, + "total_flos": 2.313659325706404e+17, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_chatglmwild_cn/checkpoint-200/training_args.bin b/adapters/saved_chatglmwild_cn/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2931809dcf749b6937e3b4f95c19b2d0705a8836 --- /dev/null +++ b/adapters/saved_chatglmwild_cn/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:def5ab529b211d0b91356abe9b26bd75dbaa5138a61deaac33479cbc22858a3f +size 3579 diff --git a/adapters/saved_llama-7b-hf_TQA/adapter_config.json b/adapters/saved_llama-7b-hf_TQA/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f033fdd6b7869b39cae4f1c30521c8561ed73ff --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/adapter_config.json @@ -0,0 +1,17 @@ +{ + "base_model_name_or_path": "decapoda-research/llama-7b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved_llama-7b-hf_TQA/adapter_model.bin b/adapters/saved_llama-7b-hf_TQA/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8710093665dfdb9819e2f8817a1e25a4ccdd9935 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5e1621f48d9ad8feb1d6d31050275f0aafd080c5c07153301fe2f48411f4406 +size 443 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-200/optimizer.pt b/adapters/saved_llama-7b-hf_TQA/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2463ac7679432d4d8e9d5ca82d7c2a31005872f7 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe0b176b209f397dc922bb07b46c1a6f12d0e7d3379c8ff59918c5fcd7c24dfb +size 33629765 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-200/pytorch_model.bin b/adapters/saved_llama-7b-hf_TQA/checkpoint-200/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..849ca090516d83921653fa474eb03bb50e1fc453 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-200/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dadb113ff9a990374bd0cee17103397c19c78d86f759e66a3e9bca4437bf6bb4 +size 16822989 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-200/rng_state_0.pth b/adapters/saved_llama-7b-hf_TQA/checkpoint-200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..32a42df9d9b9e4e248964d733c05cb6343f9113d --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c0f06c49598735ed48959790b14b3944dc90cbc23cfde88316cc07b07387dfb +size 14583 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-200/rng_state_1.pth b/adapters/saved_llama-7b-hf_TQA/checkpoint-200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..c4e44b23e3d76c87ac6208164a7ebba8138a6de0 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b636a4c8407a5302df70d3f0b7ce82aa10800e353fb1ee3b530bfb15aedecfb +size 14583 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-200/rng_state_2.pth b/adapters/saved_llama-7b-hf_TQA/checkpoint-200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6b5fef6f58961e39c4d6a70dfaffce1cbac3457c --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dab69181ca73aa3d3bca3de2c76371584b595713e3629a9a9adfd62005a33df5 +size 14583 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-200/rng_state_3.pth b/adapters/saved_llama-7b-hf_TQA/checkpoint-200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..328999eded8949f21cb73509aaa596fb3a849a29 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f51035e449e8f6d5bdd9ca698f1414c34394e1281019b05bf349655b2f47fa6c +size 14583 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-200/scaler.pt b/adapters/saved_llama-7b-hf_TQA/checkpoint-200/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..78e91886e94a9bc7e5945678e67bd0a700f5fd3a --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-200/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20cb76aa1dd9abcaa8f7e92055edcc5773f0f883f88668ad9ebe1917d319c113 +size 557 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-200/scheduler.pt b/adapters/saved_llama-7b-hf_TQA/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b78d9d210e56106e0ff8d7aa29ee6202ce658a1 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:296cbb1e9fd8673eada44080af215a5e766b97b4a916200ba7ffd7504235021b +size 627 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-200/trainer_state.json b/adapters/saved_llama-7b-hf_TQA/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7ae4f72a6d5a2d5bcdacf0f0d32bbcf955e3d5e3 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-200/trainer_state.json @@ -0,0 +1,84 @@ +{ + "best_metric": NaN, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llama-7b-hf_TQA/checkpoint-200", + "epoch": 0.6990715456034954, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.07, + "learning_rate": 5.1e-05, + "loss": 15278261862.4, + "step": 20 + }, + { + "epoch": 0.14, + "learning_rate": 0.00010799999999999998, + "loss": 155534963507.2, + "step": 40 + }, + { + "epoch": 0.21, + "learning_rate": 0.000168, + "loss": 26388629094.4, + "step": 60 + }, + { + "epoch": 0.28, + "learning_rate": 0.00022799999999999999, + "loss": 1740693708.8, + "step": 80 + }, + { + "epoch": 0.35, + "learning_rate": 0.00028799999999999995, + "loss": 353188315136.0, + "step": 100 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002936675461741425, + "loss": 15921389568.0, + "step": 120 + }, + { + "epoch": 0.49, + "learning_rate": 0.00028575197889182057, + "loss": 8279447142.4, + "step": 140 + }, + { + "epoch": 0.56, + "learning_rate": 0.00027783641160949866, + "loss": 31305360998.4, + "step": 160 + }, + { + "epoch": 0.63, + "learning_rate": 0.00026992084432717674, + "loss": 100943947366.4, + "step": 180 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002620052770448549, + "loss": 1337943859.2, + "step": 200 + }, + { + "epoch": 0.7, + "eval_loss": NaN, + "eval_runtime": 33.6458, + "eval_samples_per_second": 59.443, + "eval_steps_per_second": 1.872, + "step": 200 + } + ], + "max_steps": 858, + "num_train_epochs": 3, + "total_flos": 2.0791367854467318e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-200/training_args.bin b/adapters/saved_llama-7b-hf_TQA/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7f907c71d42e4be1c2af48f18c1da4e0d020c7d3 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71d64800fd6e8ff62c5c7c1beb08db15ef64c167b94aabfeda2506f26162d725 +size 3643 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-600/optimizer.pt b/adapters/saved_llama-7b-hf_TQA/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c16ae91a5f491cef0e257ffc8db9eaf845517f18 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e999c16869ea67a4173315990273e27ed64b0df2352f474562bfeab6d6df563 +size 33629893 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-600/pytorch_model.bin b/adapters/saved_llama-7b-hf_TQA/checkpoint-600/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..72dcd587c6cb9521d21bb141161a445c13a4722c --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-600/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dd8f0097fe164b1f6716ebeb87d7b304996c00dc284ab30c788bbda625389d5 +size 16822989 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-600/rng_state_0.pth b/adapters/saved_llama-7b-hf_TQA/checkpoint-600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5c517de9bd84d9b44e1a59d345e402761fdf4da4 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:febd419750af9f1b597dcdc0b15854c4dfff7e749d5b9f51ac7d67766d7830b8 +size 14583 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-600/rng_state_1.pth b/adapters/saved_llama-7b-hf_TQA/checkpoint-600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..8cde187fc0a1dc5a00b4ab0cca74d4e14e674bb8 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d02a5c41b712afd4f5c753f06f7b9909d4123ddc9e2db561b9640e8cf944e93 +size 14583 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-600/rng_state_2.pth b/adapters/saved_llama-7b-hf_TQA/checkpoint-600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..b07689a5c962bec9155b6e92d1e9e613b37e626c --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a62a7f700223b62c0a854555f16bdd6a0dc6ce86cbda3edcb737e98713cc8066 +size 14583 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-600/rng_state_3.pth b/adapters/saved_llama-7b-hf_TQA/checkpoint-600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..6b6c2758f3a7bb63b95771939e268e40615eebfb --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6cba0472a8fd6e17769dc522051ac77f9d2091ada96e6504e34db18cf460bf5 +size 14583 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-600/scaler.pt b/adapters/saved_llama-7b-hf_TQA/checkpoint-600/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f47d8710099e2262611450b4452b73d1728178f --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-600/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b84f4de3e6b7844aba02a230284df0e8d82136976bd1619df726d690734da6c8 +size 557 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-600/scheduler.pt b/adapters/saved_llama-7b-hf_TQA/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e0d461ca063e8a9c9ffef43b3bcee9e1fc69cb5 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa81e53e7390699d8d45cb1e2cabb7d305c69704136754f6b8ee18434bdb67f2 +size 627 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-600/trainer_state.json b/adapters/saved_llama-7b-hf_TQA/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..56f63da45cd6f8053c7d182d99f2221b27db2264 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-600/trainer_state.json @@ -0,0 +1,220 @@ +{ + "best_metric": NaN, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llama-7b-hf_TQA/checkpoint-200", + "epoch": 2.0972146368104863, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.07, + "learning_rate": 5.1e-05, + "loss": 15278261862.4, + "step": 20 + }, + { + "epoch": 0.14, + "learning_rate": 0.00010799999999999998, + "loss": 155534963507.2, + "step": 40 + }, + { + "epoch": 0.21, + "learning_rate": 0.000168, + "loss": 26388629094.4, + "step": 60 + }, + { + "epoch": 0.28, + "learning_rate": 0.00022799999999999999, + "loss": 1740693708.8, + "step": 80 + }, + { + "epoch": 0.35, + "learning_rate": 0.00028799999999999995, + "loss": 353188315136.0, + "step": 100 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002936675461741425, + "loss": 15921389568.0, + "step": 120 + }, + { + "epoch": 0.49, + "learning_rate": 0.00028575197889182057, + "loss": 8279447142.4, + "step": 140 + }, + { + "epoch": 0.56, + "learning_rate": 0.00027783641160949866, + "loss": 31305360998.4, + "step": 160 + }, + { + "epoch": 0.63, + "learning_rate": 0.00026992084432717674, + "loss": 100943947366.4, + "step": 180 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002620052770448549, + "loss": 1337943859.2, + "step": 200 + }, + { + "epoch": 0.7, + "eval_loss": NaN, + "eval_runtime": 33.6458, + "eval_samples_per_second": 59.443, + "eval_steps_per_second": 1.872, + "step": 200 + }, + { + "epoch": 0.77, + "learning_rate": 0.00025408970976253297, + "loss": 13503238963.2, + "step": 220 + }, + { + "epoch": 0.84, + "learning_rate": 0.00024617414248021105, + "loss": 4071580057.6, + "step": 240 + }, + { + "epoch": 0.91, + "learning_rate": 0.00023825857519788916, + "loss": 2072524800.0, + "step": 260 + }, + { + "epoch": 0.98, + "learning_rate": 0.00023034300791556725, + "loss": 771705753.6, + "step": 280 + }, + { + "epoch": 1.05, + "learning_rate": 0.00022242744063324536, + "loss": 12987478016.0, + "step": 300 + }, + { + "epoch": 1.12, + "learning_rate": 0.00021451187335092345, + "loss": 7428961075.2, + "step": 320 + }, + { + "epoch": 1.19, + "learning_rate": 0.0002065963060686016, + "loss": 20587793612.8, + "step": 340 + }, + { + "epoch": 1.26, + "learning_rate": 0.00019868073878627967, + "loss": 4394051174.4, + "step": 360 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019076517150395776, + "loss": 83019392614.4, + "step": 380 + }, + { + "epoch": 1.4, + "learning_rate": 0.00018284960422163587, + "loss": 15752210022.4, + "step": 400 + }, + { + "epoch": 1.4, + "eval_loss": NaN, + "eval_runtime": 33.6928, + "eval_samples_per_second": 59.36, + "eval_steps_per_second": 1.87, + "step": 400 + }, + { + "epoch": 1.47, + "learning_rate": 0.00017493403693931398, + "loss": 22903119872.0, + "step": 420 + }, + { + "epoch": 1.54, + "learning_rate": 0.00016701846965699207, + "loss": 3805777510.4, + "step": 440 + }, + { + "epoch": 1.61, + "learning_rate": 0.00015910290237467018, + "loss": 8790727065.6, + "step": 460 + }, + { + "epoch": 1.68, + "learning_rate": 0.00015118733509234827, + "loss": 16184664064.0, + "step": 480 + }, + { + "epoch": 1.75, + "learning_rate": 0.00014327176781002638, + "loss": 479622384844.8, + "step": 500 + }, + { + "epoch": 1.82, + "learning_rate": 0.00013535620052770446, + "loss": 59689546547.2, + "step": 520 + }, + { + "epoch": 1.89, + "learning_rate": 0.00012744063324538258, + "loss": 7117320192.0, + "step": 540 + }, + { + "epoch": 1.96, + "learning_rate": 0.00011952506596306068, + "loss": 165947716403.2, + "step": 560 + }, + { + "epoch": 2.03, + "learning_rate": 0.00011160949868073879, + "loss": 1006318694.4, + "step": 580 + }, + { + "epoch": 2.1, + "learning_rate": 0.00010369393139841687, + "loss": 476977612.8, + "step": 600 + }, + { + "epoch": 2.1, + "eval_loss": NaN, + "eval_runtime": 33.5099, + "eval_samples_per_second": 59.684, + "eval_steps_per_second": 1.88, + "step": 600 + } + ], + "max_steps": 858, + "num_train_epochs": 3, + "total_flos": 6.237188632948507e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-600/training_args.bin b/adapters/saved_llama-7b-hf_TQA/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7f907c71d42e4be1c2af48f18c1da4e0d020c7d3 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71d64800fd6e8ff62c5c7c1beb08db15ef64c167b94aabfeda2506f26162d725 +size 3643 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-800/optimizer.pt b/adapters/saved_llama-7b-hf_TQA/checkpoint-800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bad92a2b013597cc48c92d8ad3fe043e88994e47 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f98ca1b3aee807460799430436cb0dcd94a5c1c9a1a5133bca4f2726beaa5609 +size 33629893 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-800/pytorch_model.bin b/adapters/saved_llama-7b-hf_TQA/checkpoint-800/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..0e77b0701e7297048837851a3727aafc6a7fceeb --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-800/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0f9fdbf1ebdd0d8a6cbc5e9a8d698831dbaafec9851bd58856cbacae017e435 +size 16822989 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-800/rng_state_0.pth b/adapters/saved_llama-7b-hf_TQA/checkpoint-800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..632da91243a10c8e73cdefd3d90d05320186a71b --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d53f855e50c07c258fd9a6c04b58a6fba6eb896525f88a25e719b09e2f84ecb6 +size 14583 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-800/rng_state_1.pth b/adapters/saved_llama-7b-hf_TQA/checkpoint-800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d73de3fcb13a0cad4a76e37957df99021d3658f2 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03e9f24665bc7cf14214d70cc88edb9a798a7698a72ee83e16e6d148e78adf1b +size 14583 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-800/rng_state_2.pth b/adapters/saved_llama-7b-hf_TQA/checkpoint-800/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..37fc72b9fee2c45fe64f93326719b58ba7365280 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-800/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f99b771a81983ac7aeebcf683bf0f046a0db791e0040b19cada522c2a2e00c76 +size 14583 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-800/rng_state_3.pth b/adapters/saved_llama-7b-hf_TQA/checkpoint-800/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef69745228608141d5d559179c239a0baf808b51 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-800/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0820ec1c3e9494083eaf0951a61935ff890b115b2ffd822f213cb7bf29653e20 +size 14583 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-800/scaler.pt b/adapters/saved_llama-7b-hf_TQA/checkpoint-800/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca4849d8ad4d7c36b904e6f0ba934870e7921f64 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-800/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5061cf75035729d2e42b9b7ed8612b940981eda2d629be3b715415cdcf958b58 +size 557 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-800/scheduler.pt b/adapters/saved_llama-7b-hf_TQA/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2242752af023b604453fa23786fe3c5457cb9a55 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:269d11e7e2150ac73be829dd2a16d12fbd9f6bc1295a1c5ae4b615f53213a7d5 +size 627 diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-800/trainer_state.json b/adapters/saved_llama-7b-hf_TQA/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7a96bef7d250b8014d15af80470c2316b44d97d8 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-800/trainer_state.json @@ -0,0 +1,288 @@ +{ + "best_metric": NaN, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llama-7b-hf_TQA/checkpoint-200", + "epoch": 2.7962861824139815, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.07, + "learning_rate": 5.1e-05, + "loss": 15278261862.4, + "step": 20 + }, + { + "epoch": 0.14, + "learning_rate": 0.00010799999999999998, + "loss": 155534963507.2, + "step": 40 + }, + { + "epoch": 0.21, + "learning_rate": 0.000168, + "loss": 26388629094.4, + "step": 60 + }, + { + "epoch": 0.28, + "learning_rate": 0.00022799999999999999, + "loss": 1740693708.8, + "step": 80 + }, + { + "epoch": 0.35, + "learning_rate": 0.00028799999999999995, + "loss": 353188315136.0, + "step": 100 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002936675461741425, + "loss": 15921389568.0, + "step": 120 + }, + { + "epoch": 0.49, + "learning_rate": 0.00028575197889182057, + "loss": 8279447142.4, + "step": 140 + }, + { + "epoch": 0.56, + "learning_rate": 0.00027783641160949866, + "loss": 31305360998.4, + "step": 160 + }, + { + "epoch": 0.63, + "learning_rate": 0.00026992084432717674, + "loss": 100943947366.4, + "step": 180 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002620052770448549, + "loss": 1337943859.2, + "step": 200 + }, + { + "epoch": 0.7, + "eval_loss": NaN, + "eval_runtime": 33.6458, + "eval_samples_per_second": 59.443, + "eval_steps_per_second": 1.872, + "step": 200 + }, + { + "epoch": 0.77, + "learning_rate": 0.00025408970976253297, + "loss": 13503238963.2, + "step": 220 + }, + { + "epoch": 0.84, + "learning_rate": 0.00024617414248021105, + "loss": 4071580057.6, + "step": 240 + }, + { + "epoch": 0.91, + "learning_rate": 0.00023825857519788916, + "loss": 2072524800.0, + "step": 260 + }, + { + "epoch": 0.98, + "learning_rate": 0.00023034300791556725, + "loss": 771705753.6, + "step": 280 + }, + { + "epoch": 1.05, + "learning_rate": 0.00022242744063324536, + "loss": 12987478016.0, + "step": 300 + }, + { + "epoch": 1.12, + "learning_rate": 0.00021451187335092345, + "loss": 7428961075.2, + "step": 320 + }, + { + "epoch": 1.19, + "learning_rate": 0.0002065963060686016, + "loss": 20587793612.8, + "step": 340 + }, + { + "epoch": 1.26, + "learning_rate": 0.00019868073878627967, + "loss": 4394051174.4, + "step": 360 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019076517150395776, + "loss": 83019392614.4, + "step": 380 + }, + { + "epoch": 1.4, + "learning_rate": 0.00018284960422163587, + "loss": 15752210022.4, + "step": 400 + }, + { + "epoch": 1.4, + "eval_loss": NaN, + "eval_runtime": 33.6928, + "eval_samples_per_second": 59.36, + "eval_steps_per_second": 1.87, + "step": 400 + }, + { + "epoch": 1.47, + "learning_rate": 0.00017493403693931398, + "loss": 22903119872.0, + "step": 420 + }, + { + "epoch": 1.54, + "learning_rate": 0.00016701846965699207, + "loss": 3805777510.4, + "step": 440 + }, + { + "epoch": 1.61, + "learning_rate": 0.00015910290237467018, + "loss": 8790727065.6, + "step": 460 + }, + { + "epoch": 1.68, + "learning_rate": 0.00015118733509234827, + "loss": 16184664064.0, + "step": 480 + }, + { + "epoch": 1.75, + "learning_rate": 0.00014327176781002638, + "loss": 479622384844.8, + "step": 500 + }, + { + "epoch": 1.82, + "learning_rate": 0.00013535620052770446, + "loss": 59689546547.2, + "step": 520 + }, + { + "epoch": 1.89, + "learning_rate": 0.00012744063324538258, + "loss": 7117320192.0, + "step": 540 + }, + { + "epoch": 1.96, + "learning_rate": 0.00011952506596306068, + "loss": 165947716403.2, + "step": 560 + }, + { + "epoch": 2.03, + "learning_rate": 0.00011160949868073879, + "loss": 1006318694.4, + "step": 580 + }, + { + "epoch": 2.1, + "learning_rate": 0.00010369393139841687, + "loss": 476977612.8, + "step": 600 + }, + { + "epoch": 2.1, + "eval_loss": NaN, + "eval_runtime": 33.5099, + "eval_samples_per_second": 59.684, + "eval_steps_per_second": 1.88, + "step": 600 + }, + { + "epoch": 2.17, + "learning_rate": 9.577836411609499e-05, + "loss": 13798570393.6, + "step": 620 + }, + { + "epoch": 2.24, + "learning_rate": 8.786279683377308e-05, + "loss": 1035509760.0, + "step": 640 + }, + { + "epoch": 2.31, + "learning_rate": 7.994722955145117e-05, + "loss": 27405954252.8, + "step": 660 + }, + { + "epoch": 2.38, + "learning_rate": 7.203166226912928e-05, + "loss": 26138355302.4, + "step": 680 + }, + { + "epoch": 2.45, + "learning_rate": 6.411609498680738e-05, + "loss": 13618489753.6, + "step": 700 + }, + { + "epoch": 2.52, + "learning_rate": 5.620052770448549e-05, + "loss": 389702352896.0, + "step": 720 + }, + { + "epoch": 2.59, + "learning_rate": 4.828496042216358e-05, + "loss": 128603114700.8, + "step": 740 + }, + { + "epoch": 2.66, + "learning_rate": 4.036939313984169e-05, + "loss": 342587443.2, + "step": 760 + }, + { + "epoch": 2.73, + "learning_rate": 3.2453825857519784e-05, + "loss": 27623394508.8, + "step": 780 + }, + { + "epoch": 2.8, + "learning_rate": 2.4538258575197886e-05, + "loss": 15703303782.4, + "step": 800 + }, + { + "epoch": 2.8, + "eval_loss": NaN, + "eval_runtime": 33.5559, + "eval_samples_per_second": 59.602, + "eval_steps_per_second": 1.877, + "step": 800 + } + ], + "max_steps": 858, + "num_train_epochs": 3, + "total_flos": 8.316244295052952e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llama-7b-hf_TQA/checkpoint-800/training_args.bin b/adapters/saved_llama-7b-hf_TQA/checkpoint-800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7f907c71d42e4be1c2af48f18c1da4e0d020c7d3 --- /dev/null +++ b/adapters/saved_llama-7b-hf_TQA/checkpoint-800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71d64800fd6e8ff62c5c7c1beb08db15ef64c167b94aabfeda2506f26162d725 +size 3643 diff --git a/adapters/saved_llama7b_codealpaca/adapter_config.json b/adapters/saved_llama7b_codealpaca/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f033fdd6b7869b39cae4f1c30521c8561ed73ff --- /dev/null +++ b/adapters/saved_llama7b_codealpaca/adapter_config.json @@ -0,0 +1,17 @@ +{ + "base_model_name_or_path": "decapoda-research/llama-7b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved_llama7b_codealpaca/adapter_model.bin b/adapters/saved_llama7b_codealpaca/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8710093665dfdb9819e2f8817a1e25a4ccdd9935 --- /dev/null +++ b/adapters/saved_llama7b_codealpaca/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5e1621f48d9ad8feb1d6d31050275f0aafd080c5c07153301fe2f48411f4406 +size 443 diff --git a/adapters/saved_llamaGuanaco/adapter_config.json b/adapters/saved_llamaGuanaco/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f033fdd6b7869b39cae4f1c30521c8561ed73ff --- /dev/null +++ b/adapters/saved_llamaGuanaco/adapter_config.json @@ -0,0 +1,17 @@ +{ + "base_model_name_or_path": "decapoda-research/llama-7b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved_llamaGuanaco/adapter_model.bin b/adapters/saved_llamaGuanaco/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8710093665dfdb9819e2f8817a1e25a4ccdd9935 --- /dev/null +++ b/adapters/saved_llamaGuanaco/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5e1621f48d9ad8feb1d6d31050275f0aafd080c5c07153301fe2f48411f4406 +size 443 diff --git a/adapters/saved_llamaGuanaco/checkpoint-400/optimizer.pt b/adapters/saved_llamaGuanaco/checkpoint-400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7e728bf010be6f169800a384e3c549bc2658c56 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28e3408b165e0083fab18ea519a013d30e0554bf1fc15359305d09f08520d06d +size 33629893 diff --git a/adapters/saved_llamaGuanaco/checkpoint-400/pytorch_model.bin b/adapters/saved_llamaGuanaco/checkpoint-400/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..b9894d41dfc2bd072cef0a044467c2bcd7264b77 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-400/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:420d537dcbf514b5d014bd5fec90161302bb7655b6913953a2ad121f15821348 +size 16822989 diff --git a/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_0.pth b/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..fae5eb82ebef63fcb3d026c570b72d8c91bb2c54 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b27103911f883af8138ae1f5d48533f93eac127f3951a648d065f220fe7864ad +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_1.pth b/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..a8bbcfc9b6b00fad7e7ce4b8bfe224ebe0d1f6ef --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0445cb3f9f29c489dac1821606b6d13768dada78a81df6dfce78d11d853fae1 +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_2.pth b/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..843c5df80f6816213fc83816eaa42048ed9c5d7c --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:872213a4d785d801ad6caecd08eaab67ccd79f108d2b6865a928cd973e97e60f +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_3.pth b/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..2c967cd9c83dda495685d22de52ec0d2557310a1 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:261e528c4adae4b579e380195448ec4736ffd24699fa7899fe8be0007eb044dd +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_4.pth b/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..f3f0b6f8885a137f7052605e5691d0a53cb668a9 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a11ec0f6315d2d55f8a8faf6eff39bce4c300476781a8728f5ecb03a6851d1f +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_5.pth b/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..cb53e135808e1811143e8d8aba07839486ab4104 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30d2e13dca7683f6b4bec1c492d5917e90e938dc4f7a26065c93ad6c330b33e8 +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_6.pth b/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..a3207024ae1300dc7852768c314da73d7b2c1d71 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b814f66d6ecb6fe24982b632786987722e56a801d1c1551c82a97ec63b436a20 +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_7.pth b/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..43530875de2413c4a68dcb851eddc69d01a7b893 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-400/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f6d99ce047dbed02cf5760d911676f31a8d863c6050ca77b5f5934650788981 +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-400/scaler.pt b/adapters/saved_llamaGuanaco/checkpoint-400/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ee84e1b463b64410ae6f3d5c680aef0d7e2b14f --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-400/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc40a4be6a52cee4d7658df4041e660ffa02b0d8b5bd143bb8bb397f7b71b1a5 +size 557 diff --git a/adapters/saved_llamaGuanaco/checkpoint-400/scheduler.pt b/adapters/saved_llamaGuanaco/checkpoint-400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b03253219e6e6e7e212c468f7debc759211012c2 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e59bb3fb2cd05cf0703bd9d9f29eb1575b9bcda91b9735074c3b2fc144f91d44 +size 627 diff --git a/adapters/saved_llamaGuanaco/checkpoint-400/trainer_state.json b/adapters/saved_llamaGuanaco/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..06cc5c0a0308fdb32333a7010c1bd28536a0683b --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-400/trainer_state.json @@ -0,0 +1,152 @@ +{ + "best_metric": 1.2675851583480835, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaGuanaco/checkpoint-400", + "epoch": 1.2540413441755658, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.06, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.5389, + "step": 20 + }, + { + "epoch": 0.13, + "learning_rate": 0.00011999999999999999, + "loss": 1.418, + "step": 40 + }, + { + "epoch": 0.19, + "learning_rate": 0.00017999999999999998, + "loss": 1.3622, + "step": 60 + }, + { + "epoch": 0.25, + "learning_rate": 0.00023999999999999998, + "loss": 1.3334, + "step": 80 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003, + "loss": 1.3109, + "step": 100 + }, + { + "epoch": 0.38, + "learning_rate": 0.00029297423887587816, + "loss": 1.3046, + "step": 120 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002859484777517564, + "loss": 1.2948, + "step": 140 + }, + { + "epoch": 0.5, + "learning_rate": 0.00027892271662763465, + "loss": 1.2885, + "step": 160 + }, + { + "epoch": 0.56, + "learning_rate": 0.00027189695550351283, + "loss": 1.2817, + "step": 180 + }, + { + "epoch": 0.63, + "learning_rate": 0.0002648711943793911, + "loss": 1.2706, + "step": 200 + }, + { + "epoch": 0.63, + "eval_loss": 1.2975808382034302, + "eval_runtime": 40.2385, + "eval_samples_per_second": 49.704, + "eval_steps_per_second": 0.795, + "step": 200 + }, + { + "epoch": 0.69, + "learning_rate": 0.00025784543325526926, + "loss": 1.2668, + "step": 220 + }, + { + "epoch": 0.75, + "learning_rate": 0.00025081967213114756, + "loss": 1.2699, + "step": 240 + }, + { + "epoch": 0.82, + "learning_rate": 0.00024379391100702575, + "loss": 1.2554, + "step": 260 + }, + { + "epoch": 0.88, + "learning_rate": 0.00023676814988290396, + "loss": 1.2508, + "step": 280 + }, + { + "epoch": 0.94, + "learning_rate": 0.00022974238875878218, + "loss": 1.2556, + "step": 300 + }, + { + "epoch": 1.0, + "learning_rate": 0.00022271662763466042, + "loss": 1.2497, + "step": 320 + }, + { + "epoch": 1.07, + "learning_rate": 0.00021569086651053863, + "loss": 1.2468, + "step": 340 + }, + { + "epoch": 1.13, + "learning_rate": 0.00020866510538641685, + "loss": 1.2428, + "step": 360 + }, + { + "epoch": 1.19, + "learning_rate": 0.00020163934426229506, + "loss": 1.2397, + "step": 380 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001946135831381733, + "loss": 1.2474, + "step": 400 + }, + { + "epoch": 1.25, + "eval_loss": 1.2675851583480835, + "eval_runtime": 40.2072, + "eval_samples_per_second": 49.742, + "eval_steps_per_second": 0.796, + "step": 400 + } + ], + "max_steps": 954, + "num_train_epochs": 3, + "total_flos": 5.874882938662814e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaGuanaco/checkpoint-400/training_args.bin b/adapters/saved_llamaGuanaco/checkpoint-400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..ade53f8c2ca8b12e097b4dbefc6b39c0e1c9e1ad --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a77703cb921b92e199a32b27489818de820e52081355eefe71c82f331d5760ca +size 3643 diff --git a/adapters/saved_llamaGuanaco/checkpoint-600/optimizer.pt b/adapters/saved_llamaGuanaco/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..87e2c787f335f9ed571360c76d1578d3e847d0a3 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864bf101bdcaba00e49ea1fe3689f4c079d60c21adc9dd07ee4fb7267f1ecf93 +size 33629893 diff --git a/adapters/saved_llamaGuanaco/checkpoint-600/pytorch_model.bin b/adapters/saved_llamaGuanaco/checkpoint-600/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..a24968b062e4f33dfe0606c8185b059f772a1d7c --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-600/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d306bd3ff858f3ef1ff923bd9368831a06e43923c82d5ede42214872a9d47e64 +size 16822989 diff --git a/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_0.pth b/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b11456acd685e5b4727c60645df2bac850e630d2 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:464136fdc82e969b73b211c969e9ee7bf47c3ed04cabf93ba9b9f2c2228b18f6 +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_1.pth b/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3d84f494e25177d0c0fa2761b7ae99e467c83f26 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:563f42ae955552647265cd0a490965b75b280996f71a4ec13ff3435adf7d45d6 +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_2.pth b/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..5f329483a8568977b13a9a49bdbc4ccc62d1c9d5 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39014f5d5ae8d909512f6b48fd6c69a31b576c4491b614c3a83f1796e1294e20 +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_3.pth b/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..6ef59a4d797b2ac31d620de394f401303df56c66 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0b98d8cd228b24ac5ccf9e857925df2f95348c5bb4838a453154d998d0304e2 +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_4.pth b/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..742dc446571a5b81d7320a4b86a148af19a65c1b --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5a737126ab5d828bcd4d86db173721702c62be44df14c64dac177146158ed6a +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_5.pth b/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..576968dfbcd45ca264f6400a7e64b7135af171cc --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca014377f19378ec0707f81bff4c1b58f6d60f49f91e103265daf595810b62e3 +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_6.pth b/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..7be0445b754cc14edb0a73bf0f0f2dbcae6d3a22 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c53bb84f51d54fd8357cb8abda9cc824acb1c28fa515ce74859e72785e0023d +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_7.pth b/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..6f435f329a5db95957be3434804aaeca48d9be12 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-600/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b17217b24dec9c0ad3636e69cac4942bd5f7ff7c808491b3b8d4f5a293381ac +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-600/scaler.pt b/adapters/saved_llamaGuanaco/checkpoint-600/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..842791b612283ceb8e68b64ed8e40e81c5a97bce --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-600/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dc9eacfeb00bd0bfeb98934a2309be01be65b288e0d747bbfc423b32679169f +size 557 diff --git a/adapters/saved_llamaGuanaco/checkpoint-600/scheduler.pt b/adapters/saved_llamaGuanaco/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5bf98bb844768f884a6715f00470b29a8891bffc --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35104377074b95b7a8f6c9901bcbd2d0c0a9c91cb8c55f8b380b012c0b0e6e5a +size 627 diff --git a/adapters/saved_llamaGuanaco/checkpoint-600/trainer_state.json b/adapters/saved_llamaGuanaco/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d5b66418aaf028146c92d39d5a3c8395f23ddff0 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-600/trainer_state.json @@ -0,0 +1,220 @@ +{ + "best_metric": 1.2517019510269165, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaGuanaco/checkpoint-600", + "epoch": 1.8810620162633487, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.06, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.5389, + "step": 20 + }, + { + "epoch": 0.13, + "learning_rate": 0.00011999999999999999, + "loss": 1.418, + "step": 40 + }, + { + "epoch": 0.19, + "learning_rate": 0.00017999999999999998, + "loss": 1.3622, + "step": 60 + }, + { + "epoch": 0.25, + "learning_rate": 0.00023999999999999998, + "loss": 1.3334, + "step": 80 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003, + "loss": 1.3109, + "step": 100 + }, + { + "epoch": 0.38, + "learning_rate": 0.00029297423887587816, + "loss": 1.3046, + "step": 120 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002859484777517564, + "loss": 1.2948, + "step": 140 + }, + { + "epoch": 0.5, + "learning_rate": 0.00027892271662763465, + "loss": 1.2885, + "step": 160 + }, + { + "epoch": 0.56, + "learning_rate": 0.00027189695550351283, + "loss": 1.2817, + "step": 180 + }, + { + "epoch": 0.63, + "learning_rate": 0.0002648711943793911, + "loss": 1.2706, + "step": 200 + }, + { + "epoch": 0.63, + "eval_loss": 1.2975808382034302, + "eval_runtime": 40.2385, + "eval_samples_per_second": 49.704, + "eval_steps_per_second": 0.795, + "step": 200 + }, + { + "epoch": 0.69, + "learning_rate": 0.00025784543325526926, + "loss": 1.2668, + "step": 220 + }, + { + "epoch": 0.75, + "learning_rate": 0.00025081967213114756, + "loss": 1.2699, + "step": 240 + }, + { + "epoch": 0.82, + "learning_rate": 0.00024379391100702575, + "loss": 1.2554, + "step": 260 + }, + { + "epoch": 0.88, + "learning_rate": 0.00023676814988290396, + "loss": 1.2508, + "step": 280 + }, + { + "epoch": 0.94, + "learning_rate": 0.00022974238875878218, + "loss": 1.2556, + "step": 300 + }, + { + "epoch": 1.0, + "learning_rate": 0.00022271662763466042, + "loss": 1.2497, + "step": 320 + }, + { + "epoch": 1.07, + "learning_rate": 0.00021569086651053863, + "loss": 1.2468, + "step": 340 + }, + { + "epoch": 1.13, + "learning_rate": 0.00020866510538641685, + "loss": 1.2428, + "step": 360 + }, + { + "epoch": 1.19, + "learning_rate": 0.00020163934426229506, + "loss": 1.2397, + "step": 380 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001946135831381733, + "loss": 1.2474, + "step": 400 + }, + { + "epoch": 1.25, + "eval_loss": 1.2675851583480835, + "eval_runtime": 40.2072, + "eval_samples_per_second": 49.742, + "eval_steps_per_second": 0.796, + "step": 400 + }, + { + "epoch": 1.32, + "learning_rate": 0.00018758782201405152, + "loss": 1.2407, + "step": 420 + }, + { + "epoch": 1.38, + "learning_rate": 0.00018056206088992973, + "loss": 1.2348, + "step": 440 + }, + { + "epoch": 1.44, + "learning_rate": 0.00017353629976580795, + "loss": 1.228, + "step": 460 + }, + { + "epoch": 1.5, + "learning_rate": 0.00016651053864168616, + "loss": 1.233, + "step": 480 + }, + { + "epoch": 1.57, + "learning_rate": 0.0001594847775175644, + "loss": 1.2303, + "step": 500 + }, + { + "epoch": 1.63, + "learning_rate": 0.00015245901639344262, + "loss": 1.2255, + "step": 520 + }, + { + "epoch": 1.69, + "learning_rate": 0.00014543325526932083, + "loss": 1.2191, + "step": 540 + }, + { + "epoch": 1.76, + "learning_rate": 0.00013840749414519905, + "loss": 1.2211, + "step": 560 + }, + { + "epoch": 1.82, + "learning_rate": 0.00013138173302107726, + "loss": 1.2188, + "step": 580 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001243559718969555, + "loss": 1.2184, + "step": 600 + }, + { + "epoch": 1.88, + "eval_loss": 1.2517019510269165, + "eval_runtime": 40.3057, + "eval_samples_per_second": 49.621, + "eval_steps_per_second": 0.794, + "step": 600 + } + ], + "max_steps": 954, + "num_train_epochs": 3, + "total_flos": 8.806186092274385e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaGuanaco/checkpoint-600/training_args.bin b/adapters/saved_llamaGuanaco/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..ade53f8c2ca8b12e097b4dbefc6b39c0e1c9e1ad --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a77703cb921b92e199a32b27489818de820e52081355eefe71c82f331d5760ca +size 3643 diff --git a/adapters/saved_llamaGuanaco/checkpoint-800/optimizer.pt b/adapters/saved_llamaGuanaco/checkpoint-800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5f97b08600c0620b730c23f5356790c22f6db63 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1087238ef7bbb00d2c659376ce67b501887a45b65e0fc70961e1bd4e81912abc +size 33629893 diff --git a/adapters/saved_llamaGuanaco/checkpoint-800/pytorch_model.bin b/adapters/saved_llamaGuanaco/checkpoint-800/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..0f9d7284b27356ee90537bd98844a076f2dad59a --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-800/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6da779a55c303e810906453a951d9bf809da5c1f4f3ec0c5503b97c135785ea +size 16822989 diff --git a/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_0.pth b/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8979c0b0c5f0e4e97f657ede397781ffda05b858 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22afa9726989fd0fe9ce9d8508e4f80a927cf48b1dd458dd3bf4800822fb1695 +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_1.pth b/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..9f1ae845d731e3dfd49a549dfdac3bdbb62e2a13 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f77df2aa01c01d866d1f05483e495717eab679faa1fc7a92e9c12d7fd4887949 +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_2.pth b/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6599261139bc82c9c7cfc3cbee04c729bd01516d --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98ac72e7e6e871f9663c277fbc31392775eef4dd52e890d0c95fa738cd072868 +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_3.pth b/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f05f8327400af4333b497e8a00ac526ed8735097 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c812a2f5781c1946858fb15defec4b53fa36b5ce50e158ad63d60bf6e77d5cc +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_4.pth b/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..d399e1cf189bd0fbda818f08ad3348efb8c556ce --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:809691ac638bfd4ad48a04f78b4053b618727a2b9878d58f47ef8b78bd54a8b8 +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_5.pth b/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1d1db3ca83061d180a84c6998b473701b8fe7aa6 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc2d3a8cec6d9c71547543fe9aed2c0d37f292e3d32ff5c4236f937f1bc38f54 +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_6.pth b/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..659f3851584d5f6ce551713e06d6e3cf91d096a5 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7bc936c3abf64a1dfe3346975821ceeb74f5a4ed7cc1dc9c5d6ed4242f1f458 +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_7.pth b/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..ae6b64208206f615378b14c8dbb061ea1b1bfbec --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-800/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87373f8f7a3908159e64b63221a7086c94acb1394075c1b73b7595aa6d8159b8 +size 14583 diff --git a/adapters/saved_llamaGuanaco/checkpoint-800/scaler.pt b/adapters/saved_llamaGuanaco/checkpoint-800/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e01dd7b5d3a8968bb4c73a805f08f0f65c9b57f --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-800/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27ec07a12731ae6f9765d05fe7c8495505f1d0f90b4cc6255a0853fec3970808 +size 557 diff --git a/adapters/saved_llamaGuanaco/checkpoint-800/scheduler.pt b/adapters/saved_llamaGuanaco/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a74fbd22a53f16616ac649d54f225e94a5da7e96 --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36e21e3f0ea6531248bfb23b775f97636ed373d0b41966e599a6fd8a249f7e48 +size 627 diff --git a/adapters/saved_llamaGuanaco/checkpoint-800/trainer_state.json b/adapters/saved_llamaGuanaco/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3ecb99a896a24c263f1d7f0cb7918c4c4849c4ec --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-800/trainer_state.json @@ -0,0 +1,288 @@ +{ + "best_metric": 1.2436273097991943, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaGuanaco/checkpoint-800", + "epoch": 2.5080826883511316, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.06, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.5389, + "step": 20 + }, + { + "epoch": 0.13, + "learning_rate": 0.00011999999999999999, + "loss": 1.418, + "step": 40 + }, + { + "epoch": 0.19, + "learning_rate": 0.00017999999999999998, + "loss": 1.3622, + "step": 60 + }, + { + "epoch": 0.25, + "learning_rate": 0.00023999999999999998, + "loss": 1.3334, + "step": 80 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003, + "loss": 1.3109, + "step": 100 + }, + { + "epoch": 0.38, + "learning_rate": 0.00029297423887587816, + "loss": 1.3046, + "step": 120 + }, + { + "epoch": 0.44, + "learning_rate": 0.0002859484777517564, + "loss": 1.2948, + "step": 140 + }, + { + "epoch": 0.5, + "learning_rate": 0.00027892271662763465, + "loss": 1.2885, + "step": 160 + }, + { + "epoch": 0.56, + "learning_rate": 0.00027189695550351283, + "loss": 1.2817, + "step": 180 + }, + { + "epoch": 0.63, + "learning_rate": 0.0002648711943793911, + "loss": 1.2706, + "step": 200 + }, + { + "epoch": 0.63, + "eval_loss": 1.2975808382034302, + "eval_runtime": 40.2385, + "eval_samples_per_second": 49.704, + "eval_steps_per_second": 0.795, + "step": 200 + }, + { + "epoch": 0.69, + "learning_rate": 0.00025784543325526926, + "loss": 1.2668, + "step": 220 + }, + { + "epoch": 0.75, + "learning_rate": 0.00025081967213114756, + "loss": 1.2699, + "step": 240 + }, + { + "epoch": 0.82, + "learning_rate": 0.00024379391100702575, + "loss": 1.2554, + "step": 260 + }, + { + "epoch": 0.88, + "learning_rate": 0.00023676814988290396, + "loss": 1.2508, + "step": 280 + }, + { + "epoch": 0.94, + "learning_rate": 0.00022974238875878218, + "loss": 1.2556, + "step": 300 + }, + { + "epoch": 1.0, + "learning_rate": 0.00022271662763466042, + "loss": 1.2497, + "step": 320 + }, + { + "epoch": 1.07, + "learning_rate": 0.00021569086651053863, + "loss": 1.2468, + "step": 340 + }, + { + "epoch": 1.13, + "learning_rate": 0.00020866510538641685, + "loss": 1.2428, + "step": 360 + }, + { + "epoch": 1.19, + "learning_rate": 0.00020163934426229506, + "loss": 1.2397, + "step": 380 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001946135831381733, + "loss": 1.2474, + "step": 400 + }, + { + "epoch": 1.25, + "eval_loss": 1.2675851583480835, + "eval_runtime": 40.2072, + "eval_samples_per_second": 49.742, + "eval_steps_per_second": 0.796, + "step": 400 + }, + { + "epoch": 1.32, + "learning_rate": 0.00018758782201405152, + "loss": 1.2407, + "step": 420 + }, + { + "epoch": 1.38, + "learning_rate": 0.00018056206088992973, + "loss": 1.2348, + "step": 440 + }, + { + "epoch": 1.44, + "learning_rate": 0.00017353629976580795, + "loss": 1.228, + "step": 460 + }, + { + "epoch": 1.5, + "learning_rate": 0.00016651053864168616, + "loss": 1.233, + "step": 480 + }, + { + "epoch": 1.57, + "learning_rate": 0.0001594847775175644, + "loss": 1.2303, + "step": 500 + }, + { + "epoch": 1.63, + "learning_rate": 0.00015245901639344262, + "loss": 1.2255, + "step": 520 + }, + { + "epoch": 1.69, + "learning_rate": 0.00014543325526932083, + "loss": 1.2191, + "step": 540 + }, + { + "epoch": 1.76, + "learning_rate": 0.00013840749414519905, + "loss": 1.2211, + "step": 560 + }, + { + "epoch": 1.82, + "learning_rate": 0.00013138173302107726, + "loss": 1.2188, + "step": 580 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001243559718969555, + "loss": 1.2184, + "step": 600 + }, + { + "epoch": 1.88, + "eval_loss": 1.2517019510269165, + "eval_runtime": 40.3057, + "eval_samples_per_second": 49.621, + "eval_steps_per_second": 0.794, + "step": 600 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001173302107728337, + "loss": 1.2131, + "step": 620 + }, + { + "epoch": 2.01, + "learning_rate": 0.00011030444964871193, + "loss": 1.2169, + "step": 640 + }, + { + "epoch": 2.07, + "learning_rate": 0.00010327868852459015, + "loss": 1.2186, + "step": 660 + }, + { + "epoch": 2.13, + "learning_rate": 9.625292740046838e-05, + "loss": 1.2122, + "step": 680 + }, + { + "epoch": 2.19, + "learning_rate": 8.922716627634659e-05, + "loss": 1.2148, + "step": 700 + }, + { + "epoch": 2.26, + "learning_rate": 8.220140515222482e-05, + "loss": 1.2135, + "step": 720 + }, + { + "epoch": 2.32, + "learning_rate": 7.517564402810303e-05, + "loss": 1.2096, + "step": 740 + }, + { + "epoch": 2.38, + "learning_rate": 6.814988290398126e-05, + "loss": 1.2073, + "step": 760 + }, + { + "epoch": 2.45, + "learning_rate": 6.112412177985948e-05, + "loss": 1.2102, + "step": 780 + }, + { + "epoch": 2.51, + "learning_rate": 5.40983606557377e-05, + "loss": 1.2069, + "step": 800 + }, + { + "epoch": 2.51, + "eval_loss": 1.2436273097991943, + "eval_runtime": 40.1816, + "eval_samples_per_second": 49.774, + "eval_steps_per_second": 0.796, + "step": 800 + } + ], + "max_steps": 954, + "num_train_epochs": 3, + "total_flos": 1.1739466648829034e+19, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaGuanaco/checkpoint-800/training_args.bin b/adapters/saved_llamaGuanaco/checkpoint-800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..ade53f8c2ca8b12e097b4dbefc6b39c0e1c9e1ad --- /dev/null +++ b/adapters/saved_llamaGuanaco/checkpoint-800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a77703cb921b92e199a32b27489818de820e52081355eefe71c82f331d5760ca +size 3643 diff --git a/adapters/saved_llamaHC3_human/adapter_config.json b/adapters/saved_llamaHC3_human/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d07a7b67f6c57422ee3209e5778eb76d5dfe37c6 --- /dev/null +++ b/adapters/saved_llamaHC3_human/adapter_config.json @@ -0,0 +1,17 @@ +{ + "base_model_name_or_path": "/mnt/bn/qingyi-bn-lq/llama/llama-7b-hf/", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved_llamaHC3_human/adapter_model.bin b/adapters/saved_llamaHC3_human/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8710093665dfdb9819e2f8817a1e25a4ccdd9935 --- /dev/null +++ b/adapters/saved_llamaHC3_human/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5e1621f48d9ad8feb1d6d31050275f0aafd080c5c07153301fe2f48411f4406 +size 443 diff --git a/adapters/saved_llamaHC3_human/checkpoint-115/optimizer.pt b/adapters/saved_llamaHC3_human/checkpoint-115/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4fb7e57bd4c5cd110a8c0764f13cd3a21e769cc8 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-115/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d49940f803e15e3fd4286584e13a2b977521541ceaa858445c81877710a1f56 +size 33629765 diff --git a/adapters/saved_llamaHC3_human/checkpoint-115/pytorch_model.bin b/adapters/saved_llamaHC3_human/checkpoint-115/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..a4f38329d7f8244c0ae486500aef7fe18de8942e --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-115/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7a2a208a1e246368814d37f2ef741a6333fe0141b48694a3b96155d0b2c7a25 +size 16822989 diff --git a/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_0.pth b/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d6bf04e927170ed2a71b7349842abd3d22e2a7d1 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5053e961a29bfda43f02154e1c4a9e0c8160eeed8e6580d910dfd6d705ffca7f +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_1.pth b/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3cd115bb3353d9b02d5a3d638f57edf31ff4f830 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4155a8c468c6c8515907e36070cfee95a91638e2c2dc4f349c7a8c4ed4d9d48d +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_2.pth b/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..a0ee5dde25ba0f0cb7ebd4a192e80bda167252f6 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd62f4929a85a224c61b7fede516e0edc5356e4de227b3964336436de7dba7e7 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_3.pth b/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..6036ad7100791e9d43e5a4ff5ce0c39ed115dbc2 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc81bc372be4e556a21cf1e6a9f05e287befedee71b268af2c6d79608e746f90 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_4.pth b/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..8e6c7c437ab99ff9fa702115a4cc8d9dd3fdec8f --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b87c8e3260a98ad219e9e193cb2a4c208762efe64dbe49308e9ffe2d0b5e3411 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_5.pth b/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..ac7ee088c77c2102d4e45fc11d518f460f5bae85 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a879119a1cf092e89a47067bf0d2525e153a9c096518733bf14bbf370fd13930 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_6.pth b/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..c34b0842398ad6b8e3043e0fbd0320edfce51b35 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dffaa4f2e2a4f2338807cbb57483f250247591c995f77c6451b4d42c33b3e4a5 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_7.pth b/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..834c547df99bd72885171f18be6f06db77fd8ab0 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-115/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3128b45b4861ff2a00c59bb6b24bd7310ca367b5251356fb75b7c66af43fb2d1 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-115/scaler.pt b/adapters/saved_llamaHC3_human/checkpoint-115/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b5bd4bbf152358a09cf09614e3518d63a063457 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-115/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ed79e185ad8720d848bacebd4c4ac2d954da5529ebd7196901c3bfa11c6406d +size 557 diff --git a/adapters/saved_llamaHC3_human/checkpoint-115/scheduler.pt b/adapters/saved_llamaHC3_human/checkpoint-115/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc820729f711b8f56231d5bf682464e97d5ffc43 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-115/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e068b249b5ca435101ef3909af9ae268f5e48cb43c6f4ec869fd7d330e41192 +size 627 diff --git a/adapters/saved_llamaHC3_human/checkpoint-115/trainer_state.json b/adapters/saved_llamaHC3_human/checkpoint-115/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..679c7d3147e1d7c74060ab9726a421d34e936768 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-115/trainer_state.json @@ -0,0 +1,230 @@ +{ + "best_metric": 2.126908779144287, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaHC3_human/checkpoint-115", + "epoch": 2.081447963800905, + "global_step": 115, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.09, + "eval_loss": 2.3469691276550293, + "eval_runtime": 40.7893, + "eval_samples_per_second": 49.032, + "eval_steps_per_second": 0.785, + "step": 5 + }, + { + "epoch": 0.18, + "eval_loss": 2.254398822784424, + "eval_runtime": 40.7321, + "eval_samples_per_second": 49.101, + "eval_steps_per_second": 0.786, + "step": 10 + }, + { + "epoch": 0.27, + "eval_loss": 2.216655731201172, + "eval_runtime": 40.7948, + "eval_samples_per_second": 49.026, + "eval_steps_per_second": 0.784, + "step": 15 + }, + { + "epoch": 0.36, + "learning_rate": 0.000271875, + "loss": 2.265, + "step": 20 + }, + { + "epoch": 0.36, + "eval_loss": 2.192472219467163, + "eval_runtime": 40.9073, + "eval_samples_per_second": 48.891, + "eval_steps_per_second": 0.782, + "step": 20 + }, + { + "epoch": 0.45, + "eval_loss": 2.1772661209106445, + "eval_runtime": 40.8472, + "eval_samples_per_second": 48.963, + "eval_steps_per_second": 0.783, + "step": 25 + }, + { + "epoch": 0.54, + "eval_loss": 2.1680707931518555, + "eval_runtime": 40.7376, + "eval_samples_per_second": 49.095, + "eval_steps_per_second": 0.786, + "step": 30 + }, + { + "epoch": 0.63, + "eval_loss": 2.15969181060791, + "eval_runtime": 40.7043, + "eval_samples_per_second": 49.135, + "eval_steps_per_second": 0.786, + "step": 35 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023624999999999997, + "loss": 2.1528, + "step": 40 + }, + { + "epoch": 0.72, + "eval_loss": 2.1535723209381104, + "eval_runtime": 40.7472, + "eval_samples_per_second": 49.083, + "eval_steps_per_second": 0.785, + "step": 40 + }, + { + "epoch": 0.81, + "eval_loss": 2.149122476577759, + "eval_runtime": 40.7594, + "eval_samples_per_second": 49.068, + "eval_steps_per_second": 0.785, + "step": 45 + }, + { + "epoch": 0.9, + "eval_loss": 2.1453487873077393, + "eval_runtime": 40.7319, + "eval_samples_per_second": 49.102, + "eval_steps_per_second": 0.786, + "step": 50 + }, + { + "epoch": 1.0, + "eval_loss": 2.142251968383789, + "eval_runtime": 40.8482, + "eval_samples_per_second": 48.962, + "eval_steps_per_second": 0.783, + "step": 55 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019874999999999998, + "loss": 2.1332, + "step": 60 + }, + { + "epoch": 1.09, + "eval_loss": 2.139620065689087, + "eval_runtime": 40.7619, + "eval_samples_per_second": 49.065, + "eval_steps_per_second": 0.785, + "step": 60 + }, + { + "epoch": 1.18, + "eval_loss": 2.137423276901245, + "eval_runtime": 40.783, + "eval_samples_per_second": 49.04, + "eval_steps_per_second": 0.785, + "step": 65 + }, + { + "epoch": 1.27, + "eval_loss": 2.135718822479248, + "eval_runtime": 40.7802, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 70 + }, + { + "epoch": 1.36, + "eval_loss": 2.134214401245117, + "eval_runtime": 40.7213, + "eval_samples_per_second": 49.114, + "eval_steps_per_second": 0.786, + "step": 75 + }, + { + "epoch": 1.45, + "learning_rate": 0.00016125, + "loss": 2.1152, + "step": 80 + }, + { + "epoch": 1.45, + "eval_loss": 2.1330068111419678, + "eval_runtime": 40.8447, + "eval_samples_per_second": 48.966, + "eval_steps_per_second": 0.783, + "step": 80 + }, + { + "epoch": 1.54, + "eval_loss": 2.131664276123047, + "eval_runtime": 40.9633, + "eval_samples_per_second": 48.824, + "eval_steps_per_second": 0.781, + "step": 85 + }, + { + "epoch": 1.63, + "eval_loss": 2.1305532455444336, + "eval_runtime": 40.7803, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 90 + }, + { + "epoch": 1.72, + "eval_loss": 2.1295671463012695, + "eval_runtime": 40.8088, + "eval_samples_per_second": 49.009, + "eval_steps_per_second": 0.784, + "step": 95 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012374999999999997, + "loss": 2.1138, + "step": 100 + }, + { + "epoch": 1.81, + "eval_loss": 2.1289596557617188, + "eval_runtime": 40.7573, + "eval_samples_per_second": 49.071, + "eval_steps_per_second": 0.785, + "step": 100 + }, + { + "epoch": 1.9, + "eval_loss": 2.1280689239501953, + "eval_runtime": 40.8322, + "eval_samples_per_second": 48.981, + "eval_steps_per_second": 0.784, + "step": 105 + }, + { + "epoch": 1.99, + "eval_loss": 2.127443552017212, + "eval_runtime": 40.9036, + "eval_samples_per_second": 48.895, + "eval_steps_per_second": 0.782, + "step": 110 + }, + { + "epoch": 2.08, + "eval_loss": 2.126908779144287, + "eval_runtime": 41.0214, + "eval_samples_per_second": 48.755, + "eval_steps_per_second": 0.78, + "step": 115 + } + ], + "max_steps": 165, + "num_train_epochs": 3, + "total_flos": 1.6896341814217277e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaHC3_human/checkpoint-115/training_args.bin b/adapters/saved_llamaHC3_human/checkpoint-115/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08d435511e54e49bd3b0e2c715e12420f287521d --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-115/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:004b0b2ef592ca03dc7186e3339fabafe55905860b1dd3e8c499f5470388c62b +size 3643 diff --git a/adapters/saved_llamaHC3_human/checkpoint-120/optimizer.pt b/adapters/saved_llamaHC3_human/checkpoint-120/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4efb68a3419bd70978a95cc962b69b4d0b5524de --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-120/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:234d15bbb61c3a4eaa522aa7b6926cf9737f40be3ab0b54a88dda14768d2a2fa +size 33629765 diff --git a/adapters/saved_llamaHC3_human/checkpoint-120/pytorch_model.bin b/adapters/saved_llamaHC3_human/checkpoint-120/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..5ae1d9fe0a3c625c7ad279495b67f0f23be25579 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-120/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23acae97cce7210f2422c515650e7d86deeb670db08e652a0f0f9b8634d5671e +size 16822989 diff --git a/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_0.pth b/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..3f7ae80b4ae15bd731cab7552c335120ebc9a947 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a1096699174a1d22dc80f36fe940b862f53cb4c09325cfbc5d7f43a73be8c0a +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_1.pth b/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..95a2172021d9c4477ea1f5a271999417b3d2d5dd --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d55df8e0b73dfddd03059188d057446c6491ec980900d9f4b6d7e312e90695e5 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_2.pth b/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..5147171656fb4ea33a589cf21ab6ffdd0424fc78 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e661796b852a08cca0dd3b5b60f62a3ef7aa332ffd2a94ae5c36e4c9ad1b147 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_3.pth b/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..68319f22af3ed6ba10cfd05d9924844f361a8625 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb47e66a55b0b5638c722dc45d341e7ebf0ea5c169662170888b83f2a2d185ac +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_4.pth b/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..d344f47b3c4c980a28ee98e5b7317e09d0a1405b --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:791fd2fd4005546c778e64316f8b3257bfe5858ca9a942ea5340504615deb993 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_5.pth b/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..5e66e088a0247e27131a035c4873d239ee049e66 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30d3ce706921cb2ee748dc4ae127697b763221bd7843ba630bbebd21ff0aefff +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_6.pth b/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..be72caf0999f48392485ac2d66940deb6894bfa1 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e12b0768c10fb08bea7b75b0ceb3e9c6a27584105637eb68541c077d889c664 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_7.pth b/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..56f8ae982e5db786b84a11400e5feb5ab4b82543 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-120/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c90b343f7d8eede0ff59dee8cc30812d11854fd162069eaaa25fd5b23897a491 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-120/scaler.pt b/adapters/saved_llamaHC3_human/checkpoint-120/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c74773a70b176010bdadeb2f4ec12025cef1c247 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-120/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:529c735230f832a64b98afc9ffebd60bd3bc6d1c1d0210533c2f7afd4313e1a4 +size 557 diff --git a/adapters/saved_llamaHC3_human/checkpoint-120/scheduler.pt b/adapters/saved_llamaHC3_human/checkpoint-120/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c24d57f2567272eab705df74f9180073440c50e5 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-120/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b67d31d1826235232579c59cb233db017f305ff2034f4cd684e41a6957a2f11 +size 627 diff --git a/adapters/saved_llamaHC3_human/checkpoint-120/trainer_state.json b/adapters/saved_llamaHC3_human/checkpoint-120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..97a8bc743e38899873a7c5140547d527df357276 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-120/trainer_state.json @@ -0,0 +1,244 @@ +{ + "best_metric": 2.126199245452881, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaHC3_human/checkpoint-120", + "epoch": 2.171945701357466, + "global_step": 120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.09, + "eval_loss": 2.3469691276550293, + "eval_runtime": 40.7893, + "eval_samples_per_second": 49.032, + "eval_steps_per_second": 0.785, + "step": 5 + }, + { + "epoch": 0.18, + "eval_loss": 2.254398822784424, + "eval_runtime": 40.7321, + "eval_samples_per_second": 49.101, + "eval_steps_per_second": 0.786, + "step": 10 + }, + { + "epoch": 0.27, + "eval_loss": 2.216655731201172, + "eval_runtime": 40.7948, + "eval_samples_per_second": 49.026, + "eval_steps_per_second": 0.784, + "step": 15 + }, + { + "epoch": 0.36, + "learning_rate": 0.000271875, + "loss": 2.265, + "step": 20 + }, + { + "epoch": 0.36, + "eval_loss": 2.192472219467163, + "eval_runtime": 40.9073, + "eval_samples_per_second": 48.891, + "eval_steps_per_second": 0.782, + "step": 20 + }, + { + "epoch": 0.45, + "eval_loss": 2.1772661209106445, + "eval_runtime": 40.8472, + "eval_samples_per_second": 48.963, + "eval_steps_per_second": 0.783, + "step": 25 + }, + { + "epoch": 0.54, + "eval_loss": 2.1680707931518555, + "eval_runtime": 40.7376, + "eval_samples_per_second": 49.095, + "eval_steps_per_second": 0.786, + "step": 30 + }, + { + "epoch": 0.63, + "eval_loss": 2.15969181060791, + "eval_runtime": 40.7043, + "eval_samples_per_second": 49.135, + "eval_steps_per_second": 0.786, + "step": 35 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023624999999999997, + "loss": 2.1528, + "step": 40 + }, + { + "epoch": 0.72, + "eval_loss": 2.1535723209381104, + "eval_runtime": 40.7472, + "eval_samples_per_second": 49.083, + "eval_steps_per_second": 0.785, + "step": 40 + }, + { + "epoch": 0.81, + "eval_loss": 2.149122476577759, + "eval_runtime": 40.7594, + "eval_samples_per_second": 49.068, + "eval_steps_per_second": 0.785, + "step": 45 + }, + { + "epoch": 0.9, + "eval_loss": 2.1453487873077393, + "eval_runtime": 40.7319, + "eval_samples_per_second": 49.102, + "eval_steps_per_second": 0.786, + "step": 50 + }, + { + "epoch": 1.0, + "eval_loss": 2.142251968383789, + "eval_runtime": 40.8482, + "eval_samples_per_second": 48.962, + "eval_steps_per_second": 0.783, + "step": 55 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019874999999999998, + "loss": 2.1332, + "step": 60 + }, + { + "epoch": 1.09, + "eval_loss": 2.139620065689087, + "eval_runtime": 40.7619, + "eval_samples_per_second": 49.065, + "eval_steps_per_second": 0.785, + "step": 60 + }, + { + "epoch": 1.18, + "eval_loss": 2.137423276901245, + "eval_runtime": 40.783, + "eval_samples_per_second": 49.04, + "eval_steps_per_second": 0.785, + "step": 65 + }, + { + "epoch": 1.27, + "eval_loss": 2.135718822479248, + "eval_runtime": 40.7802, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 70 + }, + { + "epoch": 1.36, + "eval_loss": 2.134214401245117, + "eval_runtime": 40.7213, + "eval_samples_per_second": 49.114, + "eval_steps_per_second": 0.786, + "step": 75 + }, + { + "epoch": 1.45, + "learning_rate": 0.00016125, + "loss": 2.1152, + "step": 80 + }, + { + "epoch": 1.45, + "eval_loss": 2.1330068111419678, + "eval_runtime": 40.8447, + "eval_samples_per_second": 48.966, + "eval_steps_per_second": 0.783, + "step": 80 + }, + { + "epoch": 1.54, + "eval_loss": 2.131664276123047, + "eval_runtime": 40.9633, + "eval_samples_per_second": 48.824, + "eval_steps_per_second": 0.781, + "step": 85 + }, + { + "epoch": 1.63, + "eval_loss": 2.1305532455444336, + "eval_runtime": 40.7803, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 90 + }, + { + "epoch": 1.72, + "eval_loss": 2.1295671463012695, + "eval_runtime": 40.8088, + "eval_samples_per_second": 49.009, + "eval_steps_per_second": 0.784, + "step": 95 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012374999999999997, + "loss": 2.1138, + "step": 100 + }, + { + "epoch": 1.81, + "eval_loss": 2.1289596557617188, + "eval_runtime": 40.7573, + "eval_samples_per_second": 49.071, + "eval_steps_per_second": 0.785, + "step": 100 + }, + { + "epoch": 1.9, + "eval_loss": 2.1280689239501953, + "eval_runtime": 40.8322, + "eval_samples_per_second": 48.981, + "eval_steps_per_second": 0.784, + "step": 105 + }, + { + "epoch": 1.99, + "eval_loss": 2.127443552017212, + "eval_runtime": 40.9036, + "eval_samples_per_second": 48.895, + "eval_steps_per_second": 0.782, + "step": 110 + }, + { + "epoch": 2.08, + "eval_loss": 2.126908779144287, + "eval_runtime": 41.0214, + "eval_samples_per_second": 48.755, + "eval_steps_per_second": 0.78, + "step": 115 + }, + { + "epoch": 2.17, + "learning_rate": 8.624999999999998e-05, + "loss": 2.1101, + "step": 120 + }, + { + "epoch": 2.17, + "eval_loss": 2.126199245452881, + "eval_runtime": 40.9707, + "eval_samples_per_second": 48.815, + "eval_steps_per_second": 0.781, + "step": 120 + } + ], + "max_steps": 165, + "num_train_epochs": 3, + "total_flos": 1.7637496231401882e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaHC3_human/checkpoint-120/training_args.bin b/adapters/saved_llamaHC3_human/checkpoint-120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08d435511e54e49bd3b0e2c715e12420f287521d --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:004b0b2ef592ca03dc7186e3339fabafe55905860b1dd3e8c499f5470388c62b +size 3643 diff --git a/adapters/saved_llamaHC3_human/checkpoint-125/optimizer.pt b/adapters/saved_llamaHC3_human/checkpoint-125/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8701adaed2e78b073e8bb9b4aa03350bdfee30db --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-125/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69315e97a164f1a911d701cffe5f54ada193479c4abe13a279f4edc9adc1b2d8 +size 33629765 diff --git a/adapters/saved_llamaHC3_human/checkpoint-125/pytorch_model.bin b/adapters/saved_llamaHC3_human/checkpoint-125/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ede7abab29e2e86f2f3b4b85e1a279ab9752a8c6 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-125/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d204023e5d593337a74fd7214ca8f2b5e745f86f4d1e1ad0be9e6676fac3bf39 +size 16822989 diff --git a/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_0.pth b/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..381b47390cf3acb1c2f71370ab41a156d24b5138 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8113ec59455e5468c7ade87ebaac61c3a5084961c8f3f24c7827ca0720255f4b +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_1.pth b/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..b300df83507ec5424fad282574b48ffe4d07e3c5 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc85ead2a411d59eb1461c94321af877978e06a1a99564100b88796cbc9610fe +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_2.pth b/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..7ac968d531aaefafbc2b1a9758356b09dd50163f --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a03d96578270ac4e333cc007a1c8c8ecab6b3a04d92ef357a3306033f27d8aa2 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_3.pth b/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..3428b8023d2307d41f8388e54770ee942163e38f --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1e94e9b49e8a7d6100a34aa7ef9b5630ac51ed9ce59bb55d743a09328e098b1 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_4.pth b/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..8eea7401abd2447615f3081352647c4f0c80b413 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f24354e4384b9733e12f2b65347f6fba91c31aa5284c86e4eb0ed61258d51fe7 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_5.pth b/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..73e87ec0a8bd8f86d47cdfff936094d1611f92d8 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74330d01e8dd3728b00b53dddd8f79dd6574f0024165fab14750701c64737217 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_6.pth b/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..a2372a606d4e09e603fef2c75c4005b894e0b275 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec939209e3448750ad2944642b83bc7648782ca68691be1229241b1c6ad786eb +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_7.pth b/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..44e5fd4727cd2905259f0c781e8379028081e887 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-125/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32e5cfd296bbd06dee9683de7f9a222dfadb14c659d8ab1d6f185ea5ac762c02 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-125/scaler.pt b/adapters/saved_llamaHC3_human/checkpoint-125/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..232769224687995e735bd8aa242b8c7f7a70aa84 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-125/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8131f89c5cbaa717b917adf1f74e138486efeb837e12544a5f533c500fac84e +size 557 diff --git a/adapters/saved_llamaHC3_human/checkpoint-125/scheduler.pt b/adapters/saved_llamaHC3_human/checkpoint-125/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f7a2c7ee8e19bc1d26770a4da2bd5f855a2ccc37 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-125/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1c0ea9f1c0ec53f92ffbee9f95396f4df396795ffcdfaaa8f338f979b8732f5 +size 627 diff --git a/adapters/saved_llamaHC3_human/checkpoint-125/trainer_state.json b/adapters/saved_llamaHC3_human/checkpoint-125/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..033ea41b7e47930b5d339b547e2f946468d280c2 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-125/trainer_state.json @@ -0,0 +1,252 @@ +{ + "best_metric": 2.125767230987549, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaHC3_human/checkpoint-125", + "epoch": 2.262443438914027, + "global_step": 125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.09, + "eval_loss": 2.3469691276550293, + "eval_runtime": 40.7893, + "eval_samples_per_second": 49.032, + "eval_steps_per_second": 0.785, + "step": 5 + }, + { + "epoch": 0.18, + "eval_loss": 2.254398822784424, + "eval_runtime": 40.7321, + "eval_samples_per_second": 49.101, + "eval_steps_per_second": 0.786, + "step": 10 + }, + { + "epoch": 0.27, + "eval_loss": 2.216655731201172, + "eval_runtime": 40.7948, + "eval_samples_per_second": 49.026, + "eval_steps_per_second": 0.784, + "step": 15 + }, + { + "epoch": 0.36, + "learning_rate": 0.000271875, + "loss": 2.265, + "step": 20 + }, + { + "epoch": 0.36, + "eval_loss": 2.192472219467163, + "eval_runtime": 40.9073, + "eval_samples_per_second": 48.891, + "eval_steps_per_second": 0.782, + "step": 20 + }, + { + "epoch": 0.45, + "eval_loss": 2.1772661209106445, + "eval_runtime": 40.8472, + "eval_samples_per_second": 48.963, + "eval_steps_per_second": 0.783, + "step": 25 + }, + { + "epoch": 0.54, + "eval_loss": 2.1680707931518555, + "eval_runtime": 40.7376, + "eval_samples_per_second": 49.095, + "eval_steps_per_second": 0.786, + "step": 30 + }, + { + "epoch": 0.63, + "eval_loss": 2.15969181060791, + "eval_runtime": 40.7043, + "eval_samples_per_second": 49.135, + "eval_steps_per_second": 0.786, + "step": 35 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023624999999999997, + "loss": 2.1528, + "step": 40 + }, + { + "epoch": 0.72, + "eval_loss": 2.1535723209381104, + "eval_runtime": 40.7472, + "eval_samples_per_second": 49.083, + "eval_steps_per_second": 0.785, + "step": 40 + }, + { + "epoch": 0.81, + "eval_loss": 2.149122476577759, + "eval_runtime": 40.7594, + "eval_samples_per_second": 49.068, + "eval_steps_per_second": 0.785, + "step": 45 + }, + { + "epoch": 0.9, + "eval_loss": 2.1453487873077393, + "eval_runtime": 40.7319, + "eval_samples_per_second": 49.102, + "eval_steps_per_second": 0.786, + "step": 50 + }, + { + "epoch": 1.0, + "eval_loss": 2.142251968383789, + "eval_runtime": 40.8482, + "eval_samples_per_second": 48.962, + "eval_steps_per_second": 0.783, + "step": 55 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019874999999999998, + "loss": 2.1332, + "step": 60 + }, + { + "epoch": 1.09, + "eval_loss": 2.139620065689087, + "eval_runtime": 40.7619, + "eval_samples_per_second": 49.065, + "eval_steps_per_second": 0.785, + "step": 60 + }, + { + "epoch": 1.18, + "eval_loss": 2.137423276901245, + "eval_runtime": 40.783, + "eval_samples_per_second": 49.04, + "eval_steps_per_second": 0.785, + "step": 65 + }, + { + "epoch": 1.27, + "eval_loss": 2.135718822479248, + "eval_runtime": 40.7802, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 70 + }, + { + "epoch": 1.36, + "eval_loss": 2.134214401245117, + "eval_runtime": 40.7213, + "eval_samples_per_second": 49.114, + "eval_steps_per_second": 0.786, + "step": 75 + }, + { + "epoch": 1.45, + "learning_rate": 0.00016125, + "loss": 2.1152, + "step": 80 + }, + { + "epoch": 1.45, + "eval_loss": 2.1330068111419678, + "eval_runtime": 40.8447, + "eval_samples_per_second": 48.966, + "eval_steps_per_second": 0.783, + "step": 80 + }, + { + "epoch": 1.54, + "eval_loss": 2.131664276123047, + "eval_runtime": 40.9633, + "eval_samples_per_second": 48.824, + "eval_steps_per_second": 0.781, + "step": 85 + }, + { + "epoch": 1.63, + "eval_loss": 2.1305532455444336, + "eval_runtime": 40.7803, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 90 + }, + { + "epoch": 1.72, + "eval_loss": 2.1295671463012695, + "eval_runtime": 40.8088, + "eval_samples_per_second": 49.009, + "eval_steps_per_second": 0.784, + "step": 95 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012374999999999997, + "loss": 2.1138, + "step": 100 + }, + { + "epoch": 1.81, + "eval_loss": 2.1289596557617188, + "eval_runtime": 40.7573, + "eval_samples_per_second": 49.071, + "eval_steps_per_second": 0.785, + "step": 100 + }, + { + "epoch": 1.9, + "eval_loss": 2.1280689239501953, + "eval_runtime": 40.8322, + "eval_samples_per_second": 48.981, + "eval_steps_per_second": 0.784, + "step": 105 + }, + { + "epoch": 1.99, + "eval_loss": 2.127443552017212, + "eval_runtime": 40.9036, + "eval_samples_per_second": 48.895, + "eval_steps_per_second": 0.782, + "step": 110 + }, + { + "epoch": 2.08, + "eval_loss": 2.126908779144287, + "eval_runtime": 41.0214, + "eval_samples_per_second": 48.755, + "eval_steps_per_second": 0.78, + "step": 115 + }, + { + "epoch": 2.17, + "learning_rate": 8.624999999999998e-05, + "loss": 2.1101, + "step": 120 + }, + { + "epoch": 2.17, + "eval_loss": 2.126199245452881, + "eval_runtime": 40.9707, + "eval_samples_per_second": 48.815, + "eval_steps_per_second": 0.781, + "step": 120 + }, + { + "epoch": 2.26, + "eval_loss": 2.125767230987549, + "eval_runtime": 40.7834, + "eval_samples_per_second": 49.04, + "eval_steps_per_second": 0.785, + "step": 125 + } + ], + "max_steps": 165, + "num_train_epochs": 3, + "total_flos": 1.8371572112985948e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaHC3_human/checkpoint-125/training_args.bin b/adapters/saved_llamaHC3_human/checkpoint-125/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08d435511e54e49bd3b0e2c715e12420f287521d --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-125/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:004b0b2ef592ca03dc7186e3339fabafe55905860b1dd3e8c499f5470388c62b +size 3643 diff --git a/adapters/saved_llamaHC3_human/checkpoint-130/optimizer.pt b/adapters/saved_llamaHC3_human/checkpoint-130/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..71b58e038d18e8abffed38d808a421a26c7d403a --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-130/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73c4add70b64f982e75134d9089b6b429c9ce0b05f74b1b8ca8b51462256eeda +size 33629765 diff --git a/adapters/saved_llamaHC3_human/checkpoint-130/pytorch_model.bin b/adapters/saved_llamaHC3_human/checkpoint-130/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..aed512fb883aa466ad829eae414b86edf39c037a --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-130/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09ba76792ef41596ea843137e7b5f949d97d9d2810abc4c13348292904859054 +size 16822989 diff --git a/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_0.pth b/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..57139c14c605ab091526f265c5e84bbd126a5bfc --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:239d59ba028e0d7de4aa2f5c448a6175950d896090844f678132cad093e5fe98 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_1.pth b/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3bf3d269252b85773d7d0dd12f52a0321ccf9ee0 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:382e5b4da91b74b6d568f3d56f1570c8e222cb94ebeb24f92484f315378f01e5 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_2.pth b/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..ade5cefae10fc22c0efec82989affab32489590d --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2783e433e020eb3e568433a89bb68af24eac573dae8caec46e2f60257b38497c +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_3.pth b/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..892c957a01702513ece559ae2600064ec5e008d6 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f1689755479e912addd2396ac475f71da3fa3b83d9320eacae707b75bf5f8a6 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_4.pth b/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..5150994a2c59f96dff6435fc2c6b855bce90998c --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a68ee986cf3d2af868292c77371060b537ccd39e794757a9f94895e8d39d4a4b +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_5.pth b/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..083126c0ad02cc4cdf7198c26fbf0f447ef3242a --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd7f8f614f7095eaef87b6e431539898ec773e8838599d331a9f47bba3d5ee7a +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_6.pth b/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..2260b93ce4a4aefbddf61a0a95492a51f74193e9 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d63818224c269a4a2883c0fe3dacc8f69561c3b47a46b934c2c4b7ec62e8eee8 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_7.pth b/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..83edbdc3a9ec8a49198dcb0aee237dfaf737c5a9 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-130/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7417a1a84511f33139599263e89f14c835b24436ac1708081958852902f55400 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-130/scaler.pt b/adapters/saved_llamaHC3_human/checkpoint-130/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ad6582619ae45a7e8f6141919a12421f2477e5ee --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-130/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f566ef92c5487901dfc4931d9c61e15e5f385efb7ac2d9719d45f9042d085ebe +size 557 diff --git a/adapters/saved_llamaHC3_human/checkpoint-130/scheduler.pt b/adapters/saved_llamaHC3_human/checkpoint-130/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..338357f58da484f34f273fd861e4ba4ee81cc2bb --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-130/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7dcdfb9adf051325f6d8b1916ef59492d84e7ccf56fcad27de8b2f37a1c7a35 +size 627 diff --git a/adapters/saved_llamaHC3_human/checkpoint-130/trainer_state.json b/adapters/saved_llamaHC3_human/checkpoint-130/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ac7304dd0e2b865e48314f0e394aae3817480eb6 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-130/trainer_state.json @@ -0,0 +1,260 @@ +{ + "best_metric": 2.125452995300293, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaHC3_human/checkpoint-130", + "epoch": 2.3529411764705883, + "global_step": 130, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.09, + "eval_loss": 2.3469691276550293, + "eval_runtime": 40.7893, + "eval_samples_per_second": 49.032, + "eval_steps_per_second": 0.785, + "step": 5 + }, + { + "epoch": 0.18, + "eval_loss": 2.254398822784424, + "eval_runtime": 40.7321, + "eval_samples_per_second": 49.101, + "eval_steps_per_second": 0.786, + "step": 10 + }, + { + "epoch": 0.27, + "eval_loss": 2.216655731201172, + "eval_runtime": 40.7948, + "eval_samples_per_second": 49.026, + "eval_steps_per_second": 0.784, + "step": 15 + }, + { + "epoch": 0.36, + "learning_rate": 0.000271875, + "loss": 2.265, + "step": 20 + }, + { + "epoch": 0.36, + "eval_loss": 2.192472219467163, + "eval_runtime": 40.9073, + "eval_samples_per_second": 48.891, + "eval_steps_per_second": 0.782, + "step": 20 + }, + { + "epoch": 0.45, + "eval_loss": 2.1772661209106445, + "eval_runtime": 40.8472, + "eval_samples_per_second": 48.963, + "eval_steps_per_second": 0.783, + "step": 25 + }, + { + "epoch": 0.54, + "eval_loss": 2.1680707931518555, + "eval_runtime": 40.7376, + "eval_samples_per_second": 49.095, + "eval_steps_per_second": 0.786, + "step": 30 + }, + { + "epoch": 0.63, + "eval_loss": 2.15969181060791, + "eval_runtime": 40.7043, + "eval_samples_per_second": 49.135, + "eval_steps_per_second": 0.786, + "step": 35 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023624999999999997, + "loss": 2.1528, + "step": 40 + }, + { + "epoch": 0.72, + "eval_loss": 2.1535723209381104, + "eval_runtime": 40.7472, + "eval_samples_per_second": 49.083, + "eval_steps_per_second": 0.785, + "step": 40 + }, + { + "epoch": 0.81, + "eval_loss": 2.149122476577759, + "eval_runtime": 40.7594, + "eval_samples_per_second": 49.068, + "eval_steps_per_second": 0.785, + "step": 45 + }, + { + "epoch": 0.9, + "eval_loss": 2.1453487873077393, + "eval_runtime": 40.7319, + "eval_samples_per_second": 49.102, + "eval_steps_per_second": 0.786, + "step": 50 + }, + { + "epoch": 1.0, + "eval_loss": 2.142251968383789, + "eval_runtime": 40.8482, + "eval_samples_per_second": 48.962, + "eval_steps_per_second": 0.783, + "step": 55 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019874999999999998, + "loss": 2.1332, + "step": 60 + }, + { + "epoch": 1.09, + "eval_loss": 2.139620065689087, + "eval_runtime": 40.7619, + "eval_samples_per_second": 49.065, + "eval_steps_per_second": 0.785, + "step": 60 + }, + { + "epoch": 1.18, + "eval_loss": 2.137423276901245, + "eval_runtime": 40.783, + "eval_samples_per_second": 49.04, + "eval_steps_per_second": 0.785, + "step": 65 + }, + { + "epoch": 1.27, + "eval_loss": 2.135718822479248, + "eval_runtime": 40.7802, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 70 + }, + { + "epoch": 1.36, + "eval_loss": 2.134214401245117, + "eval_runtime": 40.7213, + "eval_samples_per_second": 49.114, + "eval_steps_per_second": 0.786, + "step": 75 + }, + { + "epoch": 1.45, + "learning_rate": 0.00016125, + "loss": 2.1152, + "step": 80 + }, + { + "epoch": 1.45, + "eval_loss": 2.1330068111419678, + "eval_runtime": 40.8447, + "eval_samples_per_second": 48.966, + "eval_steps_per_second": 0.783, + "step": 80 + }, + { + "epoch": 1.54, + "eval_loss": 2.131664276123047, + "eval_runtime": 40.9633, + "eval_samples_per_second": 48.824, + "eval_steps_per_second": 0.781, + "step": 85 + }, + { + "epoch": 1.63, + "eval_loss": 2.1305532455444336, + "eval_runtime": 40.7803, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 90 + }, + { + "epoch": 1.72, + "eval_loss": 2.1295671463012695, + "eval_runtime": 40.8088, + "eval_samples_per_second": 49.009, + "eval_steps_per_second": 0.784, + "step": 95 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012374999999999997, + "loss": 2.1138, + "step": 100 + }, + { + "epoch": 1.81, + "eval_loss": 2.1289596557617188, + "eval_runtime": 40.7573, + "eval_samples_per_second": 49.071, + "eval_steps_per_second": 0.785, + "step": 100 + }, + { + "epoch": 1.9, + "eval_loss": 2.1280689239501953, + "eval_runtime": 40.8322, + "eval_samples_per_second": 48.981, + "eval_steps_per_second": 0.784, + "step": 105 + }, + { + "epoch": 1.99, + "eval_loss": 2.127443552017212, + "eval_runtime": 40.9036, + "eval_samples_per_second": 48.895, + "eval_steps_per_second": 0.782, + "step": 110 + }, + { + "epoch": 2.08, + "eval_loss": 2.126908779144287, + "eval_runtime": 41.0214, + "eval_samples_per_second": 48.755, + "eval_steps_per_second": 0.78, + "step": 115 + }, + { + "epoch": 2.17, + "learning_rate": 8.624999999999998e-05, + "loss": 2.1101, + "step": 120 + }, + { + "epoch": 2.17, + "eval_loss": 2.126199245452881, + "eval_runtime": 40.9707, + "eval_samples_per_second": 48.815, + "eval_steps_per_second": 0.781, + "step": 120 + }, + { + "epoch": 2.26, + "eval_loss": 2.125767230987549, + "eval_runtime": 40.7834, + "eval_samples_per_second": 49.04, + "eval_steps_per_second": 0.785, + "step": 125 + }, + { + "epoch": 2.35, + "eval_loss": 2.125452995300293, + "eval_runtime": 40.8009, + "eval_samples_per_second": 49.019, + "eval_steps_per_second": 0.784, + "step": 130 + } + ], + "max_steps": 165, + "num_train_epochs": 3, + "total_flos": 1.910764566975873e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaHC3_human/checkpoint-130/training_args.bin b/adapters/saved_llamaHC3_human/checkpoint-130/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08d435511e54e49bd3b0e2c715e12420f287521d --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-130/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:004b0b2ef592ca03dc7186e3339fabafe55905860b1dd3e8c499f5470388c62b +size 3643 diff --git a/adapters/saved_llamaHC3_human/checkpoint-135/optimizer.pt b/adapters/saved_llamaHC3_human/checkpoint-135/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6dde961efd298f0b6964501c4c24b245a0ff9d39 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-135/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d59a895b781291e911d9c7c5fa8821c1be7a9d68ac6ebc11662c62bb464d818 +size 33629765 diff --git a/adapters/saved_llamaHC3_human/checkpoint-135/pytorch_model.bin b/adapters/saved_llamaHC3_human/checkpoint-135/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..92b82f02435e6b7cd1e68db506f7bbd41d27deef --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-135/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7accca4b9b1ccee1a609d647437c14db8431a4c806a5a06b870f18df4fef1167 +size 16822989 diff --git a/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_0.pth b/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..aa4c66ecf06b8381d7db65ad27f943fce22e70ee --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6558a1b96bdec3ec5a47144fa1ab94e535fd35656dd416d8e2cbaab6d093965f +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_1.pth b/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..a1ab3115d33f6bfd8e47faf47324071268d307d2 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54bc5dcb45216f6930c95743e1f352a0fa5f84222679994a8f6a97772f636254 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_2.pth b/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..b4f3af981ea590fde41af57fa4461235fb1d5ad6 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5002ac3eb7595d4f8164b2467938e575a725c07482b949a3a3b783013b12b593 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_3.pth b/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..9dce2c379872d8c81a92bbc2a40cfbc9b4771c53 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:354fa16568b06af5da2689f7161b7a5d2d10ef695db3831fda8181f45b45a911 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_4.pth b/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..6dddb1e70127c8e81b51179a21149bf175c67ffe --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e67b6cc49ec455478278fa47b94ba97f39681789d6bed66c88d7277c6eda6f6 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_5.pth b/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..3f55a659e02cbb934c533464a8e8fdb3d39b33f5 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edc24e6b57e869a5732f98aa3c8d6c0f4e137bebecaf0a12a1da5fe22942f5fc +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_6.pth b/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..5d0e428c9df15994f5109e810f568925c5eac4c0 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7fdc18e548e6b9fca60d6a4d750ed77b8265337ed3168bf0aa1ab242010b3db +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_7.pth b/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..aead83cd0c6bfb4954b208963afb270b17500194 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-135/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cce64eea5f6a15c5a068c90cd6b5f3b9cf3c3b74e13ba47be6877ca18bb3b71 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-135/scaler.pt b/adapters/saved_llamaHC3_human/checkpoint-135/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7d0610eb63812274211d9ffa24c7db3d3c069e2 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-135/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbf57f10ac3e8af7e2fbd8a6ecb776d489fac33a859187e26b8752fa07194f32 +size 557 diff --git a/adapters/saved_llamaHC3_human/checkpoint-135/scheduler.pt b/adapters/saved_llamaHC3_human/checkpoint-135/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a58e17ace7fd45e21bf2743af5988af5c82af8c --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-135/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d700bc404fb458062a98246935664ebdf14c019a0e59aa23a76a1aa4db5d938 +size 627 diff --git a/adapters/saved_llamaHC3_human/checkpoint-135/trainer_state.json b/adapters/saved_llamaHC3_human/checkpoint-135/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4ef75662ecc4ee5183f6ad1aa90d57807c6103c6 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-135/trainer_state.json @@ -0,0 +1,268 @@ +{ + "best_metric": 2.1250967979431152, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaHC3_human/checkpoint-135", + "epoch": 2.4434389140271495, + "global_step": 135, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.09, + "eval_loss": 2.3469691276550293, + "eval_runtime": 40.7893, + "eval_samples_per_second": 49.032, + "eval_steps_per_second": 0.785, + "step": 5 + }, + { + "epoch": 0.18, + "eval_loss": 2.254398822784424, + "eval_runtime": 40.7321, + "eval_samples_per_second": 49.101, + "eval_steps_per_second": 0.786, + "step": 10 + }, + { + "epoch": 0.27, + "eval_loss": 2.216655731201172, + "eval_runtime": 40.7948, + "eval_samples_per_second": 49.026, + "eval_steps_per_second": 0.784, + "step": 15 + }, + { + "epoch": 0.36, + "learning_rate": 0.000271875, + "loss": 2.265, + "step": 20 + }, + { + "epoch": 0.36, + "eval_loss": 2.192472219467163, + "eval_runtime": 40.9073, + "eval_samples_per_second": 48.891, + "eval_steps_per_second": 0.782, + "step": 20 + }, + { + "epoch": 0.45, + "eval_loss": 2.1772661209106445, + "eval_runtime": 40.8472, + "eval_samples_per_second": 48.963, + "eval_steps_per_second": 0.783, + "step": 25 + }, + { + "epoch": 0.54, + "eval_loss": 2.1680707931518555, + "eval_runtime": 40.7376, + "eval_samples_per_second": 49.095, + "eval_steps_per_second": 0.786, + "step": 30 + }, + { + "epoch": 0.63, + "eval_loss": 2.15969181060791, + "eval_runtime": 40.7043, + "eval_samples_per_second": 49.135, + "eval_steps_per_second": 0.786, + "step": 35 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023624999999999997, + "loss": 2.1528, + "step": 40 + }, + { + "epoch": 0.72, + "eval_loss": 2.1535723209381104, + "eval_runtime": 40.7472, + "eval_samples_per_second": 49.083, + "eval_steps_per_second": 0.785, + "step": 40 + }, + { + "epoch": 0.81, + "eval_loss": 2.149122476577759, + "eval_runtime": 40.7594, + "eval_samples_per_second": 49.068, + "eval_steps_per_second": 0.785, + "step": 45 + }, + { + "epoch": 0.9, + "eval_loss": 2.1453487873077393, + "eval_runtime": 40.7319, + "eval_samples_per_second": 49.102, + "eval_steps_per_second": 0.786, + "step": 50 + }, + { + "epoch": 1.0, + "eval_loss": 2.142251968383789, + "eval_runtime": 40.8482, + "eval_samples_per_second": 48.962, + "eval_steps_per_second": 0.783, + "step": 55 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019874999999999998, + "loss": 2.1332, + "step": 60 + }, + { + "epoch": 1.09, + "eval_loss": 2.139620065689087, + "eval_runtime": 40.7619, + "eval_samples_per_second": 49.065, + "eval_steps_per_second": 0.785, + "step": 60 + }, + { + "epoch": 1.18, + "eval_loss": 2.137423276901245, + "eval_runtime": 40.783, + "eval_samples_per_second": 49.04, + "eval_steps_per_second": 0.785, + "step": 65 + }, + { + "epoch": 1.27, + "eval_loss": 2.135718822479248, + "eval_runtime": 40.7802, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 70 + }, + { + "epoch": 1.36, + "eval_loss": 2.134214401245117, + "eval_runtime": 40.7213, + "eval_samples_per_second": 49.114, + "eval_steps_per_second": 0.786, + "step": 75 + }, + { + "epoch": 1.45, + "learning_rate": 0.00016125, + "loss": 2.1152, + "step": 80 + }, + { + "epoch": 1.45, + "eval_loss": 2.1330068111419678, + "eval_runtime": 40.8447, + "eval_samples_per_second": 48.966, + "eval_steps_per_second": 0.783, + "step": 80 + }, + { + "epoch": 1.54, + "eval_loss": 2.131664276123047, + "eval_runtime": 40.9633, + "eval_samples_per_second": 48.824, + "eval_steps_per_second": 0.781, + "step": 85 + }, + { + "epoch": 1.63, + "eval_loss": 2.1305532455444336, + "eval_runtime": 40.7803, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 90 + }, + { + "epoch": 1.72, + "eval_loss": 2.1295671463012695, + "eval_runtime": 40.8088, + "eval_samples_per_second": 49.009, + "eval_steps_per_second": 0.784, + "step": 95 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012374999999999997, + "loss": 2.1138, + "step": 100 + }, + { + "epoch": 1.81, + "eval_loss": 2.1289596557617188, + "eval_runtime": 40.7573, + "eval_samples_per_second": 49.071, + "eval_steps_per_second": 0.785, + "step": 100 + }, + { + "epoch": 1.9, + "eval_loss": 2.1280689239501953, + "eval_runtime": 40.8322, + "eval_samples_per_second": 48.981, + "eval_steps_per_second": 0.784, + "step": 105 + }, + { + "epoch": 1.99, + "eval_loss": 2.127443552017212, + "eval_runtime": 40.9036, + "eval_samples_per_second": 48.895, + "eval_steps_per_second": 0.782, + "step": 110 + }, + { + "epoch": 2.08, + "eval_loss": 2.126908779144287, + "eval_runtime": 41.0214, + "eval_samples_per_second": 48.755, + "eval_steps_per_second": 0.78, + "step": 115 + }, + { + "epoch": 2.17, + "learning_rate": 8.624999999999998e-05, + "loss": 2.1101, + "step": 120 + }, + { + "epoch": 2.17, + "eval_loss": 2.126199245452881, + "eval_runtime": 40.9707, + "eval_samples_per_second": 48.815, + "eval_steps_per_second": 0.781, + "step": 120 + }, + { + "epoch": 2.26, + "eval_loss": 2.125767230987549, + "eval_runtime": 40.7834, + "eval_samples_per_second": 49.04, + "eval_steps_per_second": 0.785, + "step": 125 + }, + { + "epoch": 2.35, + "eval_loss": 2.125452995300293, + "eval_runtime": 40.8009, + "eval_samples_per_second": 49.019, + "eval_steps_per_second": 0.784, + "step": 130 + }, + { + "epoch": 2.44, + "eval_loss": 2.1250967979431152, + "eval_runtime": 40.7689, + "eval_samples_per_second": 49.057, + "eval_steps_per_second": 0.785, + "step": 135 + } + ], + "max_steps": 165, + "num_train_epochs": 3, + "total_flos": 1.984028875025285e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaHC3_human/checkpoint-135/training_args.bin b/adapters/saved_llamaHC3_human/checkpoint-135/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08d435511e54e49bd3b0e2c715e12420f287521d --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-135/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:004b0b2ef592ca03dc7186e3339fabafe55905860b1dd3e8c499f5470388c62b +size 3643 diff --git a/adapters/saved_llamaHC3_human/checkpoint-140/optimizer.pt b/adapters/saved_llamaHC3_human/checkpoint-140/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0360b9c07ecb28c56a32c2967547b9945d751903 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-140/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c858c451d2a43da45155a0e26c5bf99d007093623557c987119c507f4b64a12 +size 33629765 diff --git a/adapters/saved_llamaHC3_human/checkpoint-140/pytorch_model.bin b/adapters/saved_llamaHC3_human/checkpoint-140/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..87ad0b85cc8c92d3d90c1a40d8d163f89fcd6a15 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-140/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1d042dc9b305112d0e787f2dd3b1b9608829d290f7722f3289f5a2c10627eda +size 16822989 diff --git a/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_0.pth b/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5a4d775866b43c8c7c232835c6bf4ce02db95686 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a76e00b2caf215010351e2dc4bba9f6ec698bc5119fd1d79e773fc776b02d26e +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_1.pth b/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0284d72b2fa3ec55a439095d3967be52cd7229d1 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f72ae6618093f84e59b823c63f01b259dc1cbc39021654753095e4203b155e45 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_2.pth b/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..0d700f326dff9206b1c86e37a6b30b7021b3f28b --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a99d4eab0a3a022c7b6df1a9c4eb6f4676afd73fb15790804c5b03c6ca5f6aa +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_3.pth b/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f1fc8c55d0e91632a46cdc6e2d02f769b9b7d7bf --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f2236beaa76cafc2e4e7abc98b904581290f13e45c9e310582924ad23279078 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_4.pth b/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..6bc4e6daf107cdbf328c2007147b17810f843392 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cbdd0ec4214bbe5cdb08b1b6b5893faeb66dd5dd0229e840800deb0ce0b3816 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_5.pth b/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..ca998a3b5d79e538ad6dbb87921a3d8fb7adaf41 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cd8591c53cb91ed598e14c190392e270d25944cd0ab96e8193c111c1543c247 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_6.pth b/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..9e1748a3963f0d311c1de62c601ca85427c38123 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68eba966270eebd9c29843fb3e4d9d8f380c3023708534e82bffdc2f2c5e7eca +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_7.pth b/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..0a683f8430094bcbf2614a2d89b81362b2549558 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-140/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d805a4a88a5a8410f0509ecdd724f1fb5319d40d0d288522fa46209e3d62503 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-140/scaler.pt b/adapters/saved_llamaHC3_human/checkpoint-140/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..091cf8a809165ae7f71650b5c406e3bcd0032922 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-140/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8891658daa019107d971bac91f3ec43ad584832f53a6c8eef4b7012fcc58966 +size 557 diff --git a/adapters/saved_llamaHC3_human/checkpoint-140/scheduler.pt b/adapters/saved_llamaHC3_human/checkpoint-140/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d76bbd2f08afab2b9df87f69e7381bca3f3f0a4a --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-140/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35b80d2312f1a96d5f4ee66835a90519be86be2480b55ab5e5ce956a4bb05e0d +size 627 diff --git a/adapters/saved_llamaHC3_human/checkpoint-140/trainer_state.json b/adapters/saved_llamaHC3_human/checkpoint-140/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..89edb2d84aafaaba932b5ea04f4ab8d6eaa0a056 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-140/trainer_state.json @@ -0,0 +1,282 @@ +{ + "best_metric": 2.124704599380493, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaHC3_human/checkpoint-140", + "epoch": 2.5339366515837103, + "global_step": 140, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.09, + "eval_loss": 2.3469691276550293, + "eval_runtime": 40.7893, + "eval_samples_per_second": 49.032, + "eval_steps_per_second": 0.785, + "step": 5 + }, + { + "epoch": 0.18, + "eval_loss": 2.254398822784424, + "eval_runtime": 40.7321, + "eval_samples_per_second": 49.101, + "eval_steps_per_second": 0.786, + "step": 10 + }, + { + "epoch": 0.27, + "eval_loss": 2.216655731201172, + "eval_runtime": 40.7948, + "eval_samples_per_second": 49.026, + "eval_steps_per_second": 0.784, + "step": 15 + }, + { + "epoch": 0.36, + "learning_rate": 0.000271875, + "loss": 2.265, + "step": 20 + }, + { + "epoch": 0.36, + "eval_loss": 2.192472219467163, + "eval_runtime": 40.9073, + "eval_samples_per_second": 48.891, + "eval_steps_per_second": 0.782, + "step": 20 + }, + { + "epoch": 0.45, + "eval_loss": 2.1772661209106445, + "eval_runtime": 40.8472, + "eval_samples_per_second": 48.963, + "eval_steps_per_second": 0.783, + "step": 25 + }, + { + "epoch": 0.54, + "eval_loss": 2.1680707931518555, + "eval_runtime": 40.7376, + "eval_samples_per_second": 49.095, + "eval_steps_per_second": 0.786, + "step": 30 + }, + { + "epoch": 0.63, + "eval_loss": 2.15969181060791, + "eval_runtime": 40.7043, + "eval_samples_per_second": 49.135, + "eval_steps_per_second": 0.786, + "step": 35 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023624999999999997, + "loss": 2.1528, + "step": 40 + }, + { + "epoch": 0.72, + "eval_loss": 2.1535723209381104, + "eval_runtime": 40.7472, + "eval_samples_per_second": 49.083, + "eval_steps_per_second": 0.785, + "step": 40 + }, + { + "epoch": 0.81, + "eval_loss": 2.149122476577759, + "eval_runtime": 40.7594, + "eval_samples_per_second": 49.068, + "eval_steps_per_second": 0.785, + "step": 45 + }, + { + "epoch": 0.9, + "eval_loss": 2.1453487873077393, + "eval_runtime": 40.7319, + "eval_samples_per_second": 49.102, + "eval_steps_per_second": 0.786, + "step": 50 + }, + { + "epoch": 1.0, + "eval_loss": 2.142251968383789, + "eval_runtime": 40.8482, + "eval_samples_per_second": 48.962, + "eval_steps_per_second": 0.783, + "step": 55 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019874999999999998, + "loss": 2.1332, + "step": 60 + }, + { + "epoch": 1.09, + "eval_loss": 2.139620065689087, + "eval_runtime": 40.7619, + "eval_samples_per_second": 49.065, + "eval_steps_per_second": 0.785, + "step": 60 + }, + { + "epoch": 1.18, + "eval_loss": 2.137423276901245, + "eval_runtime": 40.783, + "eval_samples_per_second": 49.04, + "eval_steps_per_second": 0.785, + "step": 65 + }, + { + "epoch": 1.27, + "eval_loss": 2.135718822479248, + "eval_runtime": 40.7802, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 70 + }, + { + "epoch": 1.36, + "eval_loss": 2.134214401245117, + "eval_runtime": 40.7213, + "eval_samples_per_second": 49.114, + "eval_steps_per_second": 0.786, + "step": 75 + }, + { + "epoch": 1.45, + "learning_rate": 0.00016125, + "loss": 2.1152, + "step": 80 + }, + { + "epoch": 1.45, + "eval_loss": 2.1330068111419678, + "eval_runtime": 40.8447, + "eval_samples_per_second": 48.966, + "eval_steps_per_second": 0.783, + "step": 80 + }, + { + "epoch": 1.54, + "eval_loss": 2.131664276123047, + "eval_runtime": 40.9633, + "eval_samples_per_second": 48.824, + "eval_steps_per_second": 0.781, + "step": 85 + }, + { + "epoch": 1.63, + "eval_loss": 2.1305532455444336, + "eval_runtime": 40.7803, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 90 + }, + { + "epoch": 1.72, + "eval_loss": 2.1295671463012695, + "eval_runtime": 40.8088, + "eval_samples_per_second": 49.009, + "eval_steps_per_second": 0.784, + "step": 95 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012374999999999997, + "loss": 2.1138, + "step": 100 + }, + { + "epoch": 1.81, + "eval_loss": 2.1289596557617188, + "eval_runtime": 40.7573, + "eval_samples_per_second": 49.071, + "eval_steps_per_second": 0.785, + "step": 100 + }, + { + "epoch": 1.9, + "eval_loss": 2.1280689239501953, + "eval_runtime": 40.8322, + "eval_samples_per_second": 48.981, + "eval_steps_per_second": 0.784, + "step": 105 + }, + { + "epoch": 1.99, + "eval_loss": 2.127443552017212, + "eval_runtime": 40.9036, + "eval_samples_per_second": 48.895, + "eval_steps_per_second": 0.782, + "step": 110 + }, + { + "epoch": 2.08, + "eval_loss": 2.126908779144287, + "eval_runtime": 41.0214, + "eval_samples_per_second": 48.755, + "eval_steps_per_second": 0.78, + "step": 115 + }, + { + "epoch": 2.17, + "learning_rate": 8.624999999999998e-05, + "loss": 2.1101, + "step": 120 + }, + { + "epoch": 2.17, + "eval_loss": 2.126199245452881, + "eval_runtime": 40.9707, + "eval_samples_per_second": 48.815, + "eval_steps_per_second": 0.781, + "step": 120 + }, + { + "epoch": 2.26, + "eval_loss": 2.125767230987549, + "eval_runtime": 40.7834, + "eval_samples_per_second": 49.04, + "eval_steps_per_second": 0.785, + "step": 125 + }, + { + "epoch": 2.35, + "eval_loss": 2.125452995300293, + "eval_runtime": 40.8009, + "eval_samples_per_second": 49.019, + "eval_steps_per_second": 0.784, + "step": 130 + }, + { + "epoch": 2.44, + "eval_loss": 2.1250967979431152, + "eval_runtime": 40.7689, + "eval_samples_per_second": 49.057, + "eval_steps_per_second": 0.785, + "step": 135 + }, + { + "epoch": 2.53, + "learning_rate": 4.875e-05, + "loss": 2.108, + "step": 140 + }, + { + "epoch": 2.53, + "eval_loss": 2.124704599380493, + "eval_runtime": 40.7682, + "eval_samples_per_second": 49.058, + "eval_steps_per_second": 0.785, + "step": 140 + } + ], + "max_steps": 165, + "num_train_epochs": 3, + "total_flos": 2.0586863673863045e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaHC3_human/checkpoint-140/training_args.bin b/adapters/saved_llamaHC3_human/checkpoint-140/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08d435511e54e49bd3b0e2c715e12420f287521d --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-140/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:004b0b2ef592ca03dc7186e3339fabafe55905860b1dd3e8c499f5470388c62b +size 3643 diff --git a/adapters/saved_llamaHC3_human/checkpoint-145/optimizer.pt b/adapters/saved_llamaHC3_human/checkpoint-145/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..234914f83c1fdbedda731d464fc6a6c4faf76bc6 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-145/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:822dbc5ec503290c08e6c9aff0561f87f1da44b908d32f96002c6df5f2c90104 +size 33629765 diff --git a/adapters/saved_llamaHC3_human/checkpoint-145/pytorch_model.bin b/adapters/saved_llamaHC3_human/checkpoint-145/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..a48c0c2459950103d7cbbd787242f63e90114a5d --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-145/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:633920b11da0014ac1845fe20bdc731c4c424a55e3702c2b5e83b23f679879ff +size 16822989 diff --git a/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_0.pth b/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..89024badce8f64fb2b9f8aaf070a299dbee9bb89 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb002116c8cb002c0dcbbd108835beea8cd2e19ea5657c901efacea57f66d7b4 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_1.pth b/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..39b0a5815eac131045133ec35ed6de8d7b6035cc --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b50aef4098d79cc91310365408a560064f87e381f7d65c728f3a1f96a5048a2 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_2.pth b/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..f820aadb09547894d20be63a13597a6a9afdad7f --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e973ffd489001eee84274ad4daf651187be58bdb21b894c00835ca79e4f2556 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_3.pth b/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..d67bfd69c8270fe93559db891258cb739a887def --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:391f272b48076aaf7ca27b171e90eba5a81bfc8dfcdb0256c1e50a0fc36fd1f9 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_4.pth b/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..6182c05026418e3ac9ecfd62a4f742d0b69fbc1f --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4341cd57e5dc6cb79f8ad3bd9ae37fa03a5cc42117ea8d282a69475c0ff85aa +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_5.pth b/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..5b1d9524f9e6d0e65ab4024552c61f2704c3a7cb --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13e15de01f220605e2edc810bb6a963c56ae3ba773fbd3eba8a293f6fc78745e +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_6.pth b/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..fa2586ec3a342bbcf191d84b5a8c9d3b5217481e --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b45d21a5005c1289954fe2369f63fd0366d64b8b0ee2d0110ef0d0721780fb37 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_7.pth b/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..3bf60b1e68b0e8c3f1b44f8c47c10f3d0c7d0112 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-145/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9c5ddb9ed818f18217c9437563399ff7a4a90c1caf451b2e5e19f844e740db5 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-145/scaler.pt b/adapters/saved_llamaHC3_human/checkpoint-145/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e901f12918e6fe597ffb9dfcbe805e0d6740c55d --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-145/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79c6ecc519f5c11b9917467991601fd897422716ee349e71a3604559e571b0ce +size 557 diff --git a/adapters/saved_llamaHC3_human/checkpoint-145/scheduler.pt b/adapters/saved_llamaHC3_human/checkpoint-145/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0dd629b2fa011b6a74f2178cb4ca0e59e540ad5 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-145/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcb4490d9b352e73c69a27722eef051e8f2abada110240ffe554828531f90478 +size 627 diff --git a/adapters/saved_llamaHC3_human/checkpoint-145/trainer_state.json b/adapters/saved_llamaHC3_human/checkpoint-145/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2b774107c71e6b7997422589677446224b0f21b2 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-145/trainer_state.json @@ -0,0 +1,290 @@ +{ + "best_metric": 2.124401807785034, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaHC3_human/checkpoint-145", + "epoch": 2.6244343891402715, + "global_step": 145, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.09, + "eval_loss": 2.3469691276550293, + "eval_runtime": 40.7893, + "eval_samples_per_second": 49.032, + "eval_steps_per_second": 0.785, + "step": 5 + }, + { + "epoch": 0.18, + "eval_loss": 2.254398822784424, + "eval_runtime": 40.7321, + "eval_samples_per_second": 49.101, + "eval_steps_per_second": 0.786, + "step": 10 + }, + { + "epoch": 0.27, + "eval_loss": 2.216655731201172, + "eval_runtime": 40.7948, + "eval_samples_per_second": 49.026, + "eval_steps_per_second": 0.784, + "step": 15 + }, + { + "epoch": 0.36, + "learning_rate": 0.000271875, + "loss": 2.265, + "step": 20 + }, + { + "epoch": 0.36, + "eval_loss": 2.192472219467163, + "eval_runtime": 40.9073, + "eval_samples_per_second": 48.891, + "eval_steps_per_second": 0.782, + "step": 20 + }, + { + "epoch": 0.45, + "eval_loss": 2.1772661209106445, + "eval_runtime": 40.8472, + "eval_samples_per_second": 48.963, + "eval_steps_per_second": 0.783, + "step": 25 + }, + { + "epoch": 0.54, + "eval_loss": 2.1680707931518555, + "eval_runtime": 40.7376, + "eval_samples_per_second": 49.095, + "eval_steps_per_second": 0.786, + "step": 30 + }, + { + "epoch": 0.63, + "eval_loss": 2.15969181060791, + "eval_runtime": 40.7043, + "eval_samples_per_second": 49.135, + "eval_steps_per_second": 0.786, + "step": 35 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023624999999999997, + "loss": 2.1528, + "step": 40 + }, + { + "epoch": 0.72, + "eval_loss": 2.1535723209381104, + "eval_runtime": 40.7472, + "eval_samples_per_second": 49.083, + "eval_steps_per_second": 0.785, + "step": 40 + }, + { + "epoch": 0.81, + "eval_loss": 2.149122476577759, + "eval_runtime": 40.7594, + "eval_samples_per_second": 49.068, + "eval_steps_per_second": 0.785, + "step": 45 + }, + { + "epoch": 0.9, + "eval_loss": 2.1453487873077393, + "eval_runtime": 40.7319, + "eval_samples_per_second": 49.102, + "eval_steps_per_second": 0.786, + "step": 50 + }, + { + "epoch": 1.0, + "eval_loss": 2.142251968383789, + "eval_runtime": 40.8482, + "eval_samples_per_second": 48.962, + "eval_steps_per_second": 0.783, + "step": 55 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019874999999999998, + "loss": 2.1332, + "step": 60 + }, + { + "epoch": 1.09, + "eval_loss": 2.139620065689087, + "eval_runtime": 40.7619, + "eval_samples_per_second": 49.065, + "eval_steps_per_second": 0.785, + "step": 60 + }, + { + "epoch": 1.18, + "eval_loss": 2.137423276901245, + "eval_runtime": 40.783, + "eval_samples_per_second": 49.04, + "eval_steps_per_second": 0.785, + "step": 65 + }, + { + "epoch": 1.27, + "eval_loss": 2.135718822479248, + "eval_runtime": 40.7802, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 70 + }, + { + "epoch": 1.36, + "eval_loss": 2.134214401245117, + "eval_runtime": 40.7213, + "eval_samples_per_second": 49.114, + "eval_steps_per_second": 0.786, + "step": 75 + }, + { + "epoch": 1.45, + "learning_rate": 0.00016125, + "loss": 2.1152, + "step": 80 + }, + { + "epoch": 1.45, + "eval_loss": 2.1330068111419678, + "eval_runtime": 40.8447, + "eval_samples_per_second": 48.966, + "eval_steps_per_second": 0.783, + "step": 80 + }, + { + "epoch": 1.54, + "eval_loss": 2.131664276123047, + "eval_runtime": 40.9633, + "eval_samples_per_second": 48.824, + "eval_steps_per_second": 0.781, + "step": 85 + }, + { + "epoch": 1.63, + "eval_loss": 2.1305532455444336, + "eval_runtime": 40.7803, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 90 + }, + { + "epoch": 1.72, + "eval_loss": 2.1295671463012695, + "eval_runtime": 40.8088, + "eval_samples_per_second": 49.009, + "eval_steps_per_second": 0.784, + "step": 95 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012374999999999997, + "loss": 2.1138, + "step": 100 + }, + { + "epoch": 1.81, + "eval_loss": 2.1289596557617188, + "eval_runtime": 40.7573, + "eval_samples_per_second": 49.071, + "eval_steps_per_second": 0.785, + "step": 100 + }, + { + "epoch": 1.9, + "eval_loss": 2.1280689239501953, + "eval_runtime": 40.8322, + "eval_samples_per_second": 48.981, + "eval_steps_per_second": 0.784, + "step": 105 + }, + { + "epoch": 1.99, + "eval_loss": 2.127443552017212, + "eval_runtime": 40.9036, + "eval_samples_per_second": 48.895, + "eval_steps_per_second": 0.782, + "step": 110 + }, + { + "epoch": 2.08, + "eval_loss": 2.126908779144287, + "eval_runtime": 41.0214, + "eval_samples_per_second": 48.755, + "eval_steps_per_second": 0.78, + "step": 115 + }, + { + "epoch": 2.17, + "learning_rate": 8.624999999999998e-05, + "loss": 2.1101, + "step": 120 + }, + { + "epoch": 2.17, + "eval_loss": 2.126199245452881, + "eval_runtime": 40.9707, + "eval_samples_per_second": 48.815, + "eval_steps_per_second": 0.781, + "step": 120 + }, + { + "epoch": 2.26, + "eval_loss": 2.125767230987549, + "eval_runtime": 40.7834, + "eval_samples_per_second": 49.04, + "eval_steps_per_second": 0.785, + "step": 125 + }, + { + "epoch": 2.35, + "eval_loss": 2.125452995300293, + "eval_runtime": 40.8009, + "eval_samples_per_second": 49.019, + "eval_steps_per_second": 0.784, + "step": 130 + }, + { + "epoch": 2.44, + "eval_loss": 2.1250967979431152, + "eval_runtime": 40.7689, + "eval_samples_per_second": 49.057, + "eval_steps_per_second": 0.785, + "step": 135 + }, + { + "epoch": 2.53, + "learning_rate": 4.875e-05, + "loss": 2.108, + "step": 140 + }, + { + "epoch": 2.53, + "eval_loss": 2.124704599380493, + "eval_runtime": 40.7682, + "eval_samples_per_second": 49.058, + "eval_steps_per_second": 0.785, + "step": 140 + }, + { + "epoch": 2.62, + "eval_loss": 2.124401807785034, + "eval_runtime": 40.7695, + "eval_samples_per_second": 49.056, + "eval_steps_per_second": 0.785, + "step": 145 + } + ], + "max_steps": 165, + "num_train_epochs": 3, + "total_flos": 2.1310292331520328e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaHC3_human/checkpoint-145/training_args.bin b/adapters/saved_llamaHC3_human/checkpoint-145/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08d435511e54e49bd3b0e2c715e12420f287521d --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-145/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:004b0b2ef592ca03dc7186e3339fabafe55905860b1dd3e8c499f5470388c62b +size 3643 diff --git a/adapters/saved_llamaHC3_human/checkpoint-150/optimizer.pt b/adapters/saved_llamaHC3_human/checkpoint-150/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ad04009da531cac0aaa950550b2c2def77a72ff --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-150/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58df5962f7e74fe385d1c08581f97f45228c92b041dde6680170d9fef221958a +size 33629765 diff --git a/adapters/saved_llamaHC3_human/checkpoint-150/pytorch_model.bin b/adapters/saved_llamaHC3_human/checkpoint-150/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ef159f27b87a5de8f573a27ea83d9b70601d08a4 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-150/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec75ca0f37fb99fcdb377b2fd7d0213e28cf928e7425627e112dbd0e4b682036 +size 16822989 diff --git a/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_0.pth b/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..270c282166259dc4e05f3948701a46b73b5abf74 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:537915a708933ea92e8e0c1512d61932c18713e489e482365c5d64e02b81e9c4 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_1.pth b/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..c375421873dc528537eb36db984e655c09c03de7 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83a6cfb6689106199f6dbee1e7e4dbd14a223b7149fc0cc4864ada13663ff73c +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_2.pth b/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..3dc99d94c5fa9783e2bb0b4ecf4cdf9f2a7f9b54 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0676071acf8adf17a4f005c084c505a87b62d7384391993bb6914d6a185d2e50 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_3.pth b/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..65040409742abcd569def4fde6c8a2baf8ce0fbe --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a0f95613e21e8ed1fa4c00965b1e341be9487a7ac67eed599423dc3f04b2839 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_4.pth b/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..90b9c3acb9895ba53eb72e885b586cd7aa5b59d4 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e50334097dd1ded543223bba0bedb5c8aff8c1e1fea242aebcbaa355e6375482 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_5.pth b/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..7594b3caf20b4eeef5dc20b7a0e8c5e91e33d21f --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16a10786178f9eb9c96b1cbf1290638f3ff6d3ae6ade079cb6184254fc48b4d4 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_6.pth b/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..25cef97987b47227e7f4bdef40cac47dfef4d64f --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:543e59d0b271785480611ad06ee50b693f1eafd191afc9808ddbb1195be24134 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_7.pth b/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..6ec199ae4e2573deb364e3c47aa1eb1a399dfd78 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-150/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e93807c7426d92ca7d8a1ba387d69f93c4a8f880b67d2ac2b0f480e7c6eccb1 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-150/scaler.pt b/adapters/saved_llamaHC3_human/checkpoint-150/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ba176eacb88ff7b0fc92faafdea5989cd1ea4d3 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-150/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:857c860a90747303e7bd9bdd09128c1f2428f9f884e675fc1b947c750fc26306 +size 557 diff --git a/adapters/saved_llamaHC3_human/checkpoint-150/scheduler.pt b/adapters/saved_llamaHC3_human/checkpoint-150/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..705f83b2b8367512fc7d3d817d9e2d05f48b4ff3 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-150/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44165d72c935f395e8693dfa80bf1d7a6af1d2aceb7d416abea8b21e11950c7e +size 627 diff --git a/adapters/saved_llamaHC3_human/checkpoint-150/trainer_state.json b/adapters/saved_llamaHC3_human/checkpoint-150/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cb76ff8efc2dd798bdecf03c4d2007432d137134 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-150/trainer_state.json @@ -0,0 +1,298 @@ +{ + "best_metric": 2.1242740154266357, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaHC3_human/checkpoint-150", + "epoch": 2.7149321266968327, + "global_step": 150, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.09, + "eval_loss": 2.3469691276550293, + "eval_runtime": 40.7893, + "eval_samples_per_second": 49.032, + "eval_steps_per_second": 0.785, + "step": 5 + }, + { + "epoch": 0.18, + "eval_loss": 2.254398822784424, + "eval_runtime": 40.7321, + "eval_samples_per_second": 49.101, + "eval_steps_per_second": 0.786, + "step": 10 + }, + { + "epoch": 0.27, + "eval_loss": 2.216655731201172, + "eval_runtime": 40.7948, + "eval_samples_per_second": 49.026, + "eval_steps_per_second": 0.784, + "step": 15 + }, + { + "epoch": 0.36, + "learning_rate": 0.000271875, + "loss": 2.265, + "step": 20 + }, + { + "epoch": 0.36, + "eval_loss": 2.192472219467163, + "eval_runtime": 40.9073, + "eval_samples_per_second": 48.891, + "eval_steps_per_second": 0.782, + "step": 20 + }, + { + "epoch": 0.45, + "eval_loss": 2.1772661209106445, + "eval_runtime": 40.8472, + "eval_samples_per_second": 48.963, + "eval_steps_per_second": 0.783, + "step": 25 + }, + { + "epoch": 0.54, + "eval_loss": 2.1680707931518555, + "eval_runtime": 40.7376, + "eval_samples_per_second": 49.095, + "eval_steps_per_second": 0.786, + "step": 30 + }, + { + "epoch": 0.63, + "eval_loss": 2.15969181060791, + "eval_runtime": 40.7043, + "eval_samples_per_second": 49.135, + "eval_steps_per_second": 0.786, + "step": 35 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023624999999999997, + "loss": 2.1528, + "step": 40 + }, + { + "epoch": 0.72, + "eval_loss": 2.1535723209381104, + "eval_runtime": 40.7472, + "eval_samples_per_second": 49.083, + "eval_steps_per_second": 0.785, + "step": 40 + }, + { + "epoch": 0.81, + "eval_loss": 2.149122476577759, + "eval_runtime": 40.7594, + "eval_samples_per_second": 49.068, + "eval_steps_per_second": 0.785, + "step": 45 + }, + { + "epoch": 0.9, + "eval_loss": 2.1453487873077393, + "eval_runtime": 40.7319, + "eval_samples_per_second": 49.102, + "eval_steps_per_second": 0.786, + "step": 50 + }, + { + "epoch": 1.0, + "eval_loss": 2.142251968383789, + "eval_runtime": 40.8482, + "eval_samples_per_second": 48.962, + "eval_steps_per_second": 0.783, + "step": 55 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019874999999999998, + "loss": 2.1332, + "step": 60 + }, + { + "epoch": 1.09, + "eval_loss": 2.139620065689087, + "eval_runtime": 40.7619, + "eval_samples_per_second": 49.065, + "eval_steps_per_second": 0.785, + "step": 60 + }, + { + "epoch": 1.18, + "eval_loss": 2.137423276901245, + "eval_runtime": 40.783, + "eval_samples_per_second": 49.04, + "eval_steps_per_second": 0.785, + "step": 65 + }, + { + "epoch": 1.27, + "eval_loss": 2.135718822479248, + "eval_runtime": 40.7802, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 70 + }, + { + "epoch": 1.36, + "eval_loss": 2.134214401245117, + "eval_runtime": 40.7213, + "eval_samples_per_second": 49.114, + "eval_steps_per_second": 0.786, + "step": 75 + }, + { + "epoch": 1.45, + "learning_rate": 0.00016125, + "loss": 2.1152, + "step": 80 + }, + { + "epoch": 1.45, + "eval_loss": 2.1330068111419678, + "eval_runtime": 40.8447, + "eval_samples_per_second": 48.966, + "eval_steps_per_second": 0.783, + "step": 80 + }, + { + "epoch": 1.54, + "eval_loss": 2.131664276123047, + "eval_runtime": 40.9633, + "eval_samples_per_second": 48.824, + "eval_steps_per_second": 0.781, + "step": 85 + }, + { + "epoch": 1.63, + "eval_loss": 2.1305532455444336, + "eval_runtime": 40.7803, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 90 + }, + { + "epoch": 1.72, + "eval_loss": 2.1295671463012695, + "eval_runtime": 40.8088, + "eval_samples_per_second": 49.009, + "eval_steps_per_second": 0.784, + "step": 95 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012374999999999997, + "loss": 2.1138, + "step": 100 + }, + { + "epoch": 1.81, + "eval_loss": 2.1289596557617188, + "eval_runtime": 40.7573, + "eval_samples_per_second": 49.071, + "eval_steps_per_second": 0.785, + "step": 100 + }, + { + "epoch": 1.9, + "eval_loss": 2.1280689239501953, + "eval_runtime": 40.8322, + "eval_samples_per_second": 48.981, + "eval_steps_per_second": 0.784, + "step": 105 + }, + { + "epoch": 1.99, + "eval_loss": 2.127443552017212, + "eval_runtime": 40.9036, + "eval_samples_per_second": 48.895, + "eval_steps_per_second": 0.782, + "step": 110 + }, + { + "epoch": 2.08, + "eval_loss": 2.126908779144287, + "eval_runtime": 41.0214, + "eval_samples_per_second": 48.755, + "eval_steps_per_second": 0.78, + "step": 115 + }, + { + "epoch": 2.17, + "learning_rate": 8.624999999999998e-05, + "loss": 2.1101, + "step": 120 + }, + { + "epoch": 2.17, + "eval_loss": 2.126199245452881, + "eval_runtime": 40.9707, + "eval_samples_per_second": 48.815, + "eval_steps_per_second": 0.781, + "step": 120 + }, + { + "epoch": 2.26, + "eval_loss": 2.125767230987549, + "eval_runtime": 40.7834, + "eval_samples_per_second": 49.04, + "eval_steps_per_second": 0.785, + "step": 125 + }, + { + "epoch": 2.35, + "eval_loss": 2.125452995300293, + "eval_runtime": 40.8009, + "eval_samples_per_second": 49.019, + "eval_steps_per_second": 0.784, + "step": 130 + }, + { + "epoch": 2.44, + "eval_loss": 2.1250967979431152, + "eval_runtime": 40.7689, + "eval_samples_per_second": 49.057, + "eval_steps_per_second": 0.785, + "step": 135 + }, + { + "epoch": 2.53, + "learning_rate": 4.875e-05, + "loss": 2.108, + "step": 140 + }, + { + "epoch": 2.53, + "eval_loss": 2.124704599380493, + "eval_runtime": 40.7682, + "eval_samples_per_second": 49.058, + "eval_steps_per_second": 0.785, + "step": 140 + }, + { + "epoch": 2.62, + "eval_loss": 2.124401807785034, + "eval_runtime": 40.7695, + "eval_samples_per_second": 49.056, + "eval_steps_per_second": 0.785, + "step": 145 + }, + { + "epoch": 2.71, + "eval_loss": 2.1242740154266357, + "eval_runtime": 40.8982, + "eval_samples_per_second": 48.902, + "eval_steps_per_second": 0.782, + "step": 150 + } + ], + "max_steps": 165, + "num_train_epochs": 3, + "total_flos": 2.2050761357823836e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaHC3_human/checkpoint-150/training_args.bin b/adapters/saved_llamaHC3_human/checkpoint-150/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08d435511e54e49bd3b0e2c715e12420f287521d --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-150/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:004b0b2ef592ca03dc7186e3339fabafe55905860b1dd3e8c499f5470388c62b +size 3643 diff --git a/adapters/saved_llamaHC3_human/checkpoint-155/optimizer.pt b/adapters/saved_llamaHC3_human/checkpoint-155/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c98af5faf0b4582ff86a5e940cda401a2608f2c --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-155/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26d31cec7d10a8b935d835e899f63ec84e05d5be393f703823cc81c598118e9c +size 33629765 diff --git a/adapters/saved_llamaHC3_human/checkpoint-155/pytorch_model.bin b/adapters/saved_llamaHC3_human/checkpoint-155/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..29a61769f64ce4e6923aaa73b2becc9c82464d98 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-155/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6ca741e049c3882e907ef0bee3201fbc751ed60633c277da333ad3c6bba941e +size 16822989 diff --git a/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_0.pth b/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8bbb33b7701fcdce0eeb695924e7fe0de1dda09 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fc307fd1569b7840e2dd63dd82d2c6416f0ce867a2e42c325ac232f6166b97 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_1.pth b/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..002f4500b530fd1f8d70d91c1da3e7b405802c55 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ed951d9e239de25ba04512a4b53a1da6d41e3b92a428cb1400316617f6af4e1 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_2.pth b/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..cd426667f74f2275abd8dc7d35353b0f7cc1e5d0 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5277b21215394188f8214905207778d29c935b310fb1333f46c2dd586ea4e34d +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_3.pth b/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..62d179e215844eed866dda82fae008bb29fd6ce9 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80f6378381b58dd953e6e82dbc6f67c2c0e7fa60597f5fcdc2339cd93b026b73 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_4.pth b/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..8afa23c4f238eb7086e65a2efd351e3a70921a0d --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:329bfabd7e137dc083a2ddac831e77dc86a926edc1110662e6d57ca2e3f2f320 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_5.pth b/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..2450945b6014511d4f93471e1d5a4bbaaab9398f --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2cd1eb55c3a25cfc0c4645195105211b6fb4fc97229720d908f7d3461326364 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_6.pth b/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..11531df2a83f26b21bce67235d225d47f6d89b75 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97f1697bc63cf7056dfcee5958111e071e44c8a1aaa02d12a608859568798087 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_7.pth b/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..715d11cf37700c2e44da55159fe33075575a910b --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-155/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8dfb76cb40ef62454898cf8b7e05892a482a50c034ecc3380dccd713d597e42 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-155/scaler.pt b/adapters/saved_llamaHC3_human/checkpoint-155/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0fc57b42d381c4c1408cfd573227e14b71908b2 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-155/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e74d86497a5c3f81d2db12f1a59280923c00667bac1c62a48ced089007abec9 +size 557 diff --git a/adapters/saved_llamaHC3_human/checkpoint-155/scheduler.pt b/adapters/saved_llamaHC3_human/checkpoint-155/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ca6d747f73325f880194bad5201801dcb4e82cf --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-155/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585c1104e52aa36eefef469ae1c76a9622fe7ec7f05dfa7d9d7fea19d31d0664 +size 627 diff --git a/adapters/saved_llamaHC3_human/checkpoint-155/trainer_state.json b/adapters/saved_llamaHC3_human/checkpoint-155/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a8db82a238ee176fc2d2bfc2e32898fa208dd851 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-155/trainer_state.json @@ -0,0 +1,306 @@ +{ + "best_metric": 2.124105930328369, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaHC3_human/checkpoint-155", + "epoch": 2.8054298642533935, + "global_step": 155, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.09, + "eval_loss": 2.3469691276550293, + "eval_runtime": 40.7893, + "eval_samples_per_second": 49.032, + "eval_steps_per_second": 0.785, + "step": 5 + }, + { + "epoch": 0.18, + "eval_loss": 2.254398822784424, + "eval_runtime": 40.7321, + "eval_samples_per_second": 49.101, + "eval_steps_per_second": 0.786, + "step": 10 + }, + { + "epoch": 0.27, + "eval_loss": 2.216655731201172, + "eval_runtime": 40.7948, + "eval_samples_per_second": 49.026, + "eval_steps_per_second": 0.784, + "step": 15 + }, + { + "epoch": 0.36, + "learning_rate": 0.000271875, + "loss": 2.265, + "step": 20 + }, + { + "epoch": 0.36, + "eval_loss": 2.192472219467163, + "eval_runtime": 40.9073, + "eval_samples_per_second": 48.891, + "eval_steps_per_second": 0.782, + "step": 20 + }, + { + "epoch": 0.45, + "eval_loss": 2.1772661209106445, + "eval_runtime": 40.8472, + "eval_samples_per_second": 48.963, + "eval_steps_per_second": 0.783, + "step": 25 + }, + { + "epoch": 0.54, + "eval_loss": 2.1680707931518555, + "eval_runtime": 40.7376, + "eval_samples_per_second": 49.095, + "eval_steps_per_second": 0.786, + "step": 30 + }, + { + "epoch": 0.63, + "eval_loss": 2.15969181060791, + "eval_runtime": 40.7043, + "eval_samples_per_second": 49.135, + "eval_steps_per_second": 0.786, + "step": 35 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023624999999999997, + "loss": 2.1528, + "step": 40 + }, + { + "epoch": 0.72, + "eval_loss": 2.1535723209381104, + "eval_runtime": 40.7472, + "eval_samples_per_second": 49.083, + "eval_steps_per_second": 0.785, + "step": 40 + }, + { + "epoch": 0.81, + "eval_loss": 2.149122476577759, + "eval_runtime": 40.7594, + "eval_samples_per_second": 49.068, + "eval_steps_per_second": 0.785, + "step": 45 + }, + { + "epoch": 0.9, + "eval_loss": 2.1453487873077393, + "eval_runtime": 40.7319, + "eval_samples_per_second": 49.102, + "eval_steps_per_second": 0.786, + "step": 50 + }, + { + "epoch": 1.0, + "eval_loss": 2.142251968383789, + "eval_runtime": 40.8482, + "eval_samples_per_second": 48.962, + "eval_steps_per_second": 0.783, + "step": 55 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019874999999999998, + "loss": 2.1332, + "step": 60 + }, + { + "epoch": 1.09, + "eval_loss": 2.139620065689087, + "eval_runtime": 40.7619, + "eval_samples_per_second": 49.065, + "eval_steps_per_second": 0.785, + "step": 60 + }, + { + "epoch": 1.18, + "eval_loss": 2.137423276901245, + "eval_runtime": 40.783, + "eval_samples_per_second": 49.04, + "eval_steps_per_second": 0.785, + "step": 65 + }, + { + "epoch": 1.27, + "eval_loss": 2.135718822479248, + "eval_runtime": 40.7802, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 70 + }, + { + "epoch": 1.36, + "eval_loss": 2.134214401245117, + "eval_runtime": 40.7213, + "eval_samples_per_second": 49.114, + "eval_steps_per_second": 0.786, + "step": 75 + }, + { + "epoch": 1.45, + "learning_rate": 0.00016125, + "loss": 2.1152, + "step": 80 + }, + { + "epoch": 1.45, + "eval_loss": 2.1330068111419678, + "eval_runtime": 40.8447, + "eval_samples_per_second": 48.966, + "eval_steps_per_second": 0.783, + "step": 80 + }, + { + "epoch": 1.54, + "eval_loss": 2.131664276123047, + "eval_runtime": 40.9633, + "eval_samples_per_second": 48.824, + "eval_steps_per_second": 0.781, + "step": 85 + }, + { + "epoch": 1.63, + "eval_loss": 2.1305532455444336, + "eval_runtime": 40.7803, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 90 + }, + { + "epoch": 1.72, + "eval_loss": 2.1295671463012695, + "eval_runtime": 40.8088, + "eval_samples_per_second": 49.009, + "eval_steps_per_second": 0.784, + "step": 95 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012374999999999997, + "loss": 2.1138, + "step": 100 + }, + { + "epoch": 1.81, + "eval_loss": 2.1289596557617188, + "eval_runtime": 40.7573, + "eval_samples_per_second": 49.071, + "eval_steps_per_second": 0.785, + "step": 100 + }, + { + "epoch": 1.9, + "eval_loss": 2.1280689239501953, + "eval_runtime": 40.8322, + "eval_samples_per_second": 48.981, + "eval_steps_per_second": 0.784, + "step": 105 + }, + { + "epoch": 1.99, + "eval_loss": 2.127443552017212, + "eval_runtime": 40.9036, + "eval_samples_per_second": 48.895, + "eval_steps_per_second": 0.782, + "step": 110 + }, + { + "epoch": 2.08, + "eval_loss": 2.126908779144287, + "eval_runtime": 41.0214, + "eval_samples_per_second": 48.755, + "eval_steps_per_second": 0.78, + "step": 115 + }, + { + "epoch": 2.17, + "learning_rate": 8.624999999999998e-05, + "loss": 2.1101, + "step": 120 + }, + { + "epoch": 2.17, + "eval_loss": 2.126199245452881, + "eval_runtime": 40.9707, + "eval_samples_per_second": 48.815, + "eval_steps_per_second": 0.781, + "step": 120 + }, + { + "epoch": 2.26, + "eval_loss": 2.125767230987549, + "eval_runtime": 40.7834, + "eval_samples_per_second": 49.04, + "eval_steps_per_second": 0.785, + "step": 125 + }, + { + "epoch": 2.35, + "eval_loss": 2.125452995300293, + "eval_runtime": 40.8009, + "eval_samples_per_second": 49.019, + "eval_steps_per_second": 0.784, + "step": 130 + }, + { + "epoch": 2.44, + "eval_loss": 2.1250967979431152, + "eval_runtime": 40.7689, + "eval_samples_per_second": 49.057, + "eval_steps_per_second": 0.785, + "step": 135 + }, + { + "epoch": 2.53, + "learning_rate": 4.875e-05, + "loss": 2.108, + "step": 140 + }, + { + "epoch": 2.53, + "eval_loss": 2.124704599380493, + "eval_runtime": 40.7682, + "eval_samples_per_second": 49.058, + "eval_steps_per_second": 0.785, + "step": 140 + }, + { + "epoch": 2.62, + "eval_loss": 2.124401807785034, + "eval_runtime": 40.7695, + "eval_samples_per_second": 49.056, + "eval_steps_per_second": 0.785, + "step": 145 + }, + { + "epoch": 2.71, + "eval_loss": 2.1242740154266357, + "eval_runtime": 40.8982, + "eval_samples_per_second": 48.902, + "eval_steps_per_second": 0.782, + "step": 150 + }, + { + "epoch": 2.81, + "eval_loss": 2.124105930328369, + "eval_runtime": 40.7925, + "eval_samples_per_second": 49.029, + "eval_steps_per_second": 0.784, + "step": 155 + } + ], + "max_steps": 165, + "num_train_epochs": 3, + "total_flos": 2.2800684294340608e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaHC3_human/checkpoint-155/training_args.bin b/adapters/saved_llamaHC3_human/checkpoint-155/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08d435511e54e49bd3b0e2c715e12420f287521d --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-155/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:004b0b2ef592ca03dc7186e3339fabafe55905860b1dd3e8c499f5470388c62b +size 3643 diff --git a/adapters/saved_llamaHC3_human/checkpoint-160/optimizer.pt b/adapters/saved_llamaHC3_human/checkpoint-160/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b7c44f0fea6c18d4d334b082dd1892694e71952 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-160/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:545769626267100c91bae7684c2c7cc15e62a68e09a56a638844a10f010de151 +size 33629765 diff --git a/adapters/saved_llamaHC3_human/checkpoint-160/pytorch_model.bin b/adapters/saved_llamaHC3_human/checkpoint-160/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..cceb904bb9480742e263a7e5528df71eef4b16f2 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-160/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e60e7a23353b9f63c7dd08161648c96f4f5902af9491723781834138710e66e6 +size 16822989 diff --git a/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_0.pth b/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5c2154cae8f94282deebfb772ebb8ebc17829f04 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:640aea51197012a8db7f8edd5d5042a1f5dac24e04f3b5368724d088457f3303 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_1.pth b/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..dcb0ee7e1df32206ea902b4c6e24d802d6a8af79 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe2d6014f32775d4cc58950ac53daae7a43db33541b1c9371468389ec79b1a4b +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_2.pth b/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..c96eb919df94f58c2239f48c6912f7cf70f43b01 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8209edd22eae004f498b4fb3c528acc9f14d016f4cf32820370ba93769f4a06c +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_3.pth b/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..5f4eb0f0015aa28a19ee8552d4e96e9c55e106fa --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fc0e75155482da08faa856a87264f57073e871bcfa6f7dee46e71770c6f473a +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_4.pth b/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..0b125a83c3df42ee9cc2d31947ea51fb3a8158f9 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58b2b4917d134bd1ef2c797e004c4b5ff8ae8c680b2d2c3698ca176a48d7667e +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_5.pth b/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..4ba10738c8ab80cd0bfea82fb706a54fdf66f7c4 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af4f9bcb129c42bad4c0740209a764a5e1238f5cda73ac1f90c1cacdf952bc75 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_6.pth b/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..f047a1fa249c277ed02c722407ccdf8f66f11c09 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dabe744f163ce2ba5371267ea33ce245dadb3a8796f64595fe9a8a3ab7a3249b +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_7.pth b/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..fb1752937242c5a38a45d7b9979737568b880830 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-160/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb8fc4e5fc778c89e0508f141a51e2a20cbac9949a7a43d21bae92d788afcd4f +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-160/scaler.pt b/adapters/saved_llamaHC3_human/checkpoint-160/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..161360c55fa24860eed84ea724c67eadea9d5743 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-160/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42289e372f1018c300a1a9a2e013cd2572034427e7b64b793c31b8a21ee5627e +size 557 diff --git a/adapters/saved_llamaHC3_human/checkpoint-160/scheduler.pt b/adapters/saved_llamaHC3_human/checkpoint-160/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0eed4af023eb67ef043196b9d9a803e5492af4e0 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-160/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4eaa6da97a4d5cb2830c5e342c4b1d3f1e08dac892aa790c75ee1b4fc3f5bc86 +size 627 diff --git a/adapters/saved_llamaHC3_human/checkpoint-160/trainer_state.json b/adapters/saved_llamaHC3_human/checkpoint-160/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f46aa026fd4fedc10aa1a50f701024eac851685e --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-160/trainer_state.json @@ -0,0 +1,320 @@ +{ + "best_metric": 2.123983860015869, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaHC3_human/checkpoint-160", + "epoch": 2.8959276018099547, + "global_step": 160, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.09, + "eval_loss": 2.3469691276550293, + "eval_runtime": 40.7893, + "eval_samples_per_second": 49.032, + "eval_steps_per_second": 0.785, + "step": 5 + }, + { + "epoch": 0.18, + "eval_loss": 2.254398822784424, + "eval_runtime": 40.7321, + "eval_samples_per_second": 49.101, + "eval_steps_per_second": 0.786, + "step": 10 + }, + { + "epoch": 0.27, + "eval_loss": 2.216655731201172, + "eval_runtime": 40.7948, + "eval_samples_per_second": 49.026, + "eval_steps_per_second": 0.784, + "step": 15 + }, + { + "epoch": 0.36, + "learning_rate": 0.000271875, + "loss": 2.265, + "step": 20 + }, + { + "epoch": 0.36, + "eval_loss": 2.192472219467163, + "eval_runtime": 40.9073, + "eval_samples_per_second": 48.891, + "eval_steps_per_second": 0.782, + "step": 20 + }, + { + "epoch": 0.45, + "eval_loss": 2.1772661209106445, + "eval_runtime": 40.8472, + "eval_samples_per_second": 48.963, + "eval_steps_per_second": 0.783, + "step": 25 + }, + { + "epoch": 0.54, + "eval_loss": 2.1680707931518555, + "eval_runtime": 40.7376, + "eval_samples_per_second": 49.095, + "eval_steps_per_second": 0.786, + "step": 30 + }, + { + "epoch": 0.63, + "eval_loss": 2.15969181060791, + "eval_runtime": 40.7043, + "eval_samples_per_second": 49.135, + "eval_steps_per_second": 0.786, + "step": 35 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023624999999999997, + "loss": 2.1528, + "step": 40 + }, + { + "epoch": 0.72, + "eval_loss": 2.1535723209381104, + "eval_runtime": 40.7472, + "eval_samples_per_second": 49.083, + "eval_steps_per_second": 0.785, + "step": 40 + }, + { + "epoch": 0.81, + "eval_loss": 2.149122476577759, + "eval_runtime": 40.7594, + "eval_samples_per_second": 49.068, + "eval_steps_per_second": 0.785, + "step": 45 + }, + { + "epoch": 0.9, + "eval_loss": 2.1453487873077393, + "eval_runtime": 40.7319, + "eval_samples_per_second": 49.102, + "eval_steps_per_second": 0.786, + "step": 50 + }, + { + "epoch": 1.0, + "eval_loss": 2.142251968383789, + "eval_runtime": 40.8482, + "eval_samples_per_second": 48.962, + "eval_steps_per_second": 0.783, + "step": 55 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019874999999999998, + "loss": 2.1332, + "step": 60 + }, + { + "epoch": 1.09, + "eval_loss": 2.139620065689087, + "eval_runtime": 40.7619, + "eval_samples_per_second": 49.065, + "eval_steps_per_second": 0.785, + "step": 60 + }, + { + "epoch": 1.18, + "eval_loss": 2.137423276901245, + "eval_runtime": 40.783, + "eval_samples_per_second": 49.04, + "eval_steps_per_second": 0.785, + "step": 65 + }, + { + "epoch": 1.27, + "eval_loss": 2.135718822479248, + "eval_runtime": 40.7802, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 70 + }, + { + "epoch": 1.36, + "eval_loss": 2.134214401245117, + "eval_runtime": 40.7213, + "eval_samples_per_second": 49.114, + "eval_steps_per_second": 0.786, + "step": 75 + }, + { + "epoch": 1.45, + "learning_rate": 0.00016125, + "loss": 2.1152, + "step": 80 + }, + { + "epoch": 1.45, + "eval_loss": 2.1330068111419678, + "eval_runtime": 40.8447, + "eval_samples_per_second": 48.966, + "eval_steps_per_second": 0.783, + "step": 80 + }, + { + "epoch": 1.54, + "eval_loss": 2.131664276123047, + "eval_runtime": 40.9633, + "eval_samples_per_second": 48.824, + "eval_steps_per_second": 0.781, + "step": 85 + }, + { + "epoch": 1.63, + "eval_loss": 2.1305532455444336, + "eval_runtime": 40.7803, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 90 + }, + { + "epoch": 1.72, + "eval_loss": 2.1295671463012695, + "eval_runtime": 40.8088, + "eval_samples_per_second": 49.009, + "eval_steps_per_second": 0.784, + "step": 95 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012374999999999997, + "loss": 2.1138, + "step": 100 + }, + { + "epoch": 1.81, + "eval_loss": 2.1289596557617188, + "eval_runtime": 40.7573, + "eval_samples_per_second": 49.071, + "eval_steps_per_second": 0.785, + "step": 100 + }, + { + "epoch": 1.9, + "eval_loss": 2.1280689239501953, + "eval_runtime": 40.8322, + "eval_samples_per_second": 48.981, + "eval_steps_per_second": 0.784, + "step": 105 + }, + { + "epoch": 1.99, + "eval_loss": 2.127443552017212, + "eval_runtime": 40.9036, + "eval_samples_per_second": 48.895, + "eval_steps_per_second": 0.782, + "step": 110 + }, + { + "epoch": 2.08, + "eval_loss": 2.126908779144287, + "eval_runtime": 41.0214, + "eval_samples_per_second": 48.755, + "eval_steps_per_second": 0.78, + "step": 115 + }, + { + "epoch": 2.17, + "learning_rate": 8.624999999999998e-05, + "loss": 2.1101, + "step": 120 + }, + { + "epoch": 2.17, + "eval_loss": 2.126199245452881, + "eval_runtime": 40.9707, + "eval_samples_per_second": 48.815, + "eval_steps_per_second": 0.781, + "step": 120 + }, + { + "epoch": 2.26, + "eval_loss": 2.125767230987549, + "eval_runtime": 40.7834, + "eval_samples_per_second": 49.04, + "eval_steps_per_second": 0.785, + "step": 125 + }, + { + "epoch": 2.35, + "eval_loss": 2.125452995300293, + "eval_runtime": 40.8009, + "eval_samples_per_second": 49.019, + "eval_steps_per_second": 0.784, + "step": 130 + }, + { + "epoch": 2.44, + "eval_loss": 2.1250967979431152, + "eval_runtime": 40.7689, + "eval_samples_per_second": 49.057, + "eval_steps_per_second": 0.785, + "step": 135 + }, + { + "epoch": 2.53, + "learning_rate": 4.875e-05, + "loss": 2.108, + "step": 140 + }, + { + "epoch": 2.53, + "eval_loss": 2.124704599380493, + "eval_runtime": 40.7682, + "eval_samples_per_second": 49.058, + "eval_steps_per_second": 0.785, + "step": 140 + }, + { + "epoch": 2.62, + "eval_loss": 2.124401807785034, + "eval_runtime": 40.7695, + "eval_samples_per_second": 49.056, + "eval_steps_per_second": 0.785, + "step": 145 + }, + { + "epoch": 2.71, + "eval_loss": 2.1242740154266357, + "eval_runtime": 40.8982, + "eval_samples_per_second": 48.902, + "eval_steps_per_second": 0.782, + "step": 150 + }, + { + "epoch": 2.81, + "eval_loss": 2.124105930328369, + "eval_runtime": 40.7925, + "eval_samples_per_second": 49.029, + "eval_steps_per_second": 0.784, + "step": 155 + }, + { + "epoch": 2.9, + "learning_rate": 1.1249999999999999e-05, + "loss": 2.0996, + "step": 160 + }, + { + "epoch": 2.9, + "eval_loss": 2.123983860015869, + "eval_runtime": 40.7694, + "eval_samples_per_second": 49.056, + "eval_steps_per_second": 0.785, + "step": 160 + } + ], + "max_steps": 165, + "num_train_epochs": 3, + "total_flos": 2.354051216792617e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaHC3_human/checkpoint-160/training_args.bin b/adapters/saved_llamaHC3_human/checkpoint-160/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08d435511e54e49bd3b0e2c715e12420f287521d --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-160/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:004b0b2ef592ca03dc7186e3339fabafe55905860b1dd3e8c499f5470388c62b +size 3643 diff --git a/adapters/saved_llamaHC3_human/checkpoint-165/optimizer.pt b/adapters/saved_llamaHC3_human/checkpoint-165/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..92abb4e6125d48fcbf62985c4610d0ee633176e8 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-165/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a9a1cac4d21f536a57718869d0f9e9d151670b86d9875023af9634ed0f489ed +size 33629765 diff --git a/adapters/saved_llamaHC3_human/checkpoint-165/pytorch_model.bin b/adapters/saved_llamaHC3_human/checkpoint-165/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..6b9903eaa522c45a7a243c19db3f7fb0fcdcde31 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-165/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d04eeb6e4d3333e9e923381c3b3b657f16f2e06eba5c6d5e85872c5123057494 +size 16822989 diff --git a/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_0.pth b/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..551cae8642a0b7fc2fdb3913c08df40484c4080c --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0df71113d63dfac6284cda91980df51835e75d2993b61c5066c117256b33da06 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_1.pth b/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..09d39e02e357adcb9d55516738dc0b208de2f55a --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35e912ca615d64168d033379d6295ad3b4f707ff889448420f408dd99174b749 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_2.pth b/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..94ca67187f7ae47ba21fe17f1e0a02b33029caf0 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de957d4db5de4a9bcf4481b61cf221028642a6ab97d39b645d27ff1620931ce0 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_3.pth b/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..620b1bf5562171a6a1cfe2c08bada070466f1358 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d92515fea73a0644cb60f9a8114513779003f40f53105f9c039926d34ca63d5b +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_4.pth b/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..693a8439c124f5c16e11f67c339fba6208d23912 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2706a5df43cbd99d29f3c11b845251483af0685307a47f8070aa599560aff9c +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_5.pth b/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..846cbc50adee29a80968277431b7b535148536ee --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99af46c77cb177d18f5c260a7c9d3ad93aac87e3bfb5d66e6dd8cd2b0857e364 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_6.pth b/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..59ada88327a3283390ceb03fa0d844e5c213adc9 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62dd424a9a35a2e0d99077824b7e0fb9a52d3631fb9e6fed49f8d16c2f713d30 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_7.pth b/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..3ca9a78f43ee1df912d1d0f71e7ed645aa497633 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-165/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12de3cf5060c09bac479aabe273b3977f5c7c90b7ebe88d2d3eccde178fa9827 +size 14583 diff --git a/adapters/saved_llamaHC3_human/checkpoint-165/scaler.pt b/adapters/saved_llamaHC3_human/checkpoint-165/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7029fec9eb667a478dd158cc8acdd2456b7c0e4 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-165/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eab10427fed8dab31c87bbacacda6ca6d724feb5d5955984260d926594db1de2 +size 557 diff --git a/adapters/saved_llamaHC3_human/checkpoint-165/scheduler.pt b/adapters/saved_llamaHC3_human/checkpoint-165/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7302eba30fed1ed5988b8a618fdd3870fbe0e30 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-165/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:baaf489745255b059e313fce2c8a0eb8cb8e0292007620bf741e9ad1d15f62cd +size 627 diff --git a/adapters/saved_llamaHC3_human/checkpoint-165/trainer_state.json b/adapters/saved_llamaHC3_human/checkpoint-165/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f540745534413c842ae6270988937fed1420d8b6 --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-165/trainer_state.json @@ -0,0 +1,328 @@ +{ + "best_metric": 2.123960494995117, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaHC3_human/checkpoint-165", + "epoch": 2.986425339366516, + "global_step": 165, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.09, + "eval_loss": 2.3469691276550293, + "eval_runtime": 40.7893, + "eval_samples_per_second": 49.032, + "eval_steps_per_second": 0.785, + "step": 5 + }, + { + "epoch": 0.18, + "eval_loss": 2.254398822784424, + "eval_runtime": 40.7321, + "eval_samples_per_second": 49.101, + "eval_steps_per_second": 0.786, + "step": 10 + }, + { + "epoch": 0.27, + "eval_loss": 2.216655731201172, + "eval_runtime": 40.7948, + "eval_samples_per_second": 49.026, + "eval_steps_per_second": 0.784, + "step": 15 + }, + { + "epoch": 0.36, + "learning_rate": 0.000271875, + "loss": 2.265, + "step": 20 + }, + { + "epoch": 0.36, + "eval_loss": 2.192472219467163, + "eval_runtime": 40.9073, + "eval_samples_per_second": 48.891, + "eval_steps_per_second": 0.782, + "step": 20 + }, + { + "epoch": 0.45, + "eval_loss": 2.1772661209106445, + "eval_runtime": 40.8472, + "eval_samples_per_second": 48.963, + "eval_steps_per_second": 0.783, + "step": 25 + }, + { + "epoch": 0.54, + "eval_loss": 2.1680707931518555, + "eval_runtime": 40.7376, + "eval_samples_per_second": 49.095, + "eval_steps_per_second": 0.786, + "step": 30 + }, + { + "epoch": 0.63, + "eval_loss": 2.15969181060791, + "eval_runtime": 40.7043, + "eval_samples_per_second": 49.135, + "eval_steps_per_second": 0.786, + "step": 35 + }, + { + "epoch": 0.72, + "learning_rate": 0.00023624999999999997, + "loss": 2.1528, + "step": 40 + }, + { + "epoch": 0.72, + "eval_loss": 2.1535723209381104, + "eval_runtime": 40.7472, + "eval_samples_per_second": 49.083, + "eval_steps_per_second": 0.785, + "step": 40 + }, + { + "epoch": 0.81, + "eval_loss": 2.149122476577759, + "eval_runtime": 40.7594, + "eval_samples_per_second": 49.068, + "eval_steps_per_second": 0.785, + "step": 45 + }, + { + "epoch": 0.9, + "eval_loss": 2.1453487873077393, + "eval_runtime": 40.7319, + "eval_samples_per_second": 49.102, + "eval_steps_per_second": 0.786, + "step": 50 + }, + { + "epoch": 1.0, + "eval_loss": 2.142251968383789, + "eval_runtime": 40.8482, + "eval_samples_per_second": 48.962, + "eval_steps_per_second": 0.783, + "step": 55 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019874999999999998, + "loss": 2.1332, + "step": 60 + }, + { + "epoch": 1.09, + "eval_loss": 2.139620065689087, + "eval_runtime": 40.7619, + "eval_samples_per_second": 49.065, + "eval_steps_per_second": 0.785, + "step": 60 + }, + { + "epoch": 1.18, + "eval_loss": 2.137423276901245, + "eval_runtime": 40.783, + "eval_samples_per_second": 49.04, + "eval_steps_per_second": 0.785, + "step": 65 + }, + { + "epoch": 1.27, + "eval_loss": 2.135718822479248, + "eval_runtime": 40.7802, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 70 + }, + { + "epoch": 1.36, + "eval_loss": 2.134214401245117, + "eval_runtime": 40.7213, + "eval_samples_per_second": 49.114, + "eval_steps_per_second": 0.786, + "step": 75 + }, + { + "epoch": 1.45, + "learning_rate": 0.00016125, + "loss": 2.1152, + "step": 80 + }, + { + "epoch": 1.45, + "eval_loss": 2.1330068111419678, + "eval_runtime": 40.8447, + "eval_samples_per_second": 48.966, + "eval_steps_per_second": 0.783, + "step": 80 + }, + { + "epoch": 1.54, + "eval_loss": 2.131664276123047, + "eval_runtime": 40.9633, + "eval_samples_per_second": 48.824, + "eval_steps_per_second": 0.781, + "step": 85 + }, + { + "epoch": 1.63, + "eval_loss": 2.1305532455444336, + "eval_runtime": 40.7803, + "eval_samples_per_second": 49.043, + "eval_steps_per_second": 0.785, + "step": 90 + }, + { + "epoch": 1.72, + "eval_loss": 2.1295671463012695, + "eval_runtime": 40.8088, + "eval_samples_per_second": 49.009, + "eval_steps_per_second": 0.784, + "step": 95 + }, + { + "epoch": 1.81, + "learning_rate": 0.00012374999999999997, + "loss": 2.1138, + "step": 100 + }, + { + "epoch": 1.81, + "eval_loss": 2.1289596557617188, + "eval_runtime": 40.7573, + "eval_samples_per_second": 49.071, + "eval_steps_per_second": 0.785, + "step": 100 + }, + { + "epoch": 1.9, + "eval_loss": 2.1280689239501953, + "eval_runtime": 40.8322, + "eval_samples_per_second": 48.981, + "eval_steps_per_second": 0.784, + "step": 105 + }, + { + "epoch": 1.99, + "eval_loss": 2.127443552017212, + "eval_runtime": 40.9036, + "eval_samples_per_second": 48.895, + "eval_steps_per_second": 0.782, + "step": 110 + }, + { + "epoch": 2.08, + "eval_loss": 2.126908779144287, + "eval_runtime": 41.0214, + "eval_samples_per_second": 48.755, + "eval_steps_per_second": 0.78, + "step": 115 + }, + { + "epoch": 2.17, + "learning_rate": 8.624999999999998e-05, + "loss": 2.1101, + "step": 120 + }, + { + "epoch": 2.17, + "eval_loss": 2.126199245452881, + "eval_runtime": 40.9707, + "eval_samples_per_second": 48.815, + "eval_steps_per_second": 0.781, + "step": 120 + }, + { + "epoch": 2.26, + "eval_loss": 2.125767230987549, + "eval_runtime": 40.7834, + "eval_samples_per_second": 49.04, + "eval_steps_per_second": 0.785, + "step": 125 + }, + { + "epoch": 2.35, + "eval_loss": 2.125452995300293, + "eval_runtime": 40.8009, + "eval_samples_per_second": 49.019, + "eval_steps_per_second": 0.784, + "step": 130 + }, + { + "epoch": 2.44, + "eval_loss": 2.1250967979431152, + "eval_runtime": 40.7689, + "eval_samples_per_second": 49.057, + "eval_steps_per_second": 0.785, + "step": 135 + }, + { + "epoch": 2.53, + "learning_rate": 4.875e-05, + "loss": 2.108, + "step": 140 + }, + { + "epoch": 2.53, + "eval_loss": 2.124704599380493, + "eval_runtime": 40.7682, + "eval_samples_per_second": 49.058, + "eval_steps_per_second": 0.785, + "step": 140 + }, + { + "epoch": 2.62, + "eval_loss": 2.124401807785034, + "eval_runtime": 40.7695, + "eval_samples_per_second": 49.056, + "eval_steps_per_second": 0.785, + "step": 145 + }, + { + "epoch": 2.71, + "eval_loss": 2.1242740154266357, + "eval_runtime": 40.8982, + "eval_samples_per_second": 48.902, + "eval_steps_per_second": 0.782, + "step": 150 + }, + { + "epoch": 2.81, + "eval_loss": 2.124105930328369, + "eval_runtime": 40.7925, + "eval_samples_per_second": 49.029, + "eval_steps_per_second": 0.784, + "step": 155 + }, + { + "epoch": 2.9, + "learning_rate": 1.1249999999999999e-05, + "loss": 2.0996, + "step": 160 + }, + { + "epoch": 2.9, + "eval_loss": 2.123983860015869, + "eval_runtime": 40.7694, + "eval_samples_per_second": 49.056, + "eval_steps_per_second": 0.785, + "step": 160 + }, + { + "epoch": 2.99, + "eval_loss": 2.123960494995117, + "eval_runtime": 40.8472, + "eval_samples_per_second": 48.963, + "eval_steps_per_second": 0.783, + "step": 165 + } + ], + "max_steps": 165, + "num_train_epochs": 3, + "total_flos": 2.4274443652709745e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaHC3_human/checkpoint-165/training_args.bin b/adapters/saved_llamaHC3_human/checkpoint-165/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..08d435511e54e49bd3b0e2c715e12420f287521d --- /dev/null +++ b/adapters/saved_llamaHC3_human/checkpoint-165/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:004b0b2ef592ca03dc7186e3339fabafe55905860b1dd3e8c499f5470388c62b +size 3643 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-4/optimizer.pt b/adapters/saved_llamaalpacaGPT4/checkpoint-4/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9470862920065ee85ef62f20ac78ad58bd11708c --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-4/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e1dad334a8cf6f6c5a992cbd114d70b469e4b6752ba6dc01000aeaa00473e61 +size 33629765 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-4/pytorch_model.bin b/adapters/saved_llamaalpacaGPT4/checkpoint-4/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..d3fa62c4a7ce528460e60d574d5050e702b18175 --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-4/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cee67249f32a2bf84ede01972c5406ac2838bb0c00a5357f3b8b3d24d982cdc4 +size 16822989 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_0.pth b/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..689634ea738f70f571c7605e8bcf8c7233ffd340 --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bac0d0e15d5d789c42e0fe2b180cfb53d540c0387f35fec444e611a8fb777dc2 +size 14583 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_1.pth b/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc530424631585f0926833da2c9a5eab775a76d2 --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e51a2f2112f9d6a4a8e6a98d22d5d8e19c6dd37d51376ebb512343bedc52837e +size 14583 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_2.pth b/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..84a6c11f7679f88af4c259e1310d5dad9587d554 --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62af3525863f24a3534075c3fa7a66ca75d0cc75dc3c0ef21afe4f1e80896bab +size 14583 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_3.pth b/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..9a23ff7c60098399ab8e31e3581187055def3778 --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:312e3a9bded34df6c8bad0773065a6c055e2d3a52459453d337f22592a9eb4da +size 14583 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_4.pth b/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..847b655bd0800b42bf1a38bed719fb78b2e93423 --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e22ad9c63186541f5b0a14c4c7b6651f801e3ac2fcc49058d054e2ce69c483ab +size 14583 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_5.pth b/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..df6840bd54d8f2d0c5557cd2aaeb4aeafee1f3ac --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eb93c9230e55892fde33a7bcf75d2495876690e7ef4d9b6d3aaae071e33c5e8 +size 14583 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_6.pth b/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..6494ac8dda4a7e2497695ab177e783e90596186d --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f6b061ee49f94f3820cad544147d1917766288e416f34f39611965979738ade +size 14583 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_7.pth b/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..4384d335f42a25ad1b8719b2990070206dffe5c0 --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-4/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:343a47426c7d56821b9078fd0ec538fb7a7b5327e027b6fcd8010cae1549f135 +size 14583 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-4/scaler.pt b/adapters/saved_llamaalpacaGPT4/checkpoint-4/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f09d7be44441abd8e758a0f02a48131245111f91 --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-4/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c68654e686758a37c420d9849703f2bc33e5969162688f5f31cfc4aa9705212 +size 557 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-4/scheduler.pt b/adapters/saved_llamaalpacaGPT4/checkpoint-4/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f24113b082752638c9b9a6f8f8a8713d17f40ee --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-4/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce733d9b27b8c2ed769776b49c49ce7dcbffcd8ef93a330d5052a441ae6b2e5d +size 627 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-4/trainer_state.json b/adapters/saved_llamaalpacaGPT4/checkpoint-4/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..869bc1b06ea145ce7f6384fbb5f0dff504b5797c --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-4/trainer_state.json @@ -0,0 +1,24 @@ +{ + "best_metric": 1.2102158069610596, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaalpacaGPT4/checkpoint-4", + "epoch": 0.0818937939859245, + "global_step": 4, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "eval_loss": 1.2102158069610596, + "eval_runtime": 40.7424, + "eval_samples_per_second": 49.089, + "eval_steps_per_second": 0.785, + "step": 4 + } + ], + "max_steps": 144, + "num_train_epochs": 3, + "total_flos": 6.17380116913193e+16, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-4/training_args.bin b/adapters/saved_llamaalpacaGPT4/checkpoint-4/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..b7091f62ab0cbc8c92f6b795980388bf1294cc3b --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-4/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb435fbd20725afe7232bd58e6e6076e3d30ce1f179f9c80a9bd051bd39f79fb +size 3643 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-8/optimizer.pt b/adapters/saved_llamaalpacaGPT4/checkpoint-8/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..68cb672b35d6b1fb2293b5f67ef800d3b8987029 --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-8/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56341bcb358a394fe779268c1a50f621f29c6d6f64b66db0fe1c84e5489064d9 +size 33629765 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-8/pytorch_model.bin b/adapters/saved_llamaalpacaGPT4/checkpoint-8/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..f23cdcc0ef737873714fbc3c19f0e1114ebab26c --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-8/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95733cdc836f6b508fc3793981660092be3a0311b731e99c650717b562e5ee78 +size 16822989 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_0.pth b/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..46349576985cf478deffac8a206a82039ea8774d --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad5430df3864b49a8bc14c66caf0d8af260f28b5935926fe9986447fd56a52f1 +size 14583 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_1.pth b/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..e5fde093ae04266f43958266b0db5c16353cdf7d --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3d53270f2b6b08f7dc592a01f9c1c54fad1b0635ffb5c1a48e9423a3f5c3774 +size 14583 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_2.pth b/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..e54c2910404e6b0636106be85d98b6e857d5fd7a --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f436b034150f1434d762b173fe4d02a7d1b7cde6e035e37118a7ba9cdb2f52a +size 14583 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_3.pth b/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9db84228c653b94597a71e55e06c743ed136776 --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1155f00593444b8f2f77dfc1bd66a92d71e19f41da9e85e66628746701cc0177 +size 14583 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_4.pth b/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..88d4f671bdbf73ca451850b82c832abf6c321cdb --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9d75261d34a919f6345a84869adbe4f14279ffec17804358760cb61354571aa +size 14583 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_5.pth b/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..9ad1c1493c05b263c66e9a48f8b4b15966758f22 --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bb654221b1e224d0e66ed73ef8069976e3f1deb2d46c2591cbc2f350d88ce6b +size 14583 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_6.pth b/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..0807d8ed1629b8491ed8a48619f34a6f3abf655a --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99a348bf878ca81cf377e25e8be68a7ce70c880a5de96eb90399d38997b22849 +size 14583 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_7.pth b/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d6c4ad876817c88e67b74e9a1ca9aeffbd3c173d --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-8/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8674ef85c847b8dd214d8009cb18181d998986c72d3698a89a5d41dcca17950 +size 14583 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-8/scaler.pt b/adapters/saved_llamaalpacaGPT4/checkpoint-8/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e487ff88bf42511dea594f60b41340e49aee7c2 --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-8/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a79944e2dfd765a29c7ec78561cfb25a4e79d7bdb4c76c3a8717188592da7b04 +size 557 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-8/scheduler.pt b/adapters/saved_llamaalpacaGPT4/checkpoint-8/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6c4b2b044285f28e43131fcdc423b43f91cfe34 --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-8/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c31d01a3e517a55de146bf720fd5225445846b0fed6214cf9da7cc98a7f9609a +size 627 diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-8/trainer_state.json b/adapters/saved_llamaalpacaGPT4/checkpoint-8/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..328b3da5b5545a604112a04d3afebe937fc07b6d --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-8/trainer_state.json @@ -0,0 +1,32 @@ +{ + "best_metric": 1.1144840717315674, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaalpacaGPT4/checkpoint-8", + "epoch": 0.163787587971849, + "global_step": 8, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "eval_loss": 1.2102158069610596, + "eval_runtime": 40.7424, + "eval_samples_per_second": 49.089, + "eval_steps_per_second": 0.785, + "step": 4 + }, + { + "epoch": 0.16, + "eval_loss": 1.1144840717315674, + "eval_runtime": 40.6507, + "eval_samples_per_second": 49.2, + "eval_steps_per_second": 0.787, + "step": 8 + } + ], + "max_steps": 144, + "num_train_epochs": 3, + "total_flos": 1.2265264808722432e+17, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaalpacaGPT4/checkpoint-8/training_args.bin b/adapters/saved_llamaalpacaGPT4/checkpoint-8/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..b7091f62ab0cbc8c92f6b795980388bf1294cc3b --- /dev/null +++ b/adapters/saved_llamaalpacaGPT4/checkpoint-8/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb435fbd20725afe7232bd58e6e6076e3d30ce1f179f9c80a9bd051bd39f79fb +size 3643 diff --git a/adapters/saved_llamacode_alpaca/adapter_config.json b/adapters/saved_llamacode_alpaca/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f033fdd6b7869b39cae4f1c30521c8561ed73ff --- /dev/null +++ b/adapters/saved_llamacode_alpaca/adapter_config.json @@ -0,0 +1,17 @@ +{ + "base_model_name_or_path": "decapoda-research/llama-7b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved_llamacode_alpaca/adapter_model.bin b/adapters/saved_llamacode_alpaca/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8710093665dfdb9819e2f8817a1e25a4ccdd9935 --- /dev/null +++ b/adapters/saved_llamacode_alpaca/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5e1621f48d9ad8feb1d6d31050275f0aafd080c5c07153301fe2f48411f4406 +size 443 diff --git a/adapters/saved_llamafinance/adapter_config.json b/adapters/saved_llamafinance/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f033fdd6b7869b39cae4f1c30521c8561ed73ff --- /dev/null +++ b/adapters/saved_llamafinance/adapter_config.json @@ -0,0 +1,17 @@ +{ + "base_model_name_or_path": "decapoda-research/llama-7b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved_llamafinance/adapter_model.bin b/adapters/saved_llamafinance/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8710093665dfdb9819e2f8817a1e25a4ccdd9935 --- /dev/null +++ b/adapters/saved_llamafinance/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5e1621f48d9ad8feb1d6d31050275f0aafd080c5c07153301fe2f48411f4406 +size 443 diff --git a/adapters/saved_llamainstinwild/adapter_config.json b/adapters/saved_llamainstinwild/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d07a7b67f6c57422ee3209e5778eb76d5dfe37c6 --- /dev/null +++ b/adapters/saved_llamainstinwild/adapter_config.json @@ -0,0 +1,17 @@ +{ + "base_model_name_or_path": "/mnt/bn/qingyi-bn-lq/llama/llama-7b-hf/", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved_llamainstinwild/adapter_model.bin b/adapters/saved_llamainstinwild/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8710093665dfdb9819e2f8817a1e25a4ccdd9935 --- /dev/null +++ b/adapters/saved_llamainstinwild/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5e1621f48d9ad8feb1d6d31050275f0aafd080c5c07153301fe2f48411f4406 +size 443 diff --git a/adapters/saved_llamainstinwild/checkpoint-104/optimizer.pt b/adapters/saved_llamainstinwild/checkpoint-104/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8053349ece2a5abf592aa691807cad49ad8b6fd9 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-104/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc3ecafe6628b4f11cc716f6160fc769060b2cbe66f00057152002ac2da7aa78 +size 33629765 diff --git a/adapters/saved_llamainstinwild/checkpoint-104/pytorch_model.bin b/adapters/saved_llamainstinwild/checkpoint-104/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..7c6b611de76aff05930cec85f8f880b1f39ad43d --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-104/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe53f36e405f3f0be91c7bd75da7f7b55f07ffabce8d0c83a303cf99162a6049 +size 16822989 diff --git a/adapters/saved_llamainstinwild/checkpoint-104/rng_state_0.pth b/adapters/saved_llamainstinwild/checkpoint-104/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..c734890bb6a59c3ccff30c248a49726a21cce8b2 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-104/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3f086040fb3781d9a7ae9501f3d05f20239dac26732b16b9d6a2ddbac39523c +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-104/rng_state_1.pth b/adapters/saved_llamainstinwild/checkpoint-104/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..c55b4312c0a8a92040b69e43f8bc6c150487ddad --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-104/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d36dd8c37de16ee977eb7f3859f8533c3b5fe678d1dcaf8f59a92944621ff27f +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-104/rng_state_2.pth b/adapters/saved_llamainstinwild/checkpoint-104/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..eb78e3d97a6a13a7fc32c48550b66ccf727133d9 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-104/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:542893540b97f37a62a602842acb01686cf1a481dac18aa80c91ebf8f30eaaeb +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-104/rng_state_3.pth b/adapters/saved_llamainstinwild/checkpoint-104/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..0b6de744855cfa06539f7fcee4c8f4ad1691c8f2 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-104/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93673368b316df1ae2c05a8af9032a3caed919f9b6a1bc38c354eb52bbcaee86 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-104/rng_state_4.pth b/adapters/saved_llamainstinwild/checkpoint-104/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..e60554a921e4c423e909066b5dd030396ee381ae --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-104/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afd50a44322a8f6d5ad7513ecb3dc22ee75f9e55fd31bdd9f59adb3375ba1524 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-104/rng_state_5.pth b/adapters/saved_llamainstinwild/checkpoint-104/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..bc57cc81cc960b8b8842877b1ebc2b63b48404bd --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-104/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95b03e1f0f46a3d3a120817d30fb4235bcb3740a8193a5db426a81432aa22b28 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-104/rng_state_6.pth b/adapters/saved_llamainstinwild/checkpoint-104/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..fa00cc06519accacc941d226f32513ca034d037c --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-104/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a258b9dd5b4eff0b62569267ac8ca03b534ae6eb9214539d9b299ad315e416bc +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-104/rng_state_7.pth b/adapters/saved_llamainstinwild/checkpoint-104/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8411b0b957228f5593c0a996b01dae69988f35f --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-104/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0411f4a23d6f72bae17afc5e75f5224a11a287ecb3174c0faa0a29fb0934406 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-104/scaler.pt b/adapters/saved_llamainstinwild/checkpoint-104/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..259d847af9920bffbee58d6ef1e9a1c5a19eabc4 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-104/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef8657b3b76e9105153d42e84ef17c3d7970afd93993db6c2238fac95a424a8c +size 557 diff --git a/adapters/saved_llamainstinwild/checkpoint-104/scheduler.pt b/adapters/saved_llamainstinwild/checkpoint-104/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..62a7eb82b1e80ac6e2074218b19c146a58dcd8f9 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-104/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:251dbc1d24cfda9efd4fba7f96de2546a3f2c818e20cc08ee3b1d5225c6e0a78 +size 627 diff --git a/adapters/saved_llamainstinwild/checkpoint-104/trainer_state.json b/adapters/saved_llamainstinwild/checkpoint-104/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3ebc45bad4eff4920864bf74948e83dec6451e83 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-104/trainer_state.json @@ -0,0 +1,254 @@ +{ + "best_metric": 1.176965594291687, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamainstinwild/checkpoint-104", + "epoch": 2.1210962396430846, + "global_step": 104, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "eval_loss": 1.39645254611969, + "eval_runtime": 37.7333, + "eval_samples_per_second": 53.004, + "eval_steps_per_second": 0.848, + "step": 4 + }, + { + "epoch": 0.16, + "eval_loss": 1.2984137535095215, + "eval_runtime": 37.7164, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 8 + }, + { + "epoch": 0.24, + "eval_loss": 1.2540132999420166, + "eval_runtime": 37.6847, + "eval_samples_per_second": 53.072, + "eval_steps_per_second": 0.849, + "step": 12 + }, + { + "epoch": 0.33, + "eval_loss": 1.2342448234558105, + "eval_runtime": 37.6879, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 16 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002664335664335664, + "loss": 1.3125, + "step": 20 + }, + { + "epoch": 0.41, + "eval_loss": 1.2209277153015137, + "eval_runtime": 37.7168, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 20 + }, + { + "epoch": 0.49, + "eval_loss": 1.2117934226989746, + "eval_runtime": 37.6897, + "eval_samples_per_second": 53.065, + "eval_steps_per_second": 0.849, + "step": 24 + }, + { + "epoch": 0.57, + "eval_loss": 1.2057584524154663, + "eval_runtime": 37.7691, + "eval_samples_per_second": 52.953, + "eval_steps_per_second": 0.847, + "step": 28 + }, + { + "epoch": 0.65, + "eval_loss": 1.2009178400039673, + "eval_runtime": 37.7026, + "eval_samples_per_second": 53.047, + "eval_steps_per_second": 0.849, + "step": 32 + }, + { + "epoch": 0.73, + "eval_loss": 1.197250485420227, + "eval_runtime": 37.6765, + "eval_samples_per_second": 53.084, + "eval_steps_per_second": 0.849, + "step": 36 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022447552447552445, + "loss": 1.2095, + "step": 40 + }, + { + "epoch": 0.82, + "eval_loss": 1.1943359375, + "eval_runtime": 37.6881, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 40 + }, + { + "epoch": 0.9, + "eval_loss": 1.1920316219329834, + "eval_runtime": 37.694, + "eval_samples_per_second": 53.059, + "eval_steps_per_second": 0.849, + "step": 44 + }, + { + "epoch": 0.98, + "eval_loss": 1.1898770332336426, + "eval_runtime": 37.7089, + "eval_samples_per_second": 53.038, + "eval_steps_per_second": 0.849, + "step": 48 + }, + { + "epoch": 1.06, + "eval_loss": 1.1879488229751587, + "eval_runtime": 37.6975, + "eval_samples_per_second": 53.054, + "eval_steps_per_second": 0.849, + "step": 52 + }, + { + "epoch": 1.14, + "eval_loss": 1.1864111423492432, + "eval_runtime": 37.7188, + "eval_samples_per_second": 53.024, + "eval_steps_per_second": 0.848, + "step": 56 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018251748251748253, + "loss": 1.188, + "step": 60 + }, + { + "epoch": 1.22, + "eval_loss": 1.1849411725997925, + "eval_runtime": 37.7686, + "eval_samples_per_second": 52.954, + "eval_steps_per_second": 0.847, + "step": 60 + }, + { + "epoch": 1.31, + "eval_loss": 1.1838239431381226, + "eval_runtime": 37.6945, + "eval_samples_per_second": 53.058, + "eval_steps_per_second": 0.849, + "step": 64 + }, + { + "epoch": 1.39, + "eval_loss": 1.182841181755066, + "eval_runtime": 37.6624, + "eval_samples_per_second": 53.103, + "eval_steps_per_second": 0.85, + "step": 68 + }, + { + "epoch": 1.47, + "eval_loss": 1.1818122863769531, + "eval_runtime": 37.7233, + "eval_samples_per_second": 53.018, + "eval_steps_per_second": 0.848, + "step": 72 + }, + { + "epoch": 1.55, + "eval_loss": 1.1808911561965942, + "eval_runtime": 37.7038, + "eval_samples_per_second": 53.045, + "eval_steps_per_second": 0.849, + "step": 76 + }, + { + "epoch": 1.63, + "learning_rate": 0.00014055944055944055, + "loss": 1.1876, + "step": 80 + }, + { + "epoch": 1.63, + "eval_loss": 1.1801403760910034, + "eval_runtime": 37.7277, + "eval_samples_per_second": 53.011, + "eval_steps_per_second": 0.848, + "step": 80 + }, + { + "epoch": 1.71, + "eval_loss": 1.1793811321258545, + "eval_runtime": 37.6641, + "eval_samples_per_second": 53.101, + "eval_steps_per_second": 0.85, + "step": 84 + }, + { + "epoch": 1.79, + "eval_loss": 1.1788283586502075, + "eval_runtime": 37.7097, + "eval_samples_per_second": 53.037, + "eval_steps_per_second": 0.849, + "step": 88 + }, + { + "epoch": 1.88, + "eval_loss": 1.1783016920089722, + "eval_runtime": 37.7104, + "eval_samples_per_second": 53.036, + "eval_steps_per_second": 0.849, + "step": 92 + }, + { + "epoch": 1.96, + "eval_loss": 1.177699327468872, + "eval_runtime": 37.6884, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 96 + }, + { + "epoch": 2.04, + "learning_rate": 9.860139860139858e-05, + "loss": 1.182, + "step": 100 + }, + { + "epoch": 2.04, + "eval_loss": 1.1773412227630615, + "eval_runtime": 37.7134, + "eval_samples_per_second": 53.032, + "eval_steps_per_second": 0.849, + "step": 100 + }, + { + "epoch": 2.12, + "eval_loss": 1.176965594291687, + "eval_runtime": 37.6735, + "eval_samples_per_second": 53.088, + "eval_steps_per_second": 0.849, + "step": 104 + } + ], + "max_steps": 147, + "num_train_epochs": 3, + "total_flos": 1.2114014958413414e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamainstinwild/checkpoint-104/training_args.bin b/adapters/saved_llamainstinwild/checkpoint-104/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..34b95391e17e9d44eb2a04177250246c85716097 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-104/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13fbb17fb4210eddefede6fc992fb1cd0ac7bb1d7d94253bc089086b39aba316 +size 3643 diff --git a/adapters/saved_llamainstinwild/checkpoint-108/optimizer.pt b/adapters/saved_llamainstinwild/checkpoint-108/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e17bd639b6e1f3cdc34a20d81e5cc8c33a048473 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-108/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92cd21716c01e5567d7ab2ca5e1cf99f623c29346673fde8eb952d317bd1cf4e +size 33629765 diff --git a/adapters/saved_llamainstinwild/checkpoint-108/pytorch_model.bin b/adapters/saved_llamainstinwild/checkpoint-108/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..84a2f7446d60d7c189d7b454354dbeeb51b5461d --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-108/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:719946c7c0207590a728e4039fa9695e5ad576f8cdf7d14362d74f815f9e9335 +size 16822989 diff --git a/adapters/saved_llamainstinwild/checkpoint-108/rng_state_0.pth b/adapters/saved_llamainstinwild/checkpoint-108/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..245484245608fb960c96873a601b5b032cd482f0 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-108/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88d6137173c770096ae277fa30364b7ec3dc06e4ee69cab3c03d04e45918206c +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-108/rng_state_1.pth b/adapters/saved_llamainstinwild/checkpoint-108/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..31d1ba88c1ae5706c79003667a69b0e2dd3c1902 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-108/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55406a5d5cb31025ccc748198f4e6e29de67717cb6248ebddc03d96a4dc899f8 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-108/rng_state_2.pth b/adapters/saved_llamainstinwild/checkpoint-108/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..bcff63c409ea5c8da056950ff9591c7bc3aa4127 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-108/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8de6145c13fa52f26c49149d1e0dbe7afc615511dc97fe74a626a483439e6c90 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-108/rng_state_3.pth b/adapters/saved_llamainstinwild/checkpoint-108/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..e7eb28eb0ad234bdc81acc684c4ea015729856b3 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-108/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fb2b6a03a1f8fb028de71f1bda227a6253616c8d5c8cc36a4afc4f5238b4c40 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-108/rng_state_4.pth b/adapters/saved_llamainstinwild/checkpoint-108/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..29cc203e2ff3ab2e555753682dee1a230f890635 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-108/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66ab1f3c41374a72c2c973664a3a84365f2c7a5ef8b260c146c9560b8ed7fa6f +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-108/rng_state_5.pth b/adapters/saved_llamainstinwild/checkpoint-108/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..0bd72610261e1d5736ecff7e5bbf2e0bdd5fa9c1 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-108/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d0e70c9ad9c29dbf0cda590a9806ce36d76ab16510770312b519024125519bf +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-108/rng_state_6.pth b/adapters/saved_llamainstinwild/checkpoint-108/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..54e419397ea5f50e2ea48096903b3d4b76d50e0d --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-108/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ee6db1e46ed74b02bfbc4f22f853b9742f540d5fbf11d7116bb348aec50f4e4 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-108/rng_state_7.pth b/adapters/saved_llamainstinwild/checkpoint-108/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..025c511aa7ff662412ef6d3c933fa5ea46f5eaba --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-108/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5250184945211380d0d85ff2f2044a1eba8863f350e0ce320bfe371718ab0ac +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-108/scaler.pt b/adapters/saved_llamainstinwild/checkpoint-108/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f82ba5ca7e9606a18f0f07c4c6b35360933dfd4 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-108/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95ee313b0acb2a1ef10029766c944c75a09e2ff5936ff94af4ebd7dce05666e3 +size 557 diff --git a/adapters/saved_llamainstinwild/checkpoint-108/scheduler.pt b/adapters/saved_llamainstinwild/checkpoint-108/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ac4746347b3cbe8821970dd630ba54c6630233e --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-108/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8139b6527b8ce6d0f3002450d16ff656087559de2d58cd58563316e1db98fe24 +size 627 diff --git a/adapters/saved_llamainstinwild/checkpoint-108/trainer_state.json b/adapters/saved_llamainstinwild/checkpoint-108/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ed8a5843f3d175e2e913d57f32c7189254ecce12 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-108/trainer_state.json @@ -0,0 +1,262 @@ +{ + "best_metric": 1.1765486001968384, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamainstinwild/checkpoint-108", + "epoch": 2.202676864244742, + "global_step": 108, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "eval_loss": 1.39645254611969, + "eval_runtime": 37.7333, + "eval_samples_per_second": 53.004, + "eval_steps_per_second": 0.848, + "step": 4 + }, + { + "epoch": 0.16, + "eval_loss": 1.2984137535095215, + "eval_runtime": 37.7164, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 8 + }, + { + "epoch": 0.24, + "eval_loss": 1.2540132999420166, + "eval_runtime": 37.6847, + "eval_samples_per_second": 53.072, + "eval_steps_per_second": 0.849, + "step": 12 + }, + { + "epoch": 0.33, + "eval_loss": 1.2342448234558105, + "eval_runtime": 37.6879, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 16 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002664335664335664, + "loss": 1.3125, + "step": 20 + }, + { + "epoch": 0.41, + "eval_loss": 1.2209277153015137, + "eval_runtime": 37.7168, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 20 + }, + { + "epoch": 0.49, + "eval_loss": 1.2117934226989746, + "eval_runtime": 37.6897, + "eval_samples_per_second": 53.065, + "eval_steps_per_second": 0.849, + "step": 24 + }, + { + "epoch": 0.57, + "eval_loss": 1.2057584524154663, + "eval_runtime": 37.7691, + "eval_samples_per_second": 52.953, + "eval_steps_per_second": 0.847, + "step": 28 + }, + { + "epoch": 0.65, + "eval_loss": 1.2009178400039673, + "eval_runtime": 37.7026, + "eval_samples_per_second": 53.047, + "eval_steps_per_second": 0.849, + "step": 32 + }, + { + "epoch": 0.73, + "eval_loss": 1.197250485420227, + "eval_runtime": 37.6765, + "eval_samples_per_second": 53.084, + "eval_steps_per_second": 0.849, + "step": 36 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022447552447552445, + "loss": 1.2095, + "step": 40 + }, + { + "epoch": 0.82, + "eval_loss": 1.1943359375, + "eval_runtime": 37.6881, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 40 + }, + { + "epoch": 0.9, + "eval_loss": 1.1920316219329834, + "eval_runtime": 37.694, + "eval_samples_per_second": 53.059, + "eval_steps_per_second": 0.849, + "step": 44 + }, + { + "epoch": 0.98, + "eval_loss": 1.1898770332336426, + "eval_runtime": 37.7089, + "eval_samples_per_second": 53.038, + "eval_steps_per_second": 0.849, + "step": 48 + }, + { + "epoch": 1.06, + "eval_loss": 1.1879488229751587, + "eval_runtime": 37.6975, + "eval_samples_per_second": 53.054, + "eval_steps_per_second": 0.849, + "step": 52 + }, + { + "epoch": 1.14, + "eval_loss": 1.1864111423492432, + "eval_runtime": 37.7188, + "eval_samples_per_second": 53.024, + "eval_steps_per_second": 0.848, + "step": 56 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018251748251748253, + "loss": 1.188, + "step": 60 + }, + { + "epoch": 1.22, + "eval_loss": 1.1849411725997925, + "eval_runtime": 37.7686, + "eval_samples_per_second": 52.954, + "eval_steps_per_second": 0.847, + "step": 60 + }, + { + "epoch": 1.31, + "eval_loss": 1.1838239431381226, + "eval_runtime": 37.6945, + "eval_samples_per_second": 53.058, + "eval_steps_per_second": 0.849, + "step": 64 + }, + { + "epoch": 1.39, + "eval_loss": 1.182841181755066, + "eval_runtime": 37.6624, + "eval_samples_per_second": 53.103, + "eval_steps_per_second": 0.85, + "step": 68 + }, + { + "epoch": 1.47, + "eval_loss": 1.1818122863769531, + "eval_runtime": 37.7233, + "eval_samples_per_second": 53.018, + "eval_steps_per_second": 0.848, + "step": 72 + }, + { + "epoch": 1.55, + "eval_loss": 1.1808911561965942, + "eval_runtime": 37.7038, + "eval_samples_per_second": 53.045, + "eval_steps_per_second": 0.849, + "step": 76 + }, + { + "epoch": 1.63, + "learning_rate": 0.00014055944055944055, + "loss": 1.1876, + "step": 80 + }, + { + "epoch": 1.63, + "eval_loss": 1.1801403760910034, + "eval_runtime": 37.7277, + "eval_samples_per_second": 53.011, + "eval_steps_per_second": 0.848, + "step": 80 + }, + { + "epoch": 1.71, + "eval_loss": 1.1793811321258545, + "eval_runtime": 37.6641, + "eval_samples_per_second": 53.101, + "eval_steps_per_second": 0.85, + "step": 84 + }, + { + "epoch": 1.79, + "eval_loss": 1.1788283586502075, + "eval_runtime": 37.7097, + "eval_samples_per_second": 53.037, + "eval_steps_per_second": 0.849, + "step": 88 + }, + { + "epoch": 1.88, + "eval_loss": 1.1783016920089722, + "eval_runtime": 37.7104, + "eval_samples_per_second": 53.036, + "eval_steps_per_second": 0.849, + "step": 92 + }, + { + "epoch": 1.96, + "eval_loss": 1.177699327468872, + "eval_runtime": 37.6884, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 96 + }, + { + "epoch": 2.04, + "learning_rate": 9.860139860139858e-05, + "loss": 1.182, + "step": 100 + }, + { + "epoch": 2.04, + "eval_loss": 1.1773412227630615, + "eval_runtime": 37.7134, + "eval_samples_per_second": 53.032, + "eval_steps_per_second": 0.849, + "step": 100 + }, + { + "epoch": 2.12, + "eval_loss": 1.176965594291687, + "eval_runtime": 37.6735, + "eval_samples_per_second": 53.088, + "eval_steps_per_second": 0.849, + "step": 104 + }, + { + "epoch": 2.2, + "eval_loss": 1.1765486001968384, + "eval_runtime": 37.7771, + "eval_samples_per_second": 52.942, + "eval_steps_per_second": 0.847, + "step": 108 + } + ], + "max_steps": 147, + "num_train_epochs": 3, + "total_flos": 1.2579484186601062e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamainstinwild/checkpoint-108/training_args.bin b/adapters/saved_llamainstinwild/checkpoint-108/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..34b95391e17e9d44eb2a04177250246c85716097 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-108/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13fbb17fb4210eddefede6fc992fb1cd0ac7bb1d7d94253bc089086b39aba316 +size 3643 diff --git a/adapters/saved_llamainstinwild/checkpoint-112/optimizer.pt b/adapters/saved_llamainstinwild/checkpoint-112/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f153613fae2e093a9a3f2b8d87dc5875a78d853 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-112/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f77c7ceb09c931cfdb4f9511cdc7d32b5913440b58ac9a0218cf2213af92daa0 +size 33629765 diff --git a/adapters/saved_llamainstinwild/checkpoint-112/pytorch_model.bin b/adapters/saved_llamainstinwild/checkpoint-112/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..a59f79debac04cd494240070cdee7c779554de79 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-112/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b370eb5f382e0123fa2fb055700303e49017e41b11d935da598829c0c3be91e7 +size 16822989 diff --git a/adapters/saved_llamainstinwild/checkpoint-112/rng_state_0.pth b/adapters/saved_llamainstinwild/checkpoint-112/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..bdf9bcb9deae665c2cd5627cb9c8a7c77f50d0e7 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-112/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66f643860db806940af8fb5cda6a90b0d6415d89a8243d45af123d6767378493 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-112/rng_state_1.pth b/adapters/saved_llamainstinwild/checkpoint-112/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..82e5cd7345748fbc10ea917ef7dbec9f20cb87cf --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-112/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0250f63a786777e2f9b6f48b91ca9e1a5a5b7e25af3d887a6af8b029e1346e7d +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-112/rng_state_2.pth b/adapters/saved_llamainstinwild/checkpoint-112/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..ab9df8360f1865fdfdb681c168bbf8b4311642d4 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-112/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b70fb09f0c24dc5cc9c2c69940ad4e8c2745a834c11767719a6a1747f4ec044 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-112/rng_state_3.pth b/adapters/saved_llamainstinwild/checkpoint-112/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..297880b2de43efd15719aa03edb7a19c91681595 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-112/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb18773cdcc97f277b51814da47a8ffd7c7eaa45ed87cea2901e9d52e3dc10dd +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-112/rng_state_4.pth b/adapters/saved_llamainstinwild/checkpoint-112/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..e60e22bd56ba28dd6710e4daf92931d8a349d889 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-112/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5234d6326d3cc0b9c8a2d94a82385e1cb383a1f0ae036c9b052dfaae671525e9 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-112/rng_state_5.pth b/adapters/saved_llamainstinwild/checkpoint-112/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..c9ab0f1ea1709da68c1e162e1ef1e9c41640f40d --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-112/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7be1ae1b776cf99314d4c5e93bcfdabfe3d4543f1f1e6f4c109a82a84080cfaf +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-112/rng_state_6.pth b/adapters/saved_llamainstinwild/checkpoint-112/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..048a2701cdcb11d0ad284075595e040ea313a1f5 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-112/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32bb5cdb19da96d3c89fdac54ce40e4840c75acb3a9b343ed63a9acf8fa8815a +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-112/rng_state_7.pth b/adapters/saved_llamainstinwild/checkpoint-112/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..700a4f51165ba8455d7654b254df3c724974da96 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-112/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b00035bc5c0ac4a4a033c658c6447632bce6c7d23e478b61d229f80a79385a3 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-112/scaler.pt b/adapters/saved_llamainstinwild/checkpoint-112/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..77174f6d26aded41a49a9bf6166176fbb3c28b5a --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-112/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6b240c32ef702ac523b294b979864a897ae8a7efa9790900a0cc2b920ea8dd5 +size 557 diff --git a/adapters/saved_llamainstinwild/checkpoint-112/scheduler.pt b/adapters/saved_llamainstinwild/checkpoint-112/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..123c4edf019a94aa1a320493f18364cf17cebfae --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-112/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9afb28cf9e3c43bdf8f25bb9781725f5400ae87f51131e47006025a089acbab +size 627 diff --git a/adapters/saved_llamainstinwild/checkpoint-112/trainer_state.json b/adapters/saved_llamainstinwild/checkpoint-112/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..162d6d2f107d6b55fe99f9ffbcd54dd3d7494e9b --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-112/trainer_state.json @@ -0,0 +1,270 @@ +{ + "best_metric": 1.1761963367462158, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamainstinwild/checkpoint-112", + "epoch": 2.284257488846399, + "global_step": 112, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "eval_loss": 1.39645254611969, + "eval_runtime": 37.7333, + "eval_samples_per_second": 53.004, + "eval_steps_per_second": 0.848, + "step": 4 + }, + { + "epoch": 0.16, + "eval_loss": 1.2984137535095215, + "eval_runtime": 37.7164, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 8 + }, + { + "epoch": 0.24, + "eval_loss": 1.2540132999420166, + "eval_runtime": 37.6847, + "eval_samples_per_second": 53.072, + "eval_steps_per_second": 0.849, + "step": 12 + }, + { + "epoch": 0.33, + "eval_loss": 1.2342448234558105, + "eval_runtime": 37.6879, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 16 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002664335664335664, + "loss": 1.3125, + "step": 20 + }, + { + "epoch": 0.41, + "eval_loss": 1.2209277153015137, + "eval_runtime": 37.7168, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 20 + }, + { + "epoch": 0.49, + "eval_loss": 1.2117934226989746, + "eval_runtime": 37.6897, + "eval_samples_per_second": 53.065, + "eval_steps_per_second": 0.849, + "step": 24 + }, + { + "epoch": 0.57, + "eval_loss": 1.2057584524154663, + "eval_runtime": 37.7691, + "eval_samples_per_second": 52.953, + "eval_steps_per_second": 0.847, + "step": 28 + }, + { + "epoch": 0.65, + "eval_loss": 1.2009178400039673, + "eval_runtime": 37.7026, + "eval_samples_per_second": 53.047, + "eval_steps_per_second": 0.849, + "step": 32 + }, + { + "epoch": 0.73, + "eval_loss": 1.197250485420227, + "eval_runtime": 37.6765, + "eval_samples_per_second": 53.084, + "eval_steps_per_second": 0.849, + "step": 36 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022447552447552445, + "loss": 1.2095, + "step": 40 + }, + { + "epoch": 0.82, + "eval_loss": 1.1943359375, + "eval_runtime": 37.6881, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 40 + }, + { + "epoch": 0.9, + "eval_loss": 1.1920316219329834, + "eval_runtime": 37.694, + "eval_samples_per_second": 53.059, + "eval_steps_per_second": 0.849, + "step": 44 + }, + { + "epoch": 0.98, + "eval_loss": 1.1898770332336426, + "eval_runtime": 37.7089, + "eval_samples_per_second": 53.038, + "eval_steps_per_second": 0.849, + "step": 48 + }, + { + "epoch": 1.06, + "eval_loss": 1.1879488229751587, + "eval_runtime": 37.6975, + "eval_samples_per_second": 53.054, + "eval_steps_per_second": 0.849, + "step": 52 + }, + { + "epoch": 1.14, + "eval_loss": 1.1864111423492432, + "eval_runtime": 37.7188, + "eval_samples_per_second": 53.024, + "eval_steps_per_second": 0.848, + "step": 56 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018251748251748253, + "loss": 1.188, + "step": 60 + }, + { + "epoch": 1.22, + "eval_loss": 1.1849411725997925, + "eval_runtime": 37.7686, + "eval_samples_per_second": 52.954, + "eval_steps_per_second": 0.847, + "step": 60 + }, + { + "epoch": 1.31, + "eval_loss": 1.1838239431381226, + "eval_runtime": 37.6945, + "eval_samples_per_second": 53.058, + "eval_steps_per_second": 0.849, + "step": 64 + }, + { + "epoch": 1.39, + "eval_loss": 1.182841181755066, + "eval_runtime": 37.6624, + "eval_samples_per_second": 53.103, + "eval_steps_per_second": 0.85, + "step": 68 + }, + { + "epoch": 1.47, + "eval_loss": 1.1818122863769531, + "eval_runtime": 37.7233, + "eval_samples_per_second": 53.018, + "eval_steps_per_second": 0.848, + "step": 72 + }, + { + "epoch": 1.55, + "eval_loss": 1.1808911561965942, + "eval_runtime": 37.7038, + "eval_samples_per_second": 53.045, + "eval_steps_per_second": 0.849, + "step": 76 + }, + { + "epoch": 1.63, + "learning_rate": 0.00014055944055944055, + "loss": 1.1876, + "step": 80 + }, + { + "epoch": 1.63, + "eval_loss": 1.1801403760910034, + "eval_runtime": 37.7277, + "eval_samples_per_second": 53.011, + "eval_steps_per_second": 0.848, + "step": 80 + }, + { + "epoch": 1.71, + "eval_loss": 1.1793811321258545, + "eval_runtime": 37.6641, + "eval_samples_per_second": 53.101, + "eval_steps_per_second": 0.85, + "step": 84 + }, + { + "epoch": 1.79, + "eval_loss": 1.1788283586502075, + "eval_runtime": 37.7097, + "eval_samples_per_second": 53.037, + "eval_steps_per_second": 0.849, + "step": 88 + }, + { + "epoch": 1.88, + "eval_loss": 1.1783016920089722, + "eval_runtime": 37.7104, + "eval_samples_per_second": 53.036, + "eval_steps_per_second": 0.849, + "step": 92 + }, + { + "epoch": 1.96, + "eval_loss": 1.177699327468872, + "eval_runtime": 37.6884, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 96 + }, + { + "epoch": 2.04, + "learning_rate": 9.860139860139858e-05, + "loss": 1.182, + "step": 100 + }, + { + "epoch": 2.04, + "eval_loss": 1.1773412227630615, + "eval_runtime": 37.7134, + "eval_samples_per_second": 53.032, + "eval_steps_per_second": 0.849, + "step": 100 + }, + { + "epoch": 2.12, + "eval_loss": 1.176965594291687, + "eval_runtime": 37.6735, + "eval_samples_per_second": 53.088, + "eval_steps_per_second": 0.849, + "step": 104 + }, + { + "epoch": 2.2, + "eval_loss": 1.1765486001968384, + "eval_runtime": 37.7771, + "eval_samples_per_second": 52.942, + "eval_steps_per_second": 0.847, + "step": 108 + }, + { + "epoch": 2.28, + "eval_loss": 1.1761963367462158, + "eval_runtime": 37.7211, + "eval_samples_per_second": 53.021, + "eval_steps_per_second": 0.848, + "step": 112 + } + ], + "max_steps": 147, + "num_train_epochs": 3, + "total_flos": 1.3046940008961802e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamainstinwild/checkpoint-112/training_args.bin b/adapters/saved_llamainstinwild/checkpoint-112/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..34b95391e17e9d44eb2a04177250246c85716097 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-112/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13fbb17fb4210eddefede6fc992fb1cd0ac7bb1d7d94253bc089086b39aba316 +size 3643 diff --git a/adapters/saved_llamainstinwild/checkpoint-116/optimizer.pt b/adapters/saved_llamainstinwild/checkpoint-116/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ad6b861611d56e87257dd1b1500b43ba3fc5814 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-116/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d2dceca83560110055b74f95d353201ff464c5f3332b882cb3e43dd54d1acdf +size 33629765 diff --git a/adapters/saved_llamainstinwild/checkpoint-116/pytorch_model.bin b/adapters/saved_llamainstinwild/checkpoint-116/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..e86451f7238588585ec06c1c50c8f266b9a4200b --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-116/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f952e670670590810ed6fcadb433805887bc941cfa1739de5c124eb3f14f163e +size 16822989 diff --git a/adapters/saved_llamainstinwild/checkpoint-116/rng_state_0.pth b/adapters/saved_llamainstinwild/checkpoint-116/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..0bb69fd1a8983476c0901428ffd6a630f53f5f68 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-116/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67a4f5270fe2dab0040560d0799188082b1f5d25c6d60c03380d64571b1f8f32 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-116/rng_state_1.pth b/adapters/saved_llamainstinwild/checkpoint-116/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..59e159eb1a317af0ab0a3458a1dda830438b5f7c --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-116/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7860006511b0bc8890f934e042ece1966e510b8cfd03b21f69bd3af3cd32484 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-116/rng_state_2.pth b/adapters/saved_llamainstinwild/checkpoint-116/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..c10014cad11ad6838f31d2539d6053054c9ab739 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-116/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:424fe210e3a3b7ecdc3c52033214bdda90ed2c31ebbd728a482f0e5d29c67666 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-116/rng_state_3.pth b/adapters/saved_llamainstinwild/checkpoint-116/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b61afbd2a40fa018f5444797a0352ea32a8c88fd --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-116/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25a55486bd6498acc9b1aeab8678cdef09e3a9585dd3059f0619a0445786a2d0 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-116/rng_state_4.pth b/adapters/saved_llamainstinwild/checkpoint-116/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..de599c292e4809feb4b43e454b767f530b8eaf15 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-116/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b44e199beb1e0707db8f86b92313d1f97a64372abb3956dc7546fe792f89fb90 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-116/rng_state_5.pth b/adapters/saved_llamainstinwild/checkpoint-116/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..0bd0397831ade1484cef606fb4edbe07212407c8 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-116/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b430e9e88e479416dbd22b4d96f59e0bd32c8d22bc866c6fd553deb1fbf029b4 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-116/rng_state_6.pth b/adapters/saved_llamainstinwild/checkpoint-116/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..b95075716e6d439215917305c0a7dcc9c1b2be13 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-116/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de034e76de8838bdf147429b9632864fcec4fca7c31aa8655fd3c18b3c38f539 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-116/rng_state_7.pth b/adapters/saved_llamainstinwild/checkpoint-116/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..8df90a99a98d5100bb49d34d9078d8513253b774 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-116/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:491af29e30605058d20ef6124a8b58cdee20460d31f7ca52bba69b7999da9bf6 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-116/scaler.pt b/adapters/saved_llamainstinwild/checkpoint-116/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a5f2f5362ebd9e0eb8132821de95a8763b78e5f --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-116/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3257036012d1d9d973faf15eb497c60f2dec03f49d7fac6c8f9fd2a5b9993e9a +size 557 diff --git a/adapters/saved_llamainstinwild/checkpoint-116/scheduler.pt b/adapters/saved_llamainstinwild/checkpoint-116/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..10ea098a87807811e3ea3f3b69aae5ecf9854ae5 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-116/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2e3fa7c91c3a09ffee09058737eadf04d68bae75c25698a255979291d970811 +size 627 diff --git a/adapters/saved_llamainstinwild/checkpoint-116/trainer_state.json b/adapters/saved_llamainstinwild/checkpoint-116/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..edef74fec5a87bc9f6eb0ba6ade8277da04142d3 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-116/trainer_state.json @@ -0,0 +1,278 @@ +{ + "best_metric": 1.1757991313934326, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamainstinwild/checkpoint-116", + "epoch": 2.365838113448056, + "global_step": 116, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "eval_loss": 1.39645254611969, + "eval_runtime": 37.7333, + "eval_samples_per_second": 53.004, + "eval_steps_per_second": 0.848, + "step": 4 + }, + { + "epoch": 0.16, + "eval_loss": 1.2984137535095215, + "eval_runtime": 37.7164, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 8 + }, + { + "epoch": 0.24, + "eval_loss": 1.2540132999420166, + "eval_runtime": 37.6847, + "eval_samples_per_second": 53.072, + "eval_steps_per_second": 0.849, + "step": 12 + }, + { + "epoch": 0.33, + "eval_loss": 1.2342448234558105, + "eval_runtime": 37.6879, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 16 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002664335664335664, + "loss": 1.3125, + "step": 20 + }, + { + "epoch": 0.41, + "eval_loss": 1.2209277153015137, + "eval_runtime": 37.7168, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 20 + }, + { + "epoch": 0.49, + "eval_loss": 1.2117934226989746, + "eval_runtime": 37.6897, + "eval_samples_per_second": 53.065, + "eval_steps_per_second": 0.849, + "step": 24 + }, + { + "epoch": 0.57, + "eval_loss": 1.2057584524154663, + "eval_runtime": 37.7691, + "eval_samples_per_second": 52.953, + "eval_steps_per_second": 0.847, + "step": 28 + }, + { + "epoch": 0.65, + "eval_loss": 1.2009178400039673, + "eval_runtime": 37.7026, + "eval_samples_per_second": 53.047, + "eval_steps_per_second": 0.849, + "step": 32 + }, + { + "epoch": 0.73, + "eval_loss": 1.197250485420227, + "eval_runtime": 37.6765, + "eval_samples_per_second": 53.084, + "eval_steps_per_second": 0.849, + "step": 36 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022447552447552445, + "loss": 1.2095, + "step": 40 + }, + { + "epoch": 0.82, + "eval_loss": 1.1943359375, + "eval_runtime": 37.6881, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 40 + }, + { + "epoch": 0.9, + "eval_loss": 1.1920316219329834, + "eval_runtime": 37.694, + "eval_samples_per_second": 53.059, + "eval_steps_per_second": 0.849, + "step": 44 + }, + { + "epoch": 0.98, + "eval_loss": 1.1898770332336426, + "eval_runtime": 37.7089, + "eval_samples_per_second": 53.038, + "eval_steps_per_second": 0.849, + "step": 48 + }, + { + "epoch": 1.06, + "eval_loss": 1.1879488229751587, + "eval_runtime": 37.6975, + "eval_samples_per_second": 53.054, + "eval_steps_per_second": 0.849, + "step": 52 + }, + { + "epoch": 1.14, + "eval_loss": 1.1864111423492432, + "eval_runtime": 37.7188, + "eval_samples_per_second": 53.024, + "eval_steps_per_second": 0.848, + "step": 56 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018251748251748253, + "loss": 1.188, + "step": 60 + }, + { + "epoch": 1.22, + "eval_loss": 1.1849411725997925, + "eval_runtime": 37.7686, + "eval_samples_per_second": 52.954, + "eval_steps_per_second": 0.847, + "step": 60 + }, + { + "epoch": 1.31, + "eval_loss": 1.1838239431381226, + "eval_runtime": 37.6945, + "eval_samples_per_second": 53.058, + "eval_steps_per_second": 0.849, + "step": 64 + }, + { + "epoch": 1.39, + "eval_loss": 1.182841181755066, + "eval_runtime": 37.6624, + "eval_samples_per_second": 53.103, + "eval_steps_per_second": 0.85, + "step": 68 + }, + { + "epoch": 1.47, + "eval_loss": 1.1818122863769531, + "eval_runtime": 37.7233, + "eval_samples_per_second": 53.018, + "eval_steps_per_second": 0.848, + "step": 72 + }, + { + "epoch": 1.55, + "eval_loss": 1.1808911561965942, + "eval_runtime": 37.7038, + "eval_samples_per_second": 53.045, + "eval_steps_per_second": 0.849, + "step": 76 + }, + { + "epoch": 1.63, + "learning_rate": 0.00014055944055944055, + "loss": 1.1876, + "step": 80 + }, + { + "epoch": 1.63, + "eval_loss": 1.1801403760910034, + "eval_runtime": 37.7277, + "eval_samples_per_second": 53.011, + "eval_steps_per_second": 0.848, + "step": 80 + }, + { + "epoch": 1.71, + "eval_loss": 1.1793811321258545, + "eval_runtime": 37.6641, + "eval_samples_per_second": 53.101, + "eval_steps_per_second": 0.85, + "step": 84 + }, + { + "epoch": 1.79, + "eval_loss": 1.1788283586502075, + "eval_runtime": 37.7097, + "eval_samples_per_second": 53.037, + "eval_steps_per_second": 0.849, + "step": 88 + }, + { + "epoch": 1.88, + "eval_loss": 1.1783016920089722, + "eval_runtime": 37.7104, + "eval_samples_per_second": 53.036, + "eval_steps_per_second": 0.849, + "step": 92 + }, + { + "epoch": 1.96, + "eval_loss": 1.177699327468872, + "eval_runtime": 37.6884, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 96 + }, + { + "epoch": 2.04, + "learning_rate": 9.860139860139858e-05, + "loss": 1.182, + "step": 100 + }, + { + "epoch": 2.04, + "eval_loss": 1.1773412227630615, + "eval_runtime": 37.7134, + "eval_samples_per_second": 53.032, + "eval_steps_per_second": 0.849, + "step": 100 + }, + { + "epoch": 2.12, + "eval_loss": 1.176965594291687, + "eval_runtime": 37.6735, + "eval_samples_per_second": 53.088, + "eval_steps_per_second": 0.849, + "step": 104 + }, + { + "epoch": 2.2, + "eval_loss": 1.1765486001968384, + "eval_runtime": 37.7771, + "eval_samples_per_second": 52.942, + "eval_steps_per_second": 0.847, + "step": 108 + }, + { + "epoch": 2.28, + "eval_loss": 1.1761963367462158, + "eval_runtime": 37.7211, + "eval_samples_per_second": 53.021, + "eval_steps_per_second": 0.848, + "step": 112 + }, + { + "epoch": 2.37, + "eval_loss": 1.1757991313934326, + "eval_runtime": 37.7438, + "eval_samples_per_second": 52.989, + "eval_steps_per_second": 0.848, + "step": 116 + } + ], + "max_steps": 147, + "num_train_epochs": 3, + "total_flos": 1.3520679454375936e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamainstinwild/checkpoint-116/training_args.bin b/adapters/saved_llamainstinwild/checkpoint-116/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..34b95391e17e9d44eb2a04177250246c85716097 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-116/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13fbb17fb4210eddefede6fc992fb1cd0ac7bb1d7d94253bc089086b39aba316 +size 3643 diff --git a/adapters/saved_llamainstinwild/checkpoint-120/optimizer.pt b/adapters/saved_llamainstinwild/checkpoint-120/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f06cfe1afd762ce9d3cb5cdaa234cbc196703b4 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-120/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4ba20661159ccc12b0cfca1bf867efa5d73cd7903c1a489e25f1bc46e69e0df +size 33629765 diff --git a/adapters/saved_llamainstinwild/checkpoint-120/pytorch_model.bin b/adapters/saved_llamainstinwild/checkpoint-120/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..7ca04b77b7adf5a9ac01ea5e090dcaf30496e0a5 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-120/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6b571d00d2b6ceeb8447461a6a9d3078c3fe1a794b4f99d0f252bdde5e2df3f +size 16822989 diff --git a/adapters/saved_llamainstinwild/checkpoint-120/rng_state_0.pth b/adapters/saved_llamainstinwild/checkpoint-120/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8d4955c094aa8565ebaf5b6f90c41d2b8d0dfb82 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-120/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:483b8c3b253c03c4cabff7f19e703baca43488804e35c827967be0429ec4a4e5 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-120/rng_state_1.pth b/adapters/saved_llamainstinwild/checkpoint-120/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0abe2a58bf357b1be9351c200326b2ad4f4d50a6 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-120/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84d39a0b1f9b470d2dc59d0caafaf5132958862c22c9d8abae5bd2b69e6aa4e1 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-120/rng_state_2.pth b/adapters/saved_llamainstinwild/checkpoint-120/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..5517b2b9c8c0bf799109bc52c0202ad8dec1b8b0 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-120/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3df193737e8d1bdefd0282feb56ed1fc5eefb0f43e3d2a3d2627d022a96e4464 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-120/rng_state_3.pth b/adapters/saved_llamainstinwild/checkpoint-120/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..7d00dd08e3d88e2a13f62345f984c1395e068510 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-120/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f56cfba4592ba69085d62e884e4b6c3a1dece14e87a9fedcfb3b742e1e674ec +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-120/rng_state_4.pth b/adapters/saved_llamainstinwild/checkpoint-120/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..99d593eeb788481046aa7884cad08d7151276d6d --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-120/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c82521dd3cb396378fd6576da398263cadaf893aebe8a78d0fcd88efbbc48b77 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-120/rng_state_5.pth b/adapters/saved_llamainstinwild/checkpoint-120/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..bfdeca9a4619a9767034195866efcd0162d13aeb --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-120/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea2607364f4995902df7f2a55d0dc99ed26bd909248760e72b62bc2b1f300f78 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-120/rng_state_6.pth b/adapters/saved_llamainstinwild/checkpoint-120/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..9792e6f8f9f38c44c977e2b0364e81fd0d7c0a3d --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-120/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eecd64a5dfd5a55e1028b1ed7d585dd4569595514a47074ac699d30328c0c99d +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-120/rng_state_7.pth b/adapters/saved_llamainstinwild/checkpoint-120/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..756276dc668e0c10086bb5967444750f7bc4fdf6 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-120/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eda779a9ab905e6cf0438dc238ea919b8a2bb2338b052a4f7f0900cb80e169fb +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-120/scaler.pt b/adapters/saved_llamainstinwild/checkpoint-120/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1a50e08815899670db763c19bce5e6852e1f11d4 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-120/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e09ff957dcf3f9fb1df66ef09885175387c825e4e3df09b6b2d36d79c3de1ea +size 557 diff --git a/adapters/saved_llamainstinwild/checkpoint-120/scheduler.pt b/adapters/saved_llamainstinwild/checkpoint-120/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..88af0ed4baf5187c5ac47176056e8cee9bd75155 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-120/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfa897647672ba1b9c6a7f2290aef209a42e43811d41c8cc5118e051737793b0 +size 627 diff --git a/adapters/saved_llamainstinwild/checkpoint-120/trainer_state.json b/adapters/saved_llamainstinwild/checkpoint-120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..26552dfb127332e063fc6c7b38c85c10a0f2eeef --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-120/trainer_state.json @@ -0,0 +1,292 @@ +{ + "best_metric": 1.1754933595657349, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamainstinwild/checkpoint-120", + "epoch": 2.447418738049713, + "global_step": 120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "eval_loss": 1.39645254611969, + "eval_runtime": 37.7333, + "eval_samples_per_second": 53.004, + "eval_steps_per_second": 0.848, + "step": 4 + }, + { + "epoch": 0.16, + "eval_loss": 1.2984137535095215, + "eval_runtime": 37.7164, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 8 + }, + { + "epoch": 0.24, + "eval_loss": 1.2540132999420166, + "eval_runtime": 37.6847, + "eval_samples_per_second": 53.072, + "eval_steps_per_second": 0.849, + "step": 12 + }, + { + "epoch": 0.33, + "eval_loss": 1.2342448234558105, + "eval_runtime": 37.6879, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 16 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002664335664335664, + "loss": 1.3125, + "step": 20 + }, + { + "epoch": 0.41, + "eval_loss": 1.2209277153015137, + "eval_runtime": 37.7168, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 20 + }, + { + "epoch": 0.49, + "eval_loss": 1.2117934226989746, + "eval_runtime": 37.6897, + "eval_samples_per_second": 53.065, + "eval_steps_per_second": 0.849, + "step": 24 + }, + { + "epoch": 0.57, + "eval_loss": 1.2057584524154663, + "eval_runtime": 37.7691, + "eval_samples_per_second": 52.953, + "eval_steps_per_second": 0.847, + "step": 28 + }, + { + "epoch": 0.65, + "eval_loss": 1.2009178400039673, + "eval_runtime": 37.7026, + "eval_samples_per_second": 53.047, + "eval_steps_per_second": 0.849, + "step": 32 + }, + { + "epoch": 0.73, + "eval_loss": 1.197250485420227, + "eval_runtime": 37.6765, + "eval_samples_per_second": 53.084, + "eval_steps_per_second": 0.849, + "step": 36 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022447552447552445, + "loss": 1.2095, + "step": 40 + }, + { + "epoch": 0.82, + "eval_loss": 1.1943359375, + "eval_runtime": 37.6881, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 40 + }, + { + "epoch": 0.9, + "eval_loss": 1.1920316219329834, + "eval_runtime": 37.694, + "eval_samples_per_second": 53.059, + "eval_steps_per_second": 0.849, + "step": 44 + }, + { + "epoch": 0.98, + "eval_loss": 1.1898770332336426, + "eval_runtime": 37.7089, + "eval_samples_per_second": 53.038, + "eval_steps_per_second": 0.849, + "step": 48 + }, + { + "epoch": 1.06, + "eval_loss": 1.1879488229751587, + "eval_runtime": 37.6975, + "eval_samples_per_second": 53.054, + "eval_steps_per_second": 0.849, + "step": 52 + }, + { + "epoch": 1.14, + "eval_loss": 1.1864111423492432, + "eval_runtime": 37.7188, + "eval_samples_per_second": 53.024, + "eval_steps_per_second": 0.848, + "step": 56 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018251748251748253, + "loss": 1.188, + "step": 60 + }, + { + "epoch": 1.22, + "eval_loss": 1.1849411725997925, + "eval_runtime": 37.7686, + "eval_samples_per_second": 52.954, + "eval_steps_per_second": 0.847, + "step": 60 + }, + { + "epoch": 1.31, + "eval_loss": 1.1838239431381226, + "eval_runtime": 37.6945, + "eval_samples_per_second": 53.058, + "eval_steps_per_second": 0.849, + "step": 64 + }, + { + "epoch": 1.39, + "eval_loss": 1.182841181755066, + "eval_runtime": 37.6624, + "eval_samples_per_second": 53.103, + "eval_steps_per_second": 0.85, + "step": 68 + }, + { + "epoch": 1.47, + "eval_loss": 1.1818122863769531, + "eval_runtime": 37.7233, + "eval_samples_per_second": 53.018, + "eval_steps_per_second": 0.848, + "step": 72 + }, + { + "epoch": 1.55, + "eval_loss": 1.1808911561965942, + "eval_runtime": 37.7038, + "eval_samples_per_second": 53.045, + "eval_steps_per_second": 0.849, + "step": 76 + }, + { + "epoch": 1.63, + "learning_rate": 0.00014055944055944055, + "loss": 1.1876, + "step": 80 + }, + { + "epoch": 1.63, + "eval_loss": 1.1801403760910034, + "eval_runtime": 37.7277, + "eval_samples_per_second": 53.011, + "eval_steps_per_second": 0.848, + "step": 80 + }, + { + "epoch": 1.71, + "eval_loss": 1.1793811321258545, + "eval_runtime": 37.6641, + "eval_samples_per_second": 53.101, + "eval_steps_per_second": 0.85, + "step": 84 + }, + { + "epoch": 1.79, + "eval_loss": 1.1788283586502075, + "eval_runtime": 37.7097, + "eval_samples_per_second": 53.037, + "eval_steps_per_second": 0.849, + "step": 88 + }, + { + "epoch": 1.88, + "eval_loss": 1.1783016920089722, + "eval_runtime": 37.7104, + "eval_samples_per_second": 53.036, + "eval_steps_per_second": 0.849, + "step": 92 + }, + { + "epoch": 1.96, + "eval_loss": 1.177699327468872, + "eval_runtime": 37.6884, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 96 + }, + { + "epoch": 2.04, + "learning_rate": 9.860139860139858e-05, + "loss": 1.182, + "step": 100 + }, + { + "epoch": 2.04, + "eval_loss": 1.1773412227630615, + "eval_runtime": 37.7134, + "eval_samples_per_second": 53.032, + "eval_steps_per_second": 0.849, + "step": 100 + }, + { + "epoch": 2.12, + "eval_loss": 1.176965594291687, + "eval_runtime": 37.6735, + "eval_samples_per_second": 53.088, + "eval_steps_per_second": 0.849, + "step": 104 + }, + { + "epoch": 2.2, + "eval_loss": 1.1765486001968384, + "eval_runtime": 37.7771, + "eval_samples_per_second": 52.942, + "eval_steps_per_second": 0.847, + "step": 108 + }, + { + "epoch": 2.28, + "eval_loss": 1.1761963367462158, + "eval_runtime": 37.7211, + "eval_samples_per_second": 53.021, + "eval_steps_per_second": 0.848, + "step": 112 + }, + { + "epoch": 2.37, + "eval_loss": 1.1757991313934326, + "eval_runtime": 37.7438, + "eval_samples_per_second": 52.989, + "eval_steps_per_second": 0.848, + "step": 116 + }, + { + "epoch": 2.45, + "learning_rate": 5.664335664335664e-05, + "loss": 1.1785, + "step": 120 + }, + { + "epoch": 2.45, + "eval_loss": 1.1754933595657349, + "eval_runtime": 37.7128, + "eval_samples_per_second": 53.032, + "eval_steps_per_second": 0.849, + "step": 120 + } + ], + "max_steps": 147, + "num_train_epochs": 3, + "total_flos": 1.3984182618334167e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamainstinwild/checkpoint-120/training_args.bin b/adapters/saved_llamainstinwild/checkpoint-120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..34b95391e17e9d44eb2a04177250246c85716097 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13fbb17fb4210eddefede6fc992fb1cd0ac7bb1d7d94253bc089086b39aba316 +size 3643 diff --git a/adapters/saved_llamainstinwild/checkpoint-124/optimizer.pt b/adapters/saved_llamainstinwild/checkpoint-124/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..de9cc663ee31f82da5689af2579247bc323d7fd5 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-124/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:110f88aaf5f9cbf8efb41bd9e61e840a2a6a082662d63c2d36165c99dfb0adc0 +size 33629765 diff --git a/adapters/saved_llamainstinwild/checkpoint-124/pytorch_model.bin b/adapters/saved_llamainstinwild/checkpoint-124/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..b15a970885000f911f2d0288c2ac7360995a5d78 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-124/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49c5de6ea02f447479a304a97b2a93ba92e553bb80b045332110adf55865a379 +size 16822989 diff --git a/adapters/saved_llamainstinwild/checkpoint-124/rng_state_0.pth b/adapters/saved_llamainstinwild/checkpoint-124/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5046c5d00d827fc4e026a8cb84a2fc9652738454 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-124/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ad9129bb58584677e1bedfa292567ff1db5ce499cfd2c27f3d619e219b0464c +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-124/rng_state_1.pth b/adapters/saved_llamainstinwild/checkpoint-124/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..595e0deda8f96220d6cf6eeb3f89381f15956144 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-124/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9988953eb4812a4db3842f5c852fbee759132836d4fc232cb610021df9d9425f +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-124/rng_state_2.pth b/adapters/saved_llamainstinwild/checkpoint-124/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..b521934ea07ae3f438a77b063d8993a4b16b9f28 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-124/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b362105d7e43f4558fc631ce0bf2a2c2fd252a49b59d130cd11cf13d68b02763 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-124/rng_state_3.pth b/adapters/saved_llamainstinwild/checkpoint-124/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b18e3d21b0c4b30e54ffc9e98654962dd0dc0358 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-124/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b698144c4a3d4af161b274ab0f754030d33870fb1cce607e642e72c78fd4f8dc +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-124/rng_state_4.pth b/adapters/saved_llamainstinwild/checkpoint-124/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..de64797ecc20e20f20b09fef08e29cbd3655f942 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-124/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b01129a43165ed4063a0621ec5d228df3c007c08eaa02338e6525857c1bea06c +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-124/rng_state_5.pth b/adapters/saved_llamainstinwild/checkpoint-124/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..a3c968fafaed49ec75d798b96e4185c5cd2932fb --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-124/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92946ea5d3971470d75d5b50c5d9ca0b0d0276cab6a5c8877a8e3eb04cb736a5 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-124/rng_state_6.pth b/adapters/saved_llamainstinwild/checkpoint-124/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1a0f5486952dad07067ebdcbb20adabac38a175c --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-124/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64a91fe7766879b4401e680abbb4e6d7222b5ff02e43132d4d4f88d666b38d22 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-124/rng_state_7.pth b/adapters/saved_llamainstinwild/checkpoint-124/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..6b8095a643eb8932fdfc897978a0a106cf7973e3 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-124/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cf9876b18d0c7ae82f9bcd7fce0ddb7dfb9a9609c0ec0bcd66c1bd36291f19f +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-124/scaler.pt b/adapters/saved_llamainstinwild/checkpoint-124/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e94f97e32ecb09a0c1f420eab541980a86b55674 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-124/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00b7f8b128773e6f2a0eb7923d8944641eaf2ece3a0e98cd789d99474a6b5ae1 +size 557 diff --git a/adapters/saved_llamainstinwild/checkpoint-124/scheduler.pt b/adapters/saved_llamainstinwild/checkpoint-124/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..878aa00f3ff8855db48d6568a3e2b8fa3e36558c --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-124/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:816529cad39474ceadf51349fa2e81c5446e220ee76ac0427733c83d9c3fc884 +size 627 diff --git a/adapters/saved_llamainstinwild/checkpoint-124/trainer_state.json b/adapters/saved_llamainstinwild/checkpoint-124/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4f34ab291e6a9d731a6c4cb2a59d7d3c546cddc5 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-124/trainer_state.json @@ -0,0 +1,300 @@ +{ + "best_metric": 1.1753153800964355, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamainstinwild/checkpoint-124", + "epoch": 2.52899936265137, + "global_step": 124, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "eval_loss": 1.39645254611969, + "eval_runtime": 37.7333, + "eval_samples_per_second": 53.004, + "eval_steps_per_second": 0.848, + "step": 4 + }, + { + "epoch": 0.16, + "eval_loss": 1.2984137535095215, + "eval_runtime": 37.7164, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 8 + }, + { + "epoch": 0.24, + "eval_loss": 1.2540132999420166, + "eval_runtime": 37.6847, + "eval_samples_per_second": 53.072, + "eval_steps_per_second": 0.849, + "step": 12 + }, + { + "epoch": 0.33, + "eval_loss": 1.2342448234558105, + "eval_runtime": 37.6879, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 16 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002664335664335664, + "loss": 1.3125, + "step": 20 + }, + { + "epoch": 0.41, + "eval_loss": 1.2209277153015137, + "eval_runtime": 37.7168, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 20 + }, + { + "epoch": 0.49, + "eval_loss": 1.2117934226989746, + "eval_runtime": 37.6897, + "eval_samples_per_second": 53.065, + "eval_steps_per_second": 0.849, + "step": 24 + }, + { + "epoch": 0.57, + "eval_loss": 1.2057584524154663, + "eval_runtime": 37.7691, + "eval_samples_per_second": 52.953, + "eval_steps_per_second": 0.847, + "step": 28 + }, + { + "epoch": 0.65, + "eval_loss": 1.2009178400039673, + "eval_runtime": 37.7026, + "eval_samples_per_second": 53.047, + "eval_steps_per_second": 0.849, + "step": 32 + }, + { + "epoch": 0.73, + "eval_loss": 1.197250485420227, + "eval_runtime": 37.6765, + "eval_samples_per_second": 53.084, + "eval_steps_per_second": 0.849, + "step": 36 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022447552447552445, + "loss": 1.2095, + "step": 40 + }, + { + "epoch": 0.82, + "eval_loss": 1.1943359375, + "eval_runtime": 37.6881, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 40 + }, + { + "epoch": 0.9, + "eval_loss": 1.1920316219329834, + "eval_runtime": 37.694, + "eval_samples_per_second": 53.059, + "eval_steps_per_second": 0.849, + "step": 44 + }, + { + "epoch": 0.98, + "eval_loss": 1.1898770332336426, + "eval_runtime": 37.7089, + "eval_samples_per_second": 53.038, + "eval_steps_per_second": 0.849, + "step": 48 + }, + { + "epoch": 1.06, + "eval_loss": 1.1879488229751587, + "eval_runtime": 37.6975, + "eval_samples_per_second": 53.054, + "eval_steps_per_second": 0.849, + "step": 52 + }, + { + "epoch": 1.14, + "eval_loss": 1.1864111423492432, + "eval_runtime": 37.7188, + "eval_samples_per_second": 53.024, + "eval_steps_per_second": 0.848, + "step": 56 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018251748251748253, + "loss": 1.188, + "step": 60 + }, + { + "epoch": 1.22, + "eval_loss": 1.1849411725997925, + "eval_runtime": 37.7686, + "eval_samples_per_second": 52.954, + "eval_steps_per_second": 0.847, + "step": 60 + }, + { + "epoch": 1.31, + "eval_loss": 1.1838239431381226, + "eval_runtime": 37.6945, + "eval_samples_per_second": 53.058, + "eval_steps_per_second": 0.849, + "step": 64 + }, + { + "epoch": 1.39, + "eval_loss": 1.182841181755066, + "eval_runtime": 37.6624, + "eval_samples_per_second": 53.103, + "eval_steps_per_second": 0.85, + "step": 68 + }, + { + "epoch": 1.47, + "eval_loss": 1.1818122863769531, + "eval_runtime": 37.7233, + "eval_samples_per_second": 53.018, + "eval_steps_per_second": 0.848, + "step": 72 + }, + { + "epoch": 1.55, + "eval_loss": 1.1808911561965942, + "eval_runtime": 37.7038, + "eval_samples_per_second": 53.045, + "eval_steps_per_second": 0.849, + "step": 76 + }, + { + "epoch": 1.63, + "learning_rate": 0.00014055944055944055, + "loss": 1.1876, + "step": 80 + }, + { + "epoch": 1.63, + "eval_loss": 1.1801403760910034, + "eval_runtime": 37.7277, + "eval_samples_per_second": 53.011, + "eval_steps_per_second": 0.848, + "step": 80 + }, + { + "epoch": 1.71, + "eval_loss": 1.1793811321258545, + "eval_runtime": 37.6641, + "eval_samples_per_second": 53.101, + "eval_steps_per_second": 0.85, + "step": 84 + }, + { + "epoch": 1.79, + "eval_loss": 1.1788283586502075, + "eval_runtime": 37.7097, + "eval_samples_per_second": 53.037, + "eval_steps_per_second": 0.849, + "step": 88 + }, + { + "epoch": 1.88, + "eval_loss": 1.1783016920089722, + "eval_runtime": 37.7104, + "eval_samples_per_second": 53.036, + "eval_steps_per_second": 0.849, + "step": 92 + }, + { + "epoch": 1.96, + "eval_loss": 1.177699327468872, + "eval_runtime": 37.6884, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 96 + }, + { + "epoch": 2.04, + "learning_rate": 9.860139860139858e-05, + "loss": 1.182, + "step": 100 + }, + { + "epoch": 2.04, + "eval_loss": 1.1773412227630615, + "eval_runtime": 37.7134, + "eval_samples_per_second": 53.032, + "eval_steps_per_second": 0.849, + "step": 100 + }, + { + "epoch": 2.12, + "eval_loss": 1.176965594291687, + "eval_runtime": 37.6735, + "eval_samples_per_second": 53.088, + "eval_steps_per_second": 0.849, + "step": 104 + }, + { + "epoch": 2.2, + "eval_loss": 1.1765486001968384, + "eval_runtime": 37.7771, + "eval_samples_per_second": 52.942, + "eval_steps_per_second": 0.847, + "step": 108 + }, + { + "epoch": 2.28, + "eval_loss": 1.1761963367462158, + "eval_runtime": 37.7211, + "eval_samples_per_second": 53.021, + "eval_steps_per_second": 0.848, + "step": 112 + }, + { + "epoch": 2.37, + "eval_loss": 1.1757991313934326, + "eval_runtime": 37.7438, + "eval_samples_per_second": 52.989, + "eval_steps_per_second": 0.848, + "step": 116 + }, + { + "epoch": 2.45, + "learning_rate": 5.664335664335664e-05, + "loss": 1.1785, + "step": 120 + }, + { + "epoch": 2.45, + "eval_loss": 1.1754933595657349, + "eval_runtime": 37.7128, + "eval_samples_per_second": 53.032, + "eval_steps_per_second": 0.849, + "step": 120 + }, + { + "epoch": 2.53, + "eval_loss": 1.1753153800964355, + "eval_runtime": 37.6929, + "eval_samples_per_second": 53.06, + "eval_steps_per_second": 0.849, + "step": 124 + } + ], + "max_steps": 147, + "num_train_epochs": 3, + "total_flos": 1.4455053197393265e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamainstinwild/checkpoint-124/training_args.bin b/adapters/saved_llamainstinwild/checkpoint-124/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..34b95391e17e9d44eb2a04177250246c85716097 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-124/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13fbb17fb4210eddefede6fc992fb1cd0ac7bb1d7d94253bc089086b39aba316 +size 3643 diff --git a/adapters/saved_llamainstinwild/checkpoint-128/optimizer.pt b/adapters/saved_llamainstinwild/checkpoint-128/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..155fc0f0570268f42cbf3be9931e987aca848441 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-128/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9cdac11907899e130dcb178c438a8c31f2d34699399c4a20d8c546379e6e1d2 +size 33629765 diff --git a/adapters/saved_llamainstinwild/checkpoint-128/pytorch_model.bin b/adapters/saved_llamainstinwild/checkpoint-128/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ff98b103f6eb816dd77aa47ab1f3c95b4c6e9f93 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-128/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a88c36d4dd9dd18924a184e7a2c4346bc74407501f3b3f41d2758987c4978edf +size 16822989 diff --git a/adapters/saved_llamainstinwild/checkpoint-128/rng_state_0.pth b/adapters/saved_llamainstinwild/checkpoint-128/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..df1c2c87e0d69e71fc7406189d8e3abbddbd8c07 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-128/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a47538baee6082de69e9a2fb9951ef02a1587a39101bd0b0e3df87d2e7068fa4 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-128/rng_state_1.pth b/adapters/saved_llamainstinwild/checkpoint-128/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..1c83ebcacaab2faf8124d5cb8ff9e8071b06c326 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-128/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a643b8142ee856c27da74547a31ca444eee99a6343dd3efeba61820a2075a025 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-128/rng_state_2.pth b/adapters/saved_llamainstinwild/checkpoint-128/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..4fcda463016d22a03199fda251486b9ffa8d4f1b --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-128/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1aea881dbd983ccc6cdf319375011441a707e509e449dde4ac529e4ec6d45e49 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-128/rng_state_3.pth b/adapters/saved_llamainstinwild/checkpoint-128/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..9e043509e61b15ce484de6ae071816fb42e2e17f --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-128/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a248cae2ce9798cf5208f44048f82f9fa2752455e742c686cc6c1697421179a +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-128/rng_state_4.pth b/adapters/saved_llamainstinwild/checkpoint-128/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..6351eb02831192441cb5112ee61e27a08c9712eb --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-128/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49a5e8c84b87d542dc0c8b4d1b82ba8275a1b2727edab9414ffecf73841eb9e5 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-128/rng_state_5.pth b/adapters/saved_llamainstinwild/checkpoint-128/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..77158178a23f5da95cd6895389362124854c35ec --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-128/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c06318e1cd859e3318bd1d68c16bd57b61ec06fedd4a19232f363b7045439e1 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-128/rng_state_6.pth b/adapters/saved_llamainstinwild/checkpoint-128/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..f6d6d2ee7db65eadd5410b345bdd14dd65f46c76 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-128/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:186b547fb596e5d7a044c03e4cf46a751587084fddb9e072f456ac2c7f088a62 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-128/rng_state_7.pth b/adapters/saved_llamainstinwild/checkpoint-128/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..20f76038184ebf616e7d4deb58150c770d752b52 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-128/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a0a079cff9fb982d575a6f4a80a499532ddfa6421f625714b8d1957ed5be518 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-128/scaler.pt b/adapters/saved_llamainstinwild/checkpoint-128/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2b78a167b6dae20dc4583ccc1866c4ce7ea368a --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-128/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad7613637791d17a45faf9bffbc40686cffe6671dd8744aca30a82caa79deed1 +size 557 diff --git a/adapters/saved_llamainstinwild/checkpoint-128/scheduler.pt b/adapters/saved_llamainstinwild/checkpoint-128/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..87a359b8780f5e3d80a075b8fcd45871302cf354 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-128/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58c341ef39d4af9a6500088c472f5cef8e6179f781e59c1443582ecca6b2be86 +size 627 diff --git a/adapters/saved_llamainstinwild/checkpoint-128/trainer_state.json b/adapters/saved_llamainstinwild/checkpoint-128/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3b23b8a8f5c6b18a6507003df9f6eca4bf72b3ff --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-128/trainer_state.json @@ -0,0 +1,308 @@ +{ + "best_metric": 1.1751618385314941, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamainstinwild/checkpoint-128", + "epoch": 2.6105799872530273, + "global_step": 128, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "eval_loss": 1.39645254611969, + "eval_runtime": 37.7333, + "eval_samples_per_second": 53.004, + "eval_steps_per_second": 0.848, + "step": 4 + }, + { + "epoch": 0.16, + "eval_loss": 1.2984137535095215, + "eval_runtime": 37.7164, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 8 + }, + { + "epoch": 0.24, + "eval_loss": 1.2540132999420166, + "eval_runtime": 37.6847, + "eval_samples_per_second": 53.072, + "eval_steps_per_second": 0.849, + "step": 12 + }, + { + "epoch": 0.33, + "eval_loss": 1.2342448234558105, + "eval_runtime": 37.6879, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 16 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002664335664335664, + "loss": 1.3125, + "step": 20 + }, + { + "epoch": 0.41, + "eval_loss": 1.2209277153015137, + "eval_runtime": 37.7168, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 20 + }, + { + "epoch": 0.49, + "eval_loss": 1.2117934226989746, + "eval_runtime": 37.6897, + "eval_samples_per_second": 53.065, + "eval_steps_per_second": 0.849, + "step": 24 + }, + { + "epoch": 0.57, + "eval_loss": 1.2057584524154663, + "eval_runtime": 37.7691, + "eval_samples_per_second": 52.953, + "eval_steps_per_second": 0.847, + "step": 28 + }, + { + "epoch": 0.65, + "eval_loss": 1.2009178400039673, + "eval_runtime": 37.7026, + "eval_samples_per_second": 53.047, + "eval_steps_per_second": 0.849, + "step": 32 + }, + { + "epoch": 0.73, + "eval_loss": 1.197250485420227, + "eval_runtime": 37.6765, + "eval_samples_per_second": 53.084, + "eval_steps_per_second": 0.849, + "step": 36 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022447552447552445, + "loss": 1.2095, + "step": 40 + }, + { + "epoch": 0.82, + "eval_loss": 1.1943359375, + "eval_runtime": 37.6881, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 40 + }, + { + "epoch": 0.9, + "eval_loss": 1.1920316219329834, + "eval_runtime": 37.694, + "eval_samples_per_second": 53.059, + "eval_steps_per_second": 0.849, + "step": 44 + }, + { + "epoch": 0.98, + "eval_loss": 1.1898770332336426, + "eval_runtime": 37.7089, + "eval_samples_per_second": 53.038, + "eval_steps_per_second": 0.849, + "step": 48 + }, + { + "epoch": 1.06, + "eval_loss": 1.1879488229751587, + "eval_runtime": 37.6975, + "eval_samples_per_second": 53.054, + "eval_steps_per_second": 0.849, + "step": 52 + }, + { + "epoch": 1.14, + "eval_loss": 1.1864111423492432, + "eval_runtime": 37.7188, + "eval_samples_per_second": 53.024, + "eval_steps_per_second": 0.848, + "step": 56 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018251748251748253, + "loss": 1.188, + "step": 60 + }, + { + "epoch": 1.22, + "eval_loss": 1.1849411725997925, + "eval_runtime": 37.7686, + "eval_samples_per_second": 52.954, + "eval_steps_per_second": 0.847, + "step": 60 + }, + { + "epoch": 1.31, + "eval_loss": 1.1838239431381226, + "eval_runtime": 37.6945, + "eval_samples_per_second": 53.058, + "eval_steps_per_second": 0.849, + "step": 64 + }, + { + "epoch": 1.39, + "eval_loss": 1.182841181755066, + "eval_runtime": 37.6624, + "eval_samples_per_second": 53.103, + "eval_steps_per_second": 0.85, + "step": 68 + }, + { + "epoch": 1.47, + "eval_loss": 1.1818122863769531, + "eval_runtime": 37.7233, + "eval_samples_per_second": 53.018, + "eval_steps_per_second": 0.848, + "step": 72 + }, + { + "epoch": 1.55, + "eval_loss": 1.1808911561965942, + "eval_runtime": 37.7038, + "eval_samples_per_second": 53.045, + "eval_steps_per_second": 0.849, + "step": 76 + }, + { + "epoch": 1.63, + "learning_rate": 0.00014055944055944055, + "loss": 1.1876, + "step": 80 + }, + { + "epoch": 1.63, + "eval_loss": 1.1801403760910034, + "eval_runtime": 37.7277, + "eval_samples_per_second": 53.011, + "eval_steps_per_second": 0.848, + "step": 80 + }, + { + "epoch": 1.71, + "eval_loss": 1.1793811321258545, + "eval_runtime": 37.6641, + "eval_samples_per_second": 53.101, + "eval_steps_per_second": 0.85, + "step": 84 + }, + { + "epoch": 1.79, + "eval_loss": 1.1788283586502075, + "eval_runtime": 37.7097, + "eval_samples_per_second": 53.037, + "eval_steps_per_second": 0.849, + "step": 88 + }, + { + "epoch": 1.88, + "eval_loss": 1.1783016920089722, + "eval_runtime": 37.7104, + "eval_samples_per_second": 53.036, + "eval_steps_per_second": 0.849, + "step": 92 + }, + { + "epoch": 1.96, + "eval_loss": 1.177699327468872, + "eval_runtime": 37.6884, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 96 + }, + { + "epoch": 2.04, + "learning_rate": 9.860139860139858e-05, + "loss": 1.182, + "step": 100 + }, + { + "epoch": 2.04, + "eval_loss": 1.1773412227630615, + "eval_runtime": 37.7134, + "eval_samples_per_second": 53.032, + "eval_steps_per_second": 0.849, + "step": 100 + }, + { + "epoch": 2.12, + "eval_loss": 1.176965594291687, + "eval_runtime": 37.6735, + "eval_samples_per_second": 53.088, + "eval_steps_per_second": 0.849, + "step": 104 + }, + { + "epoch": 2.2, + "eval_loss": 1.1765486001968384, + "eval_runtime": 37.7771, + "eval_samples_per_second": 52.942, + "eval_steps_per_second": 0.847, + "step": 108 + }, + { + "epoch": 2.28, + "eval_loss": 1.1761963367462158, + "eval_runtime": 37.7211, + "eval_samples_per_second": 53.021, + "eval_steps_per_second": 0.848, + "step": 112 + }, + { + "epoch": 2.37, + "eval_loss": 1.1757991313934326, + "eval_runtime": 37.7438, + "eval_samples_per_second": 52.989, + "eval_steps_per_second": 0.848, + "step": 116 + }, + { + "epoch": 2.45, + "learning_rate": 5.664335664335664e-05, + "loss": 1.1785, + "step": 120 + }, + { + "epoch": 2.45, + "eval_loss": 1.1754933595657349, + "eval_runtime": 37.7128, + "eval_samples_per_second": 53.032, + "eval_steps_per_second": 0.849, + "step": 120 + }, + { + "epoch": 2.53, + "eval_loss": 1.1753153800964355, + "eval_runtime": 37.6929, + "eval_samples_per_second": 53.06, + "eval_steps_per_second": 0.849, + "step": 124 + }, + { + "epoch": 2.61, + "eval_loss": 1.1751618385314941, + "eval_runtime": 37.6913, + "eval_samples_per_second": 53.063, + "eval_steps_per_second": 0.849, + "step": 128 + } + ], + "max_steps": 147, + "num_train_epochs": 3, + "total_flos": 1.4919346592384287e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamainstinwild/checkpoint-128/training_args.bin b/adapters/saved_llamainstinwild/checkpoint-128/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..34b95391e17e9d44eb2a04177250246c85716097 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-128/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13fbb17fb4210eddefede6fc992fb1cd0ac7bb1d7d94253bc089086b39aba316 +size 3643 diff --git a/adapters/saved_llamainstinwild/checkpoint-132/optimizer.pt b/adapters/saved_llamainstinwild/checkpoint-132/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f053c02783417ee95c9b27a61931e3b40ed8666b --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-132/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aab8a57029b37bb04e7c0c7689a0f38fac9de9d0045a795af7675cab47e73365 +size 33629765 diff --git a/adapters/saved_llamainstinwild/checkpoint-132/pytorch_model.bin b/adapters/saved_llamainstinwild/checkpoint-132/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8e8a5ac28e83e66587d08637c49d1942c684e4a0 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-132/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea201f51e8b6ac4db98c117887d7380373e09bc1645b987ca1ddab125c3c2afa +size 16822989 diff --git a/adapters/saved_llamainstinwild/checkpoint-132/rng_state_0.pth b/adapters/saved_llamainstinwild/checkpoint-132/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..1c2aed75a9fedfcbcdf40c8c6edbf980cf952546 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-132/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:818febb10ec4ed294cc48caf146cd6b1cbbbe19c301f087675665c47915fe0ec +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-132/rng_state_1.pth b/adapters/saved_llamainstinwild/checkpoint-132/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6de2990d661f3b48282eaea65d27ad3b5f03d915 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-132/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80ac2f5560304c610be4a2c6f85cf4f0197137e526d26cf4ee692b42aa69261c +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-132/rng_state_2.pth b/adapters/saved_llamainstinwild/checkpoint-132/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..420ce1658beae8e789608c92f8607de019fc3fed --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-132/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a866373463ac47ec28320549863af9255304f3e60e383f4a099d399076d28514 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-132/rng_state_3.pth b/adapters/saved_llamainstinwild/checkpoint-132/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..d83d4f40ccef1af86de119c2a6323a7d7d9b9d65 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-132/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e8694590699a276cc6ca4ec164ffffab42578b2cf65f83670eac01022969d23 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-132/rng_state_4.pth b/adapters/saved_llamainstinwild/checkpoint-132/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..e7dc92eb9378f713f28627a182befb46ec9c5cb0 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-132/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ac0d5bd64cc323f4dc675e9d1fd6635f77774a67bee21354d878dc5ad8459d4 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-132/rng_state_5.pth b/adapters/saved_llamainstinwild/checkpoint-132/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1443e94c1facd81e528db06b7f336f846df2fead --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-132/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:759286006871cd76789f57611243549812102cf1fb5bd7dbed1ada8500b4c59e +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-132/rng_state_6.pth b/adapters/saved_llamainstinwild/checkpoint-132/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..9978916877800ef26e967ab279d3b82a5272dbda --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-132/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f6e70ce2f5c41d487b4824ee5f473194621b3dfe59d9e221bf57fa3e814ddc5 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-132/rng_state_7.pth b/adapters/saved_llamainstinwild/checkpoint-132/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..44421cb8f323749342306e9712dd679dfe425fdd --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-132/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53c2760988034b71faf1527c46a5e5446749ab0ba2f9ebd6c1c58922b7018eb6 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-132/scaler.pt b/adapters/saved_llamainstinwild/checkpoint-132/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9adb3b3677d3191d7e887ebb4b4627a150fa2107 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-132/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38184ca1d8b647caada94496abdb64086f0cd47d27d1c76db818e85a4b26163c +size 557 diff --git a/adapters/saved_llamainstinwild/checkpoint-132/scheduler.pt b/adapters/saved_llamainstinwild/checkpoint-132/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ebd78133d8edf92eb9550e4402e0a967c0e1d4ef --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-132/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef913215f68b5bb670ae9c22aedc8adf19b7ce330215fa02d6f940a7ad38e142 +size 627 diff --git a/adapters/saved_llamainstinwild/checkpoint-132/trainer_state.json b/adapters/saved_llamainstinwild/checkpoint-132/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3973f12d2a908b422ff9763e9ed4006141770ee2 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-132/trainer_state.json @@ -0,0 +1,316 @@ +{ + "best_metric": 1.1750015020370483, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamainstinwild/checkpoint-132", + "epoch": 2.6921606118546846, + "global_step": 132, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "eval_loss": 1.39645254611969, + "eval_runtime": 37.7333, + "eval_samples_per_second": 53.004, + "eval_steps_per_second": 0.848, + "step": 4 + }, + { + "epoch": 0.16, + "eval_loss": 1.2984137535095215, + "eval_runtime": 37.7164, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 8 + }, + { + "epoch": 0.24, + "eval_loss": 1.2540132999420166, + "eval_runtime": 37.6847, + "eval_samples_per_second": 53.072, + "eval_steps_per_second": 0.849, + "step": 12 + }, + { + "epoch": 0.33, + "eval_loss": 1.2342448234558105, + "eval_runtime": 37.6879, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 16 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002664335664335664, + "loss": 1.3125, + "step": 20 + }, + { + "epoch": 0.41, + "eval_loss": 1.2209277153015137, + "eval_runtime": 37.7168, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 20 + }, + { + "epoch": 0.49, + "eval_loss": 1.2117934226989746, + "eval_runtime": 37.6897, + "eval_samples_per_second": 53.065, + "eval_steps_per_second": 0.849, + "step": 24 + }, + { + "epoch": 0.57, + "eval_loss": 1.2057584524154663, + "eval_runtime": 37.7691, + "eval_samples_per_second": 52.953, + "eval_steps_per_second": 0.847, + "step": 28 + }, + { + "epoch": 0.65, + "eval_loss": 1.2009178400039673, + "eval_runtime": 37.7026, + "eval_samples_per_second": 53.047, + "eval_steps_per_second": 0.849, + "step": 32 + }, + { + "epoch": 0.73, + "eval_loss": 1.197250485420227, + "eval_runtime": 37.6765, + "eval_samples_per_second": 53.084, + "eval_steps_per_second": 0.849, + "step": 36 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022447552447552445, + "loss": 1.2095, + "step": 40 + }, + { + "epoch": 0.82, + "eval_loss": 1.1943359375, + "eval_runtime": 37.6881, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 40 + }, + { + "epoch": 0.9, + "eval_loss": 1.1920316219329834, + "eval_runtime": 37.694, + "eval_samples_per_second": 53.059, + "eval_steps_per_second": 0.849, + "step": 44 + }, + { + "epoch": 0.98, + "eval_loss": 1.1898770332336426, + "eval_runtime": 37.7089, + "eval_samples_per_second": 53.038, + "eval_steps_per_second": 0.849, + "step": 48 + }, + { + "epoch": 1.06, + "eval_loss": 1.1879488229751587, + "eval_runtime": 37.6975, + "eval_samples_per_second": 53.054, + "eval_steps_per_second": 0.849, + "step": 52 + }, + { + "epoch": 1.14, + "eval_loss": 1.1864111423492432, + "eval_runtime": 37.7188, + "eval_samples_per_second": 53.024, + "eval_steps_per_second": 0.848, + "step": 56 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018251748251748253, + "loss": 1.188, + "step": 60 + }, + { + "epoch": 1.22, + "eval_loss": 1.1849411725997925, + "eval_runtime": 37.7686, + "eval_samples_per_second": 52.954, + "eval_steps_per_second": 0.847, + "step": 60 + }, + { + "epoch": 1.31, + "eval_loss": 1.1838239431381226, + "eval_runtime": 37.6945, + "eval_samples_per_second": 53.058, + "eval_steps_per_second": 0.849, + "step": 64 + }, + { + "epoch": 1.39, + "eval_loss": 1.182841181755066, + "eval_runtime": 37.6624, + "eval_samples_per_second": 53.103, + "eval_steps_per_second": 0.85, + "step": 68 + }, + { + "epoch": 1.47, + "eval_loss": 1.1818122863769531, + "eval_runtime": 37.7233, + "eval_samples_per_second": 53.018, + "eval_steps_per_second": 0.848, + "step": 72 + }, + { + "epoch": 1.55, + "eval_loss": 1.1808911561965942, + "eval_runtime": 37.7038, + "eval_samples_per_second": 53.045, + "eval_steps_per_second": 0.849, + "step": 76 + }, + { + "epoch": 1.63, + "learning_rate": 0.00014055944055944055, + "loss": 1.1876, + "step": 80 + }, + { + "epoch": 1.63, + "eval_loss": 1.1801403760910034, + "eval_runtime": 37.7277, + "eval_samples_per_second": 53.011, + "eval_steps_per_second": 0.848, + "step": 80 + }, + { + "epoch": 1.71, + "eval_loss": 1.1793811321258545, + "eval_runtime": 37.6641, + "eval_samples_per_second": 53.101, + "eval_steps_per_second": 0.85, + "step": 84 + }, + { + "epoch": 1.79, + "eval_loss": 1.1788283586502075, + "eval_runtime": 37.7097, + "eval_samples_per_second": 53.037, + "eval_steps_per_second": 0.849, + "step": 88 + }, + { + "epoch": 1.88, + "eval_loss": 1.1783016920089722, + "eval_runtime": 37.7104, + "eval_samples_per_second": 53.036, + "eval_steps_per_second": 0.849, + "step": 92 + }, + { + "epoch": 1.96, + "eval_loss": 1.177699327468872, + "eval_runtime": 37.6884, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 96 + }, + { + "epoch": 2.04, + "learning_rate": 9.860139860139858e-05, + "loss": 1.182, + "step": 100 + }, + { + "epoch": 2.04, + "eval_loss": 1.1773412227630615, + "eval_runtime": 37.7134, + "eval_samples_per_second": 53.032, + "eval_steps_per_second": 0.849, + "step": 100 + }, + { + "epoch": 2.12, + "eval_loss": 1.176965594291687, + "eval_runtime": 37.6735, + "eval_samples_per_second": 53.088, + "eval_steps_per_second": 0.849, + "step": 104 + }, + { + "epoch": 2.2, + "eval_loss": 1.1765486001968384, + "eval_runtime": 37.7771, + "eval_samples_per_second": 52.942, + "eval_steps_per_second": 0.847, + "step": 108 + }, + { + "epoch": 2.28, + "eval_loss": 1.1761963367462158, + "eval_runtime": 37.7211, + "eval_samples_per_second": 53.021, + "eval_steps_per_second": 0.848, + "step": 112 + }, + { + "epoch": 2.37, + "eval_loss": 1.1757991313934326, + "eval_runtime": 37.7438, + "eval_samples_per_second": 52.989, + "eval_steps_per_second": 0.848, + "step": 116 + }, + { + "epoch": 2.45, + "learning_rate": 5.664335664335664e-05, + "loss": 1.1785, + "step": 120 + }, + { + "epoch": 2.45, + "eval_loss": 1.1754933595657349, + "eval_runtime": 37.7128, + "eval_samples_per_second": 53.032, + "eval_steps_per_second": 0.849, + "step": 120 + }, + { + "epoch": 2.53, + "eval_loss": 1.1753153800964355, + "eval_runtime": 37.6929, + "eval_samples_per_second": 53.06, + "eval_steps_per_second": 0.849, + "step": 124 + }, + { + "epoch": 2.61, + "eval_loss": 1.1751618385314941, + "eval_runtime": 37.6913, + "eval_samples_per_second": 53.063, + "eval_steps_per_second": 0.849, + "step": 128 + }, + { + "epoch": 2.69, + "eval_loss": 1.1750015020370483, + "eval_runtime": 37.7589, + "eval_samples_per_second": 52.968, + "eval_steps_per_second": 0.847, + "step": 132 + } + ], + "max_steps": 147, + "num_train_epochs": 3, + "total_flos": 1.5383482920421294e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamainstinwild/checkpoint-132/training_args.bin b/adapters/saved_llamainstinwild/checkpoint-132/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..34b95391e17e9d44eb2a04177250246c85716097 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-132/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13fbb17fb4210eddefede6fc992fb1cd0ac7bb1d7d94253bc089086b39aba316 +size 3643 diff --git a/adapters/saved_llamainstinwild/checkpoint-136/optimizer.pt b/adapters/saved_llamainstinwild/checkpoint-136/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..dbb4953146e15b75d3db0726ab90eb521fa273fd --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-136/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af1b608dc1b6242da8ec34237ea442c69473e758c52cd067b862c117d86ab33d +size 33629765 diff --git a/adapters/saved_llamainstinwild/checkpoint-136/pytorch_model.bin b/adapters/saved_llamainstinwild/checkpoint-136/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..047946b42ba49a9a1a582c8943696b2e7544e9f7 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-136/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfaaa6ef557c9a6d4a31196bccbed2ed6350aede650b93de57144914c10d1574 +size 16822989 diff --git a/adapters/saved_llamainstinwild/checkpoint-136/rng_state_0.pth b/adapters/saved_llamainstinwild/checkpoint-136/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..aa2acb574d1dbad3cc5225529aa5d6e6b5f74ded --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-136/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e44597da800be33bab5655f12b293c5864d69daa1230c3cfacd9b1504204ebd +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-136/rng_state_1.pth b/adapters/saved_llamainstinwild/checkpoint-136/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..31762ec963861e632b5495369e02c46e76e9d799 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-136/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:033dde7db34cf75c7210865aff0601d64b42cee191c5306633a8256663aa96f4 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-136/rng_state_2.pth b/adapters/saved_llamainstinwild/checkpoint-136/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..35b510b8cf7008fe300c5d66a186f54ca9f6731d --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-136/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52162e1ea38f57c2f515fabbc3621ca307e768c12be0d508fad6a01dd8d696dd +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-136/rng_state_3.pth b/adapters/saved_llamainstinwild/checkpoint-136/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..ccbe00a8a5961c7f699903fbefc4a9ee374affdf --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-136/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9b9c8dd122ca7e0a632f0e2442a7df2f348a900553332a4fe733302198c4a68 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-136/rng_state_4.pth b/adapters/saved_llamainstinwild/checkpoint-136/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..7106aa268ba5d17ccde822500b101ebdc5bfcf58 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-136/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9981b4234c5ef03c2296645fea2a53dd72f3fe645c7f456eeb492eae1fd50fb7 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-136/rng_state_5.pth b/adapters/saved_llamainstinwild/checkpoint-136/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc7c6dcd8f014e9e526bf8d8cb0b5229d6dcc091 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-136/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc57365799e8a672b0ad043f83cf3190cf4ec4c4de54ae4eea5cb486575d340c +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-136/rng_state_6.pth b/adapters/saved_llamainstinwild/checkpoint-136/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..049849f3e1d6049606e5160523a37e16381534bd --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-136/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1b7df21e5ea822b67c50733b9ed3ebb5336ea4f0bcd24047f93c6108084a43f +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-136/rng_state_7.pth b/adapters/saved_llamainstinwild/checkpoint-136/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..09eee9c29dcac54493f06b30317c4611416e471f --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-136/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68582f8182f33b899533e244eb6c6a39a2fb3336d6a6da8156324cd5ac6e74ca +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-136/scaler.pt b/adapters/saved_llamainstinwild/checkpoint-136/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec809be0b3a877d11b9778bbe97bab10e9bd465e --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-136/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3ac6fd245fa506f7838f32d5eabe7efc9703cba12b144efcfec1a2bfdae2f16 +size 557 diff --git a/adapters/saved_llamainstinwild/checkpoint-136/scheduler.pt b/adapters/saved_llamainstinwild/checkpoint-136/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1508ca85826f82978d485b78be6afb22ad7c4566 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-136/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f24fe249c5c7feeec858ed4d11ef610629707dcde73e69e7ee7d5839e930b815 +size 627 diff --git a/adapters/saved_llamainstinwild/checkpoint-136/trainer_state.json b/adapters/saved_llamainstinwild/checkpoint-136/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..531573da76fce1f2d35d8ab96f7747f462ce737f --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-136/trainer_state.json @@ -0,0 +1,324 @@ +{ + "best_metric": 1.1748673915863037, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamainstinwild/checkpoint-136", + "epoch": 2.773741236456342, + "global_step": 136, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "eval_loss": 1.39645254611969, + "eval_runtime": 37.7333, + "eval_samples_per_second": 53.004, + "eval_steps_per_second": 0.848, + "step": 4 + }, + { + "epoch": 0.16, + "eval_loss": 1.2984137535095215, + "eval_runtime": 37.7164, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 8 + }, + { + "epoch": 0.24, + "eval_loss": 1.2540132999420166, + "eval_runtime": 37.6847, + "eval_samples_per_second": 53.072, + "eval_steps_per_second": 0.849, + "step": 12 + }, + { + "epoch": 0.33, + "eval_loss": 1.2342448234558105, + "eval_runtime": 37.6879, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 16 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002664335664335664, + "loss": 1.3125, + "step": 20 + }, + { + "epoch": 0.41, + "eval_loss": 1.2209277153015137, + "eval_runtime": 37.7168, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 20 + }, + { + "epoch": 0.49, + "eval_loss": 1.2117934226989746, + "eval_runtime": 37.6897, + "eval_samples_per_second": 53.065, + "eval_steps_per_second": 0.849, + "step": 24 + }, + { + "epoch": 0.57, + "eval_loss": 1.2057584524154663, + "eval_runtime": 37.7691, + "eval_samples_per_second": 52.953, + "eval_steps_per_second": 0.847, + "step": 28 + }, + { + "epoch": 0.65, + "eval_loss": 1.2009178400039673, + "eval_runtime": 37.7026, + "eval_samples_per_second": 53.047, + "eval_steps_per_second": 0.849, + "step": 32 + }, + { + "epoch": 0.73, + "eval_loss": 1.197250485420227, + "eval_runtime": 37.6765, + "eval_samples_per_second": 53.084, + "eval_steps_per_second": 0.849, + "step": 36 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022447552447552445, + "loss": 1.2095, + "step": 40 + }, + { + "epoch": 0.82, + "eval_loss": 1.1943359375, + "eval_runtime": 37.6881, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 40 + }, + { + "epoch": 0.9, + "eval_loss": 1.1920316219329834, + "eval_runtime": 37.694, + "eval_samples_per_second": 53.059, + "eval_steps_per_second": 0.849, + "step": 44 + }, + { + "epoch": 0.98, + "eval_loss": 1.1898770332336426, + "eval_runtime": 37.7089, + "eval_samples_per_second": 53.038, + "eval_steps_per_second": 0.849, + "step": 48 + }, + { + "epoch": 1.06, + "eval_loss": 1.1879488229751587, + "eval_runtime": 37.6975, + "eval_samples_per_second": 53.054, + "eval_steps_per_second": 0.849, + "step": 52 + }, + { + "epoch": 1.14, + "eval_loss": 1.1864111423492432, + "eval_runtime": 37.7188, + "eval_samples_per_second": 53.024, + "eval_steps_per_second": 0.848, + "step": 56 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018251748251748253, + "loss": 1.188, + "step": 60 + }, + { + "epoch": 1.22, + "eval_loss": 1.1849411725997925, + "eval_runtime": 37.7686, + "eval_samples_per_second": 52.954, + "eval_steps_per_second": 0.847, + "step": 60 + }, + { + "epoch": 1.31, + "eval_loss": 1.1838239431381226, + "eval_runtime": 37.6945, + "eval_samples_per_second": 53.058, + "eval_steps_per_second": 0.849, + "step": 64 + }, + { + "epoch": 1.39, + "eval_loss": 1.182841181755066, + "eval_runtime": 37.6624, + "eval_samples_per_second": 53.103, + "eval_steps_per_second": 0.85, + "step": 68 + }, + { + "epoch": 1.47, + "eval_loss": 1.1818122863769531, + "eval_runtime": 37.7233, + "eval_samples_per_second": 53.018, + "eval_steps_per_second": 0.848, + "step": 72 + }, + { + "epoch": 1.55, + "eval_loss": 1.1808911561965942, + "eval_runtime": 37.7038, + "eval_samples_per_second": 53.045, + "eval_steps_per_second": 0.849, + "step": 76 + }, + { + "epoch": 1.63, + "learning_rate": 0.00014055944055944055, + "loss": 1.1876, + "step": 80 + }, + { + "epoch": 1.63, + "eval_loss": 1.1801403760910034, + "eval_runtime": 37.7277, + "eval_samples_per_second": 53.011, + "eval_steps_per_second": 0.848, + "step": 80 + }, + { + "epoch": 1.71, + "eval_loss": 1.1793811321258545, + "eval_runtime": 37.6641, + "eval_samples_per_second": 53.101, + "eval_steps_per_second": 0.85, + "step": 84 + }, + { + "epoch": 1.79, + "eval_loss": 1.1788283586502075, + "eval_runtime": 37.7097, + "eval_samples_per_second": 53.037, + "eval_steps_per_second": 0.849, + "step": 88 + }, + { + "epoch": 1.88, + "eval_loss": 1.1783016920089722, + "eval_runtime": 37.7104, + "eval_samples_per_second": 53.036, + "eval_steps_per_second": 0.849, + "step": 92 + }, + { + "epoch": 1.96, + "eval_loss": 1.177699327468872, + "eval_runtime": 37.6884, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 96 + }, + { + "epoch": 2.04, + "learning_rate": 9.860139860139858e-05, + "loss": 1.182, + "step": 100 + }, + { + "epoch": 2.04, + "eval_loss": 1.1773412227630615, + "eval_runtime": 37.7134, + "eval_samples_per_second": 53.032, + "eval_steps_per_second": 0.849, + "step": 100 + }, + { + "epoch": 2.12, + "eval_loss": 1.176965594291687, + "eval_runtime": 37.6735, + "eval_samples_per_second": 53.088, + "eval_steps_per_second": 0.849, + "step": 104 + }, + { + "epoch": 2.2, + "eval_loss": 1.1765486001968384, + "eval_runtime": 37.7771, + "eval_samples_per_second": 52.942, + "eval_steps_per_second": 0.847, + "step": 108 + }, + { + "epoch": 2.28, + "eval_loss": 1.1761963367462158, + "eval_runtime": 37.7211, + "eval_samples_per_second": 53.021, + "eval_steps_per_second": 0.848, + "step": 112 + }, + { + "epoch": 2.37, + "eval_loss": 1.1757991313934326, + "eval_runtime": 37.7438, + "eval_samples_per_second": 52.989, + "eval_steps_per_second": 0.848, + "step": 116 + }, + { + "epoch": 2.45, + "learning_rate": 5.664335664335664e-05, + "loss": 1.1785, + "step": 120 + }, + { + "epoch": 2.45, + "eval_loss": 1.1754933595657349, + "eval_runtime": 37.7128, + "eval_samples_per_second": 53.032, + "eval_steps_per_second": 0.849, + "step": 120 + }, + { + "epoch": 2.53, + "eval_loss": 1.1753153800964355, + "eval_runtime": 37.6929, + "eval_samples_per_second": 53.06, + "eval_steps_per_second": 0.849, + "step": 124 + }, + { + "epoch": 2.61, + "eval_loss": 1.1751618385314941, + "eval_runtime": 37.6913, + "eval_samples_per_second": 53.063, + "eval_steps_per_second": 0.849, + "step": 128 + }, + { + "epoch": 2.69, + "eval_loss": 1.1750015020370483, + "eval_runtime": 37.7589, + "eval_samples_per_second": 52.968, + "eval_steps_per_second": 0.847, + "step": 132 + }, + { + "epoch": 2.77, + "eval_loss": 1.1748673915863037, + "eval_runtime": 37.6659, + "eval_samples_per_second": 53.098, + "eval_steps_per_second": 0.85, + "step": 136 + } + ], + "max_steps": 147, + "num_train_epochs": 3, + "total_flos": 1.5849463077918474e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamainstinwild/checkpoint-136/training_args.bin b/adapters/saved_llamainstinwild/checkpoint-136/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..34b95391e17e9d44eb2a04177250246c85716097 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-136/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13fbb17fb4210eddefede6fc992fb1cd0ac7bb1d7d94253bc089086b39aba316 +size 3643 diff --git a/adapters/saved_llamainstinwild/checkpoint-140/optimizer.pt b/adapters/saved_llamainstinwild/checkpoint-140/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f53c00b786b09fe8f2857ddc6950851b88ed929d --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-140/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:560d11f3ded100e9779f5074f16f41ba4faf28b40c65a6d6af3044e1101070f2 +size 33629765 diff --git a/adapters/saved_llamainstinwild/checkpoint-140/pytorch_model.bin b/adapters/saved_llamainstinwild/checkpoint-140/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..a72995d2bc8a9a78d3d6c02abe97ac63f4404077 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-140/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f47a205823b28aa209dc8ed97e0c1be54b0a340544a86924135536f9a5260111 +size 16822989 diff --git a/adapters/saved_llamainstinwild/checkpoint-140/rng_state_0.pth b/adapters/saved_llamainstinwild/checkpoint-140/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..fda2b99d12c8e774c0934173743564983a4a8d2b --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-140/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bffc063a77924983be12170af61e8f1f6de119b87c93ad37b1f03ca73c65500 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-140/rng_state_1.pth b/adapters/saved_llamainstinwild/checkpoint-140/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7d3a07be8f4f7287a1c4dbd0f654bd12b4b0291c --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-140/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7a52cf18ac3d649d5b73afe4d57647715a0db8c41add8c8b0017e383a88eba7 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-140/rng_state_2.pth b/adapters/saved_llamainstinwild/checkpoint-140/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..91567401aa82179fefb6a6c4a44d309857c57a58 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-140/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:690e857eca4109838830d22c8ea2e94e56f9a375a999cab54f118536dc72e584 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-140/rng_state_3.pth b/adapters/saved_llamainstinwild/checkpoint-140/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..807ed2a600dc583d88d5a4be54f6e4a502479b0b --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-140/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6500a606633e4b11a743cc314524b3cf1969f164d8afb350f6aafc2385c074a4 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-140/rng_state_4.pth b/adapters/saved_llamainstinwild/checkpoint-140/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..a0982852b04d0c708819a8658e67dd576956b959 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-140/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:704444ace2699c7600ad864d8bf8b1434dfe163d7774314ece6ecc72b0c318ff +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-140/rng_state_5.pth b/adapters/saved_llamainstinwild/checkpoint-140/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..8db35d97fd9531a1504b743c32ae8e8a0484a4b7 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-140/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:766cbe677b806233b243b907b94d5bdff1b6f6eee6d6dce7bf7eedc0fdc335dd +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-140/rng_state_6.pth b/adapters/saved_llamainstinwild/checkpoint-140/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..087821f80e3223e270714c09263dd890c672474f --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-140/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28dc67d7a9823e8aa3c32fea7a2cdb4fa8d772565a7d0186659c4d64fd6b3854 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-140/rng_state_7.pth b/adapters/saved_llamainstinwild/checkpoint-140/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..154eccf0576baab6455d78a03e7099c5d81b0226 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-140/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbdd7aed0ab1ca3daa4197d43a472b4f7d97cd0da2c141200ba41325347ed4a6 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-140/scaler.pt b/adapters/saved_llamainstinwild/checkpoint-140/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..55fa1829a47dec88992fc07d045812a491ead8ee --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-140/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7dbbfbece2fede2a53cc80ae296080e69d39722b90afa9ebeb253ed2579b6c4 +size 557 diff --git a/adapters/saved_llamainstinwild/checkpoint-140/scheduler.pt b/adapters/saved_llamainstinwild/checkpoint-140/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..af7329856e91947decaa161cd64494b6587e9c5a --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-140/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a3939f515bb5a1df073eed8ac8fd72d35dbaa72c1ccb9ad319bc8f05d4cbf63 +size 627 diff --git a/adapters/saved_llamainstinwild/checkpoint-140/trainer_state.json b/adapters/saved_llamainstinwild/checkpoint-140/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..de763fdce99ddcbf4c0b1510a60857a71c78173e --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-140/trainer_state.json @@ -0,0 +1,338 @@ +{ + "best_metric": 1.1747664213180542, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamainstinwild/checkpoint-140", + "epoch": 2.8553218610579987, + "global_step": 140, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "eval_loss": 1.39645254611969, + "eval_runtime": 37.7333, + "eval_samples_per_second": 53.004, + "eval_steps_per_second": 0.848, + "step": 4 + }, + { + "epoch": 0.16, + "eval_loss": 1.2984137535095215, + "eval_runtime": 37.7164, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 8 + }, + { + "epoch": 0.24, + "eval_loss": 1.2540132999420166, + "eval_runtime": 37.6847, + "eval_samples_per_second": 53.072, + "eval_steps_per_second": 0.849, + "step": 12 + }, + { + "epoch": 0.33, + "eval_loss": 1.2342448234558105, + "eval_runtime": 37.6879, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 16 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002664335664335664, + "loss": 1.3125, + "step": 20 + }, + { + "epoch": 0.41, + "eval_loss": 1.2209277153015137, + "eval_runtime": 37.7168, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 20 + }, + { + "epoch": 0.49, + "eval_loss": 1.2117934226989746, + "eval_runtime": 37.6897, + "eval_samples_per_second": 53.065, + "eval_steps_per_second": 0.849, + "step": 24 + }, + { + "epoch": 0.57, + "eval_loss": 1.2057584524154663, + "eval_runtime": 37.7691, + "eval_samples_per_second": 52.953, + "eval_steps_per_second": 0.847, + "step": 28 + }, + { + "epoch": 0.65, + "eval_loss": 1.2009178400039673, + "eval_runtime": 37.7026, + "eval_samples_per_second": 53.047, + "eval_steps_per_second": 0.849, + "step": 32 + }, + { + "epoch": 0.73, + "eval_loss": 1.197250485420227, + "eval_runtime": 37.6765, + "eval_samples_per_second": 53.084, + "eval_steps_per_second": 0.849, + "step": 36 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022447552447552445, + "loss": 1.2095, + "step": 40 + }, + { + "epoch": 0.82, + "eval_loss": 1.1943359375, + "eval_runtime": 37.6881, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 40 + }, + { + "epoch": 0.9, + "eval_loss": 1.1920316219329834, + "eval_runtime": 37.694, + "eval_samples_per_second": 53.059, + "eval_steps_per_second": 0.849, + "step": 44 + }, + { + "epoch": 0.98, + "eval_loss": 1.1898770332336426, + "eval_runtime": 37.7089, + "eval_samples_per_second": 53.038, + "eval_steps_per_second": 0.849, + "step": 48 + }, + { + "epoch": 1.06, + "eval_loss": 1.1879488229751587, + "eval_runtime": 37.6975, + "eval_samples_per_second": 53.054, + "eval_steps_per_second": 0.849, + "step": 52 + }, + { + "epoch": 1.14, + "eval_loss": 1.1864111423492432, + "eval_runtime": 37.7188, + "eval_samples_per_second": 53.024, + "eval_steps_per_second": 0.848, + "step": 56 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018251748251748253, + "loss": 1.188, + "step": 60 + }, + { + "epoch": 1.22, + "eval_loss": 1.1849411725997925, + "eval_runtime": 37.7686, + "eval_samples_per_second": 52.954, + "eval_steps_per_second": 0.847, + "step": 60 + }, + { + "epoch": 1.31, + "eval_loss": 1.1838239431381226, + "eval_runtime": 37.6945, + "eval_samples_per_second": 53.058, + "eval_steps_per_second": 0.849, + "step": 64 + }, + { + "epoch": 1.39, + "eval_loss": 1.182841181755066, + "eval_runtime": 37.6624, + "eval_samples_per_second": 53.103, + "eval_steps_per_second": 0.85, + "step": 68 + }, + { + "epoch": 1.47, + "eval_loss": 1.1818122863769531, + "eval_runtime": 37.7233, + "eval_samples_per_second": 53.018, + "eval_steps_per_second": 0.848, + "step": 72 + }, + { + "epoch": 1.55, + "eval_loss": 1.1808911561965942, + "eval_runtime": 37.7038, + "eval_samples_per_second": 53.045, + "eval_steps_per_second": 0.849, + "step": 76 + }, + { + "epoch": 1.63, + "learning_rate": 0.00014055944055944055, + "loss": 1.1876, + "step": 80 + }, + { + "epoch": 1.63, + "eval_loss": 1.1801403760910034, + "eval_runtime": 37.7277, + "eval_samples_per_second": 53.011, + "eval_steps_per_second": 0.848, + "step": 80 + }, + { + "epoch": 1.71, + "eval_loss": 1.1793811321258545, + "eval_runtime": 37.6641, + "eval_samples_per_second": 53.101, + "eval_steps_per_second": 0.85, + "step": 84 + }, + { + "epoch": 1.79, + "eval_loss": 1.1788283586502075, + "eval_runtime": 37.7097, + "eval_samples_per_second": 53.037, + "eval_steps_per_second": 0.849, + "step": 88 + }, + { + "epoch": 1.88, + "eval_loss": 1.1783016920089722, + "eval_runtime": 37.7104, + "eval_samples_per_second": 53.036, + "eval_steps_per_second": 0.849, + "step": 92 + }, + { + "epoch": 1.96, + "eval_loss": 1.177699327468872, + "eval_runtime": 37.6884, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 96 + }, + { + "epoch": 2.04, + "learning_rate": 9.860139860139858e-05, + "loss": 1.182, + "step": 100 + }, + { + "epoch": 2.04, + "eval_loss": 1.1773412227630615, + "eval_runtime": 37.7134, + "eval_samples_per_second": 53.032, + "eval_steps_per_second": 0.849, + "step": 100 + }, + { + "epoch": 2.12, + "eval_loss": 1.176965594291687, + "eval_runtime": 37.6735, + "eval_samples_per_second": 53.088, + "eval_steps_per_second": 0.849, + "step": 104 + }, + { + "epoch": 2.2, + "eval_loss": 1.1765486001968384, + "eval_runtime": 37.7771, + "eval_samples_per_second": 52.942, + "eval_steps_per_second": 0.847, + "step": 108 + }, + { + "epoch": 2.28, + "eval_loss": 1.1761963367462158, + "eval_runtime": 37.7211, + "eval_samples_per_second": 53.021, + "eval_steps_per_second": 0.848, + "step": 112 + }, + { + "epoch": 2.37, + "eval_loss": 1.1757991313934326, + "eval_runtime": 37.7438, + "eval_samples_per_second": 52.989, + "eval_steps_per_second": 0.848, + "step": 116 + }, + { + "epoch": 2.45, + "learning_rate": 5.664335664335664e-05, + "loss": 1.1785, + "step": 120 + }, + { + "epoch": 2.45, + "eval_loss": 1.1754933595657349, + "eval_runtime": 37.7128, + "eval_samples_per_second": 53.032, + "eval_steps_per_second": 0.849, + "step": 120 + }, + { + "epoch": 2.53, + "eval_loss": 1.1753153800964355, + "eval_runtime": 37.6929, + "eval_samples_per_second": 53.06, + "eval_steps_per_second": 0.849, + "step": 124 + }, + { + "epoch": 2.61, + "eval_loss": 1.1751618385314941, + "eval_runtime": 37.6913, + "eval_samples_per_second": 53.063, + "eval_steps_per_second": 0.849, + "step": 128 + }, + { + "epoch": 2.69, + "eval_loss": 1.1750015020370483, + "eval_runtime": 37.7589, + "eval_samples_per_second": 52.968, + "eval_steps_per_second": 0.847, + "step": 132 + }, + { + "epoch": 2.77, + "eval_loss": 1.1748673915863037, + "eval_runtime": 37.6659, + "eval_samples_per_second": 53.098, + "eval_steps_per_second": 0.85, + "step": 136 + }, + { + "epoch": 2.86, + "learning_rate": 1.4685314685314684e-05, + "loss": 1.1756, + "step": 140 + }, + { + "epoch": 2.86, + "eval_loss": 1.1747664213180542, + "eval_runtime": 37.6655, + "eval_samples_per_second": 53.099, + "eval_steps_per_second": 0.85, + "step": 140 + } + ], + "max_steps": 147, + "num_train_epochs": 3, + "total_flos": 1.6316480985413714e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamainstinwild/checkpoint-140/training_args.bin b/adapters/saved_llamainstinwild/checkpoint-140/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..34b95391e17e9d44eb2a04177250246c85716097 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-140/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13fbb17fb4210eddefede6fc992fb1cd0ac7bb1d7d94253bc089086b39aba316 +size 3643 diff --git a/adapters/saved_llamainstinwild/checkpoint-144/optimizer.pt b/adapters/saved_llamainstinwild/checkpoint-144/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f4e3d48d2de377175902a960750b5fb9d3919e6 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-144/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7a9ba02228d953d1e6175d65dfbb7b937b414891ff49bd82595717d4a712b45 +size 33629765 diff --git a/adapters/saved_llamainstinwild/checkpoint-144/pytorch_model.bin b/adapters/saved_llamainstinwild/checkpoint-144/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..c8bb72607093c99e78a2dd3709d06114ddf80a68 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-144/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94fd3e70a0d6bf30a9c76a1c822d388b8b78363a930ca5e9f3519487dcb2b1f4 +size 16822989 diff --git a/adapters/saved_llamainstinwild/checkpoint-144/rng_state_0.pth b/adapters/saved_llamainstinwild/checkpoint-144/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..74fd75f750b39688792b7f7fc2b0ea73cef24226 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-144/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36654c91b9f1411b9afc31b742364af0e1bcc9c8c1a999571336edcf5faf02e3 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-144/rng_state_1.pth b/adapters/saved_llamainstinwild/checkpoint-144/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..46720edc3a2ef0fd75c868402bcd3871d4493fe5 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-144/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e964e55458570fc6e485a00a36ccb59a6c8fd2aa794e0ae4931095ae9c18f8a +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-144/rng_state_2.pth b/adapters/saved_llamainstinwild/checkpoint-144/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..5fc5e75bc53b52e73de1f4184cc6430848245d96 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-144/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91ff2d4cf5f1a943d97eb29f4be54f98d196a8cbce5a0ea845c71332193c9f79 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-144/rng_state_3.pth b/adapters/saved_llamainstinwild/checkpoint-144/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..24679ea7eff9cdf9e483b70e652a135993065be7 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-144/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85616486c0c23789d7c05f4706e369720ed322206ce7f399c0e5c9e8b0d5836a +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-144/rng_state_4.pth b/adapters/saved_llamainstinwild/checkpoint-144/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..1868cc650abe1b5a4efcda242ac1fdeaa6ea9e66 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-144/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bf245c7cccac365cf4556f2224fe779f69a5fd2976511c8f12ac559e7260c66 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-144/rng_state_5.pth b/adapters/saved_llamainstinwild/checkpoint-144/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b84eeb2d353e0d716b98256b481691b23a9085c2 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-144/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:325b4174f2674f64796346b7a7d7c9829f23a1721ae804d8c98c2c38b4f3a423 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-144/rng_state_6.pth b/adapters/saved_llamainstinwild/checkpoint-144/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..bdbc35e6d7e6f5f5b73bcbe96c6bd40a574b9265 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-144/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:462ace530b5a02bfe624c371768bad89d9560e053f11ed47874cf1a0af533841 +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-144/rng_state_7.pth b/adapters/saved_llamainstinwild/checkpoint-144/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..362d07c93c492b68f6016f76d87a7ad85ee6d846 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-144/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb86e64de56280ae272f8c8a41902b4993b0f79e3ee63a769b865a826f453e0c +size 14583 diff --git a/adapters/saved_llamainstinwild/checkpoint-144/scaler.pt b/adapters/saved_llamainstinwild/checkpoint-144/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..971959d27b7b9707a25391a1ef838066589b9020 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-144/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e592f9928565d75dd7e8d26fe4e2f55b9133cf26cb2f665777ebcd048159241d +size 557 diff --git a/adapters/saved_llamainstinwild/checkpoint-144/scheduler.pt b/adapters/saved_llamainstinwild/checkpoint-144/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..66da239342cec6e1b650ef3d343d9b6e896caecc --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-144/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9789a64e2c584b71f261c70b633cf5bfbfed74ab91b05873b0bd3efd6fc72ba +size 627 diff --git a/adapters/saved_llamainstinwild/checkpoint-144/trainer_state.json b/adapters/saved_llamainstinwild/checkpoint-144/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f6614cd7a6085aef8727f2bca902d3323e5be4e6 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-144/trainer_state.json @@ -0,0 +1,346 @@ +{ + "best_metric": 1.1747314929962158, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamainstinwild/checkpoint-144", + "epoch": 2.936902485659656, + "global_step": 144, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "eval_loss": 1.39645254611969, + "eval_runtime": 37.7333, + "eval_samples_per_second": 53.004, + "eval_steps_per_second": 0.848, + "step": 4 + }, + { + "epoch": 0.16, + "eval_loss": 1.2984137535095215, + "eval_runtime": 37.7164, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 8 + }, + { + "epoch": 0.24, + "eval_loss": 1.2540132999420166, + "eval_runtime": 37.6847, + "eval_samples_per_second": 53.072, + "eval_steps_per_second": 0.849, + "step": 12 + }, + { + "epoch": 0.33, + "eval_loss": 1.2342448234558105, + "eval_runtime": 37.6879, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 16 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002664335664335664, + "loss": 1.3125, + "step": 20 + }, + { + "epoch": 0.41, + "eval_loss": 1.2209277153015137, + "eval_runtime": 37.7168, + "eval_samples_per_second": 53.027, + "eval_steps_per_second": 0.848, + "step": 20 + }, + { + "epoch": 0.49, + "eval_loss": 1.2117934226989746, + "eval_runtime": 37.6897, + "eval_samples_per_second": 53.065, + "eval_steps_per_second": 0.849, + "step": 24 + }, + { + "epoch": 0.57, + "eval_loss": 1.2057584524154663, + "eval_runtime": 37.7691, + "eval_samples_per_second": 52.953, + "eval_steps_per_second": 0.847, + "step": 28 + }, + { + "epoch": 0.65, + "eval_loss": 1.2009178400039673, + "eval_runtime": 37.7026, + "eval_samples_per_second": 53.047, + "eval_steps_per_second": 0.849, + "step": 32 + }, + { + "epoch": 0.73, + "eval_loss": 1.197250485420227, + "eval_runtime": 37.6765, + "eval_samples_per_second": 53.084, + "eval_steps_per_second": 0.849, + "step": 36 + }, + { + "epoch": 0.82, + "learning_rate": 0.00022447552447552445, + "loss": 1.2095, + "step": 40 + }, + { + "epoch": 0.82, + "eval_loss": 1.1943359375, + "eval_runtime": 37.6881, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 40 + }, + { + "epoch": 0.9, + "eval_loss": 1.1920316219329834, + "eval_runtime": 37.694, + "eval_samples_per_second": 53.059, + "eval_steps_per_second": 0.849, + "step": 44 + }, + { + "epoch": 0.98, + "eval_loss": 1.1898770332336426, + "eval_runtime": 37.7089, + "eval_samples_per_second": 53.038, + "eval_steps_per_second": 0.849, + "step": 48 + }, + { + "epoch": 1.06, + "eval_loss": 1.1879488229751587, + "eval_runtime": 37.6975, + "eval_samples_per_second": 53.054, + "eval_steps_per_second": 0.849, + "step": 52 + }, + { + "epoch": 1.14, + "eval_loss": 1.1864111423492432, + "eval_runtime": 37.7188, + "eval_samples_per_second": 53.024, + "eval_steps_per_second": 0.848, + "step": 56 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018251748251748253, + "loss": 1.188, + "step": 60 + }, + { + "epoch": 1.22, + "eval_loss": 1.1849411725997925, + "eval_runtime": 37.7686, + "eval_samples_per_second": 52.954, + "eval_steps_per_second": 0.847, + "step": 60 + }, + { + "epoch": 1.31, + "eval_loss": 1.1838239431381226, + "eval_runtime": 37.6945, + "eval_samples_per_second": 53.058, + "eval_steps_per_second": 0.849, + "step": 64 + }, + { + "epoch": 1.39, + "eval_loss": 1.182841181755066, + "eval_runtime": 37.6624, + "eval_samples_per_second": 53.103, + "eval_steps_per_second": 0.85, + "step": 68 + }, + { + "epoch": 1.47, + "eval_loss": 1.1818122863769531, + "eval_runtime": 37.7233, + "eval_samples_per_second": 53.018, + "eval_steps_per_second": 0.848, + "step": 72 + }, + { + "epoch": 1.55, + "eval_loss": 1.1808911561965942, + "eval_runtime": 37.7038, + "eval_samples_per_second": 53.045, + "eval_steps_per_second": 0.849, + "step": 76 + }, + { + "epoch": 1.63, + "learning_rate": 0.00014055944055944055, + "loss": 1.1876, + "step": 80 + }, + { + "epoch": 1.63, + "eval_loss": 1.1801403760910034, + "eval_runtime": 37.7277, + "eval_samples_per_second": 53.011, + "eval_steps_per_second": 0.848, + "step": 80 + }, + { + "epoch": 1.71, + "eval_loss": 1.1793811321258545, + "eval_runtime": 37.6641, + "eval_samples_per_second": 53.101, + "eval_steps_per_second": 0.85, + "step": 84 + }, + { + "epoch": 1.79, + "eval_loss": 1.1788283586502075, + "eval_runtime": 37.7097, + "eval_samples_per_second": 53.037, + "eval_steps_per_second": 0.849, + "step": 88 + }, + { + "epoch": 1.88, + "eval_loss": 1.1783016920089722, + "eval_runtime": 37.7104, + "eval_samples_per_second": 53.036, + "eval_steps_per_second": 0.849, + "step": 92 + }, + { + "epoch": 1.96, + "eval_loss": 1.177699327468872, + "eval_runtime": 37.6884, + "eval_samples_per_second": 53.067, + "eval_steps_per_second": 0.849, + "step": 96 + }, + { + "epoch": 2.04, + "learning_rate": 9.860139860139858e-05, + "loss": 1.182, + "step": 100 + }, + { + "epoch": 2.04, + "eval_loss": 1.1773412227630615, + "eval_runtime": 37.7134, + "eval_samples_per_second": 53.032, + "eval_steps_per_second": 0.849, + "step": 100 + }, + { + "epoch": 2.12, + "eval_loss": 1.176965594291687, + "eval_runtime": 37.6735, + "eval_samples_per_second": 53.088, + "eval_steps_per_second": 0.849, + "step": 104 + }, + { + "epoch": 2.2, + "eval_loss": 1.1765486001968384, + "eval_runtime": 37.7771, + "eval_samples_per_second": 52.942, + "eval_steps_per_second": 0.847, + "step": 108 + }, + { + "epoch": 2.28, + "eval_loss": 1.1761963367462158, + "eval_runtime": 37.7211, + "eval_samples_per_second": 53.021, + "eval_steps_per_second": 0.848, + "step": 112 + }, + { + "epoch": 2.37, + "eval_loss": 1.1757991313934326, + "eval_runtime": 37.7438, + "eval_samples_per_second": 52.989, + "eval_steps_per_second": 0.848, + "step": 116 + }, + { + "epoch": 2.45, + "learning_rate": 5.664335664335664e-05, + "loss": 1.1785, + "step": 120 + }, + { + "epoch": 2.45, + "eval_loss": 1.1754933595657349, + "eval_runtime": 37.7128, + "eval_samples_per_second": 53.032, + "eval_steps_per_second": 0.849, + "step": 120 + }, + { + "epoch": 2.53, + "eval_loss": 1.1753153800964355, + "eval_runtime": 37.6929, + "eval_samples_per_second": 53.06, + "eval_steps_per_second": 0.849, + "step": 124 + }, + { + "epoch": 2.61, + "eval_loss": 1.1751618385314941, + "eval_runtime": 37.6913, + "eval_samples_per_second": 53.063, + "eval_steps_per_second": 0.849, + "step": 128 + }, + { + "epoch": 2.69, + "eval_loss": 1.1750015020370483, + "eval_runtime": 37.7589, + "eval_samples_per_second": 52.968, + "eval_steps_per_second": 0.847, + "step": 132 + }, + { + "epoch": 2.77, + "eval_loss": 1.1748673915863037, + "eval_runtime": 37.6659, + "eval_samples_per_second": 53.098, + "eval_steps_per_second": 0.85, + "step": 136 + }, + { + "epoch": 2.86, + "learning_rate": 1.4685314685314684e-05, + "loss": 1.1756, + "step": 140 + }, + { + "epoch": 2.86, + "eval_loss": 1.1747664213180542, + "eval_runtime": 37.6655, + "eval_samples_per_second": 53.099, + "eval_steps_per_second": 0.85, + "step": 140 + }, + { + "epoch": 2.94, + "eval_loss": 1.1747314929962158, + "eval_runtime": 37.7383, + "eval_samples_per_second": 52.997, + "eval_steps_per_second": 0.848, + "step": 144 + } + ], + "max_steps": 147, + "num_train_epochs": 3, + "total_flos": 1.6787640358073795e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamainstinwild/checkpoint-144/training_args.bin b/adapters/saved_llamainstinwild/checkpoint-144/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..34b95391e17e9d44eb2a04177250246c85716097 --- /dev/null +++ b/adapters/saved_llamainstinwild/checkpoint-144/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13fbb17fb4210eddefede6fc992fb1cd0ac7bb1d7d94253bc089086b39aba316 +size 3643 diff --git a/adapters/saved_llamaprosocial_dialog/adapter_config.json b/adapters/saved_llamaprosocial_dialog/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d07a7b67f6c57422ee3209e5778eb76d5dfe37c6 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/adapter_config.json @@ -0,0 +1,17 @@ +{ + "base_model_name_or_path": "/mnt/bn/qingyi-bn-lq/llama/llama-7b-hf/", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 16, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapters/saved_llamaprosocial_dialog/adapter_model.bin b/adapters/saved_llamaprosocial_dialog/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8710093665dfdb9819e2f8817a1e25a4ccdd9935 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5e1621f48d9ad8feb1d6d31050275f0aafd080c5c07153301fe2f48411f4406 +size 443 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-231/optimizer.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-231/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..859ca5e1575374327f5d07d77df15463b6b67d61 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-231/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:636a289f2820df3da7735aca06c4476d8951bf4bad80a75c8966bf424505fe59 +size 33629765 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-231/pytorch_model.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-231/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..96824ca0a05c2d1d38e82bbb3d91edf70c7e1ec8 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-231/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbd3d30dfa9a1b90480a0b80b6833739638cab0603fe7ed5c3dbb9be40543f13 +size 16822989 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_0.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..1a372ecef281039812df35c5de4d0bd2ed2cdb6a --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c3ed3d572aec36f906781a0a8a7942e772e32f88fa160d3472a3972f75638e8 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_1.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..e243ece975a32d593a6a41cb2006351c643032c3 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdaba8450419544c6fcf66d84d0c978470012b7361c71cc81447f19d6ac1dc65 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_2.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..36742370333c024cdf6815e74849135f55c30ae3 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f73e4d5ed143467c1751943f1967bc636d2394e2f9858eea21eda003e0836b7b +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_3.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..482d3f4cdc738e2ae132c4f2d715f9ac1b295634 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f18a08c35852f8a42010c3d5149224b675eec9d80bf7af7a5e70d0f71e9fb653 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_4.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..7400289ceb67e55d38bc339cd4781667ef608d8f --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a2e59d7e30ae0acc391d0acf1c4608692b41fd98fef5117ed4542685470002c +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_5.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..6093ab22991d43b9b7a69fe7e7804688b25cba06 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98c65ae2979f0acfa13b203856c323791920d3ff73fbf068d9271c596da2ee18 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_6.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..61a0bbf398e85ce03969cc56bc49eb9c6521261f --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bc207538b14f992863106d29f1b3b8541293b89407a3330c87ec8a14ebf8695 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_7.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..3e6c3429e887311f1ae8b953069133b745b9d4a0 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-231/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b77b19f5ba941135b3ba77c66a79e03e9c23bcd76bed5651cccee8a313633e1 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-231/scaler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-231/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..331666df6bf2b0063d197d27a0b8e7d0d83533ac --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-231/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a2cdac65893cb9d4c510b6a65bbebfc99723686e42d5ba328a682f39bfcd7c2 +size 557 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-231/scheduler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-231/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c2e89608b115c3e678d1e351c490a977421f29b --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-231/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:366d4756107a34ff1984d3c354fbc0c7688851128fa1b37f8fc9c8dc2f663bd4 +size 627 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-231/trainer_state.json b/adapters/saved_llamaprosocial_dialog/checkpoint-231/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4bbfa23e68870a3b1f096991f324f8a14220f53a --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-231/trainer_state.json @@ -0,0 +1,250 @@ +{ + "best_metric": 1.7033213376998901, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaprosocial_dialog/checkpoint-231", + "epoch": 2.0005412719891744, + "global_step": 231, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "eval_loss": 1.9915313720703125, + "eval_runtime": 22.7483, + "eval_samples_per_second": 87.919, + "eval_steps_per_second": 1.407, + "step": 11 + }, + { + "epoch": 0.17, + "learning_rate": 0.000292814371257485, + "loss": 2.0439, + "step": 20 + }, + { + "epoch": 0.19, + "eval_loss": 1.8029985427856445, + "eval_runtime": 22.7395, + "eval_samples_per_second": 87.953, + "eval_steps_per_second": 1.407, + "step": 22 + }, + { + "epoch": 0.29, + "eval_loss": 1.7677525281906128, + "eval_runtime": 22.7246, + "eval_samples_per_second": 88.01, + "eval_steps_per_second": 1.408, + "step": 33 + }, + { + "epoch": 0.35, + "learning_rate": 0.00027485029940119756, + "loss": 1.7686, + "step": 40 + }, + { + "epoch": 0.38, + "eval_loss": 1.7513495683670044, + "eval_runtime": 22.772, + "eval_samples_per_second": 87.827, + "eval_steps_per_second": 1.405, + "step": 44 + }, + { + "epoch": 0.48, + "eval_loss": 1.7416645288467407, + "eval_runtime": 22.7234, + "eval_samples_per_second": 88.015, + "eval_steps_per_second": 1.408, + "step": 55 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025688622754491017, + "loss": 1.737, + "step": 60 + }, + { + "epoch": 0.57, + "eval_loss": 1.734699010848999, + "eval_runtime": 22.7372, + "eval_samples_per_second": 87.962, + "eval_steps_per_second": 1.407, + "step": 66 + }, + { + "epoch": 0.67, + "eval_loss": 1.72977614402771, + "eval_runtime": 22.74, + "eval_samples_per_second": 87.951, + "eval_steps_per_second": 1.407, + "step": 77 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023892215568862272, + "loss": 1.7289, + "step": 80 + }, + { + "epoch": 0.76, + "eval_loss": 1.725810170173645, + "eval_runtime": 22.7407, + "eval_samples_per_second": 87.948, + "eval_steps_per_second": 1.407, + "step": 88 + }, + { + "epoch": 0.86, + "eval_loss": 1.7222310304641724, + "eval_runtime": 22.733, + "eval_samples_per_second": 87.978, + "eval_steps_per_second": 1.408, + "step": 99 + }, + { + "epoch": 0.87, + "learning_rate": 0.00022095808383233533, + "loss": 1.7172, + "step": 100 + }, + { + "epoch": 0.95, + "eval_loss": 1.7189042568206787, + "eval_runtime": 22.7271, + "eval_samples_per_second": 88.001, + "eval_steps_per_second": 1.408, + "step": 110 + }, + { + "epoch": 1.04, + "learning_rate": 0.00020299401197604788, + "loss": 1.7127, + "step": 120 + }, + { + "epoch": 1.05, + "eval_loss": 1.7177015542984009, + "eval_runtime": 22.7303, + "eval_samples_per_second": 87.988, + "eval_steps_per_second": 1.408, + "step": 121 + }, + { + "epoch": 1.14, + "eval_loss": 1.7150254249572754, + "eval_runtime": 22.7281, + "eval_samples_per_second": 87.997, + "eval_steps_per_second": 1.408, + "step": 132 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018502994011976046, + "loss": 1.7017, + "step": 140 + }, + { + "epoch": 1.24, + "eval_loss": 1.7136248350143433, + "eval_runtime": 22.732, + "eval_samples_per_second": 87.982, + "eval_steps_per_second": 1.408, + "step": 143 + }, + { + "epoch": 1.33, + "eval_loss": 1.7119090557098389, + "eval_runtime": 22.7069, + "eval_samples_per_second": 88.079, + "eval_steps_per_second": 1.409, + "step": 154 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016706586826347302, + "loss": 1.7003, + "step": 160 + }, + { + "epoch": 1.43, + "eval_loss": 1.7102028131484985, + "eval_runtime": 22.9016, + "eval_samples_per_second": 87.33, + "eval_steps_per_second": 1.397, + "step": 165 + }, + { + "epoch": 1.52, + "eval_loss": 1.708575963973999, + "eval_runtime": 22.7175, + "eval_samples_per_second": 88.038, + "eval_steps_per_second": 1.409, + "step": 176 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001491017964071856, + "loss": 1.7025, + "step": 180 + }, + { + "epoch": 1.62, + "eval_loss": 1.7076679468154907, + "eval_runtime": 22.792, + "eval_samples_per_second": 87.75, + "eval_steps_per_second": 1.404, + "step": 187 + }, + { + "epoch": 1.71, + "eval_loss": 1.7062067985534668, + "eval_runtime": 22.7054, + "eval_samples_per_second": 88.085, + "eval_steps_per_second": 1.409, + "step": 198 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001311377245508982, + "loss": 1.6976, + "step": 200 + }, + { + "epoch": 1.81, + "eval_loss": 1.7049460411071777, + "eval_runtime": 22.765, + "eval_samples_per_second": 87.854, + "eval_steps_per_second": 1.406, + "step": 209 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011317365269461076, + "loss": 1.6955, + "step": 220 + }, + { + "epoch": 1.91, + "eval_loss": 1.704172134399414, + "eval_runtime": 22.8464, + "eval_samples_per_second": 87.541, + "eval_steps_per_second": 1.401, + "step": 220 + }, + { + "epoch": 2.0, + "eval_loss": 1.7033213376998901, + "eval_runtime": 22.7378, + "eval_samples_per_second": 87.959, + "eval_steps_per_second": 1.407, + "step": 231 + } + ], + "max_steps": 345, + "num_train_epochs": 3, + "total_flos": 1.76384204600456e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-231/training_args.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-231/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e1825c4b29d38c869d6c9c9df59d0fd2371fc9ad --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-231/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:147e8e99373e1b6d6dcda22258c683d1beff1e029d1f45adaed70feec69fdbb0 +size 3643 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-242/optimizer.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-242/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..06b4d863072a53b851f7ae53ebee2612ba62b73f --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-242/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7341099f89cac777a909271abaf3eb88d941986f4889b2cab8d09e974fa77ed +size 33629765 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-242/pytorch_model.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-242/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..df28d19bdc9cabf01352d7bb17d4c9754d281c34 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-242/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63638e19de2161c2f3746ba8c97ca88a9b610e7663381a098e089428e7518c55 +size 16822989 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_0.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..53acc8c2293a546af95c66436064d8657ce15701 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d133e4b3f06381e1d2b91dbb9994ebd2ee217084263c6348efdf891dcaa60e96 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_1.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..125a33b9c8f3af176fcbe3834151d0b6aeb9cad0 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3836616355e1cf5302f71711c61dfee19f40b0179e6c7122d49b33336c1c754d +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_2.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..26c46759c53d3feb1f04753e8d4fe98bee89e9b2 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f4ba97a62d59e53a451b4a817e3032c31bc156e47da25a1bac53c544e6fbebd +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_3.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..02185408135895f9a7e7eb3a54bd5bdc8af2fa24 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77f047ab7843ada6c80c6eb2035f8805d7707e5c6fb6da8ac715edce1d6e6dd8 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_4.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..fd8f434e893e00cd36d76ff3d6291f2d2bd368ef --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06a03edb87180fdc6961ac4d1744f754a7a57746d56a726693b0f27ade1a5746 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_5.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1bca0a5fcfb00d1a4cb48247fe80a9cb8b5c5128 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8155fd70ecfdc8c6ad0cebdc1a34a6f952f6b1a9e861ec3d017282f1653c7d7 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_6.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..7dc122e7b1bc2a84335c768b49f5d3f6faf5fc34 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de764a8d828a9a52e7369b0d9b00cc67d31e00e59ca64b3e4b5e023e7a685fb4 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_7.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..c824c5b0d1234c22b725458f2b66ee95ad82cb5d --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-242/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddf7ed808a45b4307ade51b5e536f94c6cbecf49b9e2e20a281a732af477d22b +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-242/scaler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-242/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c58f72c39ef2b19c720c198b98f4206a0964a10 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-242/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d92e81b5aab135eddcc5a6fbdc5ac9d87ad198203322ae6e091227716bb0db8 +size 557 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-242/scheduler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-242/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d914f2937685c519fb96d3d78bbbd1b8c185d9b --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-242/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ede41d35fa0db2bce1d05bd6a6716e2a9327569524dd78badd1073f050ea4a14 +size 627 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-242/trainer_state.json b/adapters/saved_llamaprosocial_dialog/checkpoint-242/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..af64ad7a4d0580a10f3760860614474a620e53e5 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-242/trainer_state.json @@ -0,0 +1,264 @@ +{ + "best_metric": 1.7023844718933105, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaprosocial_dialog/checkpoint-242", + "epoch": 2.095805142083897, + "global_step": 242, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "eval_loss": 1.9915313720703125, + "eval_runtime": 22.7483, + "eval_samples_per_second": 87.919, + "eval_steps_per_second": 1.407, + "step": 11 + }, + { + "epoch": 0.17, + "learning_rate": 0.000292814371257485, + "loss": 2.0439, + "step": 20 + }, + { + "epoch": 0.19, + "eval_loss": 1.8029985427856445, + "eval_runtime": 22.7395, + "eval_samples_per_second": 87.953, + "eval_steps_per_second": 1.407, + "step": 22 + }, + { + "epoch": 0.29, + "eval_loss": 1.7677525281906128, + "eval_runtime": 22.7246, + "eval_samples_per_second": 88.01, + "eval_steps_per_second": 1.408, + "step": 33 + }, + { + "epoch": 0.35, + "learning_rate": 0.00027485029940119756, + "loss": 1.7686, + "step": 40 + }, + { + "epoch": 0.38, + "eval_loss": 1.7513495683670044, + "eval_runtime": 22.772, + "eval_samples_per_second": 87.827, + "eval_steps_per_second": 1.405, + "step": 44 + }, + { + "epoch": 0.48, + "eval_loss": 1.7416645288467407, + "eval_runtime": 22.7234, + "eval_samples_per_second": 88.015, + "eval_steps_per_second": 1.408, + "step": 55 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025688622754491017, + "loss": 1.737, + "step": 60 + }, + { + "epoch": 0.57, + "eval_loss": 1.734699010848999, + "eval_runtime": 22.7372, + "eval_samples_per_second": 87.962, + "eval_steps_per_second": 1.407, + "step": 66 + }, + { + "epoch": 0.67, + "eval_loss": 1.72977614402771, + "eval_runtime": 22.74, + "eval_samples_per_second": 87.951, + "eval_steps_per_second": 1.407, + "step": 77 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023892215568862272, + "loss": 1.7289, + "step": 80 + }, + { + "epoch": 0.76, + "eval_loss": 1.725810170173645, + "eval_runtime": 22.7407, + "eval_samples_per_second": 87.948, + "eval_steps_per_second": 1.407, + "step": 88 + }, + { + "epoch": 0.86, + "eval_loss": 1.7222310304641724, + "eval_runtime": 22.733, + "eval_samples_per_second": 87.978, + "eval_steps_per_second": 1.408, + "step": 99 + }, + { + "epoch": 0.87, + "learning_rate": 0.00022095808383233533, + "loss": 1.7172, + "step": 100 + }, + { + "epoch": 0.95, + "eval_loss": 1.7189042568206787, + "eval_runtime": 22.7271, + "eval_samples_per_second": 88.001, + "eval_steps_per_second": 1.408, + "step": 110 + }, + { + "epoch": 1.04, + "learning_rate": 0.00020299401197604788, + "loss": 1.7127, + "step": 120 + }, + { + "epoch": 1.05, + "eval_loss": 1.7177015542984009, + "eval_runtime": 22.7303, + "eval_samples_per_second": 87.988, + "eval_steps_per_second": 1.408, + "step": 121 + }, + { + "epoch": 1.14, + "eval_loss": 1.7150254249572754, + "eval_runtime": 22.7281, + "eval_samples_per_second": 87.997, + "eval_steps_per_second": 1.408, + "step": 132 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018502994011976046, + "loss": 1.7017, + "step": 140 + }, + { + "epoch": 1.24, + "eval_loss": 1.7136248350143433, + "eval_runtime": 22.732, + "eval_samples_per_second": 87.982, + "eval_steps_per_second": 1.408, + "step": 143 + }, + { + "epoch": 1.33, + "eval_loss": 1.7119090557098389, + "eval_runtime": 22.7069, + "eval_samples_per_second": 88.079, + "eval_steps_per_second": 1.409, + "step": 154 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016706586826347302, + "loss": 1.7003, + "step": 160 + }, + { + "epoch": 1.43, + "eval_loss": 1.7102028131484985, + "eval_runtime": 22.9016, + "eval_samples_per_second": 87.33, + "eval_steps_per_second": 1.397, + "step": 165 + }, + { + "epoch": 1.52, + "eval_loss": 1.708575963973999, + "eval_runtime": 22.7175, + "eval_samples_per_second": 88.038, + "eval_steps_per_second": 1.409, + "step": 176 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001491017964071856, + "loss": 1.7025, + "step": 180 + }, + { + "epoch": 1.62, + "eval_loss": 1.7076679468154907, + "eval_runtime": 22.792, + "eval_samples_per_second": 87.75, + "eval_steps_per_second": 1.404, + "step": 187 + }, + { + "epoch": 1.71, + "eval_loss": 1.7062067985534668, + "eval_runtime": 22.7054, + "eval_samples_per_second": 88.085, + "eval_steps_per_second": 1.409, + "step": 198 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001311377245508982, + "loss": 1.6976, + "step": 200 + }, + { + "epoch": 1.81, + "eval_loss": 1.7049460411071777, + "eval_runtime": 22.765, + "eval_samples_per_second": 87.854, + "eval_steps_per_second": 1.406, + "step": 209 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011317365269461076, + "loss": 1.6955, + "step": 220 + }, + { + "epoch": 1.91, + "eval_loss": 1.704172134399414, + "eval_runtime": 22.8464, + "eval_samples_per_second": 87.541, + "eval_steps_per_second": 1.401, + "step": 220 + }, + { + "epoch": 2.0, + "eval_loss": 1.7033213376998901, + "eval_runtime": 22.7378, + "eval_samples_per_second": 87.959, + "eval_steps_per_second": 1.407, + "step": 231 + }, + { + "epoch": 2.08, + "learning_rate": 9.520958083832335e-05, + "loss": 1.6893, + "step": 240 + }, + { + "epoch": 2.1, + "eval_loss": 1.7023844718933105, + "eval_runtime": 22.7421, + "eval_samples_per_second": 87.942, + "eval_steps_per_second": 1.407, + "step": 242 + } + ], + "max_steps": 345, + "num_train_epochs": 3, + "total_flos": 1.8476901005116047e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-242/training_args.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-242/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e1825c4b29d38c869d6c9c9df59d0fd2371fc9ad --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-242/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:147e8e99373e1b6d6dcda22258c683d1beff1e029d1f45adaed70feec69fdbb0 +size 3643 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-253/optimizer.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-253/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ecdc1bc63591ee2f83a7e9b001d5e27cd6b88909 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-253/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e703f72dc81f7f5cc7bce4c0c092bff95b95a67b1625e6280d7487bf5a7a520e +size 33629765 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-253/pytorch_model.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-253/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..61c80eff54d8dbf7b2b40a6dac4d1551e0699848 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-253/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d352f8a816ff0c8eb51502820efa11e60d208dfbfcc715cc25dc520be997a00 +size 16822989 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_0.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5b3561464c4f30dfb1117a8e0c0783983dabf499 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57893516f06d701161938994e29960d59c22adb924373988c8dbea8a1f54bd97 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_1.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..82e7759c36f9804962f0a05c544d80f901916458 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf0704f8c3fa0ccee16b1f8087b331e5e330a008271a6e97460048c1f4896a45 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_2.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..8444ca033547e722a8019119c1c0740e740813cd --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f9d7bf25bb46bdb26a731ae49f0d36fbb7b6d91cd2ed6667151d9cc1d744a18 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_3.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..bfa733721570968008f887503b97e8d8e2ff4d03 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39efc78219e5d64be971e04cc09b25e19a521a27bc223c81100cb38dfa22d85c +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_4.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..a4c572bc770bef2cb729f8d988668a11f9c8f65e --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7263e7de2ad55ee56841e79879dea866974004f7af360a49f043d5690b846cda +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_5.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..28533ea119aa6520f695acad803f185129be5809 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb8795de665b13c84d64b467e082f0f4eea2d60e88a02a159ef41f78bdede660 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_6.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..81f8293be16534d9850c3c5e411a014c084ee8a4 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6b4e84652a28db3e5d165b7c386aa228ba8f10ae5b69e47f0d8e7c916cc3791 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_7.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..3809809fd03833fd246cc46bd980463e4ba42a78 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-253/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15b2c9af52c3d2195d0eac786edf5949a49ae3131e7262c7dc5443ad77aa2e4b +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-253/scaler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-253/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a531649b316ad030e6a7a45bf860da1958e80fd --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-253/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a724cd960d1fb64864db1d210779b8efdf00cc2858299f58da447c2063b802bf +size 557 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-253/scheduler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-253/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f3a6eb4e12ba456f023c9df2ecba74d6a853202 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-253/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac7b2c8e8d0b7abead877ecaa1c3b038ce34c35624e85d202c31eca3620655d4 +size 627 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-253/trainer_state.json b/adapters/saved_llamaprosocial_dialog/checkpoint-253/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..43da71945718d0c5169151e427e6cd4348550ea2 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-253/trainer_state.json @@ -0,0 +1,272 @@ +{ + "best_metric": 1.7013720273971558, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaprosocial_dialog/checkpoint-253", + "epoch": 2.1910690121786196, + "global_step": 253, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "eval_loss": 1.9915313720703125, + "eval_runtime": 22.7483, + "eval_samples_per_second": 87.919, + "eval_steps_per_second": 1.407, + "step": 11 + }, + { + "epoch": 0.17, + "learning_rate": 0.000292814371257485, + "loss": 2.0439, + "step": 20 + }, + { + "epoch": 0.19, + "eval_loss": 1.8029985427856445, + "eval_runtime": 22.7395, + "eval_samples_per_second": 87.953, + "eval_steps_per_second": 1.407, + "step": 22 + }, + { + "epoch": 0.29, + "eval_loss": 1.7677525281906128, + "eval_runtime": 22.7246, + "eval_samples_per_second": 88.01, + "eval_steps_per_second": 1.408, + "step": 33 + }, + { + "epoch": 0.35, + "learning_rate": 0.00027485029940119756, + "loss": 1.7686, + "step": 40 + }, + { + "epoch": 0.38, + "eval_loss": 1.7513495683670044, + "eval_runtime": 22.772, + "eval_samples_per_second": 87.827, + "eval_steps_per_second": 1.405, + "step": 44 + }, + { + "epoch": 0.48, + "eval_loss": 1.7416645288467407, + "eval_runtime": 22.7234, + "eval_samples_per_second": 88.015, + "eval_steps_per_second": 1.408, + "step": 55 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025688622754491017, + "loss": 1.737, + "step": 60 + }, + { + "epoch": 0.57, + "eval_loss": 1.734699010848999, + "eval_runtime": 22.7372, + "eval_samples_per_second": 87.962, + "eval_steps_per_second": 1.407, + "step": 66 + }, + { + "epoch": 0.67, + "eval_loss": 1.72977614402771, + "eval_runtime": 22.74, + "eval_samples_per_second": 87.951, + "eval_steps_per_second": 1.407, + "step": 77 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023892215568862272, + "loss": 1.7289, + "step": 80 + }, + { + "epoch": 0.76, + "eval_loss": 1.725810170173645, + "eval_runtime": 22.7407, + "eval_samples_per_second": 87.948, + "eval_steps_per_second": 1.407, + "step": 88 + }, + { + "epoch": 0.86, + "eval_loss": 1.7222310304641724, + "eval_runtime": 22.733, + "eval_samples_per_second": 87.978, + "eval_steps_per_second": 1.408, + "step": 99 + }, + { + "epoch": 0.87, + "learning_rate": 0.00022095808383233533, + "loss": 1.7172, + "step": 100 + }, + { + "epoch": 0.95, + "eval_loss": 1.7189042568206787, + "eval_runtime": 22.7271, + "eval_samples_per_second": 88.001, + "eval_steps_per_second": 1.408, + "step": 110 + }, + { + "epoch": 1.04, + "learning_rate": 0.00020299401197604788, + "loss": 1.7127, + "step": 120 + }, + { + "epoch": 1.05, + "eval_loss": 1.7177015542984009, + "eval_runtime": 22.7303, + "eval_samples_per_second": 87.988, + "eval_steps_per_second": 1.408, + "step": 121 + }, + { + "epoch": 1.14, + "eval_loss": 1.7150254249572754, + "eval_runtime": 22.7281, + "eval_samples_per_second": 87.997, + "eval_steps_per_second": 1.408, + "step": 132 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018502994011976046, + "loss": 1.7017, + "step": 140 + }, + { + "epoch": 1.24, + "eval_loss": 1.7136248350143433, + "eval_runtime": 22.732, + "eval_samples_per_second": 87.982, + "eval_steps_per_second": 1.408, + "step": 143 + }, + { + "epoch": 1.33, + "eval_loss": 1.7119090557098389, + "eval_runtime": 22.7069, + "eval_samples_per_second": 88.079, + "eval_steps_per_second": 1.409, + "step": 154 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016706586826347302, + "loss": 1.7003, + "step": 160 + }, + { + "epoch": 1.43, + "eval_loss": 1.7102028131484985, + "eval_runtime": 22.9016, + "eval_samples_per_second": 87.33, + "eval_steps_per_second": 1.397, + "step": 165 + }, + { + "epoch": 1.52, + "eval_loss": 1.708575963973999, + "eval_runtime": 22.7175, + "eval_samples_per_second": 88.038, + "eval_steps_per_second": 1.409, + "step": 176 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001491017964071856, + "loss": 1.7025, + "step": 180 + }, + { + "epoch": 1.62, + "eval_loss": 1.7076679468154907, + "eval_runtime": 22.792, + "eval_samples_per_second": 87.75, + "eval_steps_per_second": 1.404, + "step": 187 + }, + { + "epoch": 1.71, + "eval_loss": 1.7062067985534668, + "eval_runtime": 22.7054, + "eval_samples_per_second": 88.085, + "eval_steps_per_second": 1.409, + "step": 198 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001311377245508982, + "loss": 1.6976, + "step": 200 + }, + { + "epoch": 1.81, + "eval_loss": 1.7049460411071777, + "eval_runtime": 22.765, + "eval_samples_per_second": 87.854, + "eval_steps_per_second": 1.406, + "step": 209 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011317365269461076, + "loss": 1.6955, + "step": 220 + }, + { + "epoch": 1.91, + "eval_loss": 1.704172134399414, + "eval_runtime": 22.8464, + "eval_samples_per_second": 87.541, + "eval_steps_per_second": 1.401, + "step": 220 + }, + { + "epoch": 2.0, + "eval_loss": 1.7033213376998901, + "eval_runtime": 22.7378, + "eval_samples_per_second": 87.959, + "eval_steps_per_second": 1.407, + "step": 231 + }, + { + "epoch": 2.08, + "learning_rate": 9.520958083832335e-05, + "loss": 1.6893, + "step": 240 + }, + { + "epoch": 2.1, + "eval_loss": 1.7023844718933105, + "eval_runtime": 22.7421, + "eval_samples_per_second": 87.942, + "eval_steps_per_second": 1.407, + "step": 242 + }, + { + "epoch": 2.19, + "eval_loss": 1.7013720273971558, + "eval_runtime": 22.7212, + "eval_samples_per_second": 88.023, + "eval_steps_per_second": 1.408, + "step": 253 + } + ], + "max_steps": 345, + "num_train_epochs": 3, + "total_flos": 1.932079404649808e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-253/training_args.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-253/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e1825c4b29d38c869d6c9c9df59d0fd2371fc9ad --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-253/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:147e8e99373e1b6d6dcda22258c683d1beff1e029d1f45adaed70feec69fdbb0 +size 3643 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-264/optimizer.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-264/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..aca907548de1e5ce5467e3caf32fd0b470ef16bc --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-264/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d46137404e8e5c6816cf35855b42b4aac8890e3a35fa80c1e302140ec8f8479c +size 33629893 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-264/pytorch_model.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-264/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..9f304e8cd2edc9a725f40477ca037db9e3a0f529 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-264/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:746f01da78204633f7ed181f974fe197691551a2909fe0a040520a76e10894c4 +size 16822989 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_0.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc25c29ba140f3edc4d43c8609d084538946847b --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48b3f4dc1f190f65e12530db344c7970ab4c4d3abfa6bec6fe867f33be46d865 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_1.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..769fe3e7d7dce9b3a7df3ac40f999307338825ef --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32a80fce08c0aedf4c39bbb24a15f81e6f6777d664f450a25432aa41312a887b +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_2.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..eca4eef8522cf74c90b8e2e5fa1a124c96f59097 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea8cc7d9ad25c3ba18339a63e2e9b38959d8ea4d547e65ed3f1f9d0207d7516b +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_3.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..7c7db49b49dffe120658c9e52bd1e34d760f7c7c --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89fcaa48484eb3b48cad67bb875442a75294f960b6db6be8049cb254d278ae0c +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_4.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..5eb857a066ca9d2f499ea2d72eca61f42e228ba0 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a96815f63e331cb0ae792855b8576b4b3162a029dd812c787a2533074d4b7198 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_5.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1683b4fee56222fb1eacefeb1db39c3c54b06733 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6a16e5539f62a94a8c0691b4df9676d73d06696de18c7e0591e0305c8e1d701 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_6.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..ae212368a065b0307811079b68b5a16770b9df31 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dee64b35f4d8bdad2aafd3338f643bf2e3e566f1b8775c031b2665beefc74174 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_7.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..f34662839c7cb3aeb62a7b6302901e2153a0f8d1 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-264/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91676b496edb3a639e7cbb4fe9261707667045dca76bd0839c9d1640ebfe08bc +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-264/scaler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-264/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..399883fdb93e2a6bdfc3151329c6354397e9ee6b --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-264/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b4fa9bd479f84f872871cd89b1d86a140ade8435053238b6345907d743ccc1a +size 557 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-264/scheduler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-264/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f35a5c76be95bd4670c93ab07200645c47260f6b --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-264/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82fcc536b6d6d6633ddc7df54bc34d3e5e56cdfcb4de427dc537ed6b222729db +size 627 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-264/trainer_state.json b/adapters/saved_llamaprosocial_dialog/checkpoint-264/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..59d90479c01922a6f3ae1a30bded33c3a2e298f4 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-264/trainer_state.json @@ -0,0 +1,286 @@ +{ + "best_metric": 1.7013109922409058, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaprosocial_dialog/checkpoint-264", + "epoch": 2.286332882273342, + "global_step": 264, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "eval_loss": 1.9915313720703125, + "eval_runtime": 22.7483, + "eval_samples_per_second": 87.919, + "eval_steps_per_second": 1.407, + "step": 11 + }, + { + "epoch": 0.17, + "learning_rate": 0.000292814371257485, + "loss": 2.0439, + "step": 20 + }, + { + "epoch": 0.19, + "eval_loss": 1.8029985427856445, + "eval_runtime": 22.7395, + "eval_samples_per_second": 87.953, + "eval_steps_per_second": 1.407, + "step": 22 + }, + { + "epoch": 0.29, + "eval_loss": 1.7677525281906128, + "eval_runtime": 22.7246, + "eval_samples_per_second": 88.01, + "eval_steps_per_second": 1.408, + "step": 33 + }, + { + "epoch": 0.35, + "learning_rate": 0.00027485029940119756, + "loss": 1.7686, + "step": 40 + }, + { + "epoch": 0.38, + "eval_loss": 1.7513495683670044, + "eval_runtime": 22.772, + "eval_samples_per_second": 87.827, + "eval_steps_per_second": 1.405, + "step": 44 + }, + { + "epoch": 0.48, + "eval_loss": 1.7416645288467407, + "eval_runtime": 22.7234, + "eval_samples_per_second": 88.015, + "eval_steps_per_second": 1.408, + "step": 55 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025688622754491017, + "loss": 1.737, + "step": 60 + }, + { + "epoch": 0.57, + "eval_loss": 1.734699010848999, + "eval_runtime": 22.7372, + "eval_samples_per_second": 87.962, + "eval_steps_per_second": 1.407, + "step": 66 + }, + { + "epoch": 0.67, + "eval_loss": 1.72977614402771, + "eval_runtime": 22.74, + "eval_samples_per_second": 87.951, + "eval_steps_per_second": 1.407, + "step": 77 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023892215568862272, + "loss": 1.7289, + "step": 80 + }, + { + "epoch": 0.76, + "eval_loss": 1.725810170173645, + "eval_runtime": 22.7407, + "eval_samples_per_second": 87.948, + "eval_steps_per_second": 1.407, + "step": 88 + }, + { + "epoch": 0.86, + "eval_loss": 1.7222310304641724, + "eval_runtime": 22.733, + "eval_samples_per_second": 87.978, + "eval_steps_per_second": 1.408, + "step": 99 + }, + { + "epoch": 0.87, + "learning_rate": 0.00022095808383233533, + "loss": 1.7172, + "step": 100 + }, + { + "epoch": 0.95, + "eval_loss": 1.7189042568206787, + "eval_runtime": 22.7271, + "eval_samples_per_second": 88.001, + "eval_steps_per_second": 1.408, + "step": 110 + }, + { + "epoch": 1.04, + "learning_rate": 0.00020299401197604788, + "loss": 1.7127, + "step": 120 + }, + { + "epoch": 1.05, + "eval_loss": 1.7177015542984009, + "eval_runtime": 22.7303, + "eval_samples_per_second": 87.988, + "eval_steps_per_second": 1.408, + "step": 121 + }, + { + "epoch": 1.14, + "eval_loss": 1.7150254249572754, + "eval_runtime": 22.7281, + "eval_samples_per_second": 87.997, + "eval_steps_per_second": 1.408, + "step": 132 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018502994011976046, + "loss": 1.7017, + "step": 140 + }, + { + "epoch": 1.24, + "eval_loss": 1.7136248350143433, + "eval_runtime": 22.732, + "eval_samples_per_second": 87.982, + "eval_steps_per_second": 1.408, + "step": 143 + }, + { + "epoch": 1.33, + "eval_loss": 1.7119090557098389, + "eval_runtime": 22.7069, + "eval_samples_per_second": 88.079, + "eval_steps_per_second": 1.409, + "step": 154 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016706586826347302, + "loss": 1.7003, + "step": 160 + }, + { + "epoch": 1.43, + "eval_loss": 1.7102028131484985, + "eval_runtime": 22.9016, + "eval_samples_per_second": 87.33, + "eval_steps_per_second": 1.397, + "step": 165 + }, + { + "epoch": 1.52, + "eval_loss": 1.708575963973999, + "eval_runtime": 22.7175, + "eval_samples_per_second": 88.038, + "eval_steps_per_second": 1.409, + "step": 176 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001491017964071856, + "loss": 1.7025, + "step": 180 + }, + { + "epoch": 1.62, + "eval_loss": 1.7076679468154907, + "eval_runtime": 22.792, + "eval_samples_per_second": 87.75, + "eval_steps_per_second": 1.404, + "step": 187 + }, + { + "epoch": 1.71, + "eval_loss": 1.7062067985534668, + "eval_runtime": 22.7054, + "eval_samples_per_second": 88.085, + "eval_steps_per_second": 1.409, + "step": 198 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001311377245508982, + "loss": 1.6976, + "step": 200 + }, + { + "epoch": 1.81, + "eval_loss": 1.7049460411071777, + "eval_runtime": 22.765, + "eval_samples_per_second": 87.854, + "eval_steps_per_second": 1.406, + "step": 209 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011317365269461076, + "loss": 1.6955, + "step": 220 + }, + { + "epoch": 1.91, + "eval_loss": 1.704172134399414, + "eval_runtime": 22.8464, + "eval_samples_per_second": 87.541, + "eval_steps_per_second": 1.401, + "step": 220 + }, + { + "epoch": 2.0, + "eval_loss": 1.7033213376998901, + "eval_runtime": 22.7378, + "eval_samples_per_second": 87.959, + "eval_steps_per_second": 1.407, + "step": 231 + }, + { + "epoch": 2.08, + "learning_rate": 9.520958083832335e-05, + "loss": 1.6893, + "step": 240 + }, + { + "epoch": 2.1, + "eval_loss": 1.7023844718933105, + "eval_runtime": 22.7421, + "eval_samples_per_second": 87.942, + "eval_steps_per_second": 1.407, + "step": 242 + }, + { + "epoch": 2.19, + "eval_loss": 1.7013720273971558, + "eval_runtime": 22.7212, + "eval_samples_per_second": 88.023, + "eval_steps_per_second": 1.408, + "step": 253 + }, + { + "epoch": 2.25, + "learning_rate": 7.724550898203592e-05, + "loss": 1.6853, + "step": 260 + }, + { + "epoch": 2.29, + "eval_loss": 1.7013109922409058, + "eval_runtime": 22.7306, + "eval_samples_per_second": 87.987, + "eval_steps_per_second": 1.408, + "step": 264 + } + ], + "max_steps": 345, + "num_train_epochs": 3, + "total_flos": 2.015703087917826e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-264/training_args.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-264/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e1825c4b29d38c869d6c9c9df59d0fd2371fc9ad --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-264/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:147e8e99373e1b6d6dcda22258c683d1beff1e029d1f45adaed70feec69fdbb0 +size 3643 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-275/optimizer.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-275/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..11255da10f4a41cfce571495b566eef28a9a6010 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-275/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bfdd0a3b97aa3a955e6d4b2753db86d871517d3a12283230fe43a1fb9e4314b +size 33629893 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-275/pytorch_model.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-275/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..7632e143d04709c222376c506e00ea503444dcc2 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-275/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af624f52f182886118d6a7341920d64db9a0ea5258c1814a01392144b08cb842 +size 16822989 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_0.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..085af5adc1ff1d6a65db7a9963af044a4adbfead --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48a156be55999635ae6bc297f399932ef1fa91257856c7492cfd5a96094dbad5 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_1.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2992285c471f3c76f2784a89e62615c12af5bb38 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f0668c5be8ce059e13ce6b6627546f6f04b3adabfa268c986c6316be12f39c2 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_2.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..646900f2874bdb1409e2f042c21135350b2568cb --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:663820df72726afeda664f6899e4cef96aceb0fd719963cfb04c4227fb817f1c +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_3.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..de56e070776aa70b96bb75f01c746a755522c3a5 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e13d5e7f0a56db91c30067ce25b40eb0bfdf0f779f4445604aacc1f053ce16ef +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_4.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..1ad2abfad154541490049c3cc8639b86af298be0 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fd695f546a12f23a05cb0315aa728343d8e2f6aaaec302b01280b1984174c0e +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_5.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..706f9d3e9f646452d60fcd1784eb38b5e3844641 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:131cde7b344bf06472f6527af6f835fa2f4085de277a21e681cf466d1f09ec72 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_6.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..f97b38f4c0877c12910a5bdacf395e96847c5379 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc317f429ec6cc5f1f2bc9fcc1b97b0999e0072aa6621180b4d1bbcda3ca0a8b +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_7.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..e3e1af2aafeb915b27f64969409069ed6c83bc2f --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-275/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fd57bdd512e7e2cc98a87ae03cb7776bf48ed559411ae710e4f0932b3c0251e +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-275/scaler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-275/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c146a7b010111c48723da521ad13b3606060edfc --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-275/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6453ba1c81b8471d42efd2fbc7ff667f20a410dd4791865ed318a0023bb9cfc +size 557 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-275/scheduler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-275/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..17bbd09cb981f639b3883b6317f49d5d7607230a --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-275/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d8238298c299d0c066670ae26ccd5386f33684146df066b2205dec05f61091c +size 627 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-275/trainer_state.json b/adapters/saved_llamaprosocial_dialog/checkpoint-275/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7f5922f672a4fe7269fe4085905b95cee959a1cb --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-275/trainer_state.json @@ -0,0 +1,294 @@ +{ + "best_metric": 1.7006632089614868, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaprosocial_dialog/checkpoint-275", + "epoch": 2.381596752368065, + "global_step": 275, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "eval_loss": 1.9915313720703125, + "eval_runtime": 22.7483, + "eval_samples_per_second": 87.919, + "eval_steps_per_second": 1.407, + "step": 11 + }, + { + "epoch": 0.17, + "learning_rate": 0.000292814371257485, + "loss": 2.0439, + "step": 20 + }, + { + "epoch": 0.19, + "eval_loss": 1.8029985427856445, + "eval_runtime": 22.7395, + "eval_samples_per_second": 87.953, + "eval_steps_per_second": 1.407, + "step": 22 + }, + { + "epoch": 0.29, + "eval_loss": 1.7677525281906128, + "eval_runtime": 22.7246, + "eval_samples_per_second": 88.01, + "eval_steps_per_second": 1.408, + "step": 33 + }, + { + "epoch": 0.35, + "learning_rate": 0.00027485029940119756, + "loss": 1.7686, + "step": 40 + }, + { + "epoch": 0.38, + "eval_loss": 1.7513495683670044, + "eval_runtime": 22.772, + "eval_samples_per_second": 87.827, + "eval_steps_per_second": 1.405, + "step": 44 + }, + { + "epoch": 0.48, + "eval_loss": 1.7416645288467407, + "eval_runtime": 22.7234, + "eval_samples_per_second": 88.015, + "eval_steps_per_second": 1.408, + "step": 55 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025688622754491017, + "loss": 1.737, + "step": 60 + }, + { + "epoch": 0.57, + "eval_loss": 1.734699010848999, + "eval_runtime": 22.7372, + "eval_samples_per_second": 87.962, + "eval_steps_per_second": 1.407, + "step": 66 + }, + { + "epoch": 0.67, + "eval_loss": 1.72977614402771, + "eval_runtime": 22.74, + "eval_samples_per_second": 87.951, + "eval_steps_per_second": 1.407, + "step": 77 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023892215568862272, + "loss": 1.7289, + "step": 80 + }, + { + "epoch": 0.76, + "eval_loss": 1.725810170173645, + "eval_runtime": 22.7407, + "eval_samples_per_second": 87.948, + "eval_steps_per_second": 1.407, + "step": 88 + }, + { + "epoch": 0.86, + "eval_loss": 1.7222310304641724, + "eval_runtime": 22.733, + "eval_samples_per_second": 87.978, + "eval_steps_per_second": 1.408, + "step": 99 + }, + { + "epoch": 0.87, + "learning_rate": 0.00022095808383233533, + "loss": 1.7172, + "step": 100 + }, + { + "epoch": 0.95, + "eval_loss": 1.7189042568206787, + "eval_runtime": 22.7271, + "eval_samples_per_second": 88.001, + "eval_steps_per_second": 1.408, + "step": 110 + }, + { + "epoch": 1.04, + "learning_rate": 0.00020299401197604788, + "loss": 1.7127, + "step": 120 + }, + { + "epoch": 1.05, + "eval_loss": 1.7177015542984009, + "eval_runtime": 22.7303, + "eval_samples_per_second": 87.988, + "eval_steps_per_second": 1.408, + "step": 121 + }, + { + "epoch": 1.14, + "eval_loss": 1.7150254249572754, + "eval_runtime": 22.7281, + "eval_samples_per_second": 87.997, + "eval_steps_per_second": 1.408, + "step": 132 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018502994011976046, + "loss": 1.7017, + "step": 140 + }, + { + "epoch": 1.24, + "eval_loss": 1.7136248350143433, + "eval_runtime": 22.732, + "eval_samples_per_second": 87.982, + "eval_steps_per_second": 1.408, + "step": 143 + }, + { + "epoch": 1.33, + "eval_loss": 1.7119090557098389, + "eval_runtime": 22.7069, + "eval_samples_per_second": 88.079, + "eval_steps_per_second": 1.409, + "step": 154 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016706586826347302, + "loss": 1.7003, + "step": 160 + }, + { + "epoch": 1.43, + "eval_loss": 1.7102028131484985, + "eval_runtime": 22.9016, + "eval_samples_per_second": 87.33, + "eval_steps_per_second": 1.397, + "step": 165 + }, + { + "epoch": 1.52, + "eval_loss": 1.708575963973999, + "eval_runtime": 22.7175, + "eval_samples_per_second": 88.038, + "eval_steps_per_second": 1.409, + "step": 176 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001491017964071856, + "loss": 1.7025, + "step": 180 + }, + { + "epoch": 1.62, + "eval_loss": 1.7076679468154907, + "eval_runtime": 22.792, + "eval_samples_per_second": 87.75, + "eval_steps_per_second": 1.404, + "step": 187 + }, + { + "epoch": 1.71, + "eval_loss": 1.7062067985534668, + "eval_runtime": 22.7054, + "eval_samples_per_second": 88.085, + "eval_steps_per_second": 1.409, + "step": 198 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001311377245508982, + "loss": 1.6976, + "step": 200 + }, + { + "epoch": 1.81, + "eval_loss": 1.7049460411071777, + "eval_runtime": 22.765, + "eval_samples_per_second": 87.854, + "eval_steps_per_second": 1.406, + "step": 209 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011317365269461076, + "loss": 1.6955, + "step": 220 + }, + { + "epoch": 1.91, + "eval_loss": 1.704172134399414, + "eval_runtime": 22.8464, + "eval_samples_per_second": 87.541, + "eval_steps_per_second": 1.401, + "step": 220 + }, + { + "epoch": 2.0, + "eval_loss": 1.7033213376998901, + "eval_runtime": 22.7378, + "eval_samples_per_second": 87.959, + "eval_steps_per_second": 1.407, + "step": 231 + }, + { + "epoch": 2.08, + "learning_rate": 9.520958083832335e-05, + "loss": 1.6893, + "step": 240 + }, + { + "epoch": 2.1, + "eval_loss": 1.7023844718933105, + "eval_runtime": 22.7421, + "eval_samples_per_second": 87.942, + "eval_steps_per_second": 1.407, + "step": 242 + }, + { + "epoch": 2.19, + "eval_loss": 1.7013720273971558, + "eval_runtime": 22.7212, + "eval_samples_per_second": 88.023, + "eval_steps_per_second": 1.408, + "step": 253 + }, + { + "epoch": 2.25, + "learning_rate": 7.724550898203592e-05, + "loss": 1.6853, + "step": 260 + }, + { + "epoch": 2.29, + "eval_loss": 1.7013109922409058, + "eval_runtime": 22.7306, + "eval_samples_per_second": 87.987, + "eval_steps_per_second": 1.408, + "step": 264 + }, + { + "epoch": 2.38, + "eval_loss": 1.7006632089614868, + "eval_runtime": 22.748, + "eval_samples_per_second": 87.92, + "eval_steps_per_second": 1.407, + "step": 275 + } + ], + "max_steps": 345, + "num_train_epochs": 3, + "total_flos": 2.0996154101680046e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-275/training_args.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-275/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e1825c4b29d38c869d6c9c9df59d0fd2371fc9ad --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-275/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:147e8e99373e1b6d6dcda22258c683d1beff1e029d1f45adaed70feec69fdbb0 +size 3643 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-286/optimizer.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-286/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..84e0be0d38f8af2dfc107a0f445a18aa401eb621 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-286/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348dec8efb6a73dfa476791de6e56cba747fa0ab391efbcf8242198ac172d055 +size 33629893 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-286/pytorch_model.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-286/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..47fdb05320a901d31523b630dd64539a60bcd72c --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-286/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bdf0bb74591800a2cc56a93a970f0da6871b7b0c728dfcae529a116c772b713 +size 16822989 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_0.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..99548ce577fd61602daa93f4a8ce8a37d12b7bea --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83bbd66a23608e0ca79a1151a28abc20417ad32697bb8ad7a9333dd9ff08b4d9 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_1.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..424dde2960abf7c9a558e776a40f3b0574bcf1f4 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44a4ce4d8fabd0a42712ae5e5c45905c5092edb4154970fbe914d242f6409994 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_2.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..ca45b47f0a0accfb30af08360b86e10b22aab44b --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cd6ca3ec932c0b1f56ef00de002fffbca9c2bf1bfbbacf2735b31f2c77f595c +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_3.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..e7f282a3638f2a028a390f642ec1eeef2409eca6 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63c19886d86253d9322eb7a1564bcc7626d060dc00fda34d07d2ea4af9b04362 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_4.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..a0764996f97fe4727015c28aa47b8b5f634e813a --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a5cd6e69a23ebf7754af45c8099eeabefc4efd44330b5837ecd675345567fc4 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_5.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1b5111326a725312cabe32ba2228c889f405f4fd --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dee47cd1d3140d439e30ec58cb99ee52a8721d784c606cbbf9f3d87e51185fd +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_6.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d4188b104bf72cffc98421ff3a6cacb47f1e3799 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb5d13fbf88ae2d2b6a5da3e10c1de92b8b2429f20e7cfdf2af0f7fe0947b8c5 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_7.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..964f6b35fc1412470cbf4dc5dbd1ee03b5d6505d --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-286/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c3494b8b57fbaddfbc6415b3537eb0a4b7529609ae2e7a76980f79bdd7d38dd +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-286/scaler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-286/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..22d4ed34f55146b85e43078774ccb8bb09196a9f --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-286/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23c94ae4cfb839ec800676f0de0569247ae88b93fa68a1096c3beef34ce994ae +size 557 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-286/scheduler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-286/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..88d4df08b8ae78294ad0b7bd0dc491068464ab6c --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-286/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f13a4f1a461c9e321e5dcc64cba22975e8402d9700fdce50ef9f5fbb3761416 +size 627 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-286/trainer_state.json b/adapters/saved_llamaprosocial_dialog/checkpoint-286/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b12fbf4e60c745fe191d1e7ddf57728cfefa0345 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-286/trainer_state.json @@ -0,0 +1,308 @@ +{ + "best_metric": 1.7002646923065186, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaprosocial_dialog/checkpoint-286", + "epoch": 2.4768606224627874, + "global_step": 286, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "eval_loss": 1.9915313720703125, + "eval_runtime": 22.7483, + "eval_samples_per_second": 87.919, + "eval_steps_per_second": 1.407, + "step": 11 + }, + { + "epoch": 0.17, + "learning_rate": 0.000292814371257485, + "loss": 2.0439, + "step": 20 + }, + { + "epoch": 0.19, + "eval_loss": 1.8029985427856445, + "eval_runtime": 22.7395, + "eval_samples_per_second": 87.953, + "eval_steps_per_second": 1.407, + "step": 22 + }, + { + "epoch": 0.29, + "eval_loss": 1.7677525281906128, + "eval_runtime": 22.7246, + "eval_samples_per_second": 88.01, + "eval_steps_per_second": 1.408, + "step": 33 + }, + { + "epoch": 0.35, + "learning_rate": 0.00027485029940119756, + "loss": 1.7686, + "step": 40 + }, + { + "epoch": 0.38, + "eval_loss": 1.7513495683670044, + "eval_runtime": 22.772, + "eval_samples_per_second": 87.827, + "eval_steps_per_second": 1.405, + "step": 44 + }, + { + "epoch": 0.48, + "eval_loss": 1.7416645288467407, + "eval_runtime": 22.7234, + "eval_samples_per_second": 88.015, + "eval_steps_per_second": 1.408, + "step": 55 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025688622754491017, + "loss": 1.737, + "step": 60 + }, + { + "epoch": 0.57, + "eval_loss": 1.734699010848999, + "eval_runtime": 22.7372, + "eval_samples_per_second": 87.962, + "eval_steps_per_second": 1.407, + "step": 66 + }, + { + "epoch": 0.67, + "eval_loss": 1.72977614402771, + "eval_runtime": 22.74, + "eval_samples_per_second": 87.951, + "eval_steps_per_second": 1.407, + "step": 77 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023892215568862272, + "loss": 1.7289, + "step": 80 + }, + { + "epoch": 0.76, + "eval_loss": 1.725810170173645, + "eval_runtime": 22.7407, + "eval_samples_per_second": 87.948, + "eval_steps_per_second": 1.407, + "step": 88 + }, + { + "epoch": 0.86, + "eval_loss": 1.7222310304641724, + "eval_runtime": 22.733, + "eval_samples_per_second": 87.978, + "eval_steps_per_second": 1.408, + "step": 99 + }, + { + "epoch": 0.87, + "learning_rate": 0.00022095808383233533, + "loss": 1.7172, + "step": 100 + }, + { + "epoch": 0.95, + "eval_loss": 1.7189042568206787, + "eval_runtime": 22.7271, + "eval_samples_per_second": 88.001, + "eval_steps_per_second": 1.408, + "step": 110 + }, + { + "epoch": 1.04, + "learning_rate": 0.00020299401197604788, + "loss": 1.7127, + "step": 120 + }, + { + "epoch": 1.05, + "eval_loss": 1.7177015542984009, + "eval_runtime": 22.7303, + "eval_samples_per_second": 87.988, + "eval_steps_per_second": 1.408, + "step": 121 + }, + { + "epoch": 1.14, + "eval_loss": 1.7150254249572754, + "eval_runtime": 22.7281, + "eval_samples_per_second": 87.997, + "eval_steps_per_second": 1.408, + "step": 132 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018502994011976046, + "loss": 1.7017, + "step": 140 + }, + { + "epoch": 1.24, + "eval_loss": 1.7136248350143433, + "eval_runtime": 22.732, + "eval_samples_per_second": 87.982, + "eval_steps_per_second": 1.408, + "step": 143 + }, + { + "epoch": 1.33, + "eval_loss": 1.7119090557098389, + "eval_runtime": 22.7069, + "eval_samples_per_second": 88.079, + "eval_steps_per_second": 1.409, + "step": 154 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016706586826347302, + "loss": 1.7003, + "step": 160 + }, + { + "epoch": 1.43, + "eval_loss": 1.7102028131484985, + "eval_runtime": 22.9016, + "eval_samples_per_second": 87.33, + "eval_steps_per_second": 1.397, + "step": 165 + }, + { + "epoch": 1.52, + "eval_loss": 1.708575963973999, + "eval_runtime": 22.7175, + "eval_samples_per_second": 88.038, + "eval_steps_per_second": 1.409, + "step": 176 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001491017964071856, + "loss": 1.7025, + "step": 180 + }, + { + "epoch": 1.62, + "eval_loss": 1.7076679468154907, + "eval_runtime": 22.792, + "eval_samples_per_second": 87.75, + "eval_steps_per_second": 1.404, + "step": 187 + }, + { + "epoch": 1.71, + "eval_loss": 1.7062067985534668, + "eval_runtime": 22.7054, + "eval_samples_per_second": 88.085, + "eval_steps_per_second": 1.409, + "step": 198 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001311377245508982, + "loss": 1.6976, + "step": 200 + }, + { + "epoch": 1.81, + "eval_loss": 1.7049460411071777, + "eval_runtime": 22.765, + "eval_samples_per_second": 87.854, + "eval_steps_per_second": 1.406, + "step": 209 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011317365269461076, + "loss": 1.6955, + "step": 220 + }, + { + "epoch": 1.91, + "eval_loss": 1.704172134399414, + "eval_runtime": 22.8464, + "eval_samples_per_second": 87.541, + "eval_steps_per_second": 1.401, + "step": 220 + }, + { + "epoch": 2.0, + "eval_loss": 1.7033213376998901, + "eval_runtime": 22.7378, + "eval_samples_per_second": 87.959, + "eval_steps_per_second": 1.407, + "step": 231 + }, + { + "epoch": 2.08, + "learning_rate": 9.520958083832335e-05, + "loss": 1.6893, + "step": 240 + }, + { + "epoch": 2.1, + "eval_loss": 1.7023844718933105, + "eval_runtime": 22.7421, + "eval_samples_per_second": 87.942, + "eval_steps_per_second": 1.407, + "step": 242 + }, + { + "epoch": 2.19, + "eval_loss": 1.7013720273971558, + "eval_runtime": 22.7212, + "eval_samples_per_second": 88.023, + "eval_steps_per_second": 1.408, + "step": 253 + }, + { + "epoch": 2.25, + "learning_rate": 7.724550898203592e-05, + "loss": 1.6853, + "step": 260 + }, + { + "epoch": 2.29, + "eval_loss": 1.7013109922409058, + "eval_runtime": 22.7306, + "eval_samples_per_second": 87.987, + "eval_steps_per_second": 1.408, + "step": 264 + }, + { + "epoch": 2.38, + "eval_loss": 1.7006632089614868, + "eval_runtime": 22.748, + "eval_samples_per_second": 87.92, + "eval_steps_per_second": 1.407, + "step": 275 + }, + { + "epoch": 2.42, + "learning_rate": 5.92814371257485e-05, + "loss": 1.6877, + "step": 280 + }, + { + "epoch": 2.48, + "eval_loss": 1.7002646923065186, + "eval_runtime": 22.7267, + "eval_samples_per_second": 88.002, + "eval_steps_per_second": 1.408, + "step": 286 + } + ], + "max_steps": 345, + "num_train_epochs": 3, + "total_flos": 2.1836540345214566e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-286/training_args.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-286/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e1825c4b29d38c869d6c9c9df59d0fd2371fc9ad --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-286/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:147e8e99373e1b6d6dcda22258c683d1beff1e029d1f45adaed70feec69fdbb0 +size 3643 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-297/optimizer.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-297/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7163bc021180bc9dec85a162f66b9287d979cfe9 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-297/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44271a47791a0e7e6121a739aaa7d4b9ab76433459a49763b71e0e2197c62ac0 +size 33629893 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-297/pytorch_model.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-297/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..0cc4dc5dfb7385425bdfe54db73ca117cb9634ec --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-297/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0d9476e98502f0277271bd6c3a2055f1b552a894ef230bed8f3a2486c8f54e3 +size 16822989 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_0.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..6231ec54aab0db594d3f18b9f4ff64259c9b8ca0 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8319165cc5c70c45de496437f58457ad12a1aa8aea19d8c4ed176d853c8e470a +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_1.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..fd6724607e673b4593f217b71a75143d7a07228a --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c5de54af52b82c91b9c910ec93f05d4c28a9aa57447ded8b3d254aaaa5c8dce +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_2.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9f072fd890321c145aa729febafac223e34ba8af --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bff7549d568661e9c698f8edecaa62d8380f48fb1778f9259020a68f37254ab5 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_3.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..474c9fdbb48598c17661d6b0ed018e026d3da887 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2ef2bd4c43d6ce1f9f329b7932b0ea835710437afe119c359f0aecc65a01963 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_4.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..2c587456a89a04836e2bbae331b3ceb31e7f42db --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81a95b553d4ccee652304f2b51c93f0093a5a14c5646ece67bc5122ae2bb7a44 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_5.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..957158b06e59845377c5ce6cedb612f4d64a3c62 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40e3d4cec9fa836221d415af57a5b6d4692f812fbc353196d4f77321f7ee8947 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_6.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..a8abdfdc495e08c2276453008d7dfe94624d2d01 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b39007bea1090633b99e47739bcfd1015b11379c49b986b43e475423b34db1c0 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_7.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..e3024606f06f758156258d5869152db1347ad8ce --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-297/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed9f8d1ae2c8df30631dde1ee0487e58063e79fc260e9be6e8041f875b454c97 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-297/scaler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-297/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8636c7e4aaee44e574720a6f178112e65243d1fe --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-297/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eeff284780a9f0736033f69513eb1de0d5537e3d93b1e3143be8d38cb1975e9a +size 557 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-297/scheduler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-297/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6582b737aa2ee7053ceaac9c81309ceb458e389f --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-297/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df658f19192cd065ea9812641ad55763989fc347bb9e3c3182e103cdf6851ca7 +size 627 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-297/trainer_state.json b/adapters/saved_llamaprosocial_dialog/checkpoint-297/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e8076be883773583a32e28fb5ae9b08dc47c1280 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-297/trainer_state.json @@ -0,0 +1,316 @@ +{ + "best_metric": 1.6997302770614624, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaprosocial_dialog/checkpoint-297", + "epoch": 2.57212449255751, + "global_step": 297, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "eval_loss": 1.9915313720703125, + "eval_runtime": 22.7483, + "eval_samples_per_second": 87.919, + "eval_steps_per_second": 1.407, + "step": 11 + }, + { + "epoch": 0.17, + "learning_rate": 0.000292814371257485, + "loss": 2.0439, + "step": 20 + }, + { + "epoch": 0.19, + "eval_loss": 1.8029985427856445, + "eval_runtime": 22.7395, + "eval_samples_per_second": 87.953, + "eval_steps_per_second": 1.407, + "step": 22 + }, + { + "epoch": 0.29, + "eval_loss": 1.7677525281906128, + "eval_runtime": 22.7246, + "eval_samples_per_second": 88.01, + "eval_steps_per_second": 1.408, + "step": 33 + }, + { + "epoch": 0.35, + "learning_rate": 0.00027485029940119756, + "loss": 1.7686, + "step": 40 + }, + { + "epoch": 0.38, + "eval_loss": 1.7513495683670044, + "eval_runtime": 22.772, + "eval_samples_per_second": 87.827, + "eval_steps_per_second": 1.405, + "step": 44 + }, + { + "epoch": 0.48, + "eval_loss": 1.7416645288467407, + "eval_runtime": 22.7234, + "eval_samples_per_second": 88.015, + "eval_steps_per_second": 1.408, + "step": 55 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025688622754491017, + "loss": 1.737, + "step": 60 + }, + { + "epoch": 0.57, + "eval_loss": 1.734699010848999, + "eval_runtime": 22.7372, + "eval_samples_per_second": 87.962, + "eval_steps_per_second": 1.407, + "step": 66 + }, + { + "epoch": 0.67, + "eval_loss": 1.72977614402771, + "eval_runtime": 22.74, + "eval_samples_per_second": 87.951, + "eval_steps_per_second": 1.407, + "step": 77 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023892215568862272, + "loss": 1.7289, + "step": 80 + }, + { + "epoch": 0.76, + "eval_loss": 1.725810170173645, + "eval_runtime": 22.7407, + "eval_samples_per_second": 87.948, + "eval_steps_per_second": 1.407, + "step": 88 + }, + { + "epoch": 0.86, + "eval_loss": 1.7222310304641724, + "eval_runtime": 22.733, + "eval_samples_per_second": 87.978, + "eval_steps_per_second": 1.408, + "step": 99 + }, + { + "epoch": 0.87, + "learning_rate": 0.00022095808383233533, + "loss": 1.7172, + "step": 100 + }, + { + "epoch": 0.95, + "eval_loss": 1.7189042568206787, + "eval_runtime": 22.7271, + "eval_samples_per_second": 88.001, + "eval_steps_per_second": 1.408, + "step": 110 + }, + { + "epoch": 1.04, + "learning_rate": 0.00020299401197604788, + "loss": 1.7127, + "step": 120 + }, + { + "epoch": 1.05, + "eval_loss": 1.7177015542984009, + "eval_runtime": 22.7303, + "eval_samples_per_second": 87.988, + "eval_steps_per_second": 1.408, + "step": 121 + }, + { + "epoch": 1.14, + "eval_loss": 1.7150254249572754, + "eval_runtime": 22.7281, + "eval_samples_per_second": 87.997, + "eval_steps_per_second": 1.408, + "step": 132 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018502994011976046, + "loss": 1.7017, + "step": 140 + }, + { + "epoch": 1.24, + "eval_loss": 1.7136248350143433, + "eval_runtime": 22.732, + "eval_samples_per_second": 87.982, + "eval_steps_per_second": 1.408, + "step": 143 + }, + { + "epoch": 1.33, + "eval_loss": 1.7119090557098389, + "eval_runtime": 22.7069, + "eval_samples_per_second": 88.079, + "eval_steps_per_second": 1.409, + "step": 154 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016706586826347302, + "loss": 1.7003, + "step": 160 + }, + { + "epoch": 1.43, + "eval_loss": 1.7102028131484985, + "eval_runtime": 22.9016, + "eval_samples_per_second": 87.33, + "eval_steps_per_second": 1.397, + "step": 165 + }, + { + "epoch": 1.52, + "eval_loss": 1.708575963973999, + "eval_runtime": 22.7175, + "eval_samples_per_second": 88.038, + "eval_steps_per_second": 1.409, + "step": 176 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001491017964071856, + "loss": 1.7025, + "step": 180 + }, + { + "epoch": 1.62, + "eval_loss": 1.7076679468154907, + "eval_runtime": 22.792, + "eval_samples_per_second": 87.75, + "eval_steps_per_second": 1.404, + "step": 187 + }, + { + "epoch": 1.71, + "eval_loss": 1.7062067985534668, + "eval_runtime": 22.7054, + "eval_samples_per_second": 88.085, + "eval_steps_per_second": 1.409, + "step": 198 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001311377245508982, + "loss": 1.6976, + "step": 200 + }, + { + "epoch": 1.81, + "eval_loss": 1.7049460411071777, + "eval_runtime": 22.765, + "eval_samples_per_second": 87.854, + "eval_steps_per_second": 1.406, + "step": 209 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011317365269461076, + "loss": 1.6955, + "step": 220 + }, + { + "epoch": 1.91, + "eval_loss": 1.704172134399414, + "eval_runtime": 22.8464, + "eval_samples_per_second": 87.541, + "eval_steps_per_second": 1.401, + "step": 220 + }, + { + "epoch": 2.0, + "eval_loss": 1.7033213376998901, + "eval_runtime": 22.7378, + "eval_samples_per_second": 87.959, + "eval_steps_per_second": 1.407, + "step": 231 + }, + { + "epoch": 2.08, + "learning_rate": 9.520958083832335e-05, + "loss": 1.6893, + "step": 240 + }, + { + "epoch": 2.1, + "eval_loss": 1.7023844718933105, + "eval_runtime": 22.7421, + "eval_samples_per_second": 87.942, + "eval_steps_per_second": 1.407, + "step": 242 + }, + { + "epoch": 2.19, + "eval_loss": 1.7013720273971558, + "eval_runtime": 22.7212, + "eval_samples_per_second": 88.023, + "eval_steps_per_second": 1.408, + "step": 253 + }, + { + "epoch": 2.25, + "learning_rate": 7.724550898203592e-05, + "loss": 1.6853, + "step": 260 + }, + { + "epoch": 2.29, + "eval_loss": 1.7013109922409058, + "eval_runtime": 22.7306, + "eval_samples_per_second": 87.987, + "eval_steps_per_second": 1.408, + "step": 264 + }, + { + "epoch": 2.38, + "eval_loss": 1.7006632089614868, + "eval_runtime": 22.748, + "eval_samples_per_second": 87.92, + "eval_steps_per_second": 1.407, + "step": 275 + }, + { + "epoch": 2.42, + "learning_rate": 5.92814371257485e-05, + "loss": 1.6877, + "step": 280 + }, + { + "epoch": 2.48, + "eval_loss": 1.7002646923065186, + "eval_runtime": 22.7267, + "eval_samples_per_second": 88.002, + "eval_steps_per_second": 1.408, + "step": 286 + }, + { + "epoch": 2.57, + "eval_loss": 1.6997302770614624, + "eval_runtime": 22.7256, + "eval_samples_per_second": 88.006, + "eval_steps_per_second": 1.408, + "step": 297 + } + ], + "max_steps": 345, + "num_train_epochs": 3, + "total_flos": 2.268056979475792e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-297/training_args.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-297/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e1825c4b29d38c869d6c9c9df59d0fd2371fc9ad --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-297/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:147e8e99373e1b6d6dcda22258c683d1beff1e029d1f45adaed70feec69fdbb0 +size 3643 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-308/optimizer.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-308/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6cd9a68377eb96dc6fb6b6853ff8f817fafc3e44 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-308/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e836b57da658fe495c33d5fbcf1bba137080978399d229fb643a8654c5c1aefd +size 33629893 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-308/pytorch_model.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-308/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..32015180f4280c43bab17b0f70cf9843773adf2f --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-308/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:308c91b4c5abc4526c4fb4725e877282aabcbae4937abdcec6a053fe035a805c +size 16822989 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_0.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..56c7bd0b268f60d8e02333dc165fec68cd5d7aec --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07407b12a841bc687069afd76a1c8a73a29200578872176d4947898301a3bea0 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_1.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..221a5c4611995d67eabf8b56cfb73a459544660c --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7006fb3443ab16a21a6a276c902dc7ba307bb6fbe8e087e814766045f503215 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_2.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..93d062ff047cb9c5cb7c7602ae027c5bd28525a3 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8214bc11402bb8bff1b9c5de23f3505d1e3196f6b21cbd0b74abe990ac0cc3b +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_3.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..9ea33b1d9e0fce3749eee9c4ae715b24571f865b --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccd04aa0e1be5c5032a9ea9b09445b5d6550451d87679734bfaa7da74daab365 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_4.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..059353a9a00263da9d321482f32c19aaed663531 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe4ad1482c422ec0a104f53b8d0ded35ba15ae841b5cecfb7f33865d737c1898 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_5.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..28e853fcbc4ba46d8f6de31204cb299a8f447107 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0233910265a0bb23161b873cca945e36f492c8c59372096c4fe1fb8e7cb33635 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_6.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..34215c7eb4f016add3ff4f0f24d3e9ad32d54569 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4822812e8eeabdf2a721e58e4820dfaa1da26913051c69d3ddffbf769db88b7b +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_7.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..84a825fdf6a613fdb5cb8b70e67214452431bed3 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-308/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c77956391ee3fbc12d364e730ff644ab3d175c8d56025af97469583ee6418953 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-308/scaler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-308/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..432d3e74e9d32d78d2f933726dc9ba8010a377f4 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-308/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec340539b9bebe0ee525736b68c67d3394dc34da48553961b83295677b38aef9 +size 557 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-308/scheduler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-308/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..961910a2a59b148f3104bc349b679616d3faa746 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-308/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d468fe7993285beaa37883ee6760b174d385241e26b7e03c7704544238a03c6c +size 627 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-308/trainer_state.json b/adapters/saved_llamaprosocial_dialog/checkpoint-308/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4e3b27639433474b53e39943d20e172405fd785d --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-308/trainer_state.json @@ -0,0 +1,330 @@ +{ + "best_metric": 1.6993123292922974, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaprosocial_dialog/checkpoint-308", + "epoch": 2.6673883626522326, + "global_step": 308, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "eval_loss": 1.9915313720703125, + "eval_runtime": 22.7483, + "eval_samples_per_second": 87.919, + "eval_steps_per_second": 1.407, + "step": 11 + }, + { + "epoch": 0.17, + "learning_rate": 0.000292814371257485, + "loss": 2.0439, + "step": 20 + }, + { + "epoch": 0.19, + "eval_loss": 1.8029985427856445, + "eval_runtime": 22.7395, + "eval_samples_per_second": 87.953, + "eval_steps_per_second": 1.407, + "step": 22 + }, + { + "epoch": 0.29, + "eval_loss": 1.7677525281906128, + "eval_runtime": 22.7246, + "eval_samples_per_second": 88.01, + "eval_steps_per_second": 1.408, + "step": 33 + }, + { + "epoch": 0.35, + "learning_rate": 0.00027485029940119756, + "loss": 1.7686, + "step": 40 + }, + { + "epoch": 0.38, + "eval_loss": 1.7513495683670044, + "eval_runtime": 22.772, + "eval_samples_per_second": 87.827, + "eval_steps_per_second": 1.405, + "step": 44 + }, + { + "epoch": 0.48, + "eval_loss": 1.7416645288467407, + "eval_runtime": 22.7234, + "eval_samples_per_second": 88.015, + "eval_steps_per_second": 1.408, + "step": 55 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025688622754491017, + "loss": 1.737, + "step": 60 + }, + { + "epoch": 0.57, + "eval_loss": 1.734699010848999, + "eval_runtime": 22.7372, + "eval_samples_per_second": 87.962, + "eval_steps_per_second": 1.407, + "step": 66 + }, + { + "epoch": 0.67, + "eval_loss": 1.72977614402771, + "eval_runtime": 22.74, + "eval_samples_per_second": 87.951, + "eval_steps_per_second": 1.407, + "step": 77 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023892215568862272, + "loss": 1.7289, + "step": 80 + }, + { + "epoch": 0.76, + "eval_loss": 1.725810170173645, + "eval_runtime": 22.7407, + "eval_samples_per_second": 87.948, + "eval_steps_per_second": 1.407, + "step": 88 + }, + { + "epoch": 0.86, + "eval_loss": 1.7222310304641724, + "eval_runtime": 22.733, + "eval_samples_per_second": 87.978, + "eval_steps_per_second": 1.408, + "step": 99 + }, + { + "epoch": 0.87, + "learning_rate": 0.00022095808383233533, + "loss": 1.7172, + "step": 100 + }, + { + "epoch": 0.95, + "eval_loss": 1.7189042568206787, + "eval_runtime": 22.7271, + "eval_samples_per_second": 88.001, + "eval_steps_per_second": 1.408, + "step": 110 + }, + { + "epoch": 1.04, + "learning_rate": 0.00020299401197604788, + "loss": 1.7127, + "step": 120 + }, + { + "epoch": 1.05, + "eval_loss": 1.7177015542984009, + "eval_runtime": 22.7303, + "eval_samples_per_second": 87.988, + "eval_steps_per_second": 1.408, + "step": 121 + }, + { + "epoch": 1.14, + "eval_loss": 1.7150254249572754, + "eval_runtime": 22.7281, + "eval_samples_per_second": 87.997, + "eval_steps_per_second": 1.408, + "step": 132 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018502994011976046, + "loss": 1.7017, + "step": 140 + }, + { + "epoch": 1.24, + "eval_loss": 1.7136248350143433, + "eval_runtime": 22.732, + "eval_samples_per_second": 87.982, + "eval_steps_per_second": 1.408, + "step": 143 + }, + { + "epoch": 1.33, + "eval_loss": 1.7119090557098389, + "eval_runtime": 22.7069, + "eval_samples_per_second": 88.079, + "eval_steps_per_second": 1.409, + "step": 154 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016706586826347302, + "loss": 1.7003, + "step": 160 + }, + { + "epoch": 1.43, + "eval_loss": 1.7102028131484985, + "eval_runtime": 22.9016, + "eval_samples_per_second": 87.33, + "eval_steps_per_second": 1.397, + "step": 165 + }, + { + "epoch": 1.52, + "eval_loss": 1.708575963973999, + "eval_runtime": 22.7175, + "eval_samples_per_second": 88.038, + "eval_steps_per_second": 1.409, + "step": 176 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001491017964071856, + "loss": 1.7025, + "step": 180 + }, + { + "epoch": 1.62, + "eval_loss": 1.7076679468154907, + "eval_runtime": 22.792, + "eval_samples_per_second": 87.75, + "eval_steps_per_second": 1.404, + "step": 187 + }, + { + "epoch": 1.71, + "eval_loss": 1.7062067985534668, + "eval_runtime": 22.7054, + "eval_samples_per_second": 88.085, + "eval_steps_per_second": 1.409, + "step": 198 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001311377245508982, + "loss": 1.6976, + "step": 200 + }, + { + "epoch": 1.81, + "eval_loss": 1.7049460411071777, + "eval_runtime": 22.765, + "eval_samples_per_second": 87.854, + "eval_steps_per_second": 1.406, + "step": 209 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011317365269461076, + "loss": 1.6955, + "step": 220 + }, + { + "epoch": 1.91, + "eval_loss": 1.704172134399414, + "eval_runtime": 22.8464, + "eval_samples_per_second": 87.541, + "eval_steps_per_second": 1.401, + "step": 220 + }, + { + "epoch": 2.0, + "eval_loss": 1.7033213376998901, + "eval_runtime": 22.7378, + "eval_samples_per_second": 87.959, + "eval_steps_per_second": 1.407, + "step": 231 + }, + { + "epoch": 2.08, + "learning_rate": 9.520958083832335e-05, + "loss": 1.6893, + "step": 240 + }, + { + "epoch": 2.1, + "eval_loss": 1.7023844718933105, + "eval_runtime": 22.7421, + "eval_samples_per_second": 87.942, + "eval_steps_per_second": 1.407, + "step": 242 + }, + { + "epoch": 2.19, + "eval_loss": 1.7013720273971558, + "eval_runtime": 22.7212, + "eval_samples_per_second": 88.023, + "eval_steps_per_second": 1.408, + "step": 253 + }, + { + "epoch": 2.25, + "learning_rate": 7.724550898203592e-05, + "loss": 1.6853, + "step": 260 + }, + { + "epoch": 2.29, + "eval_loss": 1.7013109922409058, + "eval_runtime": 22.7306, + "eval_samples_per_second": 87.987, + "eval_steps_per_second": 1.408, + "step": 264 + }, + { + "epoch": 2.38, + "eval_loss": 1.7006632089614868, + "eval_runtime": 22.748, + "eval_samples_per_second": 87.92, + "eval_steps_per_second": 1.407, + "step": 275 + }, + { + "epoch": 2.42, + "learning_rate": 5.92814371257485e-05, + "loss": 1.6877, + "step": 280 + }, + { + "epoch": 2.48, + "eval_loss": 1.7002646923065186, + "eval_runtime": 22.7267, + "eval_samples_per_second": 88.002, + "eval_steps_per_second": 1.408, + "step": 286 + }, + { + "epoch": 2.57, + "eval_loss": 1.6997302770614624, + "eval_runtime": 22.7256, + "eval_samples_per_second": 88.006, + "eval_steps_per_second": 1.408, + "step": 297 + }, + { + "epoch": 2.6, + "learning_rate": 4.131736526946108e-05, + "loss": 1.6834, + "step": 300 + }, + { + "epoch": 2.67, + "eval_loss": 1.6993123292922974, + "eval_runtime": 22.7463, + "eval_samples_per_second": 87.926, + "eval_steps_per_second": 1.407, + "step": 308 + } + ], + "max_steps": 345, + "num_train_epochs": 3, + "total_flos": 2.3514605800296284e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-308/training_args.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-308/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e1825c4b29d38c869d6c9c9df59d0fd2371fc9ad --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-308/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:147e8e99373e1b6d6dcda22258c683d1beff1e029d1f45adaed70feec69fdbb0 +size 3643 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-319/optimizer.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-319/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca8053a9669210b0499e877dfb0edb3cb5582b75 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-319/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44f5fb9079f5aa1076c567560fdf8bcd18988acb6763c671b31f6d8488a30e3f +size 33629893 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-319/pytorch_model.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-319/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..996d67a260cd6caa35e4aa428bceb6e01017b4b7 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-319/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:555d370e7ab0e22c08e0d4923d3dc1fba8a54175fc7d74781ddfce90f6df2941 +size 16822989 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_0.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8fd2b309699cc5667582d487921faecda7d7f642 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51d419c8ce900f0a89c264250946e05cfe5417472baad2fd8d8463e20d2c899e +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_1.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..37c91831ed928e11c0278e98b01c49ce208b377f --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8460e0231ca06bf7091c053f50fb8c7ce1593bd8d84e883ffd8bdacfbd2be01 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_2.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..04f743ae893239a09cf20feda80efc764ef9f9ca --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63da955d184f5d175bab34877d01a68aa2756868ae616c3ddfee8ee3d34e02e5 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_3.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..42618327e373eb6460c5e7541631416120cf168c --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bfe573e40496e15ce04e054268b8c04a9b4bf300eda4222a45e6bafee8aa46e +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_4.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..ce299cdbd2b84229bed80380f78995d00783a27d --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de07faa58fb26124513829406e190838b80f655d2ec4fa7b1e035b4e2bf5d2c9 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_5.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..defdc91cd594df381638a550ea1f64941d27b413 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ab8cb3bf11389213bf0f0c99df12cfc4ab9ff6ad586f6c877f7b49e5cfd2dc7 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_6.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..deb16640b44477826f047695d4507f77f13b346f --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c9940baaee7e4a3d819041d60b342a6e78e0cc194eebc2563bdd5be714ab45b +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_7.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..e548abca7f2d87fcd461a63eb82343593eb67e38 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-319/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76e3f8d902c6ee804cbf041c802c034e65ba01219d444b9e287172bfecaa120e +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-319/scaler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-319/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf3e97413028706b62c7f40cc7adc24b6c63a945 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-319/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e70db32a728050ad800f907c72a9972335564b8b8b5587ede679d356927a7788 +size 557 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-319/scheduler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-319/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..72824d6434dbc0d1d47265b05e9c351107dd0c4a --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-319/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea166748a9059d6e45f11023f3d66f95562a322a6a04a84ff4682a69e36e9919 +size 627 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-319/trainer_state.json b/adapters/saved_llamaprosocial_dialog/checkpoint-319/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..74043dd20a126e194f44f9e137d1e86661d78db0 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-319/trainer_state.json @@ -0,0 +1,338 @@ +{ + "best_metric": 1.6989519596099854, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaprosocial_dialog/checkpoint-319", + "epoch": 2.762652232746955, + "global_step": 319, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "eval_loss": 1.9915313720703125, + "eval_runtime": 22.7483, + "eval_samples_per_second": 87.919, + "eval_steps_per_second": 1.407, + "step": 11 + }, + { + "epoch": 0.17, + "learning_rate": 0.000292814371257485, + "loss": 2.0439, + "step": 20 + }, + { + "epoch": 0.19, + "eval_loss": 1.8029985427856445, + "eval_runtime": 22.7395, + "eval_samples_per_second": 87.953, + "eval_steps_per_second": 1.407, + "step": 22 + }, + { + "epoch": 0.29, + "eval_loss": 1.7677525281906128, + "eval_runtime": 22.7246, + "eval_samples_per_second": 88.01, + "eval_steps_per_second": 1.408, + "step": 33 + }, + { + "epoch": 0.35, + "learning_rate": 0.00027485029940119756, + "loss": 1.7686, + "step": 40 + }, + { + "epoch": 0.38, + "eval_loss": 1.7513495683670044, + "eval_runtime": 22.772, + "eval_samples_per_second": 87.827, + "eval_steps_per_second": 1.405, + "step": 44 + }, + { + "epoch": 0.48, + "eval_loss": 1.7416645288467407, + "eval_runtime": 22.7234, + "eval_samples_per_second": 88.015, + "eval_steps_per_second": 1.408, + "step": 55 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025688622754491017, + "loss": 1.737, + "step": 60 + }, + { + "epoch": 0.57, + "eval_loss": 1.734699010848999, + "eval_runtime": 22.7372, + "eval_samples_per_second": 87.962, + "eval_steps_per_second": 1.407, + "step": 66 + }, + { + "epoch": 0.67, + "eval_loss": 1.72977614402771, + "eval_runtime": 22.74, + "eval_samples_per_second": 87.951, + "eval_steps_per_second": 1.407, + "step": 77 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023892215568862272, + "loss": 1.7289, + "step": 80 + }, + { + "epoch": 0.76, + "eval_loss": 1.725810170173645, + "eval_runtime": 22.7407, + "eval_samples_per_second": 87.948, + "eval_steps_per_second": 1.407, + "step": 88 + }, + { + "epoch": 0.86, + "eval_loss": 1.7222310304641724, + "eval_runtime": 22.733, + "eval_samples_per_second": 87.978, + "eval_steps_per_second": 1.408, + "step": 99 + }, + { + "epoch": 0.87, + "learning_rate": 0.00022095808383233533, + "loss": 1.7172, + "step": 100 + }, + { + "epoch": 0.95, + "eval_loss": 1.7189042568206787, + "eval_runtime": 22.7271, + "eval_samples_per_second": 88.001, + "eval_steps_per_second": 1.408, + "step": 110 + }, + { + "epoch": 1.04, + "learning_rate": 0.00020299401197604788, + "loss": 1.7127, + "step": 120 + }, + { + "epoch": 1.05, + "eval_loss": 1.7177015542984009, + "eval_runtime": 22.7303, + "eval_samples_per_second": 87.988, + "eval_steps_per_second": 1.408, + "step": 121 + }, + { + "epoch": 1.14, + "eval_loss": 1.7150254249572754, + "eval_runtime": 22.7281, + "eval_samples_per_second": 87.997, + "eval_steps_per_second": 1.408, + "step": 132 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018502994011976046, + "loss": 1.7017, + "step": 140 + }, + { + "epoch": 1.24, + "eval_loss": 1.7136248350143433, + "eval_runtime": 22.732, + "eval_samples_per_second": 87.982, + "eval_steps_per_second": 1.408, + "step": 143 + }, + { + "epoch": 1.33, + "eval_loss": 1.7119090557098389, + "eval_runtime": 22.7069, + "eval_samples_per_second": 88.079, + "eval_steps_per_second": 1.409, + "step": 154 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016706586826347302, + "loss": 1.7003, + "step": 160 + }, + { + "epoch": 1.43, + "eval_loss": 1.7102028131484985, + "eval_runtime": 22.9016, + "eval_samples_per_second": 87.33, + "eval_steps_per_second": 1.397, + "step": 165 + }, + { + "epoch": 1.52, + "eval_loss": 1.708575963973999, + "eval_runtime": 22.7175, + "eval_samples_per_second": 88.038, + "eval_steps_per_second": 1.409, + "step": 176 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001491017964071856, + "loss": 1.7025, + "step": 180 + }, + { + "epoch": 1.62, + "eval_loss": 1.7076679468154907, + "eval_runtime": 22.792, + "eval_samples_per_second": 87.75, + "eval_steps_per_second": 1.404, + "step": 187 + }, + { + "epoch": 1.71, + "eval_loss": 1.7062067985534668, + "eval_runtime": 22.7054, + "eval_samples_per_second": 88.085, + "eval_steps_per_second": 1.409, + "step": 198 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001311377245508982, + "loss": 1.6976, + "step": 200 + }, + { + "epoch": 1.81, + "eval_loss": 1.7049460411071777, + "eval_runtime": 22.765, + "eval_samples_per_second": 87.854, + "eval_steps_per_second": 1.406, + "step": 209 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011317365269461076, + "loss": 1.6955, + "step": 220 + }, + { + "epoch": 1.91, + "eval_loss": 1.704172134399414, + "eval_runtime": 22.8464, + "eval_samples_per_second": 87.541, + "eval_steps_per_second": 1.401, + "step": 220 + }, + { + "epoch": 2.0, + "eval_loss": 1.7033213376998901, + "eval_runtime": 22.7378, + "eval_samples_per_second": 87.959, + "eval_steps_per_second": 1.407, + "step": 231 + }, + { + "epoch": 2.08, + "learning_rate": 9.520958083832335e-05, + "loss": 1.6893, + "step": 240 + }, + { + "epoch": 2.1, + "eval_loss": 1.7023844718933105, + "eval_runtime": 22.7421, + "eval_samples_per_second": 87.942, + "eval_steps_per_second": 1.407, + "step": 242 + }, + { + "epoch": 2.19, + "eval_loss": 1.7013720273971558, + "eval_runtime": 22.7212, + "eval_samples_per_second": 88.023, + "eval_steps_per_second": 1.408, + "step": 253 + }, + { + "epoch": 2.25, + "learning_rate": 7.724550898203592e-05, + "loss": 1.6853, + "step": 260 + }, + { + "epoch": 2.29, + "eval_loss": 1.7013109922409058, + "eval_runtime": 22.7306, + "eval_samples_per_second": 87.987, + "eval_steps_per_second": 1.408, + "step": 264 + }, + { + "epoch": 2.38, + "eval_loss": 1.7006632089614868, + "eval_runtime": 22.748, + "eval_samples_per_second": 87.92, + "eval_steps_per_second": 1.407, + "step": 275 + }, + { + "epoch": 2.42, + "learning_rate": 5.92814371257485e-05, + "loss": 1.6877, + "step": 280 + }, + { + "epoch": 2.48, + "eval_loss": 1.7002646923065186, + "eval_runtime": 22.7267, + "eval_samples_per_second": 88.002, + "eval_steps_per_second": 1.408, + "step": 286 + }, + { + "epoch": 2.57, + "eval_loss": 1.6997302770614624, + "eval_runtime": 22.7256, + "eval_samples_per_second": 88.006, + "eval_steps_per_second": 1.408, + "step": 297 + }, + { + "epoch": 2.6, + "learning_rate": 4.131736526946108e-05, + "loss": 1.6834, + "step": 300 + }, + { + "epoch": 2.67, + "eval_loss": 1.6993123292922974, + "eval_runtime": 22.7463, + "eval_samples_per_second": 87.926, + "eval_steps_per_second": 1.407, + "step": 308 + }, + { + "epoch": 2.76, + "eval_loss": 1.6989519596099854, + "eval_runtime": 22.7888, + "eval_samples_per_second": 87.762, + "eval_steps_per_second": 1.404, + "step": 319 + } + ], + "max_steps": 345, + "num_train_epochs": 3, + "total_flos": 2.4354170030040023e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-319/training_args.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-319/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e1825c4b29d38c869d6c9c9df59d0fd2371fc9ad --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-319/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:147e8e99373e1b6d6dcda22258c683d1beff1e029d1f45adaed70feec69fdbb0 +size 3643 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-330/optimizer.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-330/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..92463a6281c03b4550d5c8c080dadc0aee665c4e --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-330/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1de80a2a0e1da596edc363db38d3dc87e86f8728f59f907f4d015a9634836db6 +size 33629893 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-330/pytorch_model.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-330/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..cca27eb811945c13568a3082b954220edee1b640 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-330/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47608417d7d10aef61b23efdd92284f46ccc83300f69e9b4e1f1736d8eb35392 +size 16822989 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_0.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..0657af0802d09c57459633da711b5366b1b12905 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e647dde7aeebcbf6f032e7b45a6ea74ed0117ede890e259840f75724d4600ee6 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_1.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..810e5dd566beedd8a5d8911a8415a0b9b322b5c8 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca84d48ebbc263a04a0a8d58f56baf3cacf3015a02bfe2270b80cca317dcd5f7 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_2.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..ac8546628a7603ee18b3a6dd6f1ba190750388e2 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90bbf032da6f914bc3cc956582dd611c72d9ade4271b4ec70b99e3438106da33 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_3.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..c04c4801642052c379de63d353b311e5ad936afe --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ca5de25d4c09e34a8e6ce159235f094112a47bddcb28aa81e13d84a7f574d95 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_4.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f8f8b16615498c539043823db3e82335e784b02 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2ddc91de193d3c8cd9caf4b828d55217f0eff5b582b6cf96aee31552169d6af +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_5.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..932d68ed63eb071ee497a3d445e8cbf871116596 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f79966af527c21395576b5217ec160543c511163892bea4c12840f67a17c0f37 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_6.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..9859bdc13420daa9e87b368a1a7696d19416e487 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee46d9bc101b8ebf81c1486424d856b576ef67746c03ebba74ee6bfd71060a30 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_7.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..32388f5604d40991fac0b130a6bbd5425b711758 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-330/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:249a17084d688511ede5cca8c665840f6a58c6454beff006f72551ff89e6a6e0 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-330/scaler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-330/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..45015f8c3946c81584060f9d6e4b3e04806fa6b1 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-330/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11af06b4063670f6d7c0e8f678d4ee5ae5c16573c91f0087c76b587c0e9d1555 +size 557 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-330/scheduler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-330/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0160ae012714fe9e36769df826f111be8d44fba6 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-330/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aaa94667a37c4b96044154a123f82bbb94d88c2495271c7d7d81f742b73caf99 +size 627 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-330/trainer_state.json b/adapters/saved_llamaprosocial_dialog/checkpoint-330/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1fad65579b3d0a551ab1a91a4a8a6c79a1726ec6 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-330/trainer_state.json @@ -0,0 +1,352 @@ +{ + "best_metric": 1.698702335357666, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaprosocial_dialog/checkpoint-330", + "epoch": 2.8579161028416777, + "global_step": 330, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "eval_loss": 1.9915313720703125, + "eval_runtime": 22.7483, + "eval_samples_per_second": 87.919, + "eval_steps_per_second": 1.407, + "step": 11 + }, + { + "epoch": 0.17, + "learning_rate": 0.000292814371257485, + "loss": 2.0439, + "step": 20 + }, + { + "epoch": 0.19, + "eval_loss": 1.8029985427856445, + "eval_runtime": 22.7395, + "eval_samples_per_second": 87.953, + "eval_steps_per_second": 1.407, + "step": 22 + }, + { + "epoch": 0.29, + "eval_loss": 1.7677525281906128, + "eval_runtime": 22.7246, + "eval_samples_per_second": 88.01, + "eval_steps_per_second": 1.408, + "step": 33 + }, + { + "epoch": 0.35, + "learning_rate": 0.00027485029940119756, + "loss": 1.7686, + "step": 40 + }, + { + "epoch": 0.38, + "eval_loss": 1.7513495683670044, + "eval_runtime": 22.772, + "eval_samples_per_second": 87.827, + "eval_steps_per_second": 1.405, + "step": 44 + }, + { + "epoch": 0.48, + "eval_loss": 1.7416645288467407, + "eval_runtime": 22.7234, + "eval_samples_per_second": 88.015, + "eval_steps_per_second": 1.408, + "step": 55 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025688622754491017, + "loss": 1.737, + "step": 60 + }, + { + "epoch": 0.57, + "eval_loss": 1.734699010848999, + "eval_runtime": 22.7372, + "eval_samples_per_second": 87.962, + "eval_steps_per_second": 1.407, + "step": 66 + }, + { + "epoch": 0.67, + "eval_loss": 1.72977614402771, + "eval_runtime": 22.74, + "eval_samples_per_second": 87.951, + "eval_steps_per_second": 1.407, + "step": 77 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023892215568862272, + "loss": 1.7289, + "step": 80 + }, + { + "epoch": 0.76, + "eval_loss": 1.725810170173645, + "eval_runtime": 22.7407, + "eval_samples_per_second": 87.948, + "eval_steps_per_second": 1.407, + "step": 88 + }, + { + "epoch": 0.86, + "eval_loss": 1.7222310304641724, + "eval_runtime": 22.733, + "eval_samples_per_second": 87.978, + "eval_steps_per_second": 1.408, + "step": 99 + }, + { + "epoch": 0.87, + "learning_rate": 0.00022095808383233533, + "loss": 1.7172, + "step": 100 + }, + { + "epoch": 0.95, + "eval_loss": 1.7189042568206787, + "eval_runtime": 22.7271, + "eval_samples_per_second": 88.001, + "eval_steps_per_second": 1.408, + "step": 110 + }, + { + "epoch": 1.04, + "learning_rate": 0.00020299401197604788, + "loss": 1.7127, + "step": 120 + }, + { + "epoch": 1.05, + "eval_loss": 1.7177015542984009, + "eval_runtime": 22.7303, + "eval_samples_per_second": 87.988, + "eval_steps_per_second": 1.408, + "step": 121 + }, + { + "epoch": 1.14, + "eval_loss": 1.7150254249572754, + "eval_runtime": 22.7281, + "eval_samples_per_second": 87.997, + "eval_steps_per_second": 1.408, + "step": 132 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018502994011976046, + "loss": 1.7017, + "step": 140 + }, + { + "epoch": 1.24, + "eval_loss": 1.7136248350143433, + "eval_runtime": 22.732, + "eval_samples_per_second": 87.982, + "eval_steps_per_second": 1.408, + "step": 143 + }, + { + "epoch": 1.33, + "eval_loss": 1.7119090557098389, + "eval_runtime": 22.7069, + "eval_samples_per_second": 88.079, + "eval_steps_per_second": 1.409, + "step": 154 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016706586826347302, + "loss": 1.7003, + "step": 160 + }, + { + "epoch": 1.43, + "eval_loss": 1.7102028131484985, + "eval_runtime": 22.9016, + "eval_samples_per_second": 87.33, + "eval_steps_per_second": 1.397, + "step": 165 + }, + { + "epoch": 1.52, + "eval_loss": 1.708575963973999, + "eval_runtime": 22.7175, + "eval_samples_per_second": 88.038, + "eval_steps_per_second": 1.409, + "step": 176 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001491017964071856, + "loss": 1.7025, + "step": 180 + }, + { + "epoch": 1.62, + "eval_loss": 1.7076679468154907, + "eval_runtime": 22.792, + "eval_samples_per_second": 87.75, + "eval_steps_per_second": 1.404, + "step": 187 + }, + { + "epoch": 1.71, + "eval_loss": 1.7062067985534668, + "eval_runtime": 22.7054, + "eval_samples_per_second": 88.085, + "eval_steps_per_second": 1.409, + "step": 198 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001311377245508982, + "loss": 1.6976, + "step": 200 + }, + { + "epoch": 1.81, + "eval_loss": 1.7049460411071777, + "eval_runtime": 22.765, + "eval_samples_per_second": 87.854, + "eval_steps_per_second": 1.406, + "step": 209 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011317365269461076, + "loss": 1.6955, + "step": 220 + }, + { + "epoch": 1.91, + "eval_loss": 1.704172134399414, + "eval_runtime": 22.8464, + "eval_samples_per_second": 87.541, + "eval_steps_per_second": 1.401, + "step": 220 + }, + { + "epoch": 2.0, + "eval_loss": 1.7033213376998901, + "eval_runtime": 22.7378, + "eval_samples_per_second": 87.959, + "eval_steps_per_second": 1.407, + "step": 231 + }, + { + "epoch": 2.08, + "learning_rate": 9.520958083832335e-05, + "loss": 1.6893, + "step": 240 + }, + { + "epoch": 2.1, + "eval_loss": 1.7023844718933105, + "eval_runtime": 22.7421, + "eval_samples_per_second": 87.942, + "eval_steps_per_second": 1.407, + "step": 242 + }, + { + "epoch": 2.19, + "eval_loss": 1.7013720273971558, + "eval_runtime": 22.7212, + "eval_samples_per_second": 88.023, + "eval_steps_per_second": 1.408, + "step": 253 + }, + { + "epoch": 2.25, + "learning_rate": 7.724550898203592e-05, + "loss": 1.6853, + "step": 260 + }, + { + "epoch": 2.29, + "eval_loss": 1.7013109922409058, + "eval_runtime": 22.7306, + "eval_samples_per_second": 87.987, + "eval_steps_per_second": 1.408, + "step": 264 + }, + { + "epoch": 2.38, + "eval_loss": 1.7006632089614868, + "eval_runtime": 22.748, + "eval_samples_per_second": 87.92, + "eval_steps_per_second": 1.407, + "step": 275 + }, + { + "epoch": 2.42, + "learning_rate": 5.92814371257485e-05, + "loss": 1.6877, + "step": 280 + }, + { + "epoch": 2.48, + "eval_loss": 1.7002646923065186, + "eval_runtime": 22.7267, + "eval_samples_per_second": 88.002, + "eval_steps_per_second": 1.408, + "step": 286 + }, + { + "epoch": 2.57, + "eval_loss": 1.6997302770614624, + "eval_runtime": 22.7256, + "eval_samples_per_second": 88.006, + "eval_steps_per_second": 1.408, + "step": 297 + }, + { + "epoch": 2.6, + "learning_rate": 4.131736526946108e-05, + "loss": 1.6834, + "step": 300 + }, + { + "epoch": 2.67, + "eval_loss": 1.6993123292922974, + "eval_runtime": 22.7463, + "eval_samples_per_second": 87.926, + "eval_steps_per_second": 1.407, + "step": 308 + }, + { + "epoch": 2.76, + "eval_loss": 1.6989519596099854, + "eval_runtime": 22.7888, + "eval_samples_per_second": 87.762, + "eval_steps_per_second": 1.404, + "step": 319 + }, + { + "epoch": 2.77, + "learning_rate": 2.3353293413173652e-05, + "loss": 1.6854, + "step": 320 + }, + { + "epoch": 2.86, + "eval_loss": 1.698702335357666, + "eval_runtime": 22.724, + "eval_samples_per_second": 88.013, + "eval_steps_per_second": 1.408, + "step": 330 + } + ], + "max_steps": 345, + "num_train_epochs": 3, + "total_flos": 2.5200587588774134e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-330/training_args.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-330/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e1825c4b29d38c869d6c9c9df59d0fd2371fc9ad --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-330/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:147e8e99373e1b6d6dcda22258c683d1beff1e029d1f45adaed70feec69fdbb0 +size 3643 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-341/optimizer.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-341/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..950b967c18d6b1ec486b4cd57873e538acd7bbf3 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-341/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:070809f5d1b093706946e78224953efbc098924c516400b92793234fcf8c748a +size 33629893 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-341/pytorch_model.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-341/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..60c04a8b8e1dd43e363259e20141f88149f9bb48 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-341/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c65edc39e07975643dfc5fedb5e05030c3f73d8c58dccf3f4544551a654f359 +size 16822989 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_0.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b4ecf89c1a2df163ce88086bcb139c1e596451d1 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:555d53a871583b96b0260deb5ec500b8788bb6273a6008a43f7ac6eb539d8380 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_1.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0dc4f5545678d0e96e63a87621a23df10725670c --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9448871f19bda4cac7d6514157d334ea35b7c4761931399c250b2d8c12825f22 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_2.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..bdde4fe29e0488323a9ef67effd9cf2af73d49cf --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b9407fcad9f81c04caa5dd88b27acad29161252184da4a76fe2758f28a8ccb0 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_3.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..7740def04ddae27d79bae6b90004faafd635ab1d --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:669819bce38afe78cd72b57a99a1aee7a894ddd6fad7250d569ed63df704371b +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_4.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..7af66a542ef8be89023e40fdf7740ed01d13deaa --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1042f29e0eed409b63ee0819827cd0bc8d9cad30c617158fa75264bfb0936d3 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_5.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..c070c6e17d5b28a8078750342980c4fc7abdeeb2 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35fac88c3da4af4093d1e3ce16e2bafae72f68124bf95cf051149929c87b3459 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_6.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..0d3f0b5e1d02459f7e56a16b30c2a2e57953a259 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea5abfc8f88e64a43e31d8ce30bf60b1b14156eda93de89e53a002ea34e41cb0 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_7.pth b/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..69514bdd4482a0323046f996bd0e5f28b13b4086 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-341/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a886c33a87f168ccbffb448d827c214c443cd85048622a81817a27e54da3b74 +size 14583 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-341/scaler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-341/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..bde2d342d09d51f33afaa1336c26dd64c70feae2 --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-341/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28cd2d0839106464c865c7fb24bb7aac6f687110d99e16c4743c2fa8254935a2 +size 557 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-341/scheduler.pt b/adapters/saved_llamaprosocial_dialog/checkpoint-341/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..90ed4a4c86483a1700cb3ed0f895a268b58e793e --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-341/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a53b1230b5d78d7a8d1994c091e516c8a8704b69959dfd0e177690c89bfcb658 +size 627 diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-341/trainer_state.json b/adapters/saved_llamaprosocial_dialog/checkpoint-341/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6e6662dd5e8302984ae2750a0d848a4b9eb041ae --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-341/trainer_state.json @@ -0,0 +1,366 @@ +{ + "best_metric": 1.6984472274780273, + "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved_llamaprosocial_dialog/checkpoint-341", + "epoch": 2.9531799729364003, + "global_step": 341, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "eval_loss": 1.9915313720703125, + "eval_runtime": 22.7483, + "eval_samples_per_second": 87.919, + "eval_steps_per_second": 1.407, + "step": 11 + }, + { + "epoch": 0.17, + "learning_rate": 0.000292814371257485, + "loss": 2.0439, + "step": 20 + }, + { + "epoch": 0.19, + "eval_loss": 1.8029985427856445, + "eval_runtime": 22.7395, + "eval_samples_per_second": 87.953, + "eval_steps_per_second": 1.407, + "step": 22 + }, + { + "epoch": 0.29, + "eval_loss": 1.7677525281906128, + "eval_runtime": 22.7246, + "eval_samples_per_second": 88.01, + "eval_steps_per_second": 1.408, + "step": 33 + }, + { + "epoch": 0.35, + "learning_rate": 0.00027485029940119756, + "loss": 1.7686, + "step": 40 + }, + { + "epoch": 0.38, + "eval_loss": 1.7513495683670044, + "eval_runtime": 22.772, + "eval_samples_per_second": 87.827, + "eval_steps_per_second": 1.405, + "step": 44 + }, + { + "epoch": 0.48, + "eval_loss": 1.7416645288467407, + "eval_runtime": 22.7234, + "eval_samples_per_second": 88.015, + "eval_steps_per_second": 1.408, + "step": 55 + }, + { + "epoch": 0.52, + "learning_rate": 0.00025688622754491017, + "loss": 1.737, + "step": 60 + }, + { + "epoch": 0.57, + "eval_loss": 1.734699010848999, + "eval_runtime": 22.7372, + "eval_samples_per_second": 87.962, + "eval_steps_per_second": 1.407, + "step": 66 + }, + { + "epoch": 0.67, + "eval_loss": 1.72977614402771, + "eval_runtime": 22.74, + "eval_samples_per_second": 87.951, + "eval_steps_per_second": 1.407, + "step": 77 + }, + { + "epoch": 0.69, + "learning_rate": 0.00023892215568862272, + "loss": 1.7289, + "step": 80 + }, + { + "epoch": 0.76, + "eval_loss": 1.725810170173645, + "eval_runtime": 22.7407, + "eval_samples_per_second": 87.948, + "eval_steps_per_second": 1.407, + "step": 88 + }, + { + "epoch": 0.86, + "eval_loss": 1.7222310304641724, + "eval_runtime": 22.733, + "eval_samples_per_second": 87.978, + "eval_steps_per_second": 1.408, + "step": 99 + }, + { + "epoch": 0.87, + "learning_rate": 0.00022095808383233533, + "loss": 1.7172, + "step": 100 + }, + { + "epoch": 0.95, + "eval_loss": 1.7189042568206787, + "eval_runtime": 22.7271, + "eval_samples_per_second": 88.001, + "eval_steps_per_second": 1.408, + "step": 110 + }, + { + "epoch": 1.04, + "learning_rate": 0.00020299401197604788, + "loss": 1.7127, + "step": 120 + }, + { + "epoch": 1.05, + "eval_loss": 1.7177015542984009, + "eval_runtime": 22.7303, + "eval_samples_per_second": 87.988, + "eval_steps_per_second": 1.408, + "step": 121 + }, + { + "epoch": 1.14, + "eval_loss": 1.7150254249572754, + "eval_runtime": 22.7281, + "eval_samples_per_second": 87.997, + "eval_steps_per_second": 1.408, + "step": 132 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018502994011976046, + "loss": 1.7017, + "step": 140 + }, + { + "epoch": 1.24, + "eval_loss": 1.7136248350143433, + "eval_runtime": 22.732, + "eval_samples_per_second": 87.982, + "eval_steps_per_second": 1.408, + "step": 143 + }, + { + "epoch": 1.33, + "eval_loss": 1.7119090557098389, + "eval_runtime": 22.7069, + "eval_samples_per_second": 88.079, + "eval_steps_per_second": 1.409, + "step": 154 + }, + { + "epoch": 1.39, + "learning_rate": 0.00016706586826347302, + "loss": 1.7003, + "step": 160 + }, + { + "epoch": 1.43, + "eval_loss": 1.7102028131484985, + "eval_runtime": 22.9016, + "eval_samples_per_second": 87.33, + "eval_steps_per_second": 1.397, + "step": 165 + }, + { + "epoch": 1.52, + "eval_loss": 1.708575963973999, + "eval_runtime": 22.7175, + "eval_samples_per_second": 88.038, + "eval_steps_per_second": 1.409, + "step": 176 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001491017964071856, + "loss": 1.7025, + "step": 180 + }, + { + "epoch": 1.62, + "eval_loss": 1.7076679468154907, + "eval_runtime": 22.792, + "eval_samples_per_second": 87.75, + "eval_steps_per_second": 1.404, + "step": 187 + }, + { + "epoch": 1.71, + "eval_loss": 1.7062067985534668, + "eval_runtime": 22.7054, + "eval_samples_per_second": 88.085, + "eval_steps_per_second": 1.409, + "step": 198 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001311377245508982, + "loss": 1.6976, + "step": 200 + }, + { + "epoch": 1.81, + "eval_loss": 1.7049460411071777, + "eval_runtime": 22.765, + "eval_samples_per_second": 87.854, + "eval_steps_per_second": 1.406, + "step": 209 + }, + { + "epoch": 1.91, + "learning_rate": 0.00011317365269461076, + "loss": 1.6955, + "step": 220 + }, + { + "epoch": 1.91, + "eval_loss": 1.704172134399414, + "eval_runtime": 22.8464, + "eval_samples_per_second": 87.541, + "eval_steps_per_second": 1.401, + "step": 220 + }, + { + "epoch": 2.0, + "eval_loss": 1.7033213376998901, + "eval_runtime": 22.7378, + "eval_samples_per_second": 87.959, + "eval_steps_per_second": 1.407, + "step": 231 + }, + { + "epoch": 2.08, + "learning_rate": 9.520958083832335e-05, + "loss": 1.6893, + "step": 240 + }, + { + "epoch": 2.1, + "eval_loss": 1.7023844718933105, + "eval_runtime": 22.7421, + "eval_samples_per_second": 87.942, + "eval_steps_per_second": 1.407, + "step": 242 + }, + { + "epoch": 2.19, + "eval_loss": 1.7013720273971558, + "eval_runtime": 22.7212, + "eval_samples_per_second": 88.023, + "eval_steps_per_second": 1.408, + "step": 253 + }, + { + "epoch": 2.25, + "learning_rate": 7.724550898203592e-05, + "loss": 1.6853, + "step": 260 + }, + { + "epoch": 2.29, + "eval_loss": 1.7013109922409058, + "eval_runtime": 22.7306, + "eval_samples_per_second": 87.987, + "eval_steps_per_second": 1.408, + "step": 264 + }, + { + "epoch": 2.38, + "eval_loss": 1.7006632089614868, + "eval_runtime": 22.748, + "eval_samples_per_second": 87.92, + "eval_steps_per_second": 1.407, + "step": 275 + }, + { + "epoch": 2.42, + "learning_rate": 5.92814371257485e-05, + "loss": 1.6877, + "step": 280 + }, + { + "epoch": 2.48, + "eval_loss": 1.7002646923065186, + "eval_runtime": 22.7267, + "eval_samples_per_second": 88.002, + "eval_steps_per_second": 1.408, + "step": 286 + }, + { + "epoch": 2.57, + "eval_loss": 1.6997302770614624, + "eval_runtime": 22.7256, + "eval_samples_per_second": 88.006, + "eval_steps_per_second": 1.408, + "step": 297 + }, + { + "epoch": 2.6, + "learning_rate": 4.131736526946108e-05, + "loss": 1.6834, + "step": 300 + }, + { + "epoch": 2.67, + "eval_loss": 1.6993123292922974, + "eval_runtime": 22.7463, + "eval_samples_per_second": 87.926, + "eval_steps_per_second": 1.407, + "step": 308 + }, + { + "epoch": 2.76, + "eval_loss": 1.6989519596099854, + "eval_runtime": 22.7888, + "eval_samples_per_second": 87.762, + "eval_steps_per_second": 1.404, + "step": 319 + }, + { + "epoch": 2.77, + "learning_rate": 2.3353293413173652e-05, + "loss": 1.6854, + "step": 320 + }, + { + "epoch": 2.86, + "eval_loss": 1.698702335357666, + "eval_runtime": 22.724, + "eval_samples_per_second": 88.013, + "eval_steps_per_second": 1.408, + "step": 330 + }, + { + "epoch": 2.94, + "learning_rate": 5.389221556886227e-06, + "loss": 1.6858, + "step": 340 + }, + { + "epoch": 2.95, + "eval_loss": 1.6984472274780273, + "eval_runtime": 22.7255, + "eval_samples_per_second": 88.007, + "eval_steps_per_second": 1.408, + "step": 341 + } + ], + "max_steps": 345, + "num_train_epochs": 3, + "total_flos": 2.604629423378399e+18, + "trial_name": null, + "trial_params": null +} diff --git a/adapters/saved_llamaprosocial_dialog/checkpoint-341/training_args.bin b/adapters/saved_llamaprosocial_dialog/checkpoint-341/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e1825c4b29d38c869d6c9c9df59d0fd2371fc9ad --- /dev/null +++ b/adapters/saved_llamaprosocial_dialog/checkpoint-341/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:147e8e99373e1b6d6dcda22258c683d1beff1e029d1f45adaed70feec69fdbb0 +size 3643