diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..64462f3c67ba3462e4c33d3d5440f79313b44c1b --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,17 @@ +{ + "base_model_name_or_path": "decapoda-research/llama-7b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "lora_alpha": 32, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/adapter_model.bin b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8b4c765dccc4b62a3ac0029ecb54c45ec4c3c7e8 --- /dev/null +++ b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c4b955087f09aace7a5d755a95a6792cb0db82f4a084b97b2efb972c9f4c0af +size 16822989 diff --git a/checkpoint-1000/optimizer.pt b/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0383c4bf55e5f6b79b064f30f69105e0483f156f --- /dev/null +++ b/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:979ac3bc6cbe88f0a465b45aa635de5ec1b976163ed6db6d50f7c7d439253edd +size 33629893 diff --git a/checkpoint-1000/pytorch_model.bin b/checkpoint-1000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ef7a47c136c820fac470b3ca78d8dc14e601ba7d --- /dev/null +++ b/checkpoint-1000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8234c8c2a5f460f54a4623c898674414ebe417073721e170c173183c3e51cd6d +size 7548185429 diff --git a/checkpoint-1000/rng_state.pth b/checkpoint-1000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..94adbb40d9165a12bdc452621b37e0305b520ceb --- /dev/null +++ b/checkpoint-1000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9da99b2666c082cb0531c92fbc81ab3d3e8224b5a766766fe7f009db89afea6 +size 14575 diff --git a/checkpoint-1000/scaler.pt b/checkpoint-1000/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e7318ce3a281637fd3910c197576fec8440a6a2 --- /dev/null +++ b/checkpoint-1000/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:616f782e508bf93abe9de82fb1a8777069847068afdada0050c6f94df6e0661f +size 557 diff --git a/checkpoint-1000/scheduler.pt b/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f93a0feb79979406a2e1e89557cd33569aee275 --- /dev/null +++ b/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:428f2f0ea9063d04e8798a20929f1746aee4194634d5af1d36fa2ea04d562359 +size 627 diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3a450df09e1697966d993d68715417b3532c4f6d --- /dev/null +++ b/checkpoint-1000/trainer_state.json @@ -0,0 +1,316 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.774538386783285, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.16, + "learning_rate": 1.9912109375000002e-05, + "loss": 1.9656, + "step": 20 + }, + { + "epoch": 0.31, + "learning_rate": 1.98193359375e-05, + "loss": 1.8445, + "step": 40 + }, + { + "epoch": 0.47, + "learning_rate": 1.97216796875e-05, + "loss": 1.59, + "step": 60 + }, + { + "epoch": 0.62, + "learning_rate": 1.9624023437500002e-05, + "loss": 1.6275, + "step": 80 + }, + { + "epoch": 0.78, + "learning_rate": 1.9526367187500002e-05, + "loss": 1.473, + "step": 100 + }, + { + "epoch": 0.93, + "learning_rate": 1.9428710937500003e-05, + "loss": 1.3701, + "step": 120 + }, + { + "epoch": 1.09, + "learning_rate": 1.93310546875e-05, + "loss": 1.3224, + "step": 140 + }, + { + "epoch": 1.24, + "learning_rate": 1.92333984375e-05, + "loss": 1.1423, + "step": 160 + }, + { + "epoch": 1.4, + "learning_rate": 1.91357421875e-05, + "loss": 1.1652, + "step": 180 + }, + { + "epoch": 1.55, + "learning_rate": 1.9038085937500002e-05, + "loss": 1.1422, + "step": 200 + }, + { + "epoch": 1.71, + "learning_rate": 1.8940429687500002e-05, + "loss": 1.002, + "step": 220 + }, + { + "epoch": 1.87, + "learning_rate": 1.8842773437500003e-05, + "loss": 1.0779, + "step": 240 + }, + { + "epoch": 2.02, + "learning_rate": 1.87451171875e-05, + "loss": 0.9887, + "step": 260 + }, + { + "epoch": 2.18, + "learning_rate": 1.86474609375e-05, + "loss": 0.9543, + "step": 280 + }, + { + "epoch": 2.33, + "learning_rate": 1.85498046875e-05, + "loss": 0.9371, + "step": 300 + }, + { + "epoch": 2.49, + "learning_rate": 1.8452148437500002e-05, + "loss": 0.8701, + "step": 320 + }, + { + "epoch": 2.64, + "learning_rate": 1.8354492187500003e-05, + "loss": 0.875, + "step": 340 + }, + { + "epoch": 2.8, + "learning_rate": 1.82568359375e-05, + "loss": 0.7843, + "step": 360 + }, + { + "epoch": 2.95, + "learning_rate": 1.81591796875e-05, + "loss": 0.7945, + "step": 380 + }, + { + "epoch": 3.11, + "learning_rate": 1.80615234375e-05, + "loss": 0.799, + "step": 400 + }, + { + "epoch": 3.27, + "learning_rate": 1.79638671875e-05, + "loss": 0.7623, + "step": 420 + }, + { + "epoch": 3.42, + "learning_rate": 1.7866210937500002e-05, + "loss": 0.7263, + "step": 440 + }, + { + "epoch": 3.58, + "learning_rate": 1.7768554687500003e-05, + "loss": 0.7779, + "step": 460 + }, + { + "epoch": 3.73, + "learning_rate": 1.76708984375e-05, + "loss": 0.695, + "step": 480 + }, + { + "epoch": 3.89, + "learning_rate": 1.75732421875e-05, + "loss": 0.7344, + "step": 500 + }, + { + "epoch": 4.04, + "learning_rate": 1.74755859375e-05, + "loss": 0.695, + "step": 520 + }, + { + "epoch": 4.2, + "learning_rate": 1.7377929687500002e-05, + "loss": 0.6504, + "step": 540 + }, + { + "epoch": 4.35, + "learning_rate": 1.7280273437500002e-05, + "loss": 0.6447, + "step": 560 + }, + { + "epoch": 4.51, + "learning_rate": 1.7182617187500003e-05, + "loss": 0.6931, + "step": 580 + }, + { + "epoch": 4.66, + "learning_rate": 1.70849609375e-05, + "loss": 0.6256, + "step": 600 + }, + { + "epoch": 4.82, + "learning_rate": 1.69873046875e-05, + "loss": 0.6132, + "step": 620 + }, + { + "epoch": 4.98, + "learning_rate": 1.68896484375e-05, + "loss": 0.6001, + "step": 640 + }, + { + "epoch": 5.13, + "learning_rate": 1.6791992187500002e-05, + "loss": 0.6176, + "step": 660 + }, + { + "epoch": 5.29, + "learning_rate": 1.6694335937500002e-05, + "loss": 0.5709, + "step": 680 + }, + { + "epoch": 5.44, + "learning_rate": 1.65966796875e-05, + "loss": 0.564, + "step": 700 + }, + { + "epoch": 5.6, + "learning_rate": 1.64990234375e-05, + "loss": 0.5969, + "step": 720 + }, + { + "epoch": 5.75, + "learning_rate": 1.64013671875e-05, + "loss": 0.5484, + "step": 740 + }, + { + "epoch": 5.91, + "learning_rate": 1.63037109375e-05, + "loss": 0.5667, + "step": 760 + }, + { + "epoch": 6.06, + "learning_rate": 1.6206054687500002e-05, + "loss": 0.5442, + "step": 780 + }, + { + "epoch": 6.22, + "learning_rate": 1.6108398437500003e-05, + "loss": 0.4857, + "step": 800 + }, + { + "epoch": 6.38, + "learning_rate": 1.60107421875e-05, + "loss": 0.5225, + "step": 820 + }, + { + "epoch": 6.53, + "learning_rate": 1.59130859375e-05, + "loss": 0.5457, + "step": 840 + }, + { + "epoch": 6.69, + "learning_rate": 1.58154296875e-05, + "loss": 0.5315, + "step": 860 + }, + { + "epoch": 6.84, + "learning_rate": 1.57177734375e-05, + "loss": 0.5345, + "step": 880 + }, + { + "epoch": 7.0, + "learning_rate": 1.5620117187500002e-05, + "loss": 0.5169, + "step": 900 + }, + { + "epoch": 7.15, + "learning_rate": 1.5522460937500003e-05, + "loss": 0.5115, + "step": 920 + }, + { + "epoch": 7.31, + "learning_rate": 1.54248046875e-05, + "loss": 0.4913, + "step": 940 + }, + { + "epoch": 7.46, + "learning_rate": 1.53271484375e-05, + "loss": 0.4868, + "step": 960 + }, + { + "epoch": 7.62, + "learning_rate": 1.5229492187500001e-05, + "loss": 0.5226, + "step": 980 + }, + { + "epoch": 7.77, + "learning_rate": 1.5131835937500002e-05, + "loss": 0.4061, + "step": 1000 + } + ], + "max_steps": 4096, + "num_train_epochs": 32, + "total_flos": 1.2995638935552e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1000/training_args.bin b/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e2b3bee7f57e1aa04d4410b1e443601a704abcd7 --- /dev/null +++ b/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19b61f1babbbf314354bf942366de8dc63aa97ebdccb27573b76080ab5b50935 +size 3963 diff --git a/checkpoint-1500/optimizer.pt b/checkpoint-1500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f8945ff09fb06aa247da4e184f40b3b9d4bbace --- /dev/null +++ b/checkpoint-1500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7be759d8e3c9449c68b4477fc8e78894f12eabe3ed4d3408bfdf307b7212790 +size 33629893 diff --git a/checkpoint-1500/pytorch_model.bin b/checkpoint-1500/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8bda2cbba557bf8aa1997f476e9de6bb18429025 --- /dev/null +++ b/checkpoint-1500/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d589a9f67ecd0e153fc208ba616150a644e7bd4d2a0266aca2d392beaadd9ad9 +size 7548185429 diff --git a/checkpoint-1500/rng_state.pth b/checkpoint-1500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f61d9ab88cffeea37fa60ab2ef41e3d95856d116 --- /dev/null +++ b/checkpoint-1500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eff331b9eab2cd38e6a0fadd8adffff1a8b68b2fc0667f5221befa41e70a8b55 +size 14575 diff --git a/checkpoint-1500/scaler.pt b/checkpoint-1500/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae3cb34a9e6f9ecf425c3fe111264d2a9824ce96 --- /dev/null +++ b/checkpoint-1500/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:356b72cd9ae87e2bf8febf01c2dbebe9ae4ee48b6e3610acb7c32587cc971085 +size 557 diff --git a/checkpoint-1500/scheduler.pt b/checkpoint-1500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..732d188cabb5da74ffc52c1afa70cd6c72d2886e --- /dev/null +++ b/checkpoint-1500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:764d4d2c60f25d225c9497dc7347e571e56aa3fdf9303049b653f92a6d91c06c +size 627 diff --git a/checkpoint-1500/trainer_state.json b/checkpoint-1500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b000ab8b9e2b0969d3b595edf46d1b637d6b5e9c --- /dev/null +++ b/checkpoint-1500/trainer_state.json @@ -0,0 +1,466 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 11.661807580174926, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.16, + "learning_rate": 1.9912109375000002e-05, + "loss": 1.9656, + "step": 20 + }, + { + "epoch": 0.31, + "learning_rate": 1.98193359375e-05, + "loss": 1.8445, + "step": 40 + }, + { + "epoch": 0.47, + "learning_rate": 1.97216796875e-05, + "loss": 1.59, + "step": 60 + }, + { + "epoch": 0.62, + "learning_rate": 1.9624023437500002e-05, + "loss": 1.6275, + "step": 80 + }, + { + "epoch": 0.78, + "learning_rate": 1.9526367187500002e-05, + "loss": 1.473, + "step": 100 + }, + { + "epoch": 0.93, + "learning_rate": 1.9428710937500003e-05, + "loss": 1.3701, + "step": 120 + }, + { + "epoch": 1.09, + "learning_rate": 1.93310546875e-05, + "loss": 1.3224, + "step": 140 + }, + { + "epoch": 1.24, + "learning_rate": 1.92333984375e-05, + "loss": 1.1423, + "step": 160 + }, + { + "epoch": 1.4, + "learning_rate": 1.91357421875e-05, + "loss": 1.1652, + "step": 180 + }, + { + "epoch": 1.55, + "learning_rate": 1.9038085937500002e-05, + "loss": 1.1422, + "step": 200 + }, + { + "epoch": 1.71, + "learning_rate": 1.8940429687500002e-05, + "loss": 1.002, + "step": 220 + }, + { + "epoch": 1.87, + "learning_rate": 1.8842773437500003e-05, + "loss": 1.0779, + "step": 240 + }, + { + "epoch": 2.02, + "learning_rate": 1.87451171875e-05, + "loss": 0.9887, + "step": 260 + }, + { + "epoch": 2.18, + "learning_rate": 1.86474609375e-05, + "loss": 0.9543, + "step": 280 + }, + { + "epoch": 2.33, + "learning_rate": 1.85498046875e-05, + "loss": 0.9371, + "step": 300 + }, + { + "epoch": 2.49, + "learning_rate": 1.8452148437500002e-05, + "loss": 0.8701, + "step": 320 + }, + { + "epoch": 2.64, + "learning_rate": 1.8354492187500003e-05, + "loss": 0.875, + "step": 340 + }, + { + "epoch": 2.8, + "learning_rate": 1.82568359375e-05, + "loss": 0.7843, + "step": 360 + }, + { + "epoch": 2.95, + "learning_rate": 1.81591796875e-05, + "loss": 0.7945, + "step": 380 + }, + { + "epoch": 3.11, + "learning_rate": 1.80615234375e-05, + "loss": 0.799, + "step": 400 + }, + { + "epoch": 3.27, + "learning_rate": 1.79638671875e-05, + "loss": 0.7623, + "step": 420 + }, + { + "epoch": 3.42, + "learning_rate": 1.7866210937500002e-05, + "loss": 0.7263, + "step": 440 + }, + { + "epoch": 3.58, + "learning_rate": 1.7768554687500003e-05, + "loss": 0.7779, + "step": 460 + }, + { + "epoch": 3.73, + "learning_rate": 1.76708984375e-05, + "loss": 0.695, + "step": 480 + }, + { + "epoch": 3.89, + "learning_rate": 1.75732421875e-05, + "loss": 0.7344, + "step": 500 + }, + { + "epoch": 4.04, + "learning_rate": 1.74755859375e-05, + "loss": 0.695, + "step": 520 + }, + { + "epoch": 4.2, + "learning_rate": 1.7377929687500002e-05, + "loss": 0.6504, + "step": 540 + }, + { + "epoch": 4.35, + "learning_rate": 1.7280273437500002e-05, + "loss": 0.6447, + "step": 560 + }, + { + "epoch": 4.51, + "learning_rate": 1.7182617187500003e-05, + "loss": 0.6931, + "step": 580 + }, + { + "epoch": 4.66, + "learning_rate": 1.70849609375e-05, + "loss": 0.6256, + "step": 600 + }, + { + "epoch": 4.82, + "learning_rate": 1.69873046875e-05, + "loss": 0.6132, + "step": 620 + }, + { + "epoch": 4.98, + "learning_rate": 1.68896484375e-05, + "loss": 0.6001, + "step": 640 + }, + { + "epoch": 5.13, + "learning_rate": 1.6791992187500002e-05, + "loss": 0.6176, + "step": 660 + }, + { + "epoch": 5.29, + "learning_rate": 1.6694335937500002e-05, + "loss": 0.5709, + "step": 680 + }, + { + "epoch": 5.44, + "learning_rate": 1.65966796875e-05, + "loss": 0.564, + "step": 700 + }, + { + "epoch": 5.6, + "learning_rate": 1.64990234375e-05, + "loss": 0.5969, + "step": 720 + }, + { + "epoch": 5.75, + "learning_rate": 1.64013671875e-05, + "loss": 0.5484, + "step": 740 + }, + { + "epoch": 5.91, + "learning_rate": 1.63037109375e-05, + "loss": 0.5667, + "step": 760 + }, + { + "epoch": 6.06, + "learning_rate": 1.6206054687500002e-05, + "loss": 0.5442, + "step": 780 + }, + { + "epoch": 6.22, + "learning_rate": 1.6108398437500003e-05, + "loss": 0.4857, + "step": 800 + }, + { + "epoch": 6.38, + "learning_rate": 1.60107421875e-05, + "loss": 0.5225, + "step": 820 + }, + { + "epoch": 6.53, + "learning_rate": 1.59130859375e-05, + "loss": 0.5457, + "step": 840 + }, + { + "epoch": 6.69, + "learning_rate": 1.58154296875e-05, + "loss": 0.5315, + "step": 860 + }, + { + "epoch": 6.84, + "learning_rate": 1.57177734375e-05, + "loss": 0.5345, + "step": 880 + }, + { + "epoch": 7.0, + "learning_rate": 1.5620117187500002e-05, + "loss": 0.5169, + "step": 900 + }, + { + "epoch": 7.15, + "learning_rate": 1.5522460937500003e-05, + "loss": 0.5115, + "step": 920 + }, + { + "epoch": 7.31, + "learning_rate": 1.54248046875e-05, + "loss": 0.4913, + "step": 940 + }, + { + "epoch": 7.46, + "learning_rate": 1.53271484375e-05, + "loss": 0.4868, + "step": 960 + }, + { + "epoch": 7.62, + "learning_rate": 1.5229492187500001e-05, + "loss": 0.5226, + "step": 980 + }, + { + "epoch": 7.77, + "learning_rate": 1.5131835937500002e-05, + "loss": 0.4061, + "step": 1000 + }, + { + "epoch": 7.93, + "learning_rate": 1.50341796875e-05, + "loss": 0.4488, + "step": 1020 + }, + { + "epoch": 8.09, + "learning_rate": 1.4936523437500001e-05, + "loss": 0.4443, + "step": 1040 + }, + { + "epoch": 8.24, + "learning_rate": 1.4838867187500002e-05, + "loss": 0.4646, + "step": 1060 + }, + { + "epoch": 8.4, + "learning_rate": 1.47412109375e-05, + "loss": 0.4513, + "step": 1080 + }, + { + "epoch": 8.55, + "learning_rate": 1.4643554687500001e-05, + "loss": 0.4394, + "step": 1100 + }, + { + "epoch": 8.71, + "learning_rate": 1.4545898437500002e-05, + "loss": 0.4337, + "step": 1120 + }, + { + "epoch": 8.86, + "learning_rate": 1.44482421875e-05, + "loss": 0.5037, + "step": 1140 + }, + { + "epoch": 9.02, + "learning_rate": 1.4350585937500001e-05, + "loss": 0.4209, + "step": 1160 + }, + { + "epoch": 9.17, + "learning_rate": 1.42529296875e-05, + "loss": 0.4073, + "step": 1180 + }, + { + "epoch": 9.33, + "learning_rate": 1.4155273437500001e-05, + "loss": 0.3879, + "step": 1200 + }, + { + "epoch": 9.48, + "learning_rate": 1.4057617187500002e-05, + "loss": 0.4434, + "step": 1220 + }, + { + "epoch": 9.64, + "learning_rate": 1.39599609375e-05, + "loss": 0.4062, + "step": 1240 + }, + { + "epoch": 9.8, + "learning_rate": 1.3862304687500001e-05, + "loss": 0.39, + "step": 1260 + }, + { + "epoch": 9.95, + "learning_rate": 1.3764648437500002e-05, + "loss": 0.4466, + "step": 1280 + }, + { + "epoch": 10.11, + "learning_rate": 1.36669921875e-05, + "loss": 0.4208, + "step": 1300 + }, + { + "epoch": 10.26, + "learning_rate": 1.3569335937500001e-05, + "loss": 0.3662, + "step": 1320 + }, + { + "epoch": 10.42, + "learning_rate": 1.3471679687500002e-05, + "loss": 0.4049, + "step": 1340 + }, + { + "epoch": 10.57, + "learning_rate": 1.33740234375e-05, + "loss": 0.3989, + "step": 1360 + }, + { + "epoch": 10.73, + "learning_rate": 1.3276367187500001e-05, + "loss": 0.3839, + "step": 1380 + }, + { + "epoch": 10.88, + "learning_rate": 1.3178710937500002e-05, + "loss": 0.4065, + "step": 1400 + }, + { + "epoch": 11.04, + "learning_rate": 1.30810546875e-05, + "loss": 0.3877, + "step": 1420 + }, + { + "epoch": 11.2, + "learning_rate": 1.2983398437500001e-05, + "loss": 0.4048, + "step": 1440 + }, + { + "epoch": 11.35, + "learning_rate": 1.2885742187500002e-05, + "loss": 0.3715, + "step": 1460 + }, + { + "epoch": 11.51, + "learning_rate": 1.27880859375e-05, + "loss": 0.3752, + "step": 1480 + }, + { + "epoch": 11.66, + "learning_rate": 1.2690429687500001e-05, + "loss": 0.3401, + "step": 1500 + } + ], + "max_steps": 4096, + "num_train_epochs": 32, + "total_flos": 1.9493458403328e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1500/training_args.bin b/checkpoint-1500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e2b3bee7f57e1aa04d4410b1e443601a704abcd7 --- /dev/null +++ b/checkpoint-1500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19b61f1babbbf314354bf942366de8dc63aa97ebdccb27573b76080ab5b50935 +size 3963 diff --git a/checkpoint-2000/optimizer.pt b/checkpoint-2000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5192d0949427b5dc590ed128771d0101a0c955a0 --- /dev/null +++ b/checkpoint-2000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dacbee21340912bc1b19620e6764f9e2dee7c229253a61caafea94ddd974a741 +size 33629893 diff --git a/checkpoint-2000/pytorch_model.bin b/checkpoint-2000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..105c042d7743bc813f9956cfde3786c1cc35bec9 --- /dev/null +++ b/checkpoint-2000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92317da0dff52efc988b6d544fa6f157b2456b354033ff665573c79d2ee31eeb +size 7548185429 diff --git a/checkpoint-2000/rng_state.pth b/checkpoint-2000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b3327c712b0e370ed4ea8b3e15721401fc923a6c --- /dev/null +++ b/checkpoint-2000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d02125204d43e1df0083f5b0bffbdd3804358954e6cd266141cb9fedb325cdd4 +size 14575 diff --git a/checkpoint-2000/scaler.pt b/checkpoint-2000/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0dd68948ce9fc3cf3964b31855687c87faec57ad --- /dev/null +++ b/checkpoint-2000/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f8b56bdf768c57d7f26c9f4a2bafcaeec7eb86582a790424fd2601003bfffdd +size 557 diff --git a/checkpoint-2000/scheduler.pt b/checkpoint-2000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb34ed0ca87196ae8a033a85f9a2d1fee97063de --- /dev/null +++ b/checkpoint-2000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d318b1f56c85e8a7fb6ddaaa52d7cff0c567edba8a10f0e94e1e4ee35da82c8 +size 627 diff --git a/checkpoint-2000/trainer_state.json b/checkpoint-2000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..be8a6a1c66c64abcedf34674f7de4b1f199a34f5 --- /dev/null +++ b/checkpoint-2000/trainer_state.json @@ -0,0 +1,616 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 15.54907677356657, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.16, + "learning_rate": 1.9912109375000002e-05, + "loss": 1.9656, + "step": 20 + }, + { + "epoch": 0.31, + "learning_rate": 1.98193359375e-05, + "loss": 1.8445, + "step": 40 + }, + { + "epoch": 0.47, + "learning_rate": 1.97216796875e-05, + "loss": 1.59, + "step": 60 + }, + { + "epoch": 0.62, + "learning_rate": 1.9624023437500002e-05, + "loss": 1.6275, + "step": 80 + }, + { + "epoch": 0.78, + "learning_rate": 1.9526367187500002e-05, + "loss": 1.473, + "step": 100 + }, + { + "epoch": 0.93, + "learning_rate": 1.9428710937500003e-05, + "loss": 1.3701, + "step": 120 + }, + { + "epoch": 1.09, + "learning_rate": 1.93310546875e-05, + "loss": 1.3224, + "step": 140 + }, + { + "epoch": 1.24, + "learning_rate": 1.92333984375e-05, + "loss": 1.1423, + "step": 160 + }, + { + "epoch": 1.4, + "learning_rate": 1.91357421875e-05, + "loss": 1.1652, + "step": 180 + }, + { + "epoch": 1.55, + "learning_rate": 1.9038085937500002e-05, + "loss": 1.1422, + "step": 200 + }, + { + "epoch": 1.71, + "learning_rate": 1.8940429687500002e-05, + "loss": 1.002, + "step": 220 + }, + { + "epoch": 1.87, + "learning_rate": 1.8842773437500003e-05, + "loss": 1.0779, + "step": 240 + }, + { + "epoch": 2.02, + "learning_rate": 1.87451171875e-05, + "loss": 0.9887, + "step": 260 + }, + { + "epoch": 2.18, + "learning_rate": 1.86474609375e-05, + "loss": 0.9543, + "step": 280 + }, + { + "epoch": 2.33, + "learning_rate": 1.85498046875e-05, + "loss": 0.9371, + "step": 300 + }, + { + "epoch": 2.49, + "learning_rate": 1.8452148437500002e-05, + "loss": 0.8701, + "step": 320 + }, + { + "epoch": 2.64, + "learning_rate": 1.8354492187500003e-05, + "loss": 0.875, + "step": 340 + }, + { + "epoch": 2.8, + "learning_rate": 1.82568359375e-05, + "loss": 0.7843, + "step": 360 + }, + { + "epoch": 2.95, + "learning_rate": 1.81591796875e-05, + "loss": 0.7945, + "step": 380 + }, + { + "epoch": 3.11, + "learning_rate": 1.80615234375e-05, + "loss": 0.799, + "step": 400 + }, + { + "epoch": 3.27, + "learning_rate": 1.79638671875e-05, + "loss": 0.7623, + "step": 420 + }, + { + "epoch": 3.42, + "learning_rate": 1.7866210937500002e-05, + "loss": 0.7263, + "step": 440 + }, + { + "epoch": 3.58, + "learning_rate": 1.7768554687500003e-05, + "loss": 0.7779, + "step": 460 + }, + { + "epoch": 3.73, + "learning_rate": 1.76708984375e-05, + "loss": 0.695, + "step": 480 + }, + { + "epoch": 3.89, + "learning_rate": 1.75732421875e-05, + "loss": 0.7344, + "step": 500 + }, + { + "epoch": 4.04, + "learning_rate": 1.74755859375e-05, + "loss": 0.695, + "step": 520 + }, + { + "epoch": 4.2, + "learning_rate": 1.7377929687500002e-05, + "loss": 0.6504, + "step": 540 + }, + { + "epoch": 4.35, + "learning_rate": 1.7280273437500002e-05, + "loss": 0.6447, + "step": 560 + }, + { + "epoch": 4.51, + "learning_rate": 1.7182617187500003e-05, + "loss": 0.6931, + "step": 580 + }, + { + "epoch": 4.66, + "learning_rate": 1.70849609375e-05, + "loss": 0.6256, + "step": 600 + }, + { + "epoch": 4.82, + "learning_rate": 1.69873046875e-05, + "loss": 0.6132, + "step": 620 + }, + { + "epoch": 4.98, + "learning_rate": 1.68896484375e-05, + "loss": 0.6001, + "step": 640 + }, + { + "epoch": 5.13, + "learning_rate": 1.6791992187500002e-05, + "loss": 0.6176, + "step": 660 + }, + { + "epoch": 5.29, + "learning_rate": 1.6694335937500002e-05, + "loss": 0.5709, + "step": 680 + }, + { + "epoch": 5.44, + "learning_rate": 1.65966796875e-05, + "loss": 0.564, + "step": 700 + }, + { + "epoch": 5.6, + "learning_rate": 1.64990234375e-05, + "loss": 0.5969, + "step": 720 + }, + { + "epoch": 5.75, + "learning_rate": 1.64013671875e-05, + "loss": 0.5484, + "step": 740 + }, + { + "epoch": 5.91, + "learning_rate": 1.63037109375e-05, + "loss": 0.5667, + "step": 760 + }, + { + "epoch": 6.06, + "learning_rate": 1.6206054687500002e-05, + "loss": 0.5442, + "step": 780 + }, + { + "epoch": 6.22, + "learning_rate": 1.6108398437500003e-05, + "loss": 0.4857, + "step": 800 + }, + { + "epoch": 6.38, + "learning_rate": 1.60107421875e-05, + "loss": 0.5225, + "step": 820 + }, + { + "epoch": 6.53, + "learning_rate": 1.59130859375e-05, + "loss": 0.5457, + "step": 840 + }, + { + "epoch": 6.69, + "learning_rate": 1.58154296875e-05, + "loss": 0.5315, + "step": 860 + }, + { + "epoch": 6.84, + "learning_rate": 1.57177734375e-05, + "loss": 0.5345, + "step": 880 + }, + { + "epoch": 7.0, + "learning_rate": 1.5620117187500002e-05, + "loss": 0.5169, + "step": 900 + }, + { + "epoch": 7.15, + "learning_rate": 1.5522460937500003e-05, + "loss": 0.5115, + "step": 920 + }, + { + "epoch": 7.31, + "learning_rate": 1.54248046875e-05, + "loss": 0.4913, + "step": 940 + }, + { + "epoch": 7.46, + "learning_rate": 1.53271484375e-05, + "loss": 0.4868, + "step": 960 + }, + { + "epoch": 7.62, + "learning_rate": 1.5229492187500001e-05, + "loss": 0.5226, + "step": 980 + }, + { + "epoch": 7.77, + "learning_rate": 1.5131835937500002e-05, + "loss": 0.4061, + "step": 1000 + }, + { + "epoch": 7.93, + "learning_rate": 1.50341796875e-05, + "loss": 0.4488, + "step": 1020 + }, + { + "epoch": 8.09, + "learning_rate": 1.4936523437500001e-05, + "loss": 0.4443, + "step": 1040 + }, + { + "epoch": 8.24, + "learning_rate": 1.4838867187500002e-05, + "loss": 0.4646, + "step": 1060 + }, + { + "epoch": 8.4, + "learning_rate": 1.47412109375e-05, + "loss": 0.4513, + "step": 1080 + }, + { + "epoch": 8.55, + "learning_rate": 1.4643554687500001e-05, + "loss": 0.4394, + "step": 1100 + }, + { + "epoch": 8.71, + "learning_rate": 1.4545898437500002e-05, + "loss": 0.4337, + "step": 1120 + }, + { + "epoch": 8.86, + "learning_rate": 1.44482421875e-05, + "loss": 0.5037, + "step": 1140 + }, + { + "epoch": 9.02, + "learning_rate": 1.4350585937500001e-05, + "loss": 0.4209, + "step": 1160 + }, + { + "epoch": 9.17, + "learning_rate": 1.42529296875e-05, + "loss": 0.4073, + "step": 1180 + }, + { + "epoch": 9.33, + "learning_rate": 1.4155273437500001e-05, + "loss": 0.3879, + "step": 1200 + }, + { + "epoch": 9.48, + "learning_rate": 1.4057617187500002e-05, + "loss": 0.4434, + "step": 1220 + }, + { + "epoch": 9.64, + "learning_rate": 1.39599609375e-05, + "loss": 0.4062, + "step": 1240 + }, + { + "epoch": 9.8, + "learning_rate": 1.3862304687500001e-05, + "loss": 0.39, + "step": 1260 + }, + { + "epoch": 9.95, + "learning_rate": 1.3764648437500002e-05, + "loss": 0.4466, + "step": 1280 + }, + { + "epoch": 10.11, + "learning_rate": 1.36669921875e-05, + "loss": 0.4208, + "step": 1300 + }, + { + "epoch": 10.26, + "learning_rate": 1.3569335937500001e-05, + "loss": 0.3662, + "step": 1320 + }, + { + "epoch": 10.42, + "learning_rate": 1.3471679687500002e-05, + "loss": 0.4049, + "step": 1340 + }, + { + "epoch": 10.57, + "learning_rate": 1.33740234375e-05, + "loss": 0.3989, + "step": 1360 + }, + { + "epoch": 10.73, + "learning_rate": 1.3276367187500001e-05, + "loss": 0.3839, + "step": 1380 + }, + { + "epoch": 10.88, + "learning_rate": 1.3178710937500002e-05, + "loss": 0.4065, + "step": 1400 + }, + { + "epoch": 11.04, + "learning_rate": 1.30810546875e-05, + "loss": 0.3877, + "step": 1420 + }, + { + "epoch": 11.2, + "learning_rate": 1.2983398437500001e-05, + "loss": 0.4048, + "step": 1440 + }, + { + "epoch": 11.35, + "learning_rate": 1.2885742187500002e-05, + "loss": 0.3715, + "step": 1460 + }, + { + "epoch": 11.51, + "learning_rate": 1.27880859375e-05, + "loss": 0.3752, + "step": 1480 + }, + { + "epoch": 11.66, + "learning_rate": 1.2690429687500001e-05, + "loss": 0.3401, + "step": 1500 + }, + { + "epoch": 11.82, + "learning_rate": 1.25927734375e-05, + "loss": 0.3545, + "step": 1520 + }, + { + "epoch": 11.97, + "learning_rate": 1.24951171875e-05, + "loss": 0.3718, + "step": 1540 + }, + { + "epoch": 12.13, + "learning_rate": 1.2397460937500001e-05, + "loss": 0.3755, + "step": 1560 + }, + { + "epoch": 12.28, + "learning_rate": 1.22998046875e-05, + "loss": 0.3865, + "step": 1580 + }, + { + "epoch": 12.44, + "learning_rate": 1.2202148437500001e-05, + "loss": 0.3237, + "step": 1600 + }, + { + "epoch": 12.59, + "learning_rate": 1.2104492187500001e-05, + "loss": 0.3702, + "step": 1620 + }, + { + "epoch": 12.75, + "learning_rate": 1.20068359375e-05, + "loss": 0.3238, + "step": 1640 + }, + { + "epoch": 12.91, + "learning_rate": 1.1909179687500001e-05, + "loss": 0.3373, + "step": 1660 + }, + { + "epoch": 13.06, + "learning_rate": 1.1811523437500002e-05, + "loss": 0.3486, + "step": 1680 + }, + { + "epoch": 13.22, + "learning_rate": 1.17138671875e-05, + "loss": 0.362, + "step": 1700 + }, + { + "epoch": 13.37, + "learning_rate": 1.1616210937500001e-05, + "loss": 0.3257, + "step": 1720 + }, + { + "epoch": 13.53, + "learning_rate": 1.1518554687500002e-05, + "loss": 0.3414, + "step": 1740 + }, + { + "epoch": 13.68, + "learning_rate": 1.14208984375e-05, + "loss": 0.3121, + "step": 1760 + }, + { + "epoch": 13.84, + "learning_rate": 1.1323242187500001e-05, + "loss": 0.3598, + "step": 1780 + }, + { + "epoch": 13.99, + "learning_rate": 1.1225585937500002e-05, + "loss": 0.2906, + "step": 1800 + }, + { + "epoch": 14.15, + "learning_rate": 1.11279296875e-05, + "loss": 0.3356, + "step": 1820 + }, + { + "epoch": 14.31, + "learning_rate": 1.1030273437500001e-05, + "loss": 0.2991, + "step": 1840 + }, + { + "epoch": 14.46, + "learning_rate": 1.09326171875e-05, + "loss": 0.2987, + "step": 1860 + }, + { + "epoch": 14.62, + "learning_rate": 1.08349609375e-05, + "loss": 0.3249, + "step": 1880 + }, + { + "epoch": 14.77, + "learning_rate": 1.0737304687500001e-05, + "loss": 0.3425, + "step": 1900 + }, + { + "epoch": 14.93, + "learning_rate": 1.06396484375e-05, + "loss": 0.3398, + "step": 1920 + }, + { + "epoch": 15.08, + "learning_rate": 1.05419921875e-05, + "loss": 0.3018, + "step": 1940 + }, + { + "epoch": 15.24, + "learning_rate": 1.0444335937500001e-05, + "loss": 0.2475, + "step": 1960 + }, + { + "epoch": 15.39, + "learning_rate": 1.03466796875e-05, + "loss": 0.3507, + "step": 1980 + }, + { + "epoch": 15.55, + "learning_rate": 1.02490234375e-05, + "loss": 0.3084, + "step": 2000 + } + ], + "max_steps": 4096, + "num_train_epochs": 32, + "total_flos": 2.5991277871104e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2000/training_args.bin b/checkpoint-2000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e2b3bee7f57e1aa04d4410b1e443601a704abcd7 --- /dev/null +++ b/checkpoint-2000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19b61f1babbbf314354bf942366de8dc63aa97ebdccb27573b76080ab5b50935 +size 3963 diff --git a/checkpoint-2500/optimizer.pt b/checkpoint-2500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..da3af9fbe2a4dc15403efc55ea1c007f0a5a6312 --- /dev/null +++ b/checkpoint-2500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d0ce503ac3cfc0fb0bf9d5b0ba86746a0e747395397e162110c0b3055fa7b49 +size 33629893 diff --git a/checkpoint-2500/pytorch_model.bin b/checkpoint-2500/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..6eb57168cc39bbc798857ed31cd4c294ade6b95c --- /dev/null +++ b/checkpoint-2500/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b4ea21b4a203b57985d8fcd5debaaac82022e8ea9bf67b93fae1b484be32371 +size 7548185429 diff --git a/checkpoint-2500/rng_state.pth b/checkpoint-2500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4ed08c537464e4249a2452a9d583286ef59c25f0 --- /dev/null +++ b/checkpoint-2500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bca9221408904a641ab27bbba78b92b50bc20ac10c411059ee7210723c151689 +size 14575 diff --git a/checkpoint-2500/scaler.pt b/checkpoint-2500/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..949aae9404bb5b8482e3867a8cc88da86ddb5773 --- /dev/null +++ b/checkpoint-2500/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94c8197602c1dd7f85c5171081176d8d544489fff7e8448e92707aa6eca14087 +size 557 diff --git a/checkpoint-2500/scheduler.pt b/checkpoint-2500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..61d1311a196a5c601454bfdf516486fb8f67fb9b --- /dev/null +++ b/checkpoint-2500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e266ba6a4bc8c85cbd44824c164d6a53bd5711d4872f2290a82858dd687d6778 +size 627 diff --git a/checkpoint-2500/trainer_state.json b/checkpoint-2500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..203912e2282f1d1d10c29bf2395f0a0f159b98dd --- /dev/null +++ b/checkpoint-2500/trainer_state.json @@ -0,0 +1,766 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 19.43634596695821, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.16, + "learning_rate": 1.9912109375000002e-05, + "loss": 1.9656, + "step": 20 + }, + { + "epoch": 0.31, + "learning_rate": 1.98193359375e-05, + "loss": 1.8445, + "step": 40 + }, + { + "epoch": 0.47, + "learning_rate": 1.97216796875e-05, + "loss": 1.59, + "step": 60 + }, + { + "epoch": 0.62, + "learning_rate": 1.9624023437500002e-05, + "loss": 1.6275, + "step": 80 + }, + { + "epoch": 0.78, + "learning_rate": 1.9526367187500002e-05, + "loss": 1.473, + "step": 100 + }, + { + "epoch": 0.93, + "learning_rate": 1.9428710937500003e-05, + "loss": 1.3701, + "step": 120 + }, + { + "epoch": 1.09, + "learning_rate": 1.93310546875e-05, + "loss": 1.3224, + "step": 140 + }, + { + "epoch": 1.24, + "learning_rate": 1.92333984375e-05, + "loss": 1.1423, + "step": 160 + }, + { + "epoch": 1.4, + "learning_rate": 1.91357421875e-05, + "loss": 1.1652, + "step": 180 + }, + { + "epoch": 1.55, + "learning_rate": 1.9038085937500002e-05, + "loss": 1.1422, + "step": 200 + }, + { + "epoch": 1.71, + "learning_rate": 1.8940429687500002e-05, + "loss": 1.002, + "step": 220 + }, + { + "epoch": 1.87, + "learning_rate": 1.8842773437500003e-05, + "loss": 1.0779, + "step": 240 + }, + { + "epoch": 2.02, + "learning_rate": 1.87451171875e-05, + "loss": 0.9887, + "step": 260 + }, + { + "epoch": 2.18, + "learning_rate": 1.86474609375e-05, + "loss": 0.9543, + "step": 280 + }, + { + "epoch": 2.33, + "learning_rate": 1.85498046875e-05, + "loss": 0.9371, + "step": 300 + }, + { + "epoch": 2.49, + "learning_rate": 1.8452148437500002e-05, + "loss": 0.8701, + "step": 320 + }, + { + "epoch": 2.64, + "learning_rate": 1.8354492187500003e-05, + "loss": 0.875, + "step": 340 + }, + { + "epoch": 2.8, + "learning_rate": 1.82568359375e-05, + "loss": 0.7843, + "step": 360 + }, + { + "epoch": 2.95, + "learning_rate": 1.81591796875e-05, + "loss": 0.7945, + "step": 380 + }, + { + "epoch": 3.11, + "learning_rate": 1.80615234375e-05, + "loss": 0.799, + "step": 400 + }, + { + "epoch": 3.27, + "learning_rate": 1.79638671875e-05, + "loss": 0.7623, + "step": 420 + }, + { + "epoch": 3.42, + "learning_rate": 1.7866210937500002e-05, + "loss": 0.7263, + "step": 440 + }, + { + "epoch": 3.58, + "learning_rate": 1.7768554687500003e-05, + "loss": 0.7779, + "step": 460 + }, + { + "epoch": 3.73, + "learning_rate": 1.76708984375e-05, + "loss": 0.695, + "step": 480 + }, + { + "epoch": 3.89, + "learning_rate": 1.75732421875e-05, + "loss": 0.7344, + "step": 500 + }, + { + "epoch": 4.04, + "learning_rate": 1.74755859375e-05, + "loss": 0.695, + "step": 520 + }, + { + "epoch": 4.2, + "learning_rate": 1.7377929687500002e-05, + "loss": 0.6504, + "step": 540 + }, + { + "epoch": 4.35, + "learning_rate": 1.7280273437500002e-05, + "loss": 0.6447, + "step": 560 + }, + { + "epoch": 4.51, + "learning_rate": 1.7182617187500003e-05, + "loss": 0.6931, + "step": 580 + }, + { + "epoch": 4.66, + "learning_rate": 1.70849609375e-05, + "loss": 0.6256, + "step": 600 + }, + { + "epoch": 4.82, + "learning_rate": 1.69873046875e-05, + "loss": 0.6132, + "step": 620 + }, + { + "epoch": 4.98, + "learning_rate": 1.68896484375e-05, + "loss": 0.6001, + "step": 640 + }, + { + "epoch": 5.13, + "learning_rate": 1.6791992187500002e-05, + "loss": 0.6176, + "step": 660 + }, + { + "epoch": 5.29, + "learning_rate": 1.6694335937500002e-05, + "loss": 0.5709, + "step": 680 + }, + { + "epoch": 5.44, + "learning_rate": 1.65966796875e-05, + "loss": 0.564, + "step": 700 + }, + { + "epoch": 5.6, + "learning_rate": 1.64990234375e-05, + "loss": 0.5969, + "step": 720 + }, + { + "epoch": 5.75, + "learning_rate": 1.64013671875e-05, + "loss": 0.5484, + "step": 740 + }, + { + "epoch": 5.91, + "learning_rate": 1.63037109375e-05, + "loss": 0.5667, + "step": 760 + }, + { + "epoch": 6.06, + "learning_rate": 1.6206054687500002e-05, + "loss": 0.5442, + "step": 780 + }, + { + "epoch": 6.22, + "learning_rate": 1.6108398437500003e-05, + "loss": 0.4857, + "step": 800 + }, + { + "epoch": 6.38, + "learning_rate": 1.60107421875e-05, + "loss": 0.5225, + "step": 820 + }, + { + "epoch": 6.53, + "learning_rate": 1.59130859375e-05, + "loss": 0.5457, + "step": 840 + }, + { + "epoch": 6.69, + "learning_rate": 1.58154296875e-05, + "loss": 0.5315, + "step": 860 + }, + { + "epoch": 6.84, + "learning_rate": 1.57177734375e-05, + "loss": 0.5345, + "step": 880 + }, + { + "epoch": 7.0, + "learning_rate": 1.5620117187500002e-05, + "loss": 0.5169, + "step": 900 + }, + { + "epoch": 7.15, + "learning_rate": 1.5522460937500003e-05, + "loss": 0.5115, + "step": 920 + }, + { + "epoch": 7.31, + "learning_rate": 1.54248046875e-05, + "loss": 0.4913, + "step": 940 + }, + { + "epoch": 7.46, + "learning_rate": 1.53271484375e-05, + "loss": 0.4868, + "step": 960 + }, + { + "epoch": 7.62, + "learning_rate": 1.5229492187500001e-05, + "loss": 0.5226, + "step": 980 + }, + { + "epoch": 7.77, + "learning_rate": 1.5131835937500002e-05, + "loss": 0.4061, + "step": 1000 + }, + { + "epoch": 7.93, + "learning_rate": 1.50341796875e-05, + "loss": 0.4488, + "step": 1020 + }, + { + "epoch": 8.09, + "learning_rate": 1.4936523437500001e-05, + "loss": 0.4443, + "step": 1040 + }, + { + "epoch": 8.24, + "learning_rate": 1.4838867187500002e-05, + "loss": 0.4646, + "step": 1060 + }, + { + "epoch": 8.4, + "learning_rate": 1.47412109375e-05, + "loss": 0.4513, + "step": 1080 + }, + { + "epoch": 8.55, + "learning_rate": 1.4643554687500001e-05, + "loss": 0.4394, + "step": 1100 + }, + { + "epoch": 8.71, + "learning_rate": 1.4545898437500002e-05, + "loss": 0.4337, + "step": 1120 + }, + { + "epoch": 8.86, + "learning_rate": 1.44482421875e-05, + "loss": 0.5037, + "step": 1140 + }, + { + "epoch": 9.02, + "learning_rate": 1.4350585937500001e-05, + "loss": 0.4209, + "step": 1160 + }, + { + "epoch": 9.17, + "learning_rate": 1.42529296875e-05, + "loss": 0.4073, + "step": 1180 + }, + { + "epoch": 9.33, + "learning_rate": 1.4155273437500001e-05, + "loss": 0.3879, + "step": 1200 + }, + { + "epoch": 9.48, + "learning_rate": 1.4057617187500002e-05, + "loss": 0.4434, + "step": 1220 + }, + { + "epoch": 9.64, + "learning_rate": 1.39599609375e-05, + "loss": 0.4062, + "step": 1240 + }, + { + "epoch": 9.8, + "learning_rate": 1.3862304687500001e-05, + "loss": 0.39, + "step": 1260 + }, + { + "epoch": 9.95, + "learning_rate": 1.3764648437500002e-05, + "loss": 0.4466, + "step": 1280 + }, + { + "epoch": 10.11, + "learning_rate": 1.36669921875e-05, + "loss": 0.4208, + "step": 1300 + }, + { + "epoch": 10.26, + "learning_rate": 1.3569335937500001e-05, + "loss": 0.3662, + "step": 1320 + }, + { + "epoch": 10.42, + "learning_rate": 1.3471679687500002e-05, + "loss": 0.4049, + "step": 1340 + }, + { + "epoch": 10.57, + "learning_rate": 1.33740234375e-05, + "loss": 0.3989, + "step": 1360 + }, + { + "epoch": 10.73, + "learning_rate": 1.3276367187500001e-05, + "loss": 0.3839, + "step": 1380 + }, + { + "epoch": 10.88, + "learning_rate": 1.3178710937500002e-05, + "loss": 0.4065, + "step": 1400 + }, + { + "epoch": 11.04, + "learning_rate": 1.30810546875e-05, + "loss": 0.3877, + "step": 1420 + }, + { + "epoch": 11.2, + "learning_rate": 1.2983398437500001e-05, + "loss": 0.4048, + "step": 1440 + }, + { + "epoch": 11.35, + "learning_rate": 1.2885742187500002e-05, + "loss": 0.3715, + "step": 1460 + }, + { + "epoch": 11.51, + "learning_rate": 1.27880859375e-05, + "loss": 0.3752, + "step": 1480 + }, + { + "epoch": 11.66, + "learning_rate": 1.2690429687500001e-05, + "loss": 0.3401, + "step": 1500 + }, + { + "epoch": 11.82, + "learning_rate": 1.25927734375e-05, + "loss": 0.3545, + "step": 1520 + }, + { + "epoch": 11.97, + "learning_rate": 1.24951171875e-05, + "loss": 0.3718, + "step": 1540 + }, + { + "epoch": 12.13, + "learning_rate": 1.2397460937500001e-05, + "loss": 0.3755, + "step": 1560 + }, + { + "epoch": 12.28, + "learning_rate": 1.22998046875e-05, + "loss": 0.3865, + "step": 1580 + }, + { + "epoch": 12.44, + "learning_rate": 1.2202148437500001e-05, + "loss": 0.3237, + "step": 1600 + }, + { + "epoch": 12.59, + "learning_rate": 1.2104492187500001e-05, + "loss": 0.3702, + "step": 1620 + }, + { + "epoch": 12.75, + "learning_rate": 1.20068359375e-05, + "loss": 0.3238, + "step": 1640 + }, + { + "epoch": 12.91, + "learning_rate": 1.1909179687500001e-05, + "loss": 0.3373, + "step": 1660 + }, + { + "epoch": 13.06, + "learning_rate": 1.1811523437500002e-05, + "loss": 0.3486, + "step": 1680 + }, + { + "epoch": 13.22, + "learning_rate": 1.17138671875e-05, + "loss": 0.362, + "step": 1700 + }, + { + "epoch": 13.37, + "learning_rate": 1.1616210937500001e-05, + "loss": 0.3257, + "step": 1720 + }, + { + "epoch": 13.53, + "learning_rate": 1.1518554687500002e-05, + "loss": 0.3414, + "step": 1740 + }, + { + "epoch": 13.68, + "learning_rate": 1.14208984375e-05, + "loss": 0.3121, + "step": 1760 + }, + { + "epoch": 13.84, + "learning_rate": 1.1323242187500001e-05, + "loss": 0.3598, + "step": 1780 + }, + { + "epoch": 13.99, + "learning_rate": 1.1225585937500002e-05, + "loss": 0.2906, + "step": 1800 + }, + { + "epoch": 14.15, + "learning_rate": 1.11279296875e-05, + "loss": 0.3356, + "step": 1820 + }, + { + "epoch": 14.31, + "learning_rate": 1.1030273437500001e-05, + "loss": 0.2991, + "step": 1840 + }, + { + "epoch": 14.46, + "learning_rate": 1.09326171875e-05, + "loss": 0.2987, + "step": 1860 + }, + { + "epoch": 14.62, + "learning_rate": 1.08349609375e-05, + "loss": 0.3249, + "step": 1880 + }, + { + "epoch": 14.77, + "learning_rate": 1.0737304687500001e-05, + "loss": 0.3425, + "step": 1900 + }, + { + "epoch": 14.93, + "learning_rate": 1.06396484375e-05, + "loss": 0.3398, + "step": 1920 + }, + { + "epoch": 15.08, + "learning_rate": 1.05419921875e-05, + "loss": 0.3018, + "step": 1940 + }, + { + "epoch": 15.24, + "learning_rate": 1.0444335937500001e-05, + "loss": 0.2475, + "step": 1960 + }, + { + "epoch": 15.39, + "learning_rate": 1.03466796875e-05, + "loss": 0.3507, + "step": 1980 + }, + { + "epoch": 15.55, + "learning_rate": 1.02490234375e-05, + "loss": 0.3084, + "step": 2000 + }, + { + "epoch": 15.7, + "learning_rate": 1.0151367187500001e-05, + "loss": 0.3212, + "step": 2020 + }, + { + "epoch": 15.86, + "learning_rate": 1.00537109375e-05, + "loss": 0.2831, + "step": 2040 + }, + { + "epoch": 16.02, + "learning_rate": 9.956054687500001e-06, + "loss": 0.3072, + "step": 2060 + }, + { + "epoch": 16.17, + "learning_rate": 9.858398437500002e-06, + "loss": 0.3293, + "step": 2080 + }, + { + "epoch": 16.33, + "learning_rate": 9.7607421875e-06, + "loss": 0.2738, + "step": 2100 + }, + { + "epoch": 16.48, + "learning_rate": 9.663085937500001e-06, + "loss": 0.3245, + "step": 2120 + }, + { + "epoch": 16.64, + "learning_rate": 9.565429687500002e-06, + "loss": 0.2846, + "step": 2140 + }, + { + "epoch": 16.79, + "learning_rate": 9.4677734375e-06, + "loss": 0.2906, + "step": 2160 + }, + { + "epoch": 16.95, + "learning_rate": 9.370117187500001e-06, + "loss": 0.263, + "step": 2180 + }, + { + "epoch": 17.1, + "learning_rate": 9.2724609375e-06, + "loss": 0.286, + "step": 2200 + }, + { + "epoch": 17.26, + "learning_rate": 9.1748046875e-06, + "loss": 0.3152, + "step": 2220 + }, + { + "epoch": 17.41, + "learning_rate": 9.077148437500001e-06, + "loss": 0.2811, + "step": 2240 + }, + { + "epoch": 17.57, + "learning_rate": 8.9794921875e-06, + "loss": 0.257, + "step": 2260 + }, + { + "epoch": 17.73, + "learning_rate": 8.8818359375e-06, + "loss": 0.267, + "step": 2280 + }, + { + "epoch": 17.88, + "learning_rate": 8.784179687500001e-06, + "loss": 0.3056, + "step": 2300 + }, + { + "epoch": 18.04, + "learning_rate": 8.6865234375e-06, + "loss": 0.2522, + "step": 2320 + }, + { + "epoch": 18.19, + "learning_rate": 8.5888671875e-06, + "loss": 0.2806, + "step": 2340 + }, + { + "epoch": 18.35, + "learning_rate": 8.491210937500001e-06, + "loss": 0.266, + "step": 2360 + }, + { + "epoch": 18.5, + "learning_rate": 8.3935546875e-06, + "loss": 0.288, + "step": 2380 + }, + { + "epoch": 18.66, + "learning_rate": 8.2958984375e-06, + "loss": 0.2712, + "step": 2400 + }, + { + "epoch": 18.81, + "learning_rate": 8.198242187500001e-06, + "loss": 0.3099, + "step": 2420 + }, + { + "epoch": 18.97, + "learning_rate": 8.1005859375e-06, + "loss": 0.2205, + "step": 2440 + }, + { + "epoch": 19.13, + "learning_rate": 8.002929687500001e-06, + "loss": 0.253, + "step": 2460 + }, + { + "epoch": 19.28, + "learning_rate": 7.905273437500001e-06, + "loss": 0.2885, + "step": 2480 + }, + { + "epoch": 19.44, + "learning_rate": 7.8076171875e-06, + "loss": 0.2326, + "step": 2500 + } + ], + "max_steps": 4096, + "num_train_epochs": 32, + "total_flos": 3.248909733888e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2500/training_args.bin b/checkpoint-2500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e2b3bee7f57e1aa04d4410b1e443601a704abcd7 --- /dev/null +++ b/checkpoint-2500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19b61f1babbbf314354bf942366de8dc63aa97ebdccb27573b76080ab5b50935 +size 3963 diff --git a/checkpoint-3000/optimizer.pt b/checkpoint-3000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..64b51f34018eb18e2c78c665ffa5607009396d54 --- /dev/null +++ b/checkpoint-3000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abd6096034801a8462add69f797380a513b314f24469c357222535bef86c3c49 +size 33629893 diff --git a/checkpoint-3000/pytorch_model.bin b/checkpoint-3000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..018a77ef38a9b2ff365b663d971a64bbc6b808c0 --- /dev/null +++ b/checkpoint-3000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fadea24b4342ba15391579a69c31e51d973a4033c689ba35538b1ae86a64416 +size 7548185429 diff --git a/checkpoint-3000/rng_state.pth b/checkpoint-3000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a43f5b1e245938e287202d36424125eb12393d4d --- /dev/null +++ b/checkpoint-3000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57b99a395afb04fbb4ea183752c061c3a4f2c1c2097078e1a5e8562b178e2802 +size 14575 diff --git a/checkpoint-3000/scaler.pt b/checkpoint-3000/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd20ed6b99faaef100471d152db1b2cb81f21d19 --- /dev/null +++ b/checkpoint-3000/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:513a9b45b859c887bcff749f25fc1ee060b7a5baf665003a7040bbf3fcbb9016 +size 557 diff --git a/checkpoint-3000/scheduler.pt b/checkpoint-3000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f17007bfb163010b9e8c3afc624a154b3bf268fd --- /dev/null +++ b/checkpoint-3000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc629afeb6334b604b054705c97bd270f739029d4e5678293dbec8b8ac0c9565 +size 627 diff --git a/checkpoint-3000/trainer_state.json b/checkpoint-3000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1953959e02a6f43490ea515021b4e2307558376d --- /dev/null +++ b/checkpoint-3000/trainer_state.json @@ -0,0 +1,916 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 23.323615160349853, + "global_step": 3000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.16, + "learning_rate": 1.9912109375000002e-05, + "loss": 1.9656, + "step": 20 + }, + { + "epoch": 0.31, + "learning_rate": 1.98193359375e-05, + "loss": 1.8445, + "step": 40 + }, + { + "epoch": 0.47, + "learning_rate": 1.97216796875e-05, + "loss": 1.59, + "step": 60 + }, + { + "epoch": 0.62, + "learning_rate": 1.9624023437500002e-05, + "loss": 1.6275, + "step": 80 + }, + { + "epoch": 0.78, + "learning_rate": 1.9526367187500002e-05, + "loss": 1.473, + "step": 100 + }, + { + "epoch": 0.93, + "learning_rate": 1.9428710937500003e-05, + "loss": 1.3701, + "step": 120 + }, + { + "epoch": 1.09, + "learning_rate": 1.93310546875e-05, + "loss": 1.3224, + "step": 140 + }, + { + "epoch": 1.24, + "learning_rate": 1.92333984375e-05, + "loss": 1.1423, + "step": 160 + }, + { + "epoch": 1.4, + "learning_rate": 1.91357421875e-05, + "loss": 1.1652, + "step": 180 + }, + { + "epoch": 1.55, + "learning_rate": 1.9038085937500002e-05, + "loss": 1.1422, + "step": 200 + }, + { + "epoch": 1.71, + "learning_rate": 1.8940429687500002e-05, + "loss": 1.002, + "step": 220 + }, + { + "epoch": 1.87, + "learning_rate": 1.8842773437500003e-05, + "loss": 1.0779, + "step": 240 + }, + { + "epoch": 2.02, + "learning_rate": 1.87451171875e-05, + "loss": 0.9887, + "step": 260 + }, + { + "epoch": 2.18, + "learning_rate": 1.86474609375e-05, + "loss": 0.9543, + "step": 280 + }, + { + "epoch": 2.33, + "learning_rate": 1.85498046875e-05, + "loss": 0.9371, + "step": 300 + }, + { + "epoch": 2.49, + "learning_rate": 1.8452148437500002e-05, + "loss": 0.8701, + "step": 320 + }, + { + "epoch": 2.64, + "learning_rate": 1.8354492187500003e-05, + "loss": 0.875, + "step": 340 + }, + { + "epoch": 2.8, + "learning_rate": 1.82568359375e-05, + "loss": 0.7843, + "step": 360 + }, + { + "epoch": 2.95, + "learning_rate": 1.81591796875e-05, + "loss": 0.7945, + "step": 380 + }, + { + "epoch": 3.11, + "learning_rate": 1.80615234375e-05, + "loss": 0.799, + "step": 400 + }, + { + "epoch": 3.27, + "learning_rate": 1.79638671875e-05, + "loss": 0.7623, + "step": 420 + }, + { + "epoch": 3.42, + "learning_rate": 1.7866210937500002e-05, + "loss": 0.7263, + "step": 440 + }, + { + "epoch": 3.58, + "learning_rate": 1.7768554687500003e-05, + "loss": 0.7779, + "step": 460 + }, + { + "epoch": 3.73, + "learning_rate": 1.76708984375e-05, + "loss": 0.695, + "step": 480 + }, + { + "epoch": 3.89, + "learning_rate": 1.75732421875e-05, + "loss": 0.7344, + "step": 500 + }, + { + "epoch": 4.04, + "learning_rate": 1.74755859375e-05, + "loss": 0.695, + "step": 520 + }, + { + "epoch": 4.2, + "learning_rate": 1.7377929687500002e-05, + "loss": 0.6504, + "step": 540 + }, + { + "epoch": 4.35, + "learning_rate": 1.7280273437500002e-05, + "loss": 0.6447, + "step": 560 + }, + { + "epoch": 4.51, + "learning_rate": 1.7182617187500003e-05, + "loss": 0.6931, + "step": 580 + }, + { + "epoch": 4.66, + "learning_rate": 1.70849609375e-05, + "loss": 0.6256, + "step": 600 + }, + { + "epoch": 4.82, + "learning_rate": 1.69873046875e-05, + "loss": 0.6132, + "step": 620 + }, + { + "epoch": 4.98, + "learning_rate": 1.68896484375e-05, + "loss": 0.6001, + "step": 640 + }, + { + "epoch": 5.13, + "learning_rate": 1.6791992187500002e-05, + "loss": 0.6176, + "step": 660 + }, + { + "epoch": 5.29, + "learning_rate": 1.6694335937500002e-05, + "loss": 0.5709, + "step": 680 + }, + { + "epoch": 5.44, + "learning_rate": 1.65966796875e-05, + "loss": 0.564, + "step": 700 + }, + { + "epoch": 5.6, + "learning_rate": 1.64990234375e-05, + "loss": 0.5969, + "step": 720 + }, + { + "epoch": 5.75, + "learning_rate": 1.64013671875e-05, + "loss": 0.5484, + "step": 740 + }, + { + "epoch": 5.91, + "learning_rate": 1.63037109375e-05, + "loss": 0.5667, + "step": 760 + }, + { + "epoch": 6.06, + "learning_rate": 1.6206054687500002e-05, + "loss": 0.5442, + "step": 780 + }, + { + "epoch": 6.22, + "learning_rate": 1.6108398437500003e-05, + "loss": 0.4857, + "step": 800 + }, + { + "epoch": 6.38, + "learning_rate": 1.60107421875e-05, + "loss": 0.5225, + "step": 820 + }, + { + "epoch": 6.53, + "learning_rate": 1.59130859375e-05, + "loss": 0.5457, + "step": 840 + }, + { + "epoch": 6.69, + "learning_rate": 1.58154296875e-05, + "loss": 0.5315, + "step": 860 + }, + { + "epoch": 6.84, + "learning_rate": 1.57177734375e-05, + "loss": 0.5345, + "step": 880 + }, + { + "epoch": 7.0, + "learning_rate": 1.5620117187500002e-05, + "loss": 0.5169, + "step": 900 + }, + { + "epoch": 7.15, + "learning_rate": 1.5522460937500003e-05, + "loss": 0.5115, + "step": 920 + }, + { + "epoch": 7.31, + "learning_rate": 1.54248046875e-05, + "loss": 0.4913, + "step": 940 + }, + { + "epoch": 7.46, + "learning_rate": 1.53271484375e-05, + "loss": 0.4868, + "step": 960 + }, + { + "epoch": 7.62, + "learning_rate": 1.5229492187500001e-05, + "loss": 0.5226, + "step": 980 + }, + { + "epoch": 7.77, + "learning_rate": 1.5131835937500002e-05, + "loss": 0.4061, + "step": 1000 + }, + { + "epoch": 7.93, + "learning_rate": 1.50341796875e-05, + "loss": 0.4488, + "step": 1020 + }, + { + "epoch": 8.09, + "learning_rate": 1.4936523437500001e-05, + "loss": 0.4443, + "step": 1040 + }, + { + "epoch": 8.24, + "learning_rate": 1.4838867187500002e-05, + "loss": 0.4646, + "step": 1060 + }, + { + "epoch": 8.4, + "learning_rate": 1.47412109375e-05, + "loss": 0.4513, + "step": 1080 + }, + { + "epoch": 8.55, + "learning_rate": 1.4643554687500001e-05, + "loss": 0.4394, + "step": 1100 + }, + { + "epoch": 8.71, + "learning_rate": 1.4545898437500002e-05, + "loss": 0.4337, + "step": 1120 + }, + { + "epoch": 8.86, + "learning_rate": 1.44482421875e-05, + "loss": 0.5037, + "step": 1140 + }, + { + "epoch": 9.02, + "learning_rate": 1.4350585937500001e-05, + "loss": 0.4209, + "step": 1160 + }, + { + "epoch": 9.17, + "learning_rate": 1.42529296875e-05, + "loss": 0.4073, + "step": 1180 + }, + { + "epoch": 9.33, + "learning_rate": 1.4155273437500001e-05, + "loss": 0.3879, + "step": 1200 + }, + { + "epoch": 9.48, + "learning_rate": 1.4057617187500002e-05, + "loss": 0.4434, + "step": 1220 + }, + { + "epoch": 9.64, + "learning_rate": 1.39599609375e-05, + "loss": 0.4062, + "step": 1240 + }, + { + "epoch": 9.8, + "learning_rate": 1.3862304687500001e-05, + "loss": 0.39, + "step": 1260 + }, + { + "epoch": 9.95, + "learning_rate": 1.3764648437500002e-05, + "loss": 0.4466, + "step": 1280 + }, + { + "epoch": 10.11, + "learning_rate": 1.36669921875e-05, + "loss": 0.4208, + "step": 1300 + }, + { + "epoch": 10.26, + "learning_rate": 1.3569335937500001e-05, + "loss": 0.3662, + "step": 1320 + }, + { + "epoch": 10.42, + "learning_rate": 1.3471679687500002e-05, + "loss": 0.4049, + "step": 1340 + }, + { + "epoch": 10.57, + "learning_rate": 1.33740234375e-05, + "loss": 0.3989, + "step": 1360 + }, + { + "epoch": 10.73, + "learning_rate": 1.3276367187500001e-05, + "loss": 0.3839, + "step": 1380 + }, + { + "epoch": 10.88, + "learning_rate": 1.3178710937500002e-05, + "loss": 0.4065, + "step": 1400 + }, + { + "epoch": 11.04, + "learning_rate": 1.30810546875e-05, + "loss": 0.3877, + "step": 1420 + }, + { + "epoch": 11.2, + "learning_rate": 1.2983398437500001e-05, + "loss": 0.4048, + "step": 1440 + }, + { + "epoch": 11.35, + "learning_rate": 1.2885742187500002e-05, + "loss": 0.3715, + "step": 1460 + }, + { + "epoch": 11.51, + "learning_rate": 1.27880859375e-05, + "loss": 0.3752, + "step": 1480 + }, + { + "epoch": 11.66, + "learning_rate": 1.2690429687500001e-05, + "loss": 0.3401, + "step": 1500 + }, + { + "epoch": 11.82, + "learning_rate": 1.25927734375e-05, + "loss": 0.3545, + "step": 1520 + }, + { + "epoch": 11.97, + "learning_rate": 1.24951171875e-05, + "loss": 0.3718, + "step": 1540 + }, + { + "epoch": 12.13, + "learning_rate": 1.2397460937500001e-05, + "loss": 0.3755, + "step": 1560 + }, + { + "epoch": 12.28, + "learning_rate": 1.22998046875e-05, + "loss": 0.3865, + "step": 1580 + }, + { + "epoch": 12.44, + "learning_rate": 1.2202148437500001e-05, + "loss": 0.3237, + "step": 1600 + }, + { + "epoch": 12.59, + "learning_rate": 1.2104492187500001e-05, + "loss": 0.3702, + "step": 1620 + }, + { + "epoch": 12.75, + "learning_rate": 1.20068359375e-05, + "loss": 0.3238, + "step": 1640 + }, + { + "epoch": 12.91, + "learning_rate": 1.1909179687500001e-05, + "loss": 0.3373, + "step": 1660 + }, + { + "epoch": 13.06, + "learning_rate": 1.1811523437500002e-05, + "loss": 0.3486, + "step": 1680 + }, + { + "epoch": 13.22, + "learning_rate": 1.17138671875e-05, + "loss": 0.362, + "step": 1700 + }, + { + "epoch": 13.37, + "learning_rate": 1.1616210937500001e-05, + "loss": 0.3257, + "step": 1720 + }, + { + "epoch": 13.53, + "learning_rate": 1.1518554687500002e-05, + "loss": 0.3414, + "step": 1740 + }, + { + "epoch": 13.68, + "learning_rate": 1.14208984375e-05, + "loss": 0.3121, + "step": 1760 + }, + { + "epoch": 13.84, + "learning_rate": 1.1323242187500001e-05, + "loss": 0.3598, + "step": 1780 + }, + { + "epoch": 13.99, + "learning_rate": 1.1225585937500002e-05, + "loss": 0.2906, + "step": 1800 + }, + { + "epoch": 14.15, + "learning_rate": 1.11279296875e-05, + "loss": 0.3356, + "step": 1820 + }, + { + "epoch": 14.31, + "learning_rate": 1.1030273437500001e-05, + "loss": 0.2991, + "step": 1840 + }, + { + "epoch": 14.46, + "learning_rate": 1.09326171875e-05, + "loss": 0.2987, + "step": 1860 + }, + { + "epoch": 14.62, + "learning_rate": 1.08349609375e-05, + "loss": 0.3249, + "step": 1880 + }, + { + "epoch": 14.77, + "learning_rate": 1.0737304687500001e-05, + "loss": 0.3425, + "step": 1900 + }, + { + "epoch": 14.93, + "learning_rate": 1.06396484375e-05, + "loss": 0.3398, + "step": 1920 + }, + { + "epoch": 15.08, + "learning_rate": 1.05419921875e-05, + "loss": 0.3018, + "step": 1940 + }, + { + "epoch": 15.24, + "learning_rate": 1.0444335937500001e-05, + "loss": 0.2475, + "step": 1960 + }, + { + "epoch": 15.39, + "learning_rate": 1.03466796875e-05, + "loss": 0.3507, + "step": 1980 + }, + { + "epoch": 15.55, + "learning_rate": 1.02490234375e-05, + "loss": 0.3084, + "step": 2000 + }, + { + "epoch": 15.7, + "learning_rate": 1.0151367187500001e-05, + "loss": 0.3212, + "step": 2020 + }, + { + "epoch": 15.86, + "learning_rate": 1.00537109375e-05, + "loss": 0.2831, + "step": 2040 + }, + { + "epoch": 16.02, + "learning_rate": 9.956054687500001e-06, + "loss": 0.3072, + "step": 2060 + }, + { + "epoch": 16.17, + "learning_rate": 9.858398437500002e-06, + "loss": 0.3293, + "step": 2080 + }, + { + "epoch": 16.33, + "learning_rate": 9.7607421875e-06, + "loss": 0.2738, + "step": 2100 + }, + { + "epoch": 16.48, + "learning_rate": 9.663085937500001e-06, + "loss": 0.3245, + "step": 2120 + }, + { + "epoch": 16.64, + "learning_rate": 9.565429687500002e-06, + "loss": 0.2846, + "step": 2140 + }, + { + "epoch": 16.79, + "learning_rate": 9.4677734375e-06, + "loss": 0.2906, + "step": 2160 + }, + { + "epoch": 16.95, + "learning_rate": 9.370117187500001e-06, + "loss": 0.263, + "step": 2180 + }, + { + "epoch": 17.1, + "learning_rate": 9.2724609375e-06, + "loss": 0.286, + "step": 2200 + }, + { + "epoch": 17.26, + "learning_rate": 9.1748046875e-06, + "loss": 0.3152, + "step": 2220 + }, + { + "epoch": 17.41, + "learning_rate": 9.077148437500001e-06, + "loss": 0.2811, + "step": 2240 + }, + { + "epoch": 17.57, + "learning_rate": 8.9794921875e-06, + "loss": 0.257, + "step": 2260 + }, + { + "epoch": 17.73, + "learning_rate": 8.8818359375e-06, + "loss": 0.267, + "step": 2280 + }, + { + "epoch": 17.88, + "learning_rate": 8.784179687500001e-06, + "loss": 0.3056, + "step": 2300 + }, + { + "epoch": 18.04, + "learning_rate": 8.6865234375e-06, + "loss": 0.2522, + "step": 2320 + }, + { + "epoch": 18.19, + "learning_rate": 8.5888671875e-06, + "loss": 0.2806, + "step": 2340 + }, + { + "epoch": 18.35, + "learning_rate": 8.491210937500001e-06, + "loss": 0.266, + "step": 2360 + }, + { + "epoch": 18.5, + "learning_rate": 8.3935546875e-06, + "loss": 0.288, + "step": 2380 + }, + { + "epoch": 18.66, + "learning_rate": 8.2958984375e-06, + "loss": 0.2712, + "step": 2400 + }, + { + "epoch": 18.81, + "learning_rate": 8.198242187500001e-06, + "loss": 0.3099, + "step": 2420 + }, + { + "epoch": 18.97, + "learning_rate": 8.1005859375e-06, + "loss": 0.2205, + "step": 2440 + }, + { + "epoch": 19.13, + "learning_rate": 8.002929687500001e-06, + "loss": 0.253, + "step": 2460 + }, + { + "epoch": 19.28, + "learning_rate": 7.905273437500001e-06, + "loss": 0.2885, + "step": 2480 + }, + { + "epoch": 19.44, + "learning_rate": 7.8076171875e-06, + "loss": 0.2326, + "step": 2500 + }, + { + "epoch": 19.59, + "learning_rate": 7.709960937500001e-06, + "loss": 0.255, + "step": 2520 + }, + { + "epoch": 19.75, + "learning_rate": 7.612304687500001e-06, + "loss": 0.2698, + "step": 2540 + }, + { + "epoch": 19.9, + "learning_rate": 7.5146484375000004e-06, + "loss": 0.2532, + "step": 2560 + }, + { + "epoch": 20.06, + "learning_rate": 7.4169921875e-06, + "loss": 0.2562, + "step": 2580 + }, + { + "epoch": 20.21, + "learning_rate": 7.319335937500001e-06, + "loss": 0.2717, + "step": 2600 + }, + { + "epoch": 20.37, + "learning_rate": 7.2216796875000005e-06, + "loss": 0.2652, + "step": 2620 + }, + { + "epoch": 20.52, + "learning_rate": 7.1240234375e-06, + "loss": 0.2654, + "step": 2640 + }, + { + "epoch": 20.68, + "learning_rate": 7.026367187500001e-06, + "loss": 0.2766, + "step": 2660 + }, + { + "epoch": 20.84, + "learning_rate": 6.928710937500001e-06, + "loss": 0.2246, + "step": 2680 + }, + { + "epoch": 20.99, + "learning_rate": 6.8310546875e-06, + "loss": 0.2546, + "step": 2700 + }, + { + "epoch": 21.15, + "learning_rate": 6.733398437500001e-06, + "loss": 0.2708, + "step": 2720 + }, + { + "epoch": 21.3, + "learning_rate": 6.635742187500001e-06, + "loss": 0.2385, + "step": 2740 + }, + { + "epoch": 21.46, + "learning_rate": 6.5380859375e-06, + "loss": 0.2271, + "step": 2760 + }, + { + "epoch": 21.61, + "learning_rate": 6.4404296875e-06, + "loss": 0.2874, + "step": 2780 + }, + { + "epoch": 21.77, + "learning_rate": 6.342773437500001e-06, + "loss": 0.2447, + "step": 2800 + }, + { + "epoch": 21.92, + "learning_rate": 6.2451171875000005e-06, + "loss": 0.2309, + "step": 2820 + }, + { + "epoch": 22.08, + "learning_rate": 6.1474609375e-06, + "loss": 0.2587, + "step": 2840 + }, + { + "epoch": 22.24, + "learning_rate": 6.049804687500001e-06, + "loss": 0.2303, + "step": 2860 + }, + { + "epoch": 22.39, + "learning_rate": 5.952148437500001e-06, + "loss": 0.2376, + "step": 2880 + }, + { + "epoch": 22.55, + "learning_rate": 5.8544921875e-06, + "loss": 0.2696, + "step": 2900 + }, + { + "epoch": 22.7, + "learning_rate": 5.7568359375e-06, + "loss": 0.2459, + "step": 2920 + }, + { + "epoch": 22.86, + "learning_rate": 5.659179687500001e-06, + "loss": 0.2032, + "step": 2940 + }, + { + "epoch": 23.01, + "learning_rate": 5.5615234375e-06, + "loss": 0.2491, + "step": 2960 + }, + { + "epoch": 23.17, + "learning_rate": 5.4638671875e-06, + "loss": 0.2395, + "step": 2980 + }, + { + "epoch": 23.32, + "learning_rate": 5.366210937500001e-06, + "loss": 0.2573, + "step": 3000 + } + ], + "max_steps": 4096, + "num_train_epochs": 32, + "total_flos": 3.8986916806656e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3000/training_args.bin b/checkpoint-3000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e2b3bee7f57e1aa04d4410b1e443601a704abcd7 --- /dev/null +++ b/checkpoint-3000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19b61f1babbbf314354bf942366de8dc63aa97ebdccb27573b76080ab5b50935 +size 3963 diff --git a/checkpoint-3500/optimizer.pt b/checkpoint-3500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8daeae4989e66572dd06080970d9bcd0607284fb --- /dev/null +++ b/checkpoint-3500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4004f9fccd306f50b7742214a698c4f5527d7017c24d5545c4d9b3db6a85977 +size 33629893 diff --git a/checkpoint-3500/pytorch_model.bin b/checkpoint-3500/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..e23e843b7da2cd3d3fb0d940335a10900d346d37 --- /dev/null +++ b/checkpoint-3500/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dee3137d50f8ccd0d3b7e6e53eba049a5066b881ce7dedb55584a335b4f505fa +size 7548185429 diff --git a/checkpoint-3500/rng_state.pth b/checkpoint-3500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..934ff7c5b6453685fdfb6fda600e58d124ec3b78 --- /dev/null +++ b/checkpoint-3500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b252559e55f1cebc645f9e33d786a9d51b1c14fae33d7291871ffe2763bda82e +size 14575 diff --git a/checkpoint-3500/scaler.pt b/checkpoint-3500/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..724e80c1fb569fc9360e585ddb851a577b6cddd0 --- /dev/null +++ b/checkpoint-3500/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32a90a65317216d30151b19ae7ffa5171734912b4676ce34dfcb16507da25212 +size 557 diff --git a/checkpoint-3500/scheduler.pt b/checkpoint-3500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..62177e9798c1ff9b3359f3196e0f5a4a47b9c6df --- /dev/null +++ b/checkpoint-3500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f059ff70a4abf1019a77d47012bc09d5d465a3c63eeb7b984cf7d7d16ed46e4 +size 627 diff --git a/checkpoint-3500/trainer_state.json b/checkpoint-3500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..534adb0584ba9bd8a021e05d38639c39ae96f5a6 --- /dev/null +++ b/checkpoint-3500/trainer_state.json @@ -0,0 +1,1066 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 27.2108843537415, + "global_step": 3500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.16, + "learning_rate": 1.9912109375000002e-05, + "loss": 1.9656, + "step": 20 + }, + { + "epoch": 0.31, + "learning_rate": 1.98193359375e-05, + "loss": 1.8445, + "step": 40 + }, + { + "epoch": 0.47, + "learning_rate": 1.97216796875e-05, + "loss": 1.59, + "step": 60 + }, + { + "epoch": 0.62, + "learning_rate": 1.9624023437500002e-05, + "loss": 1.6275, + "step": 80 + }, + { + "epoch": 0.78, + "learning_rate": 1.9526367187500002e-05, + "loss": 1.473, + "step": 100 + }, + { + "epoch": 0.93, + "learning_rate": 1.9428710937500003e-05, + "loss": 1.3701, + "step": 120 + }, + { + "epoch": 1.09, + "learning_rate": 1.93310546875e-05, + "loss": 1.3224, + "step": 140 + }, + { + "epoch": 1.24, + "learning_rate": 1.92333984375e-05, + "loss": 1.1423, + "step": 160 + }, + { + "epoch": 1.4, + "learning_rate": 1.91357421875e-05, + "loss": 1.1652, + "step": 180 + }, + { + "epoch": 1.55, + "learning_rate": 1.9038085937500002e-05, + "loss": 1.1422, + "step": 200 + }, + { + "epoch": 1.71, + "learning_rate": 1.8940429687500002e-05, + "loss": 1.002, + "step": 220 + }, + { + "epoch": 1.87, + "learning_rate": 1.8842773437500003e-05, + "loss": 1.0779, + "step": 240 + }, + { + "epoch": 2.02, + "learning_rate": 1.87451171875e-05, + "loss": 0.9887, + "step": 260 + }, + { + "epoch": 2.18, + "learning_rate": 1.86474609375e-05, + "loss": 0.9543, + "step": 280 + }, + { + "epoch": 2.33, + "learning_rate": 1.85498046875e-05, + "loss": 0.9371, + "step": 300 + }, + { + "epoch": 2.49, + "learning_rate": 1.8452148437500002e-05, + "loss": 0.8701, + "step": 320 + }, + { + "epoch": 2.64, + "learning_rate": 1.8354492187500003e-05, + "loss": 0.875, + "step": 340 + }, + { + "epoch": 2.8, + "learning_rate": 1.82568359375e-05, + "loss": 0.7843, + "step": 360 + }, + { + "epoch": 2.95, + "learning_rate": 1.81591796875e-05, + "loss": 0.7945, + "step": 380 + }, + { + "epoch": 3.11, + "learning_rate": 1.80615234375e-05, + "loss": 0.799, + "step": 400 + }, + { + "epoch": 3.27, + "learning_rate": 1.79638671875e-05, + "loss": 0.7623, + "step": 420 + }, + { + "epoch": 3.42, + "learning_rate": 1.7866210937500002e-05, + "loss": 0.7263, + "step": 440 + }, + { + "epoch": 3.58, + "learning_rate": 1.7768554687500003e-05, + "loss": 0.7779, + "step": 460 + }, + { + "epoch": 3.73, + "learning_rate": 1.76708984375e-05, + "loss": 0.695, + "step": 480 + }, + { + "epoch": 3.89, + "learning_rate": 1.75732421875e-05, + "loss": 0.7344, + "step": 500 + }, + { + "epoch": 4.04, + "learning_rate": 1.74755859375e-05, + "loss": 0.695, + "step": 520 + }, + { + "epoch": 4.2, + "learning_rate": 1.7377929687500002e-05, + "loss": 0.6504, + "step": 540 + }, + { + "epoch": 4.35, + "learning_rate": 1.7280273437500002e-05, + "loss": 0.6447, + "step": 560 + }, + { + "epoch": 4.51, + "learning_rate": 1.7182617187500003e-05, + "loss": 0.6931, + "step": 580 + }, + { + "epoch": 4.66, + "learning_rate": 1.70849609375e-05, + "loss": 0.6256, + "step": 600 + }, + { + "epoch": 4.82, + "learning_rate": 1.69873046875e-05, + "loss": 0.6132, + "step": 620 + }, + { + "epoch": 4.98, + "learning_rate": 1.68896484375e-05, + "loss": 0.6001, + "step": 640 + }, + { + "epoch": 5.13, + "learning_rate": 1.6791992187500002e-05, + "loss": 0.6176, + "step": 660 + }, + { + "epoch": 5.29, + "learning_rate": 1.6694335937500002e-05, + "loss": 0.5709, + "step": 680 + }, + { + "epoch": 5.44, + "learning_rate": 1.65966796875e-05, + "loss": 0.564, + "step": 700 + }, + { + "epoch": 5.6, + "learning_rate": 1.64990234375e-05, + "loss": 0.5969, + "step": 720 + }, + { + "epoch": 5.75, + "learning_rate": 1.64013671875e-05, + "loss": 0.5484, + "step": 740 + }, + { + "epoch": 5.91, + "learning_rate": 1.63037109375e-05, + "loss": 0.5667, + "step": 760 + }, + { + "epoch": 6.06, + "learning_rate": 1.6206054687500002e-05, + "loss": 0.5442, + "step": 780 + }, + { + "epoch": 6.22, + "learning_rate": 1.6108398437500003e-05, + "loss": 0.4857, + "step": 800 + }, + { + "epoch": 6.38, + "learning_rate": 1.60107421875e-05, + "loss": 0.5225, + "step": 820 + }, + { + "epoch": 6.53, + "learning_rate": 1.59130859375e-05, + "loss": 0.5457, + "step": 840 + }, + { + "epoch": 6.69, + "learning_rate": 1.58154296875e-05, + "loss": 0.5315, + "step": 860 + }, + { + "epoch": 6.84, + "learning_rate": 1.57177734375e-05, + "loss": 0.5345, + "step": 880 + }, + { + "epoch": 7.0, + "learning_rate": 1.5620117187500002e-05, + "loss": 0.5169, + "step": 900 + }, + { + "epoch": 7.15, + "learning_rate": 1.5522460937500003e-05, + "loss": 0.5115, + "step": 920 + }, + { + "epoch": 7.31, + "learning_rate": 1.54248046875e-05, + "loss": 0.4913, + "step": 940 + }, + { + "epoch": 7.46, + "learning_rate": 1.53271484375e-05, + "loss": 0.4868, + "step": 960 + }, + { + "epoch": 7.62, + "learning_rate": 1.5229492187500001e-05, + "loss": 0.5226, + "step": 980 + }, + { + "epoch": 7.77, + "learning_rate": 1.5131835937500002e-05, + "loss": 0.4061, + "step": 1000 + }, + { + "epoch": 7.93, + "learning_rate": 1.50341796875e-05, + "loss": 0.4488, + "step": 1020 + }, + { + "epoch": 8.09, + "learning_rate": 1.4936523437500001e-05, + "loss": 0.4443, + "step": 1040 + }, + { + "epoch": 8.24, + "learning_rate": 1.4838867187500002e-05, + "loss": 0.4646, + "step": 1060 + }, + { + "epoch": 8.4, + "learning_rate": 1.47412109375e-05, + "loss": 0.4513, + "step": 1080 + }, + { + "epoch": 8.55, + "learning_rate": 1.4643554687500001e-05, + "loss": 0.4394, + "step": 1100 + }, + { + "epoch": 8.71, + "learning_rate": 1.4545898437500002e-05, + "loss": 0.4337, + "step": 1120 + }, + { + "epoch": 8.86, + "learning_rate": 1.44482421875e-05, + "loss": 0.5037, + "step": 1140 + }, + { + "epoch": 9.02, + "learning_rate": 1.4350585937500001e-05, + "loss": 0.4209, + "step": 1160 + }, + { + "epoch": 9.17, + "learning_rate": 1.42529296875e-05, + "loss": 0.4073, + "step": 1180 + }, + { + "epoch": 9.33, + "learning_rate": 1.4155273437500001e-05, + "loss": 0.3879, + "step": 1200 + }, + { + "epoch": 9.48, + "learning_rate": 1.4057617187500002e-05, + "loss": 0.4434, + "step": 1220 + }, + { + "epoch": 9.64, + "learning_rate": 1.39599609375e-05, + "loss": 0.4062, + "step": 1240 + }, + { + "epoch": 9.8, + "learning_rate": 1.3862304687500001e-05, + "loss": 0.39, + "step": 1260 + }, + { + "epoch": 9.95, + "learning_rate": 1.3764648437500002e-05, + "loss": 0.4466, + "step": 1280 + }, + { + "epoch": 10.11, + "learning_rate": 1.36669921875e-05, + "loss": 0.4208, + "step": 1300 + }, + { + "epoch": 10.26, + "learning_rate": 1.3569335937500001e-05, + "loss": 0.3662, + "step": 1320 + }, + { + "epoch": 10.42, + "learning_rate": 1.3471679687500002e-05, + "loss": 0.4049, + "step": 1340 + }, + { + "epoch": 10.57, + "learning_rate": 1.33740234375e-05, + "loss": 0.3989, + "step": 1360 + }, + { + "epoch": 10.73, + "learning_rate": 1.3276367187500001e-05, + "loss": 0.3839, + "step": 1380 + }, + { + "epoch": 10.88, + "learning_rate": 1.3178710937500002e-05, + "loss": 0.4065, + "step": 1400 + }, + { + "epoch": 11.04, + "learning_rate": 1.30810546875e-05, + "loss": 0.3877, + "step": 1420 + }, + { + "epoch": 11.2, + "learning_rate": 1.2983398437500001e-05, + "loss": 0.4048, + "step": 1440 + }, + { + "epoch": 11.35, + "learning_rate": 1.2885742187500002e-05, + "loss": 0.3715, + "step": 1460 + }, + { + "epoch": 11.51, + "learning_rate": 1.27880859375e-05, + "loss": 0.3752, + "step": 1480 + }, + { + "epoch": 11.66, + "learning_rate": 1.2690429687500001e-05, + "loss": 0.3401, + "step": 1500 + }, + { + "epoch": 11.82, + "learning_rate": 1.25927734375e-05, + "loss": 0.3545, + "step": 1520 + }, + { + "epoch": 11.97, + "learning_rate": 1.24951171875e-05, + "loss": 0.3718, + "step": 1540 + }, + { + "epoch": 12.13, + "learning_rate": 1.2397460937500001e-05, + "loss": 0.3755, + "step": 1560 + }, + { + "epoch": 12.28, + "learning_rate": 1.22998046875e-05, + "loss": 0.3865, + "step": 1580 + }, + { + "epoch": 12.44, + "learning_rate": 1.2202148437500001e-05, + "loss": 0.3237, + "step": 1600 + }, + { + "epoch": 12.59, + "learning_rate": 1.2104492187500001e-05, + "loss": 0.3702, + "step": 1620 + }, + { + "epoch": 12.75, + "learning_rate": 1.20068359375e-05, + "loss": 0.3238, + "step": 1640 + }, + { + "epoch": 12.91, + "learning_rate": 1.1909179687500001e-05, + "loss": 0.3373, + "step": 1660 + }, + { + "epoch": 13.06, + "learning_rate": 1.1811523437500002e-05, + "loss": 0.3486, + "step": 1680 + }, + { + "epoch": 13.22, + "learning_rate": 1.17138671875e-05, + "loss": 0.362, + "step": 1700 + }, + { + "epoch": 13.37, + "learning_rate": 1.1616210937500001e-05, + "loss": 0.3257, + "step": 1720 + }, + { + "epoch": 13.53, + "learning_rate": 1.1518554687500002e-05, + "loss": 0.3414, + "step": 1740 + }, + { + "epoch": 13.68, + "learning_rate": 1.14208984375e-05, + "loss": 0.3121, + "step": 1760 + }, + { + "epoch": 13.84, + "learning_rate": 1.1323242187500001e-05, + "loss": 0.3598, + "step": 1780 + }, + { + "epoch": 13.99, + "learning_rate": 1.1225585937500002e-05, + "loss": 0.2906, + "step": 1800 + }, + { + "epoch": 14.15, + "learning_rate": 1.11279296875e-05, + "loss": 0.3356, + "step": 1820 + }, + { + "epoch": 14.31, + "learning_rate": 1.1030273437500001e-05, + "loss": 0.2991, + "step": 1840 + }, + { + "epoch": 14.46, + "learning_rate": 1.09326171875e-05, + "loss": 0.2987, + "step": 1860 + }, + { + "epoch": 14.62, + "learning_rate": 1.08349609375e-05, + "loss": 0.3249, + "step": 1880 + }, + { + "epoch": 14.77, + "learning_rate": 1.0737304687500001e-05, + "loss": 0.3425, + "step": 1900 + }, + { + "epoch": 14.93, + "learning_rate": 1.06396484375e-05, + "loss": 0.3398, + "step": 1920 + }, + { + "epoch": 15.08, + "learning_rate": 1.05419921875e-05, + "loss": 0.3018, + "step": 1940 + }, + { + "epoch": 15.24, + "learning_rate": 1.0444335937500001e-05, + "loss": 0.2475, + "step": 1960 + }, + { + "epoch": 15.39, + "learning_rate": 1.03466796875e-05, + "loss": 0.3507, + "step": 1980 + }, + { + "epoch": 15.55, + "learning_rate": 1.02490234375e-05, + "loss": 0.3084, + "step": 2000 + }, + { + "epoch": 15.7, + "learning_rate": 1.0151367187500001e-05, + "loss": 0.3212, + "step": 2020 + }, + { + "epoch": 15.86, + "learning_rate": 1.00537109375e-05, + "loss": 0.2831, + "step": 2040 + }, + { + "epoch": 16.02, + "learning_rate": 9.956054687500001e-06, + "loss": 0.3072, + "step": 2060 + }, + { + "epoch": 16.17, + "learning_rate": 9.858398437500002e-06, + "loss": 0.3293, + "step": 2080 + }, + { + "epoch": 16.33, + "learning_rate": 9.7607421875e-06, + "loss": 0.2738, + "step": 2100 + }, + { + "epoch": 16.48, + "learning_rate": 9.663085937500001e-06, + "loss": 0.3245, + "step": 2120 + }, + { + "epoch": 16.64, + "learning_rate": 9.565429687500002e-06, + "loss": 0.2846, + "step": 2140 + }, + { + "epoch": 16.79, + "learning_rate": 9.4677734375e-06, + "loss": 0.2906, + "step": 2160 + }, + { + "epoch": 16.95, + "learning_rate": 9.370117187500001e-06, + "loss": 0.263, + "step": 2180 + }, + { + "epoch": 17.1, + "learning_rate": 9.2724609375e-06, + "loss": 0.286, + "step": 2200 + }, + { + "epoch": 17.26, + "learning_rate": 9.1748046875e-06, + "loss": 0.3152, + "step": 2220 + }, + { + "epoch": 17.41, + "learning_rate": 9.077148437500001e-06, + "loss": 0.2811, + "step": 2240 + }, + { + "epoch": 17.57, + "learning_rate": 8.9794921875e-06, + "loss": 0.257, + "step": 2260 + }, + { + "epoch": 17.73, + "learning_rate": 8.8818359375e-06, + "loss": 0.267, + "step": 2280 + }, + { + "epoch": 17.88, + "learning_rate": 8.784179687500001e-06, + "loss": 0.3056, + "step": 2300 + }, + { + "epoch": 18.04, + "learning_rate": 8.6865234375e-06, + "loss": 0.2522, + "step": 2320 + }, + { + "epoch": 18.19, + "learning_rate": 8.5888671875e-06, + "loss": 0.2806, + "step": 2340 + }, + { + "epoch": 18.35, + "learning_rate": 8.491210937500001e-06, + "loss": 0.266, + "step": 2360 + }, + { + "epoch": 18.5, + "learning_rate": 8.3935546875e-06, + "loss": 0.288, + "step": 2380 + }, + { + "epoch": 18.66, + "learning_rate": 8.2958984375e-06, + "loss": 0.2712, + "step": 2400 + }, + { + "epoch": 18.81, + "learning_rate": 8.198242187500001e-06, + "loss": 0.3099, + "step": 2420 + }, + { + "epoch": 18.97, + "learning_rate": 8.1005859375e-06, + "loss": 0.2205, + "step": 2440 + }, + { + "epoch": 19.13, + "learning_rate": 8.002929687500001e-06, + "loss": 0.253, + "step": 2460 + }, + { + "epoch": 19.28, + "learning_rate": 7.905273437500001e-06, + "loss": 0.2885, + "step": 2480 + }, + { + "epoch": 19.44, + "learning_rate": 7.8076171875e-06, + "loss": 0.2326, + "step": 2500 + }, + { + "epoch": 19.59, + "learning_rate": 7.709960937500001e-06, + "loss": 0.255, + "step": 2520 + }, + { + "epoch": 19.75, + "learning_rate": 7.612304687500001e-06, + "loss": 0.2698, + "step": 2540 + }, + { + "epoch": 19.9, + "learning_rate": 7.5146484375000004e-06, + "loss": 0.2532, + "step": 2560 + }, + { + "epoch": 20.06, + "learning_rate": 7.4169921875e-06, + "loss": 0.2562, + "step": 2580 + }, + { + "epoch": 20.21, + "learning_rate": 7.319335937500001e-06, + "loss": 0.2717, + "step": 2600 + }, + { + "epoch": 20.37, + "learning_rate": 7.2216796875000005e-06, + "loss": 0.2652, + "step": 2620 + }, + { + "epoch": 20.52, + "learning_rate": 7.1240234375e-06, + "loss": 0.2654, + "step": 2640 + }, + { + "epoch": 20.68, + "learning_rate": 7.026367187500001e-06, + "loss": 0.2766, + "step": 2660 + }, + { + "epoch": 20.84, + "learning_rate": 6.928710937500001e-06, + "loss": 0.2246, + "step": 2680 + }, + { + "epoch": 20.99, + "learning_rate": 6.8310546875e-06, + "loss": 0.2546, + "step": 2700 + }, + { + "epoch": 21.15, + "learning_rate": 6.733398437500001e-06, + "loss": 0.2708, + "step": 2720 + }, + { + "epoch": 21.3, + "learning_rate": 6.635742187500001e-06, + "loss": 0.2385, + "step": 2740 + }, + { + "epoch": 21.46, + "learning_rate": 6.5380859375e-06, + "loss": 0.2271, + "step": 2760 + }, + { + "epoch": 21.61, + "learning_rate": 6.4404296875e-06, + "loss": 0.2874, + "step": 2780 + }, + { + "epoch": 21.77, + "learning_rate": 6.342773437500001e-06, + "loss": 0.2447, + "step": 2800 + }, + { + "epoch": 21.92, + "learning_rate": 6.2451171875000005e-06, + "loss": 0.2309, + "step": 2820 + }, + { + "epoch": 22.08, + "learning_rate": 6.1474609375e-06, + "loss": 0.2587, + "step": 2840 + }, + { + "epoch": 22.24, + "learning_rate": 6.049804687500001e-06, + "loss": 0.2303, + "step": 2860 + }, + { + "epoch": 22.39, + "learning_rate": 5.952148437500001e-06, + "loss": 0.2376, + "step": 2880 + }, + { + "epoch": 22.55, + "learning_rate": 5.8544921875e-06, + "loss": 0.2696, + "step": 2900 + }, + { + "epoch": 22.7, + "learning_rate": 5.7568359375e-06, + "loss": 0.2459, + "step": 2920 + }, + { + "epoch": 22.86, + "learning_rate": 5.659179687500001e-06, + "loss": 0.2032, + "step": 2940 + }, + { + "epoch": 23.01, + "learning_rate": 5.5615234375e-06, + "loss": 0.2491, + "step": 2960 + }, + { + "epoch": 23.17, + "learning_rate": 5.4638671875e-06, + "loss": 0.2395, + "step": 2980 + }, + { + "epoch": 23.32, + "learning_rate": 5.366210937500001e-06, + "loss": 0.2573, + "step": 3000 + }, + { + "epoch": 23.48, + "learning_rate": 5.2685546875000005e-06, + "loss": 0.2016, + "step": 3020 + }, + { + "epoch": 23.63, + "learning_rate": 5.1708984375e-06, + "loss": 0.2486, + "step": 3040 + }, + { + "epoch": 23.79, + "learning_rate": 5.073242187500001e-06, + "loss": 0.2209, + "step": 3060 + }, + { + "epoch": 23.95, + "learning_rate": 4.9755859375000006e-06, + "loss": 0.2373, + "step": 3080 + }, + { + "epoch": 24.1, + "learning_rate": 4.8779296875e-06, + "loss": 0.2433, + "step": 3100 + }, + { + "epoch": 24.26, + "learning_rate": 4.7802734375e-06, + "loss": 0.1869, + "step": 3120 + }, + { + "epoch": 24.41, + "learning_rate": 4.682617187500001e-06, + "loss": 0.2299, + "step": 3140 + }, + { + "epoch": 24.57, + "learning_rate": 4.5849609375e-06, + "loss": 0.2556, + "step": 3160 + }, + { + "epoch": 24.72, + "learning_rate": 4.4873046875e-06, + "loss": 0.2569, + "step": 3180 + }, + { + "epoch": 24.88, + "learning_rate": 4.389648437500001e-06, + "loss": 0.2298, + "step": 3200 + }, + { + "epoch": 25.03, + "learning_rate": 4.2919921875000005e-06, + "loss": 0.2368, + "step": 3220 + }, + { + "epoch": 25.19, + "learning_rate": 4.1943359375e-06, + "loss": 0.1928, + "step": 3240 + }, + { + "epoch": 25.34, + "learning_rate": 4.0966796875e-06, + "loss": 0.2253, + "step": 3260 + }, + { + "epoch": 25.5, + "learning_rate": 3.9990234375000005e-06, + "loss": 0.2369, + "step": 3280 + }, + { + "epoch": 25.66, + "learning_rate": 3.9013671875e-06, + "loss": 0.2117, + "step": 3300 + }, + { + "epoch": 25.81, + "learning_rate": 3.8037109375000004e-06, + "loss": 0.2407, + "step": 3320 + }, + { + "epoch": 25.97, + "learning_rate": 3.7060546875e-06, + "loss": 0.2451, + "step": 3340 + }, + { + "epoch": 26.12, + "learning_rate": 3.6083984375000004e-06, + "loss": 0.235, + "step": 3360 + }, + { + "epoch": 26.28, + "learning_rate": 3.5107421875e-06, + "loss": 0.2446, + "step": 3380 + }, + { + "epoch": 26.43, + "learning_rate": 3.4130859375000003e-06, + "loss": 0.1815, + "step": 3400 + }, + { + "epoch": 26.59, + "learning_rate": 3.3154296875000004e-06, + "loss": 0.2283, + "step": 3420 + }, + { + "epoch": 26.74, + "learning_rate": 3.2177734375e-06, + "loss": 0.2136, + "step": 3440 + }, + { + "epoch": 26.9, + "learning_rate": 3.1201171875000003e-06, + "loss": 0.2438, + "step": 3460 + }, + { + "epoch": 27.06, + "learning_rate": 3.0224609375e-06, + "loss": 0.2097, + "step": 3480 + }, + { + "epoch": 27.21, + "learning_rate": 2.9248046875000003e-06, + "loss": 0.2361, + "step": 3500 + } + ], + "max_steps": 4096, + "num_train_epochs": 32, + "total_flos": 4.5484736274432e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3500/training_args.bin b/checkpoint-3500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e2b3bee7f57e1aa04d4410b1e443601a704abcd7 --- /dev/null +++ b/checkpoint-3500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19b61f1babbbf314354bf942366de8dc63aa97ebdccb27573b76080ab5b50935 +size 3963 diff --git a/checkpoint-4000/optimizer.pt b/checkpoint-4000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4b77e7bff623e1d5186234acddca5b2b6017735 --- /dev/null +++ b/checkpoint-4000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a15a4ebf3c224f4983f72caa0e87190a30aa620985e31ed1520e454a4150d9b +size 33629893 diff --git a/checkpoint-4000/pytorch_model.bin b/checkpoint-4000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..c5395779ef517e8cd45f5b1270dd06926808a3b5 --- /dev/null +++ b/checkpoint-4000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:720b9d77e0ce851d217c7a957a6f612f365e218c624e061d9ea510dc8315b3f7 +size 7548185429 diff --git a/checkpoint-4000/rng_state.pth b/checkpoint-4000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e9ffd4ab92b1050a6adf6a10b8ce7ec5babb1677 --- /dev/null +++ b/checkpoint-4000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bd42de1b0b08a74a025ad71c27122cfef9f6672e215dce9e7f12d408f8f11f1 +size 14575 diff --git a/checkpoint-4000/scaler.pt b/checkpoint-4000/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..47e308fe1f5b2e80422aeb33b55cab510d21be0d --- /dev/null +++ b/checkpoint-4000/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ee8f29e0fd7b704c75c284f9871facc4dbb4d208c091248f943fdf0558bb403 +size 557 diff --git a/checkpoint-4000/scheduler.pt b/checkpoint-4000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff6a3e66a3903319c80721c623139acf5c215ec6 --- /dev/null +++ b/checkpoint-4000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3c19d74521b72228fcdac77949603cb2eeeff72403dc8a36c02e5088c9d2841 +size 627 diff --git a/checkpoint-4000/trainer_state.json b/checkpoint-4000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..09fea44255ada30f0073963af8309406342db0a1 --- /dev/null +++ b/checkpoint-4000/trainer_state.json @@ -0,0 +1,1216 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 31.09815354713314, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.16, + "learning_rate": 1.9912109375000002e-05, + "loss": 1.9656, + "step": 20 + }, + { + "epoch": 0.31, + "learning_rate": 1.98193359375e-05, + "loss": 1.8445, + "step": 40 + }, + { + "epoch": 0.47, + "learning_rate": 1.97216796875e-05, + "loss": 1.59, + "step": 60 + }, + { + "epoch": 0.62, + "learning_rate": 1.9624023437500002e-05, + "loss": 1.6275, + "step": 80 + }, + { + "epoch": 0.78, + "learning_rate": 1.9526367187500002e-05, + "loss": 1.473, + "step": 100 + }, + { + "epoch": 0.93, + "learning_rate": 1.9428710937500003e-05, + "loss": 1.3701, + "step": 120 + }, + { + "epoch": 1.09, + "learning_rate": 1.93310546875e-05, + "loss": 1.3224, + "step": 140 + }, + { + "epoch": 1.24, + "learning_rate": 1.92333984375e-05, + "loss": 1.1423, + "step": 160 + }, + { + "epoch": 1.4, + "learning_rate": 1.91357421875e-05, + "loss": 1.1652, + "step": 180 + }, + { + "epoch": 1.55, + "learning_rate": 1.9038085937500002e-05, + "loss": 1.1422, + "step": 200 + }, + { + "epoch": 1.71, + "learning_rate": 1.8940429687500002e-05, + "loss": 1.002, + "step": 220 + }, + { + "epoch": 1.87, + "learning_rate": 1.8842773437500003e-05, + "loss": 1.0779, + "step": 240 + }, + { + "epoch": 2.02, + "learning_rate": 1.87451171875e-05, + "loss": 0.9887, + "step": 260 + }, + { + "epoch": 2.18, + "learning_rate": 1.86474609375e-05, + "loss": 0.9543, + "step": 280 + }, + { + "epoch": 2.33, + "learning_rate": 1.85498046875e-05, + "loss": 0.9371, + "step": 300 + }, + { + "epoch": 2.49, + "learning_rate": 1.8452148437500002e-05, + "loss": 0.8701, + "step": 320 + }, + { + "epoch": 2.64, + "learning_rate": 1.8354492187500003e-05, + "loss": 0.875, + "step": 340 + }, + { + "epoch": 2.8, + "learning_rate": 1.82568359375e-05, + "loss": 0.7843, + "step": 360 + }, + { + "epoch": 2.95, + "learning_rate": 1.81591796875e-05, + "loss": 0.7945, + "step": 380 + }, + { + "epoch": 3.11, + "learning_rate": 1.80615234375e-05, + "loss": 0.799, + "step": 400 + }, + { + "epoch": 3.27, + "learning_rate": 1.79638671875e-05, + "loss": 0.7623, + "step": 420 + }, + { + "epoch": 3.42, + "learning_rate": 1.7866210937500002e-05, + "loss": 0.7263, + "step": 440 + }, + { + "epoch": 3.58, + "learning_rate": 1.7768554687500003e-05, + "loss": 0.7779, + "step": 460 + }, + { + "epoch": 3.73, + "learning_rate": 1.76708984375e-05, + "loss": 0.695, + "step": 480 + }, + { + "epoch": 3.89, + "learning_rate": 1.75732421875e-05, + "loss": 0.7344, + "step": 500 + }, + { + "epoch": 4.04, + "learning_rate": 1.74755859375e-05, + "loss": 0.695, + "step": 520 + }, + { + "epoch": 4.2, + "learning_rate": 1.7377929687500002e-05, + "loss": 0.6504, + "step": 540 + }, + { + "epoch": 4.35, + "learning_rate": 1.7280273437500002e-05, + "loss": 0.6447, + "step": 560 + }, + { + "epoch": 4.51, + "learning_rate": 1.7182617187500003e-05, + "loss": 0.6931, + "step": 580 + }, + { + "epoch": 4.66, + "learning_rate": 1.70849609375e-05, + "loss": 0.6256, + "step": 600 + }, + { + "epoch": 4.82, + "learning_rate": 1.69873046875e-05, + "loss": 0.6132, + "step": 620 + }, + { + "epoch": 4.98, + "learning_rate": 1.68896484375e-05, + "loss": 0.6001, + "step": 640 + }, + { + "epoch": 5.13, + "learning_rate": 1.6791992187500002e-05, + "loss": 0.6176, + "step": 660 + }, + { + "epoch": 5.29, + "learning_rate": 1.6694335937500002e-05, + "loss": 0.5709, + "step": 680 + }, + { + "epoch": 5.44, + "learning_rate": 1.65966796875e-05, + "loss": 0.564, + "step": 700 + }, + { + "epoch": 5.6, + "learning_rate": 1.64990234375e-05, + "loss": 0.5969, + "step": 720 + }, + { + "epoch": 5.75, + "learning_rate": 1.64013671875e-05, + "loss": 0.5484, + "step": 740 + }, + { + "epoch": 5.91, + "learning_rate": 1.63037109375e-05, + "loss": 0.5667, + "step": 760 + }, + { + "epoch": 6.06, + "learning_rate": 1.6206054687500002e-05, + "loss": 0.5442, + "step": 780 + }, + { + "epoch": 6.22, + "learning_rate": 1.6108398437500003e-05, + "loss": 0.4857, + "step": 800 + }, + { + "epoch": 6.38, + "learning_rate": 1.60107421875e-05, + "loss": 0.5225, + "step": 820 + }, + { + "epoch": 6.53, + "learning_rate": 1.59130859375e-05, + "loss": 0.5457, + "step": 840 + }, + { + "epoch": 6.69, + "learning_rate": 1.58154296875e-05, + "loss": 0.5315, + "step": 860 + }, + { + "epoch": 6.84, + "learning_rate": 1.57177734375e-05, + "loss": 0.5345, + "step": 880 + }, + { + "epoch": 7.0, + "learning_rate": 1.5620117187500002e-05, + "loss": 0.5169, + "step": 900 + }, + { + "epoch": 7.15, + "learning_rate": 1.5522460937500003e-05, + "loss": 0.5115, + "step": 920 + }, + { + "epoch": 7.31, + "learning_rate": 1.54248046875e-05, + "loss": 0.4913, + "step": 940 + }, + { + "epoch": 7.46, + "learning_rate": 1.53271484375e-05, + "loss": 0.4868, + "step": 960 + }, + { + "epoch": 7.62, + "learning_rate": 1.5229492187500001e-05, + "loss": 0.5226, + "step": 980 + }, + { + "epoch": 7.77, + "learning_rate": 1.5131835937500002e-05, + "loss": 0.4061, + "step": 1000 + }, + { + "epoch": 7.93, + "learning_rate": 1.50341796875e-05, + "loss": 0.4488, + "step": 1020 + }, + { + "epoch": 8.09, + "learning_rate": 1.4936523437500001e-05, + "loss": 0.4443, + "step": 1040 + }, + { + "epoch": 8.24, + "learning_rate": 1.4838867187500002e-05, + "loss": 0.4646, + "step": 1060 + }, + { + "epoch": 8.4, + "learning_rate": 1.47412109375e-05, + "loss": 0.4513, + "step": 1080 + }, + { + "epoch": 8.55, + "learning_rate": 1.4643554687500001e-05, + "loss": 0.4394, + "step": 1100 + }, + { + "epoch": 8.71, + "learning_rate": 1.4545898437500002e-05, + "loss": 0.4337, + "step": 1120 + }, + { + "epoch": 8.86, + "learning_rate": 1.44482421875e-05, + "loss": 0.5037, + "step": 1140 + }, + { + "epoch": 9.02, + "learning_rate": 1.4350585937500001e-05, + "loss": 0.4209, + "step": 1160 + }, + { + "epoch": 9.17, + "learning_rate": 1.42529296875e-05, + "loss": 0.4073, + "step": 1180 + }, + { + "epoch": 9.33, + "learning_rate": 1.4155273437500001e-05, + "loss": 0.3879, + "step": 1200 + }, + { + "epoch": 9.48, + "learning_rate": 1.4057617187500002e-05, + "loss": 0.4434, + "step": 1220 + }, + { + "epoch": 9.64, + "learning_rate": 1.39599609375e-05, + "loss": 0.4062, + "step": 1240 + }, + { + "epoch": 9.8, + "learning_rate": 1.3862304687500001e-05, + "loss": 0.39, + "step": 1260 + }, + { + "epoch": 9.95, + "learning_rate": 1.3764648437500002e-05, + "loss": 0.4466, + "step": 1280 + }, + { + "epoch": 10.11, + "learning_rate": 1.36669921875e-05, + "loss": 0.4208, + "step": 1300 + }, + { + "epoch": 10.26, + "learning_rate": 1.3569335937500001e-05, + "loss": 0.3662, + "step": 1320 + }, + { + "epoch": 10.42, + "learning_rate": 1.3471679687500002e-05, + "loss": 0.4049, + "step": 1340 + }, + { + "epoch": 10.57, + "learning_rate": 1.33740234375e-05, + "loss": 0.3989, + "step": 1360 + }, + { + "epoch": 10.73, + "learning_rate": 1.3276367187500001e-05, + "loss": 0.3839, + "step": 1380 + }, + { + "epoch": 10.88, + "learning_rate": 1.3178710937500002e-05, + "loss": 0.4065, + "step": 1400 + }, + { + "epoch": 11.04, + "learning_rate": 1.30810546875e-05, + "loss": 0.3877, + "step": 1420 + }, + { + "epoch": 11.2, + "learning_rate": 1.2983398437500001e-05, + "loss": 0.4048, + "step": 1440 + }, + { + "epoch": 11.35, + "learning_rate": 1.2885742187500002e-05, + "loss": 0.3715, + "step": 1460 + }, + { + "epoch": 11.51, + "learning_rate": 1.27880859375e-05, + "loss": 0.3752, + "step": 1480 + }, + { + "epoch": 11.66, + "learning_rate": 1.2690429687500001e-05, + "loss": 0.3401, + "step": 1500 + }, + { + "epoch": 11.82, + "learning_rate": 1.25927734375e-05, + "loss": 0.3545, + "step": 1520 + }, + { + "epoch": 11.97, + "learning_rate": 1.24951171875e-05, + "loss": 0.3718, + "step": 1540 + }, + { + "epoch": 12.13, + "learning_rate": 1.2397460937500001e-05, + "loss": 0.3755, + "step": 1560 + }, + { + "epoch": 12.28, + "learning_rate": 1.22998046875e-05, + "loss": 0.3865, + "step": 1580 + }, + { + "epoch": 12.44, + "learning_rate": 1.2202148437500001e-05, + "loss": 0.3237, + "step": 1600 + }, + { + "epoch": 12.59, + "learning_rate": 1.2104492187500001e-05, + "loss": 0.3702, + "step": 1620 + }, + { + "epoch": 12.75, + "learning_rate": 1.20068359375e-05, + "loss": 0.3238, + "step": 1640 + }, + { + "epoch": 12.91, + "learning_rate": 1.1909179687500001e-05, + "loss": 0.3373, + "step": 1660 + }, + { + "epoch": 13.06, + "learning_rate": 1.1811523437500002e-05, + "loss": 0.3486, + "step": 1680 + }, + { + "epoch": 13.22, + "learning_rate": 1.17138671875e-05, + "loss": 0.362, + "step": 1700 + }, + { + "epoch": 13.37, + "learning_rate": 1.1616210937500001e-05, + "loss": 0.3257, + "step": 1720 + }, + { + "epoch": 13.53, + "learning_rate": 1.1518554687500002e-05, + "loss": 0.3414, + "step": 1740 + }, + { + "epoch": 13.68, + "learning_rate": 1.14208984375e-05, + "loss": 0.3121, + "step": 1760 + }, + { + "epoch": 13.84, + "learning_rate": 1.1323242187500001e-05, + "loss": 0.3598, + "step": 1780 + }, + { + "epoch": 13.99, + "learning_rate": 1.1225585937500002e-05, + "loss": 0.2906, + "step": 1800 + }, + { + "epoch": 14.15, + "learning_rate": 1.11279296875e-05, + "loss": 0.3356, + "step": 1820 + }, + { + "epoch": 14.31, + "learning_rate": 1.1030273437500001e-05, + "loss": 0.2991, + "step": 1840 + }, + { + "epoch": 14.46, + "learning_rate": 1.09326171875e-05, + "loss": 0.2987, + "step": 1860 + }, + { + "epoch": 14.62, + "learning_rate": 1.08349609375e-05, + "loss": 0.3249, + "step": 1880 + }, + { + "epoch": 14.77, + "learning_rate": 1.0737304687500001e-05, + "loss": 0.3425, + "step": 1900 + }, + { + "epoch": 14.93, + "learning_rate": 1.06396484375e-05, + "loss": 0.3398, + "step": 1920 + }, + { + "epoch": 15.08, + "learning_rate": 1.05419921875e-05, + "loss": 0.3018, + "step": 1940 + }, + { + "epoch": 15.24, + "learning_rate": 1.0444335937500001e-05, + "loss": 0.2475, + "step": 1960 + }, + { + "epoch": 15.39, + "learning_rate": 1.03466796875e-05, + "loss": 0.3507, + "step": 1980 + }, + { + "epoch": 15.55, + "learning_rate": 1.02490234375e-05, + "loss": 0.3084, + "step": 2000 + }, + { + "epoch": 15.7, + "learning_rate": 1.0151367187500001e-05, + "loss": 0.3212, + "step": 2020 + }, + { + "epoch": 15.86, + "learning_rate": 1.00537109375e-05, + "loss": 0.2831, + "step": 2040 + }, + { + "epoch": 16.02, + "learning_rate": 9.956054687500001e-06, + "loss": 0.3072, + "step": 2060 + }, + { + "epoch": 16.17, + "learning_rate": 9.858398437500002e-06, + "loss": 0.3293, + "step": 2080 + }, + { + "epoch": 16.33, + "learning_rate": 9.7607421875e-06, + "loss": 0.2738, + "step": 2100 + }, + { + "epoch": 16.48, + "learning_rate": 9.663085937500001e-06, + "loss": 0.3245, + "step": 2120 + }, + { + "epoch": 16.64, + "learning_rate": 9.565429687500002e-06, + "loss": 0.2846, + "step": 2140 + }, + { + "epoch": 16.79, + "learning_rate": 9.4677734375e-06, + "loss": 0.2906, + "step": 2160 + }, + { + "epoch": 16.95, + "learning_rate": 9.370117187500001e-06, + "loss": 0.263, + "step": 2180 + }, + { + "epoch": 17.1, + "learning_rate": 9.2724609375e-06, + "loss": 0.286, + "step": 2200 + }, + { + "epoch": 17.26, + "learning_rate": 9.1748046875e-06, + "loss": 0.3152, + "step": 2220 + }, + { + "epoch": 17.41, + "learning_rate": 9.077148437500001e-06, + "loss": 0.2811, + "step": 2240 + }, + { + "epoch": 17.57, + "learning_rate": 8.9794921875e-06, + "loss": 0.257, + "step": 2260 + }, + { + "epoch": 17.73, + "learning_rate": 8.8818359375e-06, + "loss": 0.267, + "step": 2280 + }, + { + "epoch": 17.88, + "learning_rate": 8.784179687500001e-06, + "loss": 0.3056, + "step": 2300 + }, + { + "epoch": 18.04, + "learning_rate": 8.6865234375e-06, + "loss": 0.2522, + "step": 2320 + }, + { + "epoch": 18.19, + "learning_rate": 8.5888671875e-06, + "loss": 0.2806, + "step": 2340 + }, + { + "epoch": 18.35, + "learning_rate": 8.491210937500001e-06, + "loss": 0.266, + "step": 2360 + }, + { + "epoch": 18.5, + "learning_rate": 8.3935546875e-06, + "loss": 0.288, + "step": 2380 + }, + { + "epoch": 18.66, + "learning_rate": 8.2958984375e-06, + "loss": 0.2712, + "step": 2400 + }, + { + "epoch": 18.81, + "learning_rate": 8.198242187500001e-06, + "loss": 0.3099, + "step": 2420 + }, + { + "epoch": 18.97, + "learning_rate": 8.1005859375e-06, + "loss": 0.2205, + "step": 2440 + }, + { + "epoch": 19.13, + "learning_rate": 8.002929687500001e-06, + "loss": 0.253, + "step": 2460 + }, + { + "epoch": 19.28, + "learning_rate": 7.905273437500001e-06, + "loss": 0.2885, + "step": 2480 + }, + { + "epoch": 19.44, + "learning_rate": 7.8076171875e-06, + "loss": 0.2326, + "step": 2500 + }, + { + "epoch": 19.59, + "learning_rate": 7.709960937500001e-06, + "loss": 0.255, + "step": 2520 + }, + { + "epoch": 19.75, + "learning_rate": 7.612304687500001e-06, + "loss": 0.2698, + "step": 2540 + }, + { + "epoch": 19.9, + "learning_rate": 7.5146484375000004e-06, + "loss": 0.2532, + "step": 2560 + }, + { + "epoch": 20.06, + "learning_rate": 7.4169921875e-06, + "loss": 0.2562, + "step": 2580 + }, + { + "epoch": 20.21, + "learning_rate": 7.319335937500001e-06, + "loss": 0.2717, + "step": 2600 + }, + { + "epoch": 20.37, + "learning_rate": 7.2216796875000005e-06, + "loss": 0.2652, + "step": 2620 + }, + { + "epoch": 20.52, + "learning_rate": 7.1240234375e-06, + "loss": 0.2654, + "step": 2640 + }, + { + "epoch": 20.68, + "learning_rate": 7.026367187500001e-06, + "loss": 0.2766, + "step": 2660 + }, + { + "epoch": 20.84, + "learning_rate": 6.928710937500001e-06, + "loss": 0.2246, + "step": 2680 + }, + { + "epoch": 20.99, + "learning_rate": 6.8310546875e-06, + "loss": 0.2546, + "step": 2700 + }, + { + "epoch": 21.15, + "learning_rate": 6.733398437500001e-06, + "loss": 0.2708, + "step": 2720 + }, + { + "epoch": 21.3, + "learning_rate": 6.635742187500001e-06, + "loss": 0.2385, + "step": 2740 + }, + { + "epoch": 21.46, + "learning_rate": 6.5380859375e-06, + "loss": 0.2271, + "step": 2760 + }, + { + "epoch": 21.61, + "learning_rate": 6.4404296875e-06, + "loss": 0.2874, + "step": 2780 + }, + { + "epoch": 21.77, + "learning_rate": 6.342773437500001e-06, + "loss": 0.2447, + "step": 2800 + }, + { + "epoch": 21.92, + "learning_rate": 6.2451171875000005e-06, + "loss": 0.2309, + "step": 2820 + }, + { + "epoch": 22.08, + "learning_rate": 6.1474609375e-06, + "loss": 0.2587, + "step": 2840 + }, + { + "epoch": 22.24, + "learning_rate": 6.049804687500001e-06, + "loss": 0.2303, + "step": 2860 + }, + { + "epoch": 22.39, + "learning_rate": 5.952148437500001e-06, + "loss": 0.2376, + "step": 2880 + }, + { + "epoch": 22.55, + "learning_rate": 5.8544921875e-06, + "loss": 0.2696, + "step": 2900 + }, + { + "epoch": 22.7, + "learning_rate": 5.7568359375e-06, + "loss": 0.2459, + "step": 2920 + }, + { + "epoch": 22.86, + "learning_rate": 5.659179687500001e-06, + "loss": 0.2032, + "step": 2940 + }, + { + "epoch": 23.01, + "learning_rate": 5.5615234375e-06, + "loss": 0.2491, + "step": 2960 + }, + { + "epoch": 23.17, + "learning_rate": 5.4638671875e-06, + "loss": 0.2395, + "step": 2980 + }, + { + "epoch": 23.32, + "learning_rate": 5.366210937500001e-06, + "loss": 0.2573, + "step": 3000 + }, + { + "epoch": 23.48, + "learning_rate": 5.2685546875000005e-06, + "loss": 0.2016, + "step": 3020 + }, + { + "epoch": 23.63, + "learning_rate": 5.1708984375e-06, + "loss": 0.2486, + "step": 3040 + }, + { + "epoch": 23.79, + "learning_rate": 5.073242187500001e-06, + "loss": 0.2209, + "step": 3060 + }, + { + "epoch": 23.95, + "learning_rate": 4.9755859375000006e-06, + "loss": 0.2373, + "step": 3080 + }, + { + "epoch": 24.1, + "learning_rate": 4.8779296875e-06, + "loss": 0.2433, + "step": 3100 + }, + { + "epoch": 24.26, + "learning_rate": 4.7802734375e-06, + "loss": 0.1869, + "step": 3120 + }, + { + "epoch": 24.41, + "learning_rate": 4.682617187500001e-06, + "loss": 0.2299, + "step": 3140 + }, + { + "epoch": 24.57, + "learning_rate": 4.5849609375e-06, + "loss": 0.2556, + "step": 3160 + }, + { + "epoch": 24.72, + "learning_rate": 4.4873046875e-06, + "loss": 0.2569, + "step": 3180 + }, + { + "epoch": 24.88, + "learning_rate": 4.389648437500001e-06, + "loss": 0.2298, + "step": 3200 + }, + { + "epoch": 25.03, + "learning_rate": 4.2919921875000005e-06, + "loss": 0.2368, + "step": 3220 + }, + { + "epoch": 25.19, + "learning_rate": 4.1943359375e-06, + "loss": 0.1928, + "step": 3240 + }, + { + "epoch": 25.34, + "learning_rate": 4.0966796875e-06, + "loss": 0.2253, + "step": 3260 + }, + { + "epoch": 25.5, + "learning_rate": 3.9990234375000005e-06, + "loss": 0.2369, + "step": 3280 + }, + { + "epoch": 25.66, + "learning_rate": 3.9013671875e-06, + "loss": 0.2117, + "step": 3300 + }, + { + "epoch": 25.81, + "learning_rate": 3.8037109375000004e-06, + "loss": 0.2407, + "step": 3320 + }, + { + "epoch": 25.97, + "learning_rate": 3.7060546875e-06, + "loss": 0.2451, + "step": 3340 + }, + { + "epoch": 26.12, + "learning_rate": 3.6083984375000004e-06, + "loss": 0.235, + "step": 3360 + }, + { + "epoch": 26.28, + "learning_rate": 3.5107421875e-06, + "loss": 0.2446, + "step": 3380 + }, + { + "epoch": 26.43, + "learning_rate": 3.4130859375000003e-06, + "loss": 0.1815, + "step": 3400 + }, + { + "epoch": 26.59, + "learning_rate": 3.3154296875000004e-06, + "loss": 0.2283, + "step": 3420 + }, + { + "epoch": 26.74, + "learning_rate": 3.2177734375e-06, + "loss": 0.2136, + "step": 3440 + }, + { + "epoch": 26.9, + "learning_rate": 3.1201171875000003e-06, + "loss": 0.2438, + "step": 3460 + }, + { + "epoch": 27.06, + "learning_rate": 3.0224609375e-06, + "loss": 0.2097, + "step": 3480 + }, + { + "epoch": 27.21, + "learning_rate": 2.9248046875000003e-06, + "loss": 0.2361, + "step": 3500 + }, + { + "epoch": 27.37, + "learning_rate": 2.8271484375000004e-06, + "loss": 0.2221, + "step": 3520 + }, + { + "epoch": 27.52, + "learning_rate": 2.7294921875e-06, + "loss": 0.2324, + "step": 3540 + }, + { + "epoch": 27.68, + "learning_rate": 2.6318359375000003e-06, + "loss": 0.231, + "step": 3560 + }, + { + "epoch": 27.83, + "learning_rate": 2.5341796875e-06, + "loss": 0.1879, + "step": 3580 + }, + { + "epoch": 27.99, + "learning_rate": 2.4365234375000002e-06, + "loss": 0.2024, + "step": 3600 + }, + { + "epoch": 28.14, + "learning_rate": 2.3388671875e-06, + "loss": 0.2394, + "step": 3620 + }, + { + "epoch": 28.3, + "learning_rate": 2.2412109375e-06, + "loss": 0.2174, + "step": 3640 + }, + { + "epoch": 28.45, + "learning_rate": 2.1435546875000003e-06, + "loss": 0.2288, + "step": 3660 + }, + { + "epoch": 28.61, + "learning_rate": 2.0458984375e-06, + "loss": 0.212, + "step": 3680 + }, + { + "epoch": 28.77, + "learning_rate": 1.9482421875000002e-06, + "loss": 0.2082, + "step": 3700 + }, + { + "epoch": 28.92, + "learning_rate": 1.8505859375000002e-06, + "loss": 0.1976, + "step": 3720 + }, + { + "epoch": 29.08, + "learning_rate": 1.7529296875000002e-06, + "loss": 0.2332, + "step": 3740 + }, + { + "epoch": 29.23, + "learning_rate": 1.6552734375000001e-06, + "loss": 0.2003, + "step": 3760 + }, + { + "epoch": 29.39, + "learning_rate": 1.5576171875e-06, + "loss": 0.1978, + "step": 3780 + }, + { + "epoch": 29.54, + "learning_rate": 1.4599609375e-06, + "loss": 0.2194, + "step": 3800 + }, + { + "epoch": 29.7, + "learning_rate": 1.3623046875000002e-06, + "loss": 0.231, + "step": 3820 + }, + { + "epoch": 29.85, + "learning_rate": 1.2646484375000001e-06, + "loss": 0.2196, + "step": 3840 + }, + { + "epoch": 30.01, + "learning_rate": 1.1669921875e-06, + "loss": 0.2203, + "step": 3860 + }, + { + "epoch": 30.17, + "learning_rate": 1.0693359375e-06, + "loss": 0.206, + "step": 3880 + }, + { + "epoch": 30.32, + "learning_rate": 9.716796875e-07, + "loss": 0.2211, + "step": 3900 + }, + { + "epoch": 30.48, + "learning_rate": 8.740234375000001e-07, + "loss": 0.2129, + "step": 3920 + }, + { + "epoch": 30.63, + "learning_rate": 7.763671875e-07, + "loss": 0.2085, + "step": 3940 + }, + { + "epoch": 30.79, + "learning_rate": 6.787109375000001e-07, + "loss": 0.1905, + "step": 3960 + }, + { + "epoch": 30.94, + "learning_rate": 5.810546875e-07, + "loss": 0.2433, + "step": 3980 + }, + { + "epoch": 31.1, + "learning_rate": 4.833984375e-07, + "loss": 0.1829, + "step": 4000 + } + ], + "max_steps": 4096, + "num_train_epochs": 32, + "total_flos": 5.1982555742208e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4000/training_args.bin b/checkpoint-4000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e2b3bee7f57e1aa04d4410b1e443601a704abcd7 --- /dev/null +++ b/checkpoint-4000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19b61f1babbbf314354bf942366de8dc63aa97ebdccb27573b76080ab5b50935 +size 3963 diff --git a/checkpoint-500/optimizer.pt b/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3859416ef124597b0cddba22bb787ed8edd33aa7 --- /dev/null +++ b/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2306f3ae56404b7fdcccb9752ed3f0e6327c2a1a080ec1a82e4ff8d851f1976d +size 33629893 diff --git a/checkpoint-500/pytorch_model.bin b/checkpoint-500/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8181ebef5a4340792b72fe0c487dd71346003eb8 --- /dev/null +++ b/checkpoint-500/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c6bfb40749ec1a829821de7468d60870345a16532b3771ea6c0f005fb967d9c +size 7548185429 diff --git a/checkpoint-500/rng_state.pth b/checkpoint-500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..97fa4bb4b1d1026e8bd52378d3d49742f5544cc5 --- /dev/null +++ b/checkpoint-500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ead6ca894afe28b0731c7890485799191601a32cb32f0598efa1debb98849d07 +size 14575 diff --git a/checkpoint-500/scaler.pt b/checkpoint-500/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..59578bb2e92b693421810bc18e17c4f08d99245c --- /dev/null +++ b/checkpoint-500/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc6e246c36e7291b4e14531e35fd0beee55254292ddbe1e2c5ed4e1f9d959050 +size 557 diff --git a/checkpoint-500/scheduler.pt b/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..bfa828b950dfcf1ae0f04396a5d241e52fa1eea9 --- /dev/null +++ b/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8adf462220fc877b4fa438031fda119776a4a0595d1fb60b7b3ae82b7772d5c +size 627 diff --git a/checkpoint-500/trainer_state.json b/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fb684d5a7b44f6501fc63b28201fb745fa4f8822 --- /dev/null +++ b/checkpoint-500/trainer_state.json @@ -0,0 +1,166 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.887269193391642, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.16, + "learning_rate": 1.9912109375000002e-05, + "loss": 1.9656, + "step": 20 + }, + { + "epoch": 0.31, + "learning_rate": 1.98193359375e-05, + "loss": 1.8445, + "step": 40 + }, + { + "epoch": 0.47, + "learning_rate": 1.97216796875e-05, + "loss": 1.59, + "step": 60 + }, + { + "epoch": 0.62, + "learning_rate": 1.9624023437500002e-05, + "loss": 1.6275, + "step": 80 + }, + { + "epoch": 0.78, + "learning_rate": 1.9526367187500002e-05, + "loss": 1.473, + "step": 100 + }, + { + "epoch": 0.93, + "learning_rate": 1.9428710937500003e-05, + "loss": 1.3701, + "step": 120 + }, + { + "epoch": 1.09, + "learning_rate": 1.93310546875e-05, + "loss": 1.3224, + "step": 140 + }, + { + "epoch": 1.24, + "learning_rate": 1.92333984375e-05, + "loss": 1.1423, + "step": 160 + }, + { + "epoch": 1.4, + "learning_rate": 1.91357421875e-05, + "loss": 1.1652, + "step": 180 + }, + { + "epoch": 1.55, + "learning_rate": 1.9038085937500002e-05, + "loss": 1.1422, + "step": 200 + }, + { + "epoch": 1.71, + "learning_rate": 1.8940429687500002e-05, + "loss": 1.002, + "step": 220 + }, + { + "epoch": 1.87, + "learning_rate": 1.8842773437500003e-05, + "loss": 1.0779, + "step": 240 + }, + { + "epoch": 2.02, + "learning_rate": 1.87451171875e-05, + "loss": 0.9887, + "step": 260 + }, + { + "epoch": 2.18, + "learning_rate": 1.86474609375e-05, + "loss": 0.9543, + "step": 280 + }, + { + "epoch": 2.33, + "learning_rate": 1.85498046875e-05, + "loss": 0.9371, + "step": 300 + }, + { + "epoch": 2.49, + "learning_rate": 1.8452148437500002e-05, + "loss": 0.8701, + "step": 320 + }, + { + "epoch": 2.64, + "learning_rate": 1.8354492187500003e-05, + "loss": 0.875, + "step": 340 + }, + { + "epoch": 2.8, + "learning_rate": 1.82568359375e-05, + "loss": 0.7843, + "step": 360 + }, + { + "epoch": 2.95, + "learning_rate": 1.81591796875e-05, + "loss": 0.7945, + "step": 380 + }, + { + "epoch": 3.11, + "learning_rate": 1.80615234375e-05, + "loss": 0.799, + "step": 400 + }, + { + "epoch": 3.27, + "learning_rate": 1.79638671875e-05, + "loss": 0.7623, + "step": 420 + }, + { + "epoch": 3.42, + "learning_rate": 1.7866210937500002e-05, + "loss": 0.7263, + "step": 440 + }, + { + "epoch": 3.58, + "learning_rate": 1.7768554687500003e-05, + "loss": 0.7779, + "step": 460 + }, + { + "epoch": 3.73, + "learning_rate": 1.76708984375e-05, + "loss": 0.695, + "step": 480 + }, + { + "epoch": 3.89, + "learning_rate": 1.75732421875e-05, + "loss": 0.7344, + "step": 500 + } + ], + "max_steps": 4096, + "num_train_epochs": 32, + "total_flos": 6.497819467776e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-500/training_args.bin b/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e2b3bee7f57e1aa04d4410b1e443601a704abcd7 --- /dev/null +++ b/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19b61f1babbbf314354bf942366de8dc63aa97ebdccb27573b76080ab5b50935 +size 3963 diff --git a/runs/May11_07-33-30_n7xzj2i618/1683790410.3703568/events.out.tfevents.1683790410.n7xzj2i618.6436.1 b/runs/May11_07-33-30_n7xzj2i618/1683790410.3703568/events.out.tfevents.1683790410.n7xzj2i618.6436.1 new file mode 100644 index 0000000000000000000000000000000000000000..7cc1e6fffb0652f00fefc43bbf63771d4e68a5a8 --- /dev/null +++ b/runs/May11_07-33-30_n7xzj2i618/1683790410.3703568/events.out.tfevents.1683790410.n7xzj2i618.6436.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e7cb8b6a93e0fbc483108a2e8b8f3bd46ba17971dcab26c51ce9c0dc34a988a +size 5879 diff --git a/runs/May11_07-33-30_n7xzj2i618/events.out.tfevents.1683790410.n7xzj2i618.6436.0 b/runs/May11_07-33-30_n7xzj2i618/events.out.tfevents.1683790410.n7xzj2i618.6436.0 new file mode 100644 index 0000000000000000000000000000000000000000..e9b54e16593bd2e331bb4fb7755c1a6ee43cacf2 --- /dev/null +++ b/runs/May11_07-33-30_n7xzj2i618/events.out.tfevents.1683790410.n7xzj2i618.6436.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:872b4dee9ee5761936e86b47101f2adcca3fedcb8abb043f7cf433bf5887ccf4 +size 36464