diff --git a/.gitattributes b/.gitattributes index e355ebee693cac228cd08974784be928b8c396a6..5f61be426746800ccc6e2c77e1f7ea674641acf6 100644 --- a/.gitattributes +++ b/.gitattributes @@ -29,3 +29,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text wandb/run-20220504_142129-1tmxz74i/run-1tmxz74i.wandb filter=lfs diff=lfs merge=lfs -text wandb/run-20220504_142129-1tmxz74i/logs/debug-internal.log filter=lfs diff=lfs merge=lfs -text wandb/run-20220504_142129-1tmxz74i/files/output.log filter=lfs diff=lfs merge=lfs -text +wandb/run-20220505_173748-b097rk18/run-b097rk18.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/config.json b/config.json index f08248a8e1ccf473be9e0bd7e085ba9cdc0c069c..823df42ec13b484c967ce7dda85288b56f65d4ec 100644 --- a/config.json +++ b/config.json @@ -182,7 +182,7 @@ "forced_eos_token_id": null, "gradient_checkpointing": false, "hidden_act": "gelu", - "hidden_dropout": 0.16627274455555233, + "hidden_dropout": 0.2, "hidden_size": 1024, "id2label": { "0": "LABEL_0", diff --git a/emissions.csv b/emissions.csv index c70cc37fb905c900d62335396acae24b288f9f0d..c5dc43befa671946517125b934b516c325b4a3d5 100644 --- a/emissions.csv +++ b/emissions.csv @@ -4,3 +4,4 @@ timestamp,experiment_id,project_name,duration,emissions,energy_consumed,country_ 2022-05-05T16:42:47,89692b38-e983-4664-a932-8df61002625d,codecarbon,4.291411638259888,0.00019785712188713343,0.00034938569995962115,USA,USA,Iowa,Y,gcp,us-central1 2022-05-05T16:45:58,ef3cf220-dd7d-4101-8f2e-5b58125d0f6a,codecarbon,4.533631801605225,0.00011988175208065735,0.0002116930109141045,USA,USA,Iowa,Y,gcp,us-central1 2022-05-05T16:49:10,d2831db0-f95d-4aec-bbd0-5241d2505f54,codecarbon,4.5968592166900635,0.00010660969900888212,0.00018825657603546201,USA,USA,Iowa,Y,gcp,us-central1 +2022-05-05T17:32:21,595a622e-ed46-407f-95ef-aee33f1bdf6d,codecarbon,4.391920328140259,9.467203556285354e-05,0.0001671764710627822,USA,USA,Iowa,Y,gcp,us-central1 diff --git a/pytorch_model.bin b/pytorch_model.bin index 26418c202b4342a8fc1f9cb0c272ec97c1a41c0a..f679051414ddb2d1953f8cf10e9654fdbbfb563d 100644 --- a/pytorch_model.bin +++ b/pytorch_model.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee59259a1d3d50f94ca31b73c03ef96aea7289bbdb17676e503a799ca664af3f +oid sha256:e68181435e477b5116db4b7759d8834b718e949730cecc7dd1c173661c0b5b6c size 2353867057 diff --git a/runs/May03_12-39-14_sanchit--v100/1651584502.8814862/events.out.tfevents.1651584502.sanchit--v100.38894.1 b/runs/May03_12-39-14_sanchit--v100/1651584502.8814862/events.out.tfevents.1651584502.sanchit--v100.38894.1 deleted file mode 100644 index 5b338bda868aceb12cedb9eee965ddceb79420b8..0000000000000000000000000000000000000000 --- a/runs/May03_12-39-14_sanchit--v100/1651584502.8814862/events.out.tfevents.1651584502.sanchit--v100.38894.1 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1a5e50bb326f116678bb0d2af94ee25e92d82ce3268121a68b288dd930e13470 -size 5184 diff --git a/runs/May03_12-39-14_sanchit--v100/events.out.tfevents.1651584502.sanchit--v100.38894.0 b/runs/May03_12-39-14_sanchit--v100/events.out.tfevents.1651584502.sanchit--v100.38894.0 deleted file mode 100644 index dcabe2898df9c56de4a6a6a9cbebf8130822a0ff..0000000000000000000000000000000000000000 --- a/runs/May03_12-39-14_sanchit--v100/events.out.tfevents.1651584502.sanchit--v100.38894.0 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:65a28e37ffdeeb562c734e7a4815296918b93b818763a73a70ab20d29627105c -size 10592 diff --git a/runs/May03_15-33-55_sanchit--v100/1651592261.409201/events.out.tfevents.1651592261.sanchit--v100.40140.1 b/runs/May03_15-33-55_sanchit--v100/1651592261.409201/events.out.tfevents.1651592261.sanchit--v100.40140.1 deleted file mode 100644 index 7a164dcf73fe51fcfe52383fe8685afd1696ec1d..0000000000000000000000000000000000000000 --- a/runs/May03_15-33-55_sanchit--v100/1651592261.409201/events.out.tfevents.1651592261.sanchit--v100.40140.1 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7c3651206b236f0c31a3ff08bbeb2416a7fde03f54ccb583a7df7f8256f3fd29 -size 5184 diff --git a/runs/May03_15-33-55_sanchit--v100/events.out.tfevents.1651592261.sanchit--v100.40140.0 b/runs/May03_15-33-55_sanchit--v100/events.out.tfevents.1651592261.sanchit--v100.40140.0 deleted file mode 100644 index 3a7d3946f26386a762a04bf2bb4ade14e36faa9c..0000000000000000000000000000000000000000 --- a/runs/May03_15-33-55_sanchit--v100/events.out.tfevents.1651592261.sanchit--v100.40140.0 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c965accf9ea2be03062bfecc4c45cdb54eb6b4b444ea37a2c9533c8db4fed044 -size 87941 diff --git a/runs/May03_15-58-13_sanchit--v100/1651596054.9030492/events.out.tfevents.1651596054.sanchit--v100.40903.1 b/runs/May03_15-58-13_sanchit--v100/1651596054.9030492/events.out.tfevents.1651596054.sanchit--v100.40903.1 deleted file mode 100644 index 5db30371aca52b03fa92726292650bfe256ac47a..0000000000000000000000000000000000000000 --- a/runs/May03_15-58-13_sanchit--v100/1651596054.9030492/events.out.tfevents.1651596054.sanchit--v100.40903.1 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:76e39c8c39b9be3c4531fd72d066671f7365b550e8079fee72b6060a36be59f6 -size 5184 diff --git a/runs/May03_15-58-13_sanchit--v100/events.out.tfevents.1651596054.sanchit--v100.40903.0 b/runs/May03_15-58-13_sanchit--v100/events.out.tfevents.1651596054.sanchit--v100.40903.0 deleted file mode 100644 index 9b8064756f9a25a66962fa8d23db26a725020828..0000000000000000000000000000000000000000 --- a/runs/May03_15-58-13_sanchit--v100/events.out.tfevents.1651596054.sanchit--v100.40903.0 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:410298c8949435d86a040fb00c0e0a7bde2f0445544abeadadae587ba569bfba -size 73654 diff --git a/runs/May03_17-15-22_sanchit--v100/1651598399.7088904/events.out.tfevents.1651598399.sanchit--v100.42111.1 b/runs/May03_17-15-22_sanchit--v100/1651598399.7088904/events.out.tfevents.1651598399.sanchit--v100.42111.1 deleted file mode 100644 index 270a16228431933476cef843e4b414f0a3ee7cf2..0000000000000000000000000000000000000000 --- a/runs/May03_17-15-22_sanchit--v100/1651598399.7088904/events.out.tfevents.1651598399.sanchit--v100.42111.1 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61b94cd769015be4bbc81b78b8074c963d947a90c91e7cf517abaad8c3ff3078 -size 5184 diff --git a/runs/May03_17-15-22_sanchit--v100/events.out.tfevents.1651598399.sanchit--v100.42111.0 b/runs/May03_17-15-22_sanchit--v100/events.out.tfevents.1651598399.sanchit--v100.42111.0 deleted file mode 100644 index e1ac7984985feebc560dfb7e260678544816bdf4..0000000000000000000000000000000000000000 --- a/runs/May03_17-15-22_sanchit--v100/events.out.tfevents.1651598399.sanchit--v100.42111.0 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:06ebac1e97e241e13d6c8a536ad57e3ef5b58c510633a0824a10536aae4662a4 -size 797661 diff --git a/runs/May04_08-29-27_sanchit--v100/1651653030.564084/events.out.tfevents.1651653030.sanchit--v100.48541.1 b/runs/May04_08-29-27_sanchit--v100/1651653030.564084/events.out.tfevents.1651653030.sanchit--v100.48541.1 deleted file mode 100644 index a19faaffbaefd0fd5bc0beefb104c28640847d15..0000000000000000000000000000000000000000 --- a/runs/May04_08-29-27_sanchit--v100/1651653030.564084/events.out.tfevents.1651653030.sanchit--v100.48541.1 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9f818ffc5324af0a19b67e9e869654bb112292ade31d00f48263ec9cf177206c -size 5184 diff --git a/runs/May04_08-29-27_sanchit--v100/events.out.tfevents.1651653030.sanchit--v100.48541.0 b/runs/May04_08-29-27_sanchit--v100/events.out.tfevents.1651653030.sanchit--v100.48541.0 deleted file mode 100644 index 22bd10bf7e2026cba23edf6362bbccc656299c37..0000000000000000000000000000000000000000 --- a/runs/May04_08-29-27_sanchit--v100/events.out.tfevents.1651653030.sanchit--v100.48541.0 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6120cfa0b8dd7cc6afe852da88002229769bc8f3efa93f730fa0661088c42d36 -size 88290 diff --git a/runs/May04_13-30-37_sanchit--v100/1651674088.8879716/events.out.tfevents.1651674088.sanchit--v100.50375.1 b/runs/May04_13-30-37_sanchit--v100/1651674088.8879716/events.out.tfevents.1651674088.sanchit--v100.50375.1 deleted file mode 100644 index 5ac65fc6ceb014b77c2590e16d342dd8e702081d..0000000000000000000000000000000000000000 --- a/runs/May04_13-30-37_sanchit--v100/1651674088.8879716/events.out.tfevents.1651674088.sanchit--v100.50375.1 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:aef34e54eaac03ee713590a0eb81b34a1eb839a803375bb6e9e3a6e439991c2b -size 5184 diff --git a/runs/May04_13-30-37_sanchit--v100/events.out.tfevents.1651674088.sanchit--v100.50375.0 b/runs/May04_13-30-37_sanchit--v100/events.out.tfevents.1651674088.sanchit--v100.50375.0 deleted file mode 100644 index de1edc40e94bdd5cfb58148f93b24b7b0ab3cb57..0000000000000000000000000000000000000000 --- a/runs/May04_13-30-37_sanchit--v100/events.out.tfevents.1651674088.sanchit--v100.50375.0 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:baefc16420fc1dd3035ed23659f51947934cd5d60be8002b0e1d45b1b6244ef1 -size 1541927 diff --git a/runs/May05_16-32-27_sanchit--v100/1651768642.7722313/events.out.tfevents.1651768642.sanchit--v100.65172.1 b/runs/May05_16-32-27_sanchit--v100/1651768642.7722313/events.out.tfevents.1651768642.sanchit--v100.65172.1 deleted file mode 100644 index 780670c99b38fc6b95bd3f8c2babc477a1f1dce8..0000000000000000000000000000000000000000 --- a/runs/May05_16-32-27_sanchit--v100/1651768642.7722313/events.out.tfevents.1651768642.sanchit--v100.65172.1 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1fe9010d25b5426ae74e1154e5be2727f4b9d3dc3d3ac8423b4e36ab102ca5b1 -size 5184 diff --git a/runs/May05_16-32-27_sanchit--v100/events.out.tfevents.1651768642.sanchit--v100.65172.0 b/runs/May05_16-32-27_sanchit--v100/events.out.tfevents.1651768642.sanchit--v100.65172.0 deleted file mode 100644 index 948ee281b2f0105fa3f6a0c23e085f425af648ee..0000000000000000000000000000000000000000 --- a/runs/May05_16-32-27_sanchit--v100/events.out.tfevents.1651768642.sanchit--v100.65172.0 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dffb33c2552c28d9cdf40229f277c5ce3d0f56c41496a3ad6c1fcdd0e08ec2d4 -size 10206 diff --git a/runs/May05_16-41-38_sanchit--v100/1651768958.3876698/events.out.tfevents.1651768958.sanchit--v100.65949.1 b/runs/May05_16-41-38_sanchit--v100/1651768958.3876698/events.out.tfevents.1651768958.sanchit--v100.65949.1 deleted file mode 100644 index 36ec1fc4b3b1cc03d7af54329949f42b492c2c40..0000000000000000000000000000000000000000 --- a/runs/May05_16-41-38_sanchit--v100/1651768958.3876698/events.out.tfevents.1651768958.sanchit--v100.65949.1 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:db04a69bbb2713a70173d62a1e8ee86184b8325a2e3bde42e89647c96cbd1027 -size 5184 diff --git a/runs/May05_16-41-38_sanchit--v100/events.out.tfevents.1651768958.sanchit--v100.65949.0 b/runs/May05_16-41-38_sanchit--v100/events.out.tfevents.1651768958.sanchit--v100.65949.0 deleted file mode 100644 index 75948f40ee9bd0673757af8a0c082423cf017b84..0000000000000000000000000000000000000000 --- a/runs/May05_16-41-38_sanchit--v100/events.out.tfevents.1651768958.sanchit--v100.65949.0 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2d852cd4da235fc0dd1135a7957f543279a67523fdbc8cdd26c32a2b17c3da93 -size 10206 diff --git a/runs/May05_16-44-56_sanchit--v100/1651769149.9394004/events.out.tfevents.1651769149.sanchit--v100.66677.1 b/runs/May05_16-44-56_sanchit--v100/1651769149.9394004/events.out.tfevents.1651769149.sanchit--v100.66677.1 deleted file mode 100644 index 8647dedc71ac19adff6d69be50ebec64b3c1647d..0000000000000000000000000000000000000000 --- a/runs/May05_16-44-56_sanchit--v100/1651769149.9394004/events.out.tfevents.1651769149.sanchit--v100.66677.1 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf44e1ee4066c7afb2c0126abffcdc62708cfaf679d440482c5b10bdfa09dfed -size 5184 diff --git a/runs/May05_16-44-56_sanchit--v100/events.out.tfevents.1651769149.sanchit--v100.66677.0 b/runs/May05_16-44-56_sanchit--v100/events.out.tfevents.1651769149.sanchit--v100.66677.0 deleted file mode 100644 index 5c80c81ed532aec73aee8ed5e96243003c805613..0000000000000000000000000000000000000000 --- a/runs/May05_16-44-56_sanchit--v100/events.out.tfevents.1651769149.sanchit--v100.66677.0 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6cf35e831e3ed592f3e7d3c82817cc2768013f4c0662d3bfcd1223cc883fbf3f -size 10206 diff --git a/runs/May05_16-48-07_sanchit--v100/1651769342.0456421/events.out.tfevents.1651769342.sanchit--v100.67410.1 b/runs/May05_16-48-07_sanchit--v100/1651769342.0456421/events.out.tfevents.1651769342.sanchit--v100.67410.1 deleted file mode 100644 index 40980a72147054d2c69f60b3083bf7e00426d99b..0000000000000000000000000000000000000000 --- a/runs/May05_16-48-07_sanchit--v100/1651769342.0456421/events.out.tfevents.1651769342.sanchit--v100.67410.1 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:736259815069e83ea3029a853d47a1d4e7feceac2e28a92a58b2bf2531c6944e -size 5184 diff --git a/runs/May05_16-48-07_sanchit--v100/events.out.tfevents.1651769342.sanchit--v100.67410.0 b/runs/May05_16-48-07_sanchit--v100/events.out.tfevents.1651769342.sanchit--v100.67410.0 deleted file mode 100644 index 788a0e2258602402cb37e25fe407a2bc62a9ef5e..0000000000000000000000000000000000000000 --- a/runs/May05_16-48-07_sanchit--v100/events.out.tfevents.1651769342.sanchit--v100.67410.0 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c214fc3b70e657543c5ebfda93915ca0349eaee975294c9f561c46df5eedc39b -size 10207 diff --git a/runs/May05_17-36-54_sanchit--v100/1651772268.485843/events.out.tfevents.1651772268.sanchit--v100.74387.1 b/runs/May05_17-36-54_sanchit--v100/1651772268.485843/events.out.tfevents.1651772268.sanchit--v100.74387.1 new file mode 100644 index 0000000000000000000000000000000000000000..ffec511a2ac2fff5efce3f0e6f924092c75dcf9f --- /dev/null +++ b/runs/May05_17-36-54_sanchit--v100/1651772268.485843/events.out.tfevents.1651772268.sanchit--v100.74387.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:151c1d4351b970fd9dd0b6408ef78ef7693e783a530f7635197797b147fdbdd8 +size 5184 diff --git a/runs/May05_17-36-54_sanchit--v100/events.out.tfevents.1651772268.sanchit--v100.74387.0 b/runs/May05_17-36-54_sanchit--v100/events.out.tfevents.1651772268.sanchit--v100.74387.0 new file mode 100644 index 0000000000000000000000000000000000000000..be412f87ae75020dd093e1eaf612f66b5ee4ee7c --- /dev/null +++ b/runs/May05_17-36-54_sanchit--v100/events.out.tfevents.1651772268.sanchit--v100.74387.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac2d6fb8bada68aa8ed66f0a7a406bf11f62cf5dd66fb0accc7d789bec6c30f0 +size 88259 diff --git a/sweep.yaml b/sweep.yaml index 104ce60267bd448310b73774688fedee1bd56965..c87c0bd0e27676d7d5d60e5d4b92980f797d1f07 100644 --- a/sweep.yaml +++ b/sweep.yaml @@ -13,7 +13,7 @@ command: - --push_to_hub - --use_auth_token - ${args} -method: random +method: grid metric: goal: maximize name: eval/bleu @@ -35,19 +35,17 @@ parameters: per_device_eval_batch_size: value: 8 gradient_accumulation_steps: - value: 8 + value: 16 generation_max_length: value: 40 generation_num_beams: value: 1 learning_rate: - distribution: log_uniform - max: -6.9 - min: -9.2 + values: + - 3e-5 + - 3e-4 hidden_dropout: - distribution: log_uniform - max: -1.6 - min: -3.4 + value: 0.2 warmup_steps: value: 500 evaluation_strategy: diff --git a/train_results.json b/train_results.json deleted file mode 100644 index b122afad79d33d486af0a6bdfe5c1f56ff3277f7..0000000000000000000000000000000000000000 --- a/train_results.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "epoch": 3.0, - "train_loss": 0.0, - "train_runtime": 8.6541, - "train_samples": 207374, - "train_samples_per_second": 71887.743, - "train_steps_per_second": 1123.17 -} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json deleted file mode 100644 index 7418a8953e8ab9ff3464d2a5a0175af1cef95ca5..0000000000000000000000000000000000000000 --- a/trainer_state.json +++ /dev/null @@ -1,58552 +0,0 @@ -{ - "best_metric": 2.4961869532998874e-13, - "best_model_checkpoint": "./checkpoint-1000", - "epoch": 2.999922845459455, - "global_step": 9720, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0, - "learning_rate": 0.0, - "loss": 11.5079, - "step": 1 - }, - { - "epoch": 0.0, - "learning_rate": 0.0, - "loss": 11.1203, - "step": 2 - }, - { - "epoch": 0.0, - "learning_rate": 0.0, - "loss": 10.6958, - "step": 3 - }, - { - "epoch": 0.0, - "learning_rate": 0.0, - "loss": 11.2693, - "step": 4 - }, - { - "epoch": 0.0, - "learning_rate": 0.0, - "loss": 11.6199, - "step": 5 - }, - { - "epoch": 0.0, - "learning_rate": 6.569998523345044e-07, - "loss": 10.6054, - "step": 6 - }, - { - "epoch": 0.0, - "learning_rate": 1.3139997046690089e-06, - "loss": 10.9386, - "step": 7 - }, - { - "epoch": 0.0, - "learning_rate": 1.3139997046690089e-06, - "loss": 10.9872, - "step": 8 - }, - { - "epoch": 0.0, - "learning_rate": 1.9709995570035133e-06, - "loss": 10.8356, - "step": 9 - }, - { - "epoch": 0.0, - "learning_rate": 2.6279994093380178e-06, - "loss": 11.1359, - "step": 10 - }, - { - "epoch": 0.0, - "learning_rate": 3.284999261672522e-06, - "loss": 10.0616, - "step": 11 - }, - { - "epoch": 0.0, - "learning_rate": 3.941999114007027e-06, - "loss": 9.5432, - "step": 12 - }, - { - "epoch": 0.0, - "learning_rate": 4.598998966341531e-06, - "loss": 9.1687, - "step": 13 - }, - { - "epoch": 0.0, - "learning_rate": 5.2559988186760355e-06, - "loss": 9.1152, - "step": 14 - }, - { - "epoch": 0.0, - "learning_rate": 5.9129986710105395e-06, - "loss": 8.438, - "step": 15 - }, - { - "epoch": 0.0, - "learning_rate": 6.569998523345044e-06, - "loss": 8.3793, - "step": 16 - }, - { - "epoch": 0.01, - "learning_rate": 7.2269983756795484e-06, - "loss": 7.4295, - "step": 17 - }, - { - "epoch": 0.01, - "learning_rate": 7.883998228014053e-06, - "loss": 7.3944, - "step": 18 - }, - { - "epoch": 0.01, - "learning_rate": 8.540998080348557e-06, - "loss": 6.8893, - "step": 19 - }, - { - "epoch": 0.01, - "learning_rate": 9.197997932683061e-06, - "loss": 6.8123, - "step": 20 - }, - { - "epoch": 0.01, - "learning_rate": 9.854997785017565e-06, - "loss": 6.2127, - "step": 21 - }, - { - "epoch": 0.01, - "learning_rate": 1.0511997637352071e-05, - "loss": 6.5409, - "step": 22 - }, - { - "epoch": 0.01, - "learning_rate": 1.1168997489686577e-05, - "loss": 5.9446, - "step": 23 - }, - { - "epoch": 0.01, - "learning_rate": 1.1825997342021079e-05, - "loss": 5.9737, - "step": 24 - }, - { - "epoch": 0.01, - "learning_rate": 1.2482997194355583e-05, - "loss": 5.7396, - "step": 25 - }, - { - "epoch": 0.01, - "learning_rate": 1.3139997046690089e-05, - "loss": 5.4264, - "step": 26 - }, - { - "epoch": 0.01, - "learning_rate": 1.3796996899024595e-05, - "loss": 5.2598, - "step": 27 - }, - { - "epoch": 0.01, - "learning_rate": 1.4453996751359097e-05, - "loss": 5.4661, - "step": 28 - }, - { - "epoch": 0.01, - "learning_rate": 1.5110996603693601e-05, - "loss": 4.965, - "step": 29 - }, - { - "epoch": 0.01, - "learning_rate": 1.5767996456028107e-05, - "loss": 5.1495, - "step": 30 - }, - { - "epoch": 0.01, - "learning_rate": 1.642499630836261e-05, - "loss": 5.2457, - "step": 31 - }, - { - "epoch": 0.01, - "learning_rate": 1.7081996160697115e-05, - "loss": 4.7293, - "step": 32 - }, - { - "epoch": 0.01, - "learning_rate": 1.773899601303162e-05, - "loss": 4.7455, - "step": 33 - }, - { - "epoch": 0.01, - "learning_rate": 1.8395995865366123e-05, - "loss": 4.8807, - "step": 34 - }, - { - "epoch": 0.01, - "learning_rate": 1.905299571770063e-05, - "loss": 5.0133, - "step": 35 - }, - { - "epoch": 0.01, - "learning_rate": 1.970999557003513e-05, - "loss": 4.5158, - "step": 36 - }, - { - "epoch": 0.01, - "learning_rate": 2.0366995422369638e-05, - "loss": 4.6027, - "step": 37 - }, - { - "epoch": 0.01, - "learning_rate": 2.1023995274704142e-05, - "loss": 4.5853, - "step": 38 - }, - { - "epoch": 0.01, - "learning_rate": 2.1680995127038646e-05, - "loss": 4.5911, - "step": 39 - }, - { - "epoch": 0.01, - "learning_rate": 2.2337994979373154e-05, - "loss": 4.432, - "step": 40 - }, - { - "epoch": 0.01, - "learning_rate": 2.2994994831707658e-05, - "loss": 4.1745, - "step": 41 - }, - { - "epoch": 0.01, - "learning_rate": 2.3651994684042158e-05, - "loss": 4.3357, - "step": 42 - }, - { - "epoch": 0.01, - "learning_rate": 2.4308994536376662e-05, - "loss": 4.1965, - "step": 43 - }, - { - "epoch": 0.01, - "learning_rate": 2.4965994388711166e-05, - "loss": 4.11, - "step": 44 - }, - { - "epoch": 0.01, - "learning_rate": 2.5622994241045674e-05, - "loss": 4.0005, - "step": 45 - }, - { - "epoch": 0.01, - "learning_rate": 2.6279994093380178e-05, - "loss": 3.9979, - "step": 46 - }, - { - "epoch": 0.01, - "learning_rate": 2.693699394571468e-05, - "loss": 3.808, - "step": 47 - }, - { - "epoch": 0.01, - "learning_rate": 2.759399379804919e-05, - "loss": 3.5335, - "step": 48 - }, - { - "epoch": 0.02, - "learning_rate": 2.8250993650383686e-05, - "loss": 3.2427, - "step": 49 - }, - { - "epoch": 0.02, - "learning_rate": 2.8907993502718194e-05, - "loss": 3.2449, - "step": 50 - }, - { - "epoch": 0.02, - "learning_rate": 2.9564993355052698e-05, - "loss": 6.2136, - "step": 51 - }, - { - "epoch": 0.02, - "learning_rate": 3.0221993207387202e-05, - "loss": 5.8481, - "step": 52 - }, - { - "epoch": 0.02, - "learning_rate": 3.0878993059721706e-05, - "loss": 5.5996, - "step": 53 - }, - { - "epoch": 0.02, - "learning_rate": 3.153599291205621e-05, - "loss": 5.5013, - "step": 54 - }, - { - "epoch": 0.02, - "learning_rate": 3.219299276439072e-05, - "loss": 5.3008, - "step": 55 - }, - { - "epoch": 0.02, - "learning_rate": 3.284999261672522e-05, - "loss": 5.3777, - "step": 56 - }, - { - "epoch": 0.02, - "learning_rate": 3.350699246905972e-05, - "loss": 5.0772, - "step": 57 - }, - { - "epoch": 0.02, - "learning_rate": 3.416399232139423e-05, - "loss": 4.8636, - "step": 58 - }, - { - "epoch": 0.02, - "learning_rate": 3.482099217372874e-05, - "loss": 4.8621, - "step": 59 - }, - { - "epoch": 0.02, - "learning_rate": 3.547799202606324e-05, - "loss": 4.7417, - "step": 60 - }, - { - "epoch": 0.02, - "learning_rate": 3.6134991878397745e-05, - "loss": 4.8393, - "step": 61 - }, - { - "epoch": 0.02, - "learning_rate": 3.6791991730732245e-05, - "loss": 4.6815, - "step": 62 - }, - { - "epoch": 0.02, - "learning_rate": 3.744899158306675e-05, - "loss": 4.8777, - "step": 63 - }, - { - "epoch": 0.02, - "learning_rate": 3.810599143540126e-05, - "loss": 4.8993, - "step": 64 - }, - { - "epoch": 0.02, - "learning_rate": 3.876299128773576e-05, - "loss": 4.6594, - "step": 65 - }, - { - "epoch": 0.02, - "learning_rate": 3.941999114007026e-05, - "loss": 4.6317, - "step": 66 - }, - { - "epoch": 0.02, - "learning_rate": 4.007699099240477e-05, - "loss": 4.5564, - "step": 67 - }, - { - "epoch": 0.02, - "learning_rate": 4.0733990844739276e-05, - "loss": 4.5359, - "step": 68 - }, - { - "epoch": 0.02, - "learning_rate": 4.139099069707378e-05, - "loss": 4.6462, - "step": 69 - }, - { - "epoch": 0.02, - "learning_rate": 4.2047990549408284e-05, - "loss": 4.5479, - "step": 70 - }, - { - "epoch": 0.02, - "learning_rate": 4.270499040174279e-05, - "loss": 4.6167, - "step": 71 - }, - { - "epoch": 0.02, - "learning_rate": 4.336199025407729e-05, - "loss": 4.4873, - "step": 72 - }, - { - "epoch": 0.02, - "learning_rate": 4.40189901064118e-05, - "loss": 4.5333, - "step": 73 - }, - { - "epoch": 0.02, - "learning_rate": 4.467598995874631e-05, - "loss": 4.4942, - "step": 74 - }, - { - "epoch": 0.02, - "learning_rate": 4.533298981108081e-05, - "loss": 4.5229, - "step": 75 - }, - { - "epoch": 0.02, - "learning_rate": 4.5989989663415315e-05, - "loss": 4.2522, - "step": 76 - }, - { - "epoch": 0.02, - "learning_rate": 4.664698951574981e-05, - "loss": 4.4598, - "step": 77 - }, - { - "epoch": 0.02, - "learning_rate": 4.7303989368084316e-05, - "loss": 4.3976, - "step": 78 - }, - { - "epoch": 0.02, - "learning_rate": 4.796098922041882e-05, - "loss": 4.3418, - "step": 79 - }, - { - "epoch": 0.02, - "learning_rate": 4.8617989072753324e-05, - "loss": 4.5355, - "step": 80 - }, - { - "epoch": 0.02, - "learning_rate": 4.927498892508783e-05, - "loss": 4.1474, - "step": 81 - }, - { - "epoch": 0.03, - "learning_rate": 4.993198877742233e-05, - "loss": 4.3487, - "step": 82 - }, - { - "epoch": 0.03, - "learning_rate": 5.058898862975684e-05, - "loss": 4.2385, - "step": 83 - }, - { - "epoch": 0.03, - "learning_rate": 5.124598848209135e-05, - "loss": 3.9705, - "step": 84 - }, - { - "epoch": 0.03, - "learning_rate": 5.190298833442585e-05, - "loss": 4.0977, - "step": 85 - }, - { - "epoch": 0.03, - "learning_rate": 5.2559988186760355e-05, - "loss": 4.3691, - "step": 86 - }, - { - "epoch": 0.03, - "learning_rate": 5.321698803909486e-05, - "loss": 4.2919, - "step": 87 - }, - { - "epoch": 0.03, - "learning_rate": 5.387398789142936e-05, - "loss": 4.1494, - "step": 88 - }, - { - "epoch": 0.03, - "learning_rate": 5.453098774376387e-05, - "loss": 4.0668, - "step": 89 - }, - { - "epoch": 0.03, - "learning_rate": 5.518798759609838e-05, - "loss": 4.1362, - "step": 90 - }, - { - "epoch": 0.03, - "learning_rate": 5.584498744843288e-05, - "loss": 4.0048, - "step": 91 - }, - { - "epoch": 0.03, - "learning_rate": 5.650198730076737e-05, - "loss": 4.0575, - "step": 92 - }, - { - "epoch": 0.03, - "learning_rate": 5.715898715310188e-05, - "loss": 3.9414, - "step": 93 - }, - { - "epoch": 0.03, - "learning_rate": 5.781598700543639e-05, - "loss": 3.5729, - "step": 94 - }, - { - "epoch": 0.03, - "learning_rate": 5.847298685777089e-05, - "loss": 3.8777, - "step": 95 - }, - { - "epoch": 0.03, - "learning_rate": 5.9129986710105395e-05, - "loss": 3.4894, - "step": 96 - }, - { - "epoch": 0.03, - "learning_rate": 5.97869865624399e-05, - "loss": 3.2646, - "step": 97 - }, - { - "epoch": 0.03, - "learning_rate": 6.0443986414774403e-05, - "loss": 3.1012, - "step": 98 - }, - { - "epoch": 0.03, - "learning_rate": 6.110098626710891e-05, - "loss": 3.3252, - "step": 99 - }, - { - "epoch": 0.03, - "learning_rate": 6.175798611944341e-05, - "loss": 3.0965, - "step": 100 - }, - { - "epoch": 0.03, - "learning_rate": 6.241498597177793e-05, - "loss": 6.1741, - "step": 101 - }, - { - "epoch": 0.03, - "learning_rate": 6.307198582411243e-05, - "loss": 5.7244, - "step": 102 - }, - { - "epoch": 0.03, - "learning_rate": 6.372898567644693e-05, - "loss": 5.2937, - "step": 103 - }, - { - "epoch": 0.03, - "learning_rate": 6.438598552878144e-05, - "loss": 4.9471, - "step": 104 - }, - { - "epoch": 0.03, - "learning_rate": 6.504298538111594e-05, - "loss": 4.8998, - "step": 105 - }, - { - "epoch": 0.03, - "learning_rate": 6.569998523345044e-05, - "loss": 5.0121, - "step": 106 - }, - { - "epoch": 0.03, - "learning_rate": 6.635698508578496e-05, - "loss": 4.7973, - "step": 107 - }, - { - "epoch": 0.03, - "learning_rate": 6.701398493811944e-05, - "loss": 4.7891, - "step": 108 - }, - { - "epoch": 0.03, - "learning_rate": 6.767098479045396e-05, - "loss": 4.7888, - "step": 109 - }, - { - "epoch": 0.03, - "learning_rate": 6.832798464278846e-05, - "loss": 4.8595, - "step": 110 - }, - { - "epoch": 0.03, - "learning_rate": 6.898498449512296e-05, - "loss": 4.4903, - "step": 111 - }, - { - "epoch": 0.03, - "learning_rate": 6.964198434745747e-05, - "loss": 4.6337, - "step": 112 - }, - { - "epoch": 0.03, - "learning_rate": 7.029898419979197e-05, - "loss": 4.4506, - "step": 113 - }, - { - "epoch": 0.04, - "learning_rate": 7.095598405212647e-05, - "loss": 4.526, - "step": 114 - }, - { - "epoch": 0.04, - "learning_rate": 7.161298390446099e-05, - "loss": 4.5671, - "step": 115 - }, - { - "epoch": 0.04, - "learning_rate": 7.226998375679549e-05, - "loss": 4.4214, - "step": 116 - }, - { - "epoch": 0.04, - "learning_rate": 7.292698360912999e-05, - "loss": 4.6995, - "step": 117 - }, - { - "epoch": 0.04, - "learning_rate": 7.358398346146449e-05, - "loss": 4.4702, - "step": 118 - }, - { - "epoch": 0.04, - "learning_rate": 7.4240983313799e-05, - "loss": 4.4675, - "step": 119 - }, - { - "epoch": 0.04, - "learning_rate": 7.48979831661335e-05, - "loss": 4.5423, - "step": 120 - }, - { - "epoch": 0.04, - "learning_rate": 7.5554983018468e-05, - "loss": 4.3751, - "step": 121 - }, - { - "epoch": 0.04, - "learning_rate": 7.621198287080252e-05, - "loss": 4.3965, - "step": 122 - }, - { - "epoch": 0.04, - "learning_rate": 7.686898272313702e-05, - "loss": 4.3742, - "step": 123 - }, - { - "epoch": 0.04, - "learning_rate": 7.752598257547152e-05, - "loss": 4.2151, - "step": 124 - }, - { - "epoch": 0.04, - "learning_rate": 7.818298242780602e-05, - "loss": 4.2806, - "step": 125 - }, - { - "epoch": 0.04, - "learning_rate": 7.883998228014052e-05, - "loss": 4.2132, - "step": 126 - }, - { - "epoch": 0.04, - "learning_rate": 7.949698213247504e-05, - "loss": 4.2403, - "step": 127 - }, - { - "epoch": 0.04, - "learning_rate": 8.015398198480954e-05, - "loss": 4.3122, - "step": 128 - }, - { - "epoch": 0.04, - "learning_rate": 8.081098183714404e-05, - "loss": 4.4731, - "step": 129 - }, - { - "epoch": 0.04, - "learning_rate": 8.146798168947855e-05, - "loss": 4.2146, - "step": 130 - }, - { - "epoch": 0.04, - "learning_rate": 8.212498154181305e-05, - "loss": 4.0394, - "step": 131 - }, - { - "epoch": 0.04, - "learning_rate": 8.278198139414755e-05, - "loss": 4.1229, - "step": 132 - }, - { - "epoch": 0.04, - "learning_rate": 8.343898124648207e-05, - "loss": 3.9938, - "step": 133 - }, - { - "epoch": 0.04, - "learning_rate": 8.409598109881657e-05, - "loss": 4.0056, - "step": 134 - }, - { - "epoch": 0.04, - "learning_rate": 8.475298095115107e-05, - "loss": 3.9828, - "step": 135 - }, - { - "epoch": 0.04, - "learning_rate": 8.540998080348558e-05, - "loss": 4.1734, - "step": 136 - }, - { - "epoch": 0.04, - "learning_rate": 8.606698065582008e-05, - "loss": 3.8288, - "step": 137 - }, - { - "epoch": 0.04, - "learning_rate": 8.672398050815458e-05, - "loss": 4.073, - "step": 138 - }, - { - "epoch": 0.04, - "learning_rate": 8.73809803604891e-05, - "loss": 3.9704, - "step": 139 - }, - { - "epoch": 0.04, - "learning_rate": 8.80379802128236e-05, - "loss": 3.773, - "step": 140 - }, - { - "epoch": 0.04, - "learning_rate": 8.86949800651581e-05, - "loss": 3.7891, - "step": 141 - }, - { - "epoch": 0.04, - "learning_rate": 8.935197991749261e-05, - "loss": 3.697, - "step": 142 - }, - { - "epoch": 0.04, - "learning_rate": 9.000897976982711e-05, - "loss": 3.7628, - "step": 143 - }, - { - "epoch": 0.04, - "learning_rate": 9.066597962216162e-05, - "loss": 3.7288, - "step": 144 - }, - { - "epoch": 0.04, - "learning_rate": 9.132297947449613e-05, - "loss": 3.7362, - "step": 145 - }, - { - "epoch": 0.05, - "learning_rate": 9.197997932683063e-05, - "loss": 3.6183, - "step": 146 - }, - { - "epoch": 0.05, - "learning_rate": 9.263697917916512e-05, - "loss": 3.4819, - "step": 147 - }, - { - "epoch": 0.05, - "learning_rate": 9.329397903149962e-05, - "loss": 3.4482, - "step": 148 - }, - { - "epoch": 0.05, - "learning_rate": 9.395097888383412e-05, - "loss": 3.0363, - "step": 149 - }, - { - "epoch": 0.05, - "learning_rate": 9.460797873616863e-05, - "loss": 2.9486, - "step": 150 - }, - { - "epoch": 0.05, - "learning_rate": 9.526497858850313e-05, - "loss": 6.0248, - "step": 151 - }, - { - "epoch": 0.05, - "learning_rate": 9.592197844083763e-05, - "loss": 5.5281, - "step": 152 - }, - { - "epoch": 0.05, - "learning_rate": 9.657897829317215e-05, - "loss": 5.1906, - "step": 153 - }, - { - "epoch": 0.05, - "learning_rate": 9.723597814550665e-05, - "loss": 5.0286, - "step": 154 - }, - { - "epoch": 0.05, - "learning_rate": 9.789297799784115e-05, - "loss": 4.7783, - "step": 155 - }, - { - "epoch": 0.05, - "learning_rate": 9.854997785017566e-05, - "loss": 4.5936, - "step": 156 - }, - { - "epoch": 0.05, - "learning_rate": 9.920697770251016e-05, - "loss": 4.7536, - "step": 157 - }, - { - "epoch": 0.05, - "learning_rate": 9.986397755484466e-05, - "loss": 4.5334, - "step": 158 - }, - { - "epoch": 0.05, - "learning_rate": 0.00010052097740717918, - "loss": 4.557, - "step": 159 - }, - { - "epoch": 0.05, - "learning_rate": 0.00010117797725951368, - "loss": 4.4231, - "step": 160 - }, - { - "epoch": 0.05, - "learning_rate": 0.00010183497711184818, - "loss": 4.3759, - "step": 161 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001024919769641827, - "loss": 4.4743, - "step": 162 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001031489768165172, - "loss": 4.4708, - "step": 163 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001038059766688517, - "loss": 4.4006, - "step": 164 - }, - { - "epoch": 0.05, - "learning_rate": 0.00010446297652118621, - "loss": 4.5357, - "step": 165 - }, - { - "epoch": 0.05, - "learning_rate": 0.00010511997637352071, - "loss": 4.2262, - "step": 166 - }, - { - "epoch": 0.05, - "learning_rate": 0.00010577697622585521, - "loss": 4.6843, - "step": 167 - }, - { - "epoch": 0.05, - "learning_rate": 0.00010643397607818973, - "loss": 4.3731, - "step": 168 - }, - { - "epoch": 0.05, - "learning_rate": 0.00010709097593052423, - "loss": 4.247, - "step": 169 - }, - { - "epoch": 0.05, - "learning_rate": 0.00010774797578285873, - "loss": 4.2101, - "step": 170 - }, - { - "epoch": 0.05, - "learning_rate": 0.00010840497563519324, - "loss": 4.1394, - "step": 171 - }, - { - "epoch": 0.05, - "learning_rate": 0.00010906197548752774, - "loss": 4.2254, - "step": 172 - }, - { - "epoch": 0.05, - "learning_rate": 0.00010971897533986224, - "loss": 4.5508, - "step": 173 - }, - { - "epoch": 0.05, - "learning_rate": 0.00011037597519219676, - "loss": 4.0966, - "step": 174 - }, - { - "epoch": 0.05, - "learning_rate": 0.00011103297504453126, - "loss": 4.3946, - "step": 175 - }, - { - "epoch": 0.05, - "learning_rate": 0.00011168997489686576, - "loss": 4.1662, - "step": 176 - }, - { - "epoch": 0.05, - "learning_rate": 0.00011234697474920027, - "loss": 4.3428, - "step": 177 - }, - { - "epoch": 0.05, - "learning_rate": 0.00011300397460153475, - "loss": 4.3491, - "step": 178 - }, - { - "epoch": 0.06, - "learning_rate": 0.00011366097445386926, - "loss": 4.3456, - "step": 179 - }, - { - "epoch": 0.06, - "learning_rate": 0.00011431797430620376, - "loss": 4.0542, - "step": 180 - }, - { - "epoch": 0.06, - "learning_rate": 0.00011497497415853826, - "loss": 4.2489, - "step": 181 - }, - { - "epoch": 0.06, - "learning_rate": 0.00011563197401087277, - "loss": 3.9877, - "step": 182 - }, - { - "epoch": 0.06, - "learning_rate": 0.00011628897386320728, - "loss": 4.057, - "step": 183 - }, - { - "epoch": 0.06, - "learning_rate": 0.00011694597371554178, - "loss": 4.1179, - "step": 184 - }, - { - "epoch": 0.06, - "learning_rate": 0.00011760297356787629, - "loss": 4.1351, - "step": 185 - }, - { - "epoch": 0.06, - "learning_rate": 0.00011825997342021079, - "loss": 4.1048, - "step": 186 - }, - { - "epoch": 0.06, - "learning_rate": 0.00011891697327254529, - "loss": 4.0181, - "step": 187 - }, - { - "epoch": 0.06, - "learning_rate": 0.0001195739731248798, - "loss": 3.973, - "step": 188 - }, - { - "epoch": 0.06, - "learning_rate": 0.0001202309729772143, - "loss": 3.9068, - "step": 189 - }, - { - "epoch": 0.06, - "learning_rate": 0.00012088797282954881, - "loss": 3.7849, - "step": 190 - }, - { - "epoch": 0.06, - "learning_rate": 0.00012154497268188332, - "loss": 3.9339, - "step": 191 - }, - { - "epoch": 0.06, - "learning_rate": 0.00012220197253421782, - "loss": 3.6721, - "step": 192 - }, - { - "epoch": 0.06, - "learning_rate": 0.00012285897238655232, - "loss": 3.7949, - "step": 193 - }, - { - "epoch": 0.06, - "learning_rate": 0.00012351597223888682, - "loss": 3.3948, - "step": 194 - }, - { - "epoch": 0.06, - "learning_rate": 0.00012417297209122132, - "loss": 3.6419, - "step": 195 - }, - { - "epoch": 0.06, - "learning_rate": 0.00012482997194355585, - "loss": 3.228, - "step": 196 - }, - { - "epoch": 0.06, - "learning_rate": 0.00012548697179589035, - "loss": 3.2732, - "step": 197 - }, - { - "epoch": 0.06, - "learning_rate": 0.00012614397164822485, - "loss": 3.3839, - "step": 198 - }, - { - "epoch": 0.06, - "learning_rate": 0.00012680097150055935, - "loss": 2.9711, - "step": 199 - }, - { - "epoch": 0.06, - "learning_rate": 0.00012745797135289385, - "loss": 2.9525, - "step": 200 - }, - { - "epoch": 0.06, - "learning_rate": 0.00012811497120522835, - "loss": 6.2523, - "step": 201 - }, - { - "epoch": 0.06, - "learning_rate": 0.00012877197105756288, - "loss": 5.485, - "step": 202 - }, - { - "epoch": 0.06, - "learning_rate": 0.00012942897090989738, - "loss": 5.1387, - "step": 203 - }, - { - "epoch": 0.06, - "learning_rate": 0.00013008597076223188, - "loss": 4.9817, - "step": 204 - }, - { - "epoch": 0.06, - "learning_rate": 0.00013074297061456638, - "loss": 4.5802, - "step": 205 - }, - { - "epoch": 0.06, - "learning_rate": 0.00013139997046690088, - "loss": 4.6599, - "step": 206 - }, - { - "epoch": 0.06, - "learning_rate": 0.00013205697031923539, - "loss": 4.5162, - "step": 207 - }, - { - "epoch": 0.06, - "learning_rate": 0.0001327139701715699, - "loss": 4.5406, - "step": 208 - }, - { - "epoch": 0.06, - "learning_rate": 0.00013337097002390441, - "loss": 4.7411, - "step": 209 - }, - { - "epoch": 0.06, - "learning_rate": 0.0001340279698762389, - "loss": 4.3877, - "step": 210 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001346849697285734, - "loss": 4.4414, - "step": 211 - }, - { - "epoch": 0.07, - "learning_rate": 0.00013534196958090792, - "loss": 4.4552, - "step": 212 - }, - { - "epoch": 0.07, - "learning_rate": 0.00013599896943324242, - "loss": 4.3745, - "step": 213 - }, - { - "epoch": 0.07, - "learning_rate": 0.00013665596928557692, - "loss": 4.4989, - "step": 214 - }, - { - "epoch": 0.07, - "learning_rate": 0.00013731296913791142, - "loss": 4.34, - "step": 215 - }, - { - "epoch": 0.07, - "learning_rate": 0.00013796996899024592, - "loss": 4.4169, - "step": 216 - }, - { - "epoch": 0.07, - "learning_rate": 0.00013862696884258042, - "loss": 4.2321, - "step": 217 - }, - { - "epoch": 0.07, - "learning_rate": 0.00013928396869491495, - "loss": 4.1761, - "step": 218 - }, - { - "epoch": 0.07, - "learning_rate": 0.00013994096854724945, - "loss": 4.2764, - "step": 219 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014059796839958395, - "loss": 4.3849, - "step": 220 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014125496825191845, - "loss": 4.4056, - "step": 221 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014191196810425295, - "loss": 4.2634, - "step": 222 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014256896795658745, - "loss": 4.5204, - "step": 223 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014322596780892198, - "loss": 4.3385, - "step": 224 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014388296766125648, - "loss": 4.3638, - "step": 225 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014453996751359098, - "loss": 4.1867, - "step": 226 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014519696736592548, - "loss": 4.4033, - "step": 227 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014585396721825998, - "loss": 4.2058, - "step": 228 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014651096707059448, - "loss": 4.313, - "step": 229 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014716796692292898, - "loss": 4.2648, - "step": 230 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001478249667752635, - "loss": 4.3322, - "step": 231 - }, - { - "epoch": 0.07, - "learning_rate": 0.000148481966627598, - "loss": 4.2088, - "step": 232 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001491389664799325, - "loss": 4.3424, - "step": 233 - }, - { - "epoch": 0.07, - "learning_rate": 0.000149795966332267, - "loss": 3.9832, - "step": 234 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001504529661846015, - "loss": 4.2187, - "step": 235 - }, - { - "epoch": 0.07, - "learning_rate": 0.000151109966036936, - "loss": 3.9592, - "step": 236 - }, - { - "epoch": 0.07, - "learning_rate": 0.00015176696588927054, - "loss": 4.0884, - "step": 237 - }, - { - "epoch": 0.07, - "learning_rate": 0.00015242396574160504, - "loss": 4.0517, - "step": 238 - }, - { - "epoch": 0.07, - "learning_rate": 0.00015308096559393954, - "loss": 4.1011, - "step": 239 - }, - { - "epoch": 0.07, - "learning_rate": 0.00015373796544627404, - "loss": 4.0743, - "step": 240 - }, - { - "epoch": 0.07, - "learning_rate": 0.00015439496529860854, - "loss": 3.7718, - "step": 241 - }, - { - "epoch": 0.07, - "learning_rate": 0.00015505196515094304, - "loss": 4.1137, - "step": 242 - }, - { - "epoch": 0.07, - "learning_rate": 0.00015570896500327754, - "loss": 3.8398, - "step": 243 - }, - { - "epoch": 0.08, - "learning_rate": 0.00015636596485561204, - "loss": 3.7475, - "step": 244 - }, - { - "epoch": 0.08, - "learning_rate": 0.00015702296470794654, - "loss": 3.3048, - "step": 245 - }, - { - "epoch": 0.08, - "learning_rate": 0.00015767996456028105, - "loss": 3.4871, - "step": 246 - }, - { - "epoch": 0.08, - "learning_rate": 0.00015833696441261557, - "loss": 3.4238, - "step": 247 - }, - { - "epoch": 0.08, - "learning_rate": 0.00015899396426495007, - "loss": 3.1184, - "step": 248 - }, - { - "epoch": 0.08, - "learning_rate": 0.00015965096411728457, - "loss": 2.9245, - "step": 249 - }, - { - "epoch": 0.08, - "learning_rate": 0.00016030796396961908, - "loss": 2.8914, - "step": 250 - }, - { - "epoch": 0.08, - "learning_rate": 0.00016096496382195358, - "loss": 5.7254, - "step": 251 - }, - { - "epoch": 0.08, - "learning_rate": 0.00016162196367428808, - "loss": 5.2385, - "step": 252 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001622789635266226, - "loss": 4.9273, - "step": 253 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001629359633789571, - "loss": 4.8504, - "step": 254 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001635929632312916, - "loss": 4.7786, - "step": 255 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001642499630836261, - "loss": 4.4117, - "step": 256 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001649069629359606, - "loss": 4.6535, - "step": 257 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001655639627882951, - "loss": 4.6439, - "step": 258 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001662209626406296, - "loss": 4.4627, - "step": 259 - }, - { - "epoch": 0.08, - "learning_rate": 0.00016687796249296414, - "loss": 4.4111, - "step": 260 - }, - { - "epoch": 0.08, - "learning_rate": 0.00016753496234529864, - "loss": 4.338, - "step": 261 - }, - { - "epoch": 0.08, - "learning_rate": 0.00016819196219763314, - "loss": 4.4795, - "step": 262 - }, - { - "epoch": 0.08, - "learning_rate": 0.00016884896204996764, - "loss": 4.3954, - "step": 263 - }, - { - "epoch": 0.08, - "learning_rate": 0.00016950596190230214, - "loss": 4.517, - "step": 264 - }, - { - "epoch": 0.08, - "learning_rate": 0.00017016296175463664, - "loss": 4.4148, - "step": 265 - }, - { - "epoch": 0.08, - "learning_rate": 0.00017081996160697117, - "loss": 4.707, - "step": 266 - }, - { - "epoch": 0.08, - "learning_rate": 0.00017147696145930567, - "loss": 4.5414, - "step": 267 - }, - { - "epoch": 0.08, - "learning_rate": 0.00017213396131164017, - "loss": 4.38, - "step": 268 - }, - { - "epoch": 0.08, - "learning_rate": 0.00017279096116397467, - "loss": 4.6632, - "step": 269 - }, - { - "epoch": 0.08, - "learning_rate": 0.00017344796101630917, - "loss": 4.3069, - "step": 270 - }, - { - "epoch": 0.08, - "learning_rate": 0.00017410496086864367, - "loss": 4.4658, - "step": 271 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001747619607209782, - "loss": 4.6071, - "step": 272 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001754189605733127, - "loss": 4.6362, - "step": 273 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001760759604256472, - "loss": 4.4037, - "step": 274 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001767329602779817, - "loss": 4.2221, - "step": 275 - }, - { - "epoch": 0.09, - "learning_rate": 0.0001773899601303162, - "loss": 4.4658, - "step": 276 - }, - { - "epoch": 0.09, - "learning_rate": 0.0001780469599826507, - "loss": 4.3507, - "step": 277 - }, - { - "epoch": 0.09, - "learning_rate": 0.00017870395983498523, - "loss": 4.6712, - "step": 278 - }, - { - "epoch": 0.09, - "learning_rate": 0.00017936095968731973, - "loss": 4.4491, - "step": 279 - }, - { - "epoch": 0.09, - "learning_rate": 0.00018001795953965423, - "loss": 4.0762, - "step": 280 - }, - { - "epoch": 0.09, - "learning_rate": 0.00018067495939198873, - "loss": 4.2759, - "step": 281 - }, - { - "epoch": 0.09, - "learning_rate": 0.00018133195924432323, - "loss": 4.0968, - "step": 282 - }, - { - "epoch": 0.09, - "learning_rate": 0.00018198895909665773, - "loss": 4.2658, - "step": 283 - }, - { - "epoch": 0.09, - "learning_rate": 0.00018264595894899226, - "loss": 4.3617, - "step": 284 - }, - { - "epoch": 0.09, - "learning_rate": 0.00018330295880132676, - "loss": 3.9741, - "step": 285 - }, - { - "epoch": 0.09, - "learning_rate": 0.00018395995865366126, - "loss": 4.0995, - "step": 286 - }, - { - "epoch": 0.09, - "learning_rate": 0.00018461695850599576, - "loss": 3.9358, - "step": 287 - }, - { - "epoch": 0.09, - "learning_rate": 0.00018527395835833023, - "loss": 3.9463, - "step": 288 - }, - { - "epoch": 0.09, - "learning_rate": 0.00018593095821066474, - "loss": 4.2269, - "step": 289 - }, - { - "epoch": 0.09, - "learning_rate": 0.00018658795806299924, - "loss": 3.7091, - "step": 290 - }, - { - "epoch": 0.09, - "learning_rate": 0.00018724495791533374, - "loss": 3.7328, - "step": 291 - }, - { - "epoch": 0.09, - "learning_rate": 0.00018790195776766824, - "loss": 3.9004, - "step": 292 - }, - { - "epoch": 0.09, - "learning_rate": 0.00018855895762000276, - "loss": 3.4115, - "step": 293 - }, - { - "epoch": 0.09, - "learning_rate": 0.00018921595747233727, - "loss": 3.309, - "step": 294 - }, - { - "epoch": 0.09, - "learning_rate": 0.00018987295732467177, - "loss": 3.7864, - "step": 295 - }, - { - "epoch": 0.09, - "learning_rate": 0.00019052995717700627, - "loss": 3.2292, - "step": 296 - }, - { - "epoch": 0.09, - "learning_rate": 0.00019118695702934077, - "loss": 3.3521, - "step": 297 - }, - { - "epoch": 0.09, - "learning_rate": 0.00019184395688167527, - "loss": 3.4121, - "step": 298 - }, - { - "epoch": 0.09, - "learning_rate": 0.0001925009567340098, - "loss": 3.2202, - "step": 299 - }, - { - "epoch": 0.09, - "learning_rate": 0.0001931579565863443, - "loss": 2.961, - "step": 300 - }, - { - "epoch": 0.09, - "learning_rate": 0.0001938149564386788, - "loss": 5.8341, - "step": 301 - }, - { - "epoch": 0.09, - "learning_rate": 0.0001944719562910133, - "loss": 5.1676, - "step": 302 - }, - { - "epoch": 0.09, - "learning_rate": 0.0001951289561433478, - "loss": 5.1416, - "step": 303 - }, - { - "epoch": 0.09, - "learning_rate": 0.0001957859559956823, - "loss": 4.8569, - "step": 304 - }, - { - "epoch": 0.09, - "learning_rate": 0.00019644295584801683, - "loss": 4.7374, - "step": 305 - }, - { - "epoch": 0.09, - "learning_rate": 0.00019709995570035133, - "loss": 4.6583, - "step": 306 - }, - { - "epoch": 0.09, - "learning_rate": 0.00019775695555268583, - "loss": 4.838, - "step": 307 - }, - { - "epoch": 0.1, - "learning_rate": 0.00019841395540502033, - "loss": 4.5544, - "step": 308 - }, - { - "epoch": 0.1, - "learning_rate": 0.00019907095525735483, - "loss": 4.6132, - "step": 309 - }, - { - "epoch": 0.1, - "learning_rate": 0.00019972795510968933, - "loss": 4.2333, - "step": 310 - }, - { - "epoch": 0.1, - "learning_rate": 0.00020038495496202386, - "loss": 4.4807, - "step": 311 - }, - { - "epoch": 0.1, - "learning_rate": 0.00020104195481435836, - "loss": 4.4115, - "step": 312 - }, - { - "epoch": 0.1, - "learning_rate": 0.00020169895466669286, - "loss": 5.0106, - "step": 313 - }, - { - "epoch": 0.1, - "learning_rate": 0.00020235595451902736, - "loss": 4.5739, - "step": 314 - }, - { - "epoch": 0.1, - "learning_rate": 0.00020301295437136186, - "loss": 4.4143, - "step": 315 - }, - { - "epoch": 0.1, - "learning_rate": 0.00020366995422369636, - "loss": 4.5588, - "step": 316 - }, - { - "epoch": 0.1, - "learning_rate": 0.0002043269540760309, - "loss": 4.5803, - "step": 317 - }, - { - "epoch": 0.1, - "learning_rate": 0.0002049839539283654, - "loss": 4.6836, - "step": 318 - }, - { - "epoch": 0.1, - "learning_rate": 0.0002056409537806999, - "loss": 4.305, - "step": 319 - }, - { - "epoch": 0.1, - "learning_rate": 0.0002062979536330344, - "loss": 4.2923, - "step": 320 - }, - { - "epoch": 0.1, - "learning_rate": 0.0002069549534853689, - "loss": 4.4126, - "step": 321 - }, - { - "epoch": 0.1, - "learning_rate": 0.0002076119533377034, - "loss": 4.3654, - "step": 322 - }, - { - "epoch": 0.1, - "learning_rate": 0.0002082689531900379, - "loss": 4.4962, - "step": 323 - }, - { - "epoch": 0.1, - "learning_rate": 0.00020892595304237242, - "loss": 4.1178, - "step": 324 - }, - { - "epoch": 0.1, - "learning_rate": 0.00020958295289470692, - "loss": 4.4859, - "step": 325 - }, - { - "epoch": 0.1, - "learning_rate": 0.00021023995274704142, - "loss": 4.273, - "step": 326 - }, - { - "epoch": 0.1, - "learning_rate": 0.00021089695259937592, - "loss": 4.5106, - "step": 327 - }, - { - "epoch": 0.1, - "learning_rate": 0.00021155395245171042, - "loss": 4.048, - "step": 328 - }, - { - "epoch": 0.1, - "learning_rate": 0.00021221095230404492, - "loss": 4.2437, - "step": 329 - }, - { - "epoch": 0.1, - "learning_rate": 0.00021286795215637945, - "loss": 4.1273, - "step": 330 - }, - { - "epoch": 0.1, - "learning_rate": 0.00021352495200871395, - "loss": 4.3951, - "step": 331 - }, - { - "epoch": 0.1, - "learning_rate": 0.00021418195186104845, - "loss": 4.1273, - "step": 332 - }, - { - "epoch": 0.1, - "learning_rate": 0.00021483895171338295, - "loss": 4.1606, - "step": 333 - }, - { - "epoch": 0.1, - "learning_rate": 0.00021549595156571745, - "loss": 3.9064, - "step": 334 - }, - { - "epoch": 0.1, - "learning_rate": 0.00021615295141805195, - "loss": 4.1578, - "step": 335 - }, - { - "epoch": 0.1, - "learning_rate": 0.00021680995127038648, - "loss": 3.7668, - "step": 336 - }, - { - "epoch": 0.1, - "learning_rate": 0.00021746695112272098, - "loss": 4.0668, - "step": 337 - }, - { - "epoch": 0.1, - "learning_rate": 0.00021812395097505548, - "loss": 4.1243, - "step": 338 - }, - { - "epoch": 0.1, - "learning_rate": 0.00021878095082738998, - "loss": 3.9848, - "step": 339 - }, - { - "epoch": 0.1, - "learning_rate": 0.00021943795067972448, - "loss": 3.7768, - "step": 340 - }, - { - "epoch": 0.11, - "learning_rate": 0.00022009495053205898, - "loss": 3.7398, - "step": 341 - }, - { - "epoch": 0.11, - "learning_rate": 0.0002207519503843935, - "loss": 3.6799, - "step": 342 - }, - { - "epoch": 0.11, - "learning_rate": 0.000221408950236728, - "loss": 3.4966, - "step": 343 - }, - { - "epoch": 0.11, - "learning_rate": 0.00022206595008906251, - "loss": 3.2034, - "step": 344 - }, - { - "epoch": 0.11, - "learning_rate": 0.00022272294994139701, - "loss": 3.4896, - "step": 345 - }, - { - "epoch": 0.11, - "learning_rate": 0.00022337994979373151, - "loss": 3.4473, - "step": 346 - }, - { - "epoch": 0.11, - "learning_rate": 0.00022403694964606602, - "loss": 3.4358, - "step": 347 - }, - { - "epoch": 0.11, - "learning_rate": 0.00022469394949840054, - "loss": 3.371, - "step": 348 - }, - { - "epoch": 0.11, - "learning_rate": 0.00022535094935073504, - "loss": 3.3348, - "step": 349 - }, - { - "epoch": 0.11, - "learning_rate": 0.0002260079492030695, - "loss": 3.4836, - "step": 350 - }, - { - "epoch": 0.11, - "learning_rate": 0.00022666494905540402, - "loss": 7.9123, - "step": 351 - }, - { - "epoch": 0.11, - "learning_rate": 0.00022732194890773852, - "loss": 6.0544, - "step": 352 - }, - { - "epoch": 0.11, - "learning_rate": 0.00022797894876007302, - "loss": 5.1407, - "step": 353 - }, - { - "epoch": 0.11, - "learning_rate": 0.00022863594861240752, - "loss": 4.9978, - "step": 354 - }, - { - "epoch": 0.11, - "learning_rate": 0.00022929294846474202, - "loss": 4.8652, - "step": 355 - }, - { - "epoch": 0.11, - "learning_rate": 0.00022994994831707652, - "loss": 4.7007, - "step": 356 - }, - { - "epoch": 0.11, - "learning_rate": 0.00023060694816941105, - "loss": 4.4921, - "step": 357 - }, - { - "epoch": 0.11, - "learning_rate": 0.00023126394802174555, - "loss": 4.6645, - "step": 358 - }, - { - "epoch": 0.11, - "learning_rate": 0.00023192094787408005, - "loss": 4.4648, - "step": 359 - }, - { - "epoch": 0.11, - "learning_rate": 0.00023257794772641455, - "loss": 4.6883, - "step": 360 - }, - { - "epoch": 0.11, - "learning_rate": 0.00023323494757874905, - "loss": 4.5958, - "step": 361 - }, - { - "epoch": 0.11, - "learning_rate": 0.00023389194743108355, - "loss": 4.5218, - "step": 362 - }, - { - "epoch": 0.11, - "learning_rate": 0.00023454894728341808, - "loss": 4.6279, - "step": 363 - }, - { - "epoch": 0.11, - "learning_rate": 0.00023520594713575258, - "loss": 4.5532, - "step": 364 - }, - { - "epoch": 0.11, - "learning_rate": 0.00023586294698808708, - "loss": 4.5643, - "step": 365 - }, - { - "epoch": 0.11, - "learning_rate": 0.00023651994684042158, - "loss": 4.6094, - "step": 366 - }, - { - "epoch": 0.11, - "learning_rate": 0.00023717694669275608, - "loss": 4.3253, - "step": 367 - }, - { - "epoch": 0.11, - "learning_rate": 0.00023783394654509058, - "loss": 4.4929, - "step": 368 - }, - { - "epoch": 0.11, - "learning_rate": 0.0002384909463974251, - "loss": 4.4787, - "step": 369 - }, - { - "epoch": 0.11, - "learning_rate": 0.0002391479462497596, - "loss": 4.6377, - "step": 370 - }, - { - "epoch": 0.11, - "learning_rate": 0.0002398049461020941, - "loss": 4.3335, - "step": 371 - }, - { - "epoch": 0.11, - "learning_rate": 0.0002404619459544286, - "loss": 4.3176, - "step": 372 - }, - { - "epoch": 0.12, - "learning_rate": 0.0002411189458067631, - "loss": 4.8085, - "step": 373 - }, - { - "epoch": 0.12, - "learning_rate": 0.00024177594565909761, - "loss": 4.5685, - "step": 374 - }, - { - "epoch": 0.12, - "learning_rate": 0.00024243294551143214, - "loss": 4.4958, - "step": 375 - }, - { - "epoch": 0.12, - "learning_rate": 0.00024308994536376664, - "loss": 4.5047, - "step": 376 - }, - { - "epoch": 0.12, - "learning_rate": 0.00024374694521610114, - "loss": 4.2801, - "step": 377 - }, - { - "epoch": 0.12, - "learning_rate": 0.00024440394506843564, - "loss": 4.4224, - "step": 378 - }, - { - "epoch": 0.12, - "learning_rate": 0.00024506094492077017, - "loss": 4.3793, - "step": 379 - }, - { - "epoch": 0.12, - "learning_rate": 0.00024571794477310464, - "loss": 4.1101, - "step": 380 - }, - { - "epoch": 0.12, - "learning_rate": 0.00024637494462543917, - "loss": 4.2544, - "step": 381 - }, - { - "epoch": 0.12, - "learning_rate": 0.00024703194447777365, - "loss": 4.4169, - "step": 382 - }, - { - "epoch": 0.12, - "learning_rate": 0.0002476889443301082, - "loss": 4.295, - "step": 383 - }, - { - "epoch": 0.12, - "learning_rate": 0.00024834594418244265, - "loss": 4.5854, - "step": 384 - }, - { - "epoch": 0.12, - "learning_rate": 0.0002490029440347772, - "loss": 4.1853, - "step": 385 - }, - { - "epoch": 0.12, - "learning_rate": 0.0002496599438871117, - "loss": 4.1355, - "step": 386 - }, - { - "epoch": 0.12, - "learning_rate": 0.0002503169437394462, - "loss": 4.1311, - "step": 387 - }, - { - "epoch": 0.12, - "learning_rate": 0.0002509739435917807, - "loss": 4.1599, - "step": 388 - }, - { - "epoch": 0.12, - "learning_rate": 0.0002516309434441152, - "loss": 3.9209, - "step": 389 - }, - { - "epoch": 0.12, - "learning_rate": 0.0002522879432964497, - "loss": 3.866, - "step": 390 - }, - { - "epoch": 0.12, - "learning_rate": 0.00025294494314878423, - "loss": 3.6069, - "step": 391 - }, - { - "epoch": 0.12, - "learning_rate": 0.0002536019430011187, - "loss": 3.8343, - "step": 392 - }, - { - "epoch": 0.12, - "learning_rate": 0.00025425894285345323, - "loss": 3.8334, - "step": 393 - }, - { - "epoch": 0.12, - "learning_rate": 0.0002549159427057877, - "loss": 3.6018, - "step": 394 - }, - { - "epoch": 0.12, - "learning_rate": 0.00025557294255812224, - "loss": 3.5376, - "step": 395 - }, - { - "epoch": 0.12, - "learning_rate": 0.0002562299424104567, - "loss": 3.48, - "step": 396 - }, - { - "epoch": 0.12, - "learning_rate": 0.00025688694226279124, - "loss": 3.5055, - "step": 397 - }, - { - "epoch": 0.12, - "learning_rate": 0.00025754394211512576, - "loss": 3.2152, - "step": 398 - }, - { - "epoch": 0.12, - "learning_rate": 0.00025820094196746024, - "loss": 3.0883, - "step": 399 - }, - { - "epoch": 0.12, - "learning_rate": 0.00025885794181979477, - "loss": 3.0299, - "step": 400 - }, - { - "epoch": 0.12, - "learning_rate": 0.00025951494167212924, - "loss": 6.0321, - "step": 401 - }, - { - "epoch": 0.12, - "learning_rate": 0.00026017194152446377, - "loss": 5.3678, - "step": 402 - }, - { - "epoch": 0.12, - "learning_rate": 0.0002608289413767983, - "loss": 4.7082, - "step": 403 - }, - { - "epoch": 0.12, - "learning_rate": 0.00026148594122913277, - "loss": 4.8758, - "step": 404 - }, - { - "epoch": 0.12, - "learning_rate": 0.0002621429410814673, - "loss": 5.021, - "step": 405 - }, - { - "epoch": 0.13, - "learning_rate": 0.00026279994093380177, - "loss": 4.6093, - "step": 406 - }, - { - "epoch": 0.13, - "learning_rate": 0.0002634569407861363, - "loss": 4.705, - "step": 407 - }, - { - "epoch": 0.13, - "learning_rate": 0.00026411394063847077, - "loss": 4.652, - "step": 408 - }, - { - "epoch": 0.13, - "learning_rate": 0.0002647709404908053, - "loss": 4.554, - "step": 409 - }, - { - "epoch": 0.13, - "learning_rate": 0.0002654279403431398, - "loss": 4.4065, - "step": 410 - }, - { - "epoch": 0.13, - "learning_rate": 0.0002660849401954743, - "loss": 4.4372, - "step": 411 - }, - { - "epoch": 0.13, - "learning_rate": 0.00026674194004780883, - "loss": 4.3657, - "step": 412 - }, - { - "epoch": 0.13, - "learning_rate": 0.0002673989399001433, - "loss": 4.4588, - "step": 413 - }, - { - "epoch": 0.13, - "learning_rate": 0.0002680559397524778, - "loss": 4.3591, - "step": 414 - }, - { - "epoch": 0.13, - "learning_rate": 0.0002687129396048123, - "loss": 4.6762, - "step": 415 - }, - { - "epoch": 0.13, - "learning_rate": 0.0002693699394571468, - "loss": 4.4518, - "step": 416 - }, - { - "epoch": 0.13, - "learning_rate": 0.0002700269393094813, - "loss": 4.6972, - "step": 417 - }, - { - "epoch": 0.13, - "learning_rate": 0.00027068393916181583, - "loss": 4.4306, - "step": 418 - }, - { - "epoch": 0.13, - "learning_rate": 0.0002713409390141503, - "loss": 4.4664, - "step": 419 - }, - { - "epoch": 0.13, - "learning_rate": 0.00027199793886648483, - "loss": 4.4462, - "step": 420 - }, - { - "epoch": 0.13, - "learning_rate": 0.0002726549387188193, - "loss": 4.3648, - "step": 421 - }, - { - "epoch": 0.13, - "learning_rate": 0.00027331193857115383, - "loss": 4.3616, - "step": 422 - }, - { - "epoch": 0.13, - "learning_rate": 0.0002739689384234883, - "loss": 4.3607, - "step": 423 - }, - { - "epoch": 0.13, - "learning_rate": 0.00027462593827582284, - "loss": 4.1811, - "step": 424 - }, - { - "epoch": 0.13, - "learning_rate": 0.00027528293812815736, - "loss": 4.2743, - "step": 425 - }, - { - "epoch": 0.13, - "learning_rate": 0.00027593993798049184, - "loss": 3.996, - "step": 426 - }, - { - "epoch": 0.13, - "learning_rate": 0.00027659693783282636, - "loss": 4.3638, - "step": 427 - }, - { - "epoch": 0.13, - "learning_rate": 0.00027725393768516084, - "loss": 4.2065, - "step": 428 - }, - { - "epoch": 0.13, - "learning_rate": 0.00027791093753749537, - "loss": 4.3029, - "step": 429 - }, - { - "epoch": 0.13, - "learning_rate": 0.0002785679373898299, - "loss": 4.1737, - "step": 430 - }, - { - "epoch": 0.13, - "learning_rate": 0.00027922493724216437, - "loss": 4.1625, - "step": 431 - }, - { - "epoch": 0.13, - "learning_rate": 0.0002798819370944989, - "loss": 4.2028, - "step": 432 - }, - { - "epoch": 0.13, - "learning_rate": 0.00028053893694683337, - "loss": 4.2445, - "step": 433 - }, - { - "epoch": 0.13, - "learning_rate": 0.0002811959367991679, - "loss": 4.1621, - "step": 434 - }, - { - "epoch": 0.13, - "learning_rate": 0.00028185293665150237, - "loss": 4.2919, - "step": 435 - }, - { - "epoch": 0.13, - "learning_rate": 0.0002825099365038369, - "loss": 3.8902, - "step": 436 - }, - { - "epoch": 0.13, - "learning_rate": 0.0002831669363561714, - "loss": 4.1929, - "step": 437 - }, - { - "epoch": 0.14, - "learning_rate": 0.0002838239362085059, - "loss": 4.1263, - "step": 438 - }, - { - "epoch": 0.14, - "learning_rate": 0.0002844809360608404, - "loss": 4.0029, - "step": 439 - }, - { - "epoch": 0.14, - "learning_rate": 0.0002851379359131749, - "loss": 3.9629, - "step": 440 - }, - { - "epoch": 0.14, - "learning_rate": 0.0002857949357655094, - "loss": 3.9936, - "step": 441 - }, - { - "epoch": 0.14, - "learning_rate": 0.00028645193561784395, - "loss": 3.8461, - "step": 442 - }, - { - "epoch": 0.14, - "learning_rate": 0.00028710893547017843, - "loss": 3.7729, - "step": 443 - }, - { - "epoch": 0.14, - "learning_rate": 0.00028776593532251296, - "loss": 3.8678, - "step": 444 - }, - { - "epoch": 0.14, - "learning_rate": 0.00028842293517484743, - "loss": 3.8235, - "step": 445 - }, - { - "epoch": 0.14, - "learning_rate": 0.00028907993502718196, - "loss": 3.6168, - "step": 446 - }, - { - "epoch": 0.14, - "learning_rate": 0.00028973693487951643, - "loss": 4.0479, - "step": 447 - }, - { - "epoch": 0.14, - "learning_rate": 0.00029039393473185096, - "loss": 3.4743, - "step": 448 - }, - { - "epoch": 0.14, - "learning_rate": 0.0002910509345841855, - "loss": 3.3913, - "step": 449 - }, - { - "epoch": 0.14, - "learning_rate": 0.00029170793443651996, - "loss": 3.0779, - "step": 450 - }, - { - "epoch": 0.14, - "learning_rate": 0.0002923649342888545, - "loss": 6.6536, - "step": 451 - }, - { - "epoch": 0.14, - "learning_rate": 0.00029302193414118896, - "loss": 5.6577, - "step": 452 - }, - { - "epoch": 0.14, - "learning_rate": 0.0002936789339935235, - "loss": 5.1422, - "step": 453 - }, - { - "epoch": 0.14, - "learning_rate": 0.00029433593384585796, - "loss": 5.2125, - "step": 454 - }, - { - "epoch": 0.14, - "learning_rate": 0.0002949929336981925, - "loss": 4.9079, - "step": 455 - }, - { - "epoch": 0.14, - "learning_rate": 0.000295649933550527, - "loss": 4.9266, - "step": 456 - }, - { - "epoch": 0.14, - "learning_rate": 0.0002963069334028615, - "loss": 4.7769, - "step": 457 - }, - { - "epoch": 0.14, - "learning_rate": 0.000296963933255196, - "loss": 4.7966, - "step": 458 - }, - { - "epoch": 0.14, - "learning_rate": 0.0002976209331075305, - "loss": 4.7017, - "step": 459 - }, - { - "epoch": 0.14, - "learning_rate": 0.000298277932959865, - "loss": 4.6102, - "step": 460 - }, - { - "epoch": 0.14, - "learning_rate": 0.00029893493281219955, - "loss": 4.6471, - "step": 461 - }, - { - "epoch": 0.14, - "learning_rate": 0.000299591932664534, - "loss": 4.2794, - "step": 462 - }, - { - "epoch": 0.14, - "learning_rate": 0.00030024893251686855, - "loss": 4.8004, - "step": 463 - }, - { - "epoch": 0.14, - "learning_rate": 0.000300905932369203, - "loss": 4.6793, - "step": 464 - }, - { - "epoch": 0.14, - "learning_rate": 0.00030156293222153755, - "loss": 4.4926, - "step": 465 - }, - { - "epoch": 0.14, - "learning_rate": 0.000302219932073872, - "loss": 4.2163, - "step": 466 - }, - { - "epoch": 0.14, - "learning_rate": 0.00030287693192620655, - "loss": 4.5306, - "step": 467 - }, - { - "epoch": 0.14, - "learning_rate": 0.0003035339317785411, - "loss": 4.4579, - "step": 468 - }, - { - "epoch": 0.14, - "learning_rate": 0.00030419093163087555, - "loss": 4.4814, - "step": 469 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003048479314832101, - "loss": 4.578, - "step": 470 - }, - { - "epoch": 0.15, - "learning_rate": 0.00030550493133554455, - "loss": 4.4622, - "step": 471 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003061619311878791, - "loss": 4.4143, - "step": 472 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003068189310402136, - "loss": 4.4382, - "step": 473 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003074759308925481, - "loss": 4.4204, - "step": 474 - }, - { - "epoch": 0.15, - "learning_rate": 0.00030813293074488256, - "loss": 4.3629, - "step": 475 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003087899305972171, - "loss": 4.4076, - "step": 476 - }, - { - "epoch": 0.15, - "learning_rate": 0.00030944693044955156, - "loss": 4.279, - "step": 477 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003101039303018861, - "loss": 4.4649, - "step": 478 - }, - { - "epoch": 0.15, - "learning_rate": 0.00031076093015422056, - "loss": 4.2617, - "step": 479 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003114179300065551, - "loss": 4.1989, - "step": 480 - }, - { - "epoch": 0.15, - "learning_rate": 0.00031207492985888956, - "loss": 4.1309, - "step": 481 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003127319297112241, - "loss": 4.254, - "step": 482 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003133889295635586, - "loss": 4.1839, - "step": 483 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003140459294158931, - "loss": 4.1503, - "step": 484 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003147029292682276, - "loss": 4.1915, - "step": 485 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003153599291205621, - "loss": 4.2792, - "step": 486 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003160169289728966, - "loss": 4.1449, - "step": 487 - }, - { - "epoch": 0.15, - "learning_rate": 0.00031667392882523115, - "loss": 4.0662, - "step": 488 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003173309286775656, - "loss": 4.2186, - "step": 489 - }, - { - "epoch": 0.15, - "learning_rate": 0.00031798792852990015, - "loss": 4.0839, - "step": 490 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003186449283822346, - "loss": 4.0271, - "step": 491 - }, - { - "epoch": 0.15, - "learning_rate": 0.00031930192823456915, - "loss": 3.5625, - "step": 492 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003199589280869036, - "loss": 3.7838, - "step": 493 - }, - { - "epoch": 0.15, - "learning_rate": 0.00032061592793923815, - "loss": 3.8771, - "step": 494 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003212729277915727, - "loss": 3.8744, - "step": 495 - }, - { - "epoch": 0.15, - "learning_rate": 0.00032192992764390715, - "loss": 3.3559, - "step": 496 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003225869274962417, - "loss": 3.3418, - "step": 497 - }, - { - "epoch": 0.15, - "learning_rate": 0.00032324392734857615, - "loss": 3.363, - "step": 498 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003239009272009107, - "loss": 3.0908, - "step": 499 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003245579270532452, - "loss": 3.1046, - "step": 500 - }, - { - "epoch": 0.15, - "eval_bleu": 0.0, - "eval_loss": 5.527187824249268, - "eval_runtime": 1371.8331, - "eval_samples_per_second": 10.759, - "eval_steps_per_second": 1.345, - "step": 500 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003252149269055797, - "loss": 6.4631, - "step": 501 - }, - { - "epoch": 0.15, - "learning_rate": 0.0003258719267579142, - "loss": 5.9971, - "step": 502 - }, - { - "epoch": 0.16, - "learning_rate": 0.0003265289266102487, - "loss": 5.3966, - "step": 503 - }, - { - "epoch": 0.16, - "learning_rate": 0.0003271859264625832, - "loss": 5.2348, - "step": 504 - }, - { - "epoch": 0.16, - "learning_rate": 0.0003278429263149177, - "loss": 4.9756, - "step": 505 - }, - { - "epoch": 0.16, - "learning_rate": 0.0003284999261672522, - "loss": 4.8721, - "step": 506 - }, - { - "epoch": 0.16, - "learning_rate": 0.000328464297108015, - "loss": 5.1726, - "step": 507 - }, - { - "epoch": 0.16, - "learning_rate": 0.0003284286680487778, - "loss": 4.8913, - "step": 508 - }, - { - "epoch": 0.16, - "learning_rate": 0.0003283930389895405, - "loss": 4.8133, - "step": 509 - }, - { - "epoch": 0.16, - "learning_rate": 0.00032835740993030334, - "loss": 4.5986, - "step": 510 - }, - { - "epoch": 0.16, - "learning_rate": 0.00032832178087106607, - "loss": 4.7957, - "step": 511 - }, - { - "epoch": 0.16, - "learning_rate": 0.00032828615181182885, - "loss": 4.6052, - "step": 512 - }, - { - "epoch": 0.16, - "learning_rate": 0.00032825052275259163, - "loss": 4.611, - "step": 513 - }, - { - "epoch": 0.16, - "learning_rate": 0.00032821489369335436, - "loss": 4.6585, - "step": 514 - }, - { - "epoch": 0.16, - "learning_rate": 0.00032817926463411714, - "loss": 4.2312, - "step": 515 - }, - { - "epoch": 0.16, - "learning_rate": 0.0003281436355748799, - "loss": 4.5522, - "step": 516 - }, - { - "epoch": 0.16, - "learning_rate": 0.0003281080065156427, - "loss": 4.6756, - "step": 517 - }, - { - "epoch": 0.16, - "learning_rate": 0.0003280723774564055, - "loss": 4.3275, - "step": 518 - }, - { - "epoch": 0.16, - "learning_rate": 0.0003280367483971682, - "loss": 4.5071, - "step": 519 - }, - { - "epoch": 0.16, - "learning_rate": 0.000328001119337931, - "loss": 4.4637, - "step": 520 - }, - { - "epoch": 0.16, - "learning_rate": 0.0003279654902786938, - "loss": 4.5248, - "step": 521 - }, - { - "epoch": 0.16, - "learning_rate": 0.00032792986121945656, - "loss": 4.482, - "step": 522 - }, - { - "epoch": 0.16, - "learning_rate": 0.00032789423216021934, - "loss": 4.5743, - "step": 523 - }, - { - "epoch": 0.16, - "learning_rate": 0.00032785860310098207, - "loss": 4.7417, - "step": 524 - }, - { - "epoch": 0.16, - "learning_rate": 0.00032782297404174485, - "loss": 4.6575, - "step": 525 - }, - { - "epoch": 0.16, - "learning_rate": 0.00032778734498250763, - "loss": 4.5024, - "step": 526 - }, - { - "epoch": 0.16, - "learning_rate": 0.0003277517159232704, - "loss": 4.5208, - "step": 527 - }, - { - "epoch": 0.16, - "learning_rate": 0.00032771608686403314, - "loss": 4.4044, - "step": 528 - }, - { - "epoch": 0.16, - "learning_rate": 0.0003276804578047959, - "loss": 4.2717, - "step": 529 - }, - { - "epoch": 0.16, - "learning_rate": 0.0003276448287455587, - "loss": 4.3591, - "step": 530 - }, - { - "epoch": 0.16, - "learning_rate": 0.0003276091996863215, - "loss": 4.2841, - "step": 531 - }, - { - "epoch": 0.16, - "learning_rate": 0.00032757357062708427, - "loss": 4.0601, - "step": 532 - }, - { - "epoch": 0.16, - "learning_rate": 0.000327537941567847, - "loss": 4.2274, - "step": 533 - }, - { - "epoch": 0.16, - "learning_rate": 0.00032750231250860983, - "loss": 4.2085, - "step": 534 - }, - { - "epoch": 0.17, - "learning_rate": 0.00032746668344937256, - "loss": 3.9464, - "step": 535 - }, - { - "epoch": 0.17, - "learning_rate": 0.00032743105439013534, - "loss": 4.2287, - "step": 536 - }, - { - "epoch": 0.17, - "learning_rate": 0.0003273954253308981, - "loss": 4.4058, - "step": 537 - }, - { - "epoch": 0.17, - "learning_rate": 0.00032735979627166085, - "loss": 4.0904, - "step": 538 - }, - { - "epoch": 0.17, - "learning_rate": 0.0003273241672124237, - "loss": 3.8756, - "step": 539 - }, - { - "epoch": 0.17, - "learning_rate": 0.0003272885381531864, - "loss": 4.1717, - "step": 540 - }, - { - "epoch": 0.17, - "learning_rate": 0.0003272529090939492, - "loss": 3.8294, - "step": 541 - }, - { - "epoch": 0.17, - "learning_rate": 0.000327217280034712, - "loss": 3.8332, - "step": 542 - }, - { - "epoch": 0.17, - "learning_rate": 0.0003271816509754747, - "loss": 3.6963, - "step": 543 - }, - { - "epoch": 0.17, - "learning_rate": 0.00032714602191623754, - "loss": 3.752, - "step": 544 - }, - { - "epoch": 0.17, - "learning_rate": 0.00032711039285700027, - "loss": 3.6952, - "step": 545 - }, - { - "epoch": 0.17, - "learning_rate": 0.00032707476379776305, - "loss": 3.6828, - "step": 546 - }, - { - "epoch": 0.17, - "learning_rate": 0.00032703913473852584, - "loss": 3.2342, - "step": 547 - }, - { - "epoch": 0.17, - "learning_rate": 0.00032700350567928856, - "loss": 3.1144, - "step": 548 - }, - { - "epoch": 0.17, - "learning_rate": 0.00032696787662005134, - "loss": 3.1742, - "step": 549 - }, - { - "epoch": 0.17, - "learning_rate": 0.0003269322475608141, - "loss": 3.0925, - "step": 550 - }, - { - "epoch": 0.17, - "learning_rate": 0.0003268966185015769, - "loss": 6.8604, - "step": 551 - }, - { - "epoch": 0.17, - "learning_rate": 0.0003268609894423397, - "loss": 5.9804, - "step": 552 - }, - { - "epoch": 0.17, - "learning_rate": 0.00032682536038310247, - "loss": 5.3944, - "step": 553 - }, - { - "epoch": 0.17, - "learning_rate": 0.0003267897313238652, - "loss": 5.124, - "step": 554 - }, - { - "epoch": 0.17, - "learning_rate": 0.000326754102264628, - "loss": 4.8878, - "step": 555 - }, - { - "epoch": 0.17, - "learning_rate": 0.00032671847320539076, - "loss": 4.6555, - "step": 556 - }, - { - "epoch": 0.17, - "learning_rate": 0.00032668284414615355, - "loss": 4.8814, - "step": 557 - }, - { - "epoch": 0.17, - "learning_rate": 0.00032664721508691633, - "loss": 4.7107, - "step": 558 - }, - { - "epoch": 0.17, - "learning_rate": 0.00032661158602767905, - "loss": 4.7047, - "step": 559 - }, - { - "epoch": 0.17, - "learning_rate": 0.00032657595696844184, - "loss": 4.5857, - "step": 560 - }, - { - "epoch": 0.17, - "learning_rate": 0.0003265403279092046, - "loss": 4.7638, - "step": 561 - }, - { - "epoch": 0.17, - "learning_rate": 0.00032650469884996735, - "loss": 4.5761, - "step": 562 - }, - { - "epoch": 0.17, - "learning_rate": 0.0003264690697907302, - "loss": 4.6643, - "step": 563 - }, - { - "epoch": 0.17, - "learning_rate": 0.0003264334407314929, - "loss": 4.6397, - "step": 564 - }, - { - "epoch": 0.17, - "learning_rate": 0.0003263978116722557, - "loss": 4.5706, - "step": 565 - }, - { - "epoch": 0.17, - "learning_rate": 0.0003263621826130185, - "loss": 4.3672, - "step": 566 - }, - { - "epoch": 0.17, - "learning_rate": 0.0003263265535537812, - "loss": 4.5335, - "step": 567 - }, - { - "epoch": 0.18, - "learning_rate": 0.00032629092449454404, - "loss": 4.5029, - "step": 568 - }, - { - "epoch": 0.18, - "learning_rate": 0.00032625529543530677, - "loss": 4.679, - "step": 569 - }, - { - "epoch": 0.18, - "learning_rate": 0.00032621966637606955, - "loss": 4.6395, - "step": 570 - }, - { - "epoch": 0.18, - "learning_rate": 0.00032618403731683233, - "loss": 4.439, - "step": 571 - }, - { - "epoch": 0.18, - "learning_rate": 0.00032614840825759506, - "loss": 4.3253, - "step": 572 - }, - { - "epoch": 0.18, - "learning_rate": 0.0003261127791983579, - "loss": 4.4057, - "step": 573 - }, - { - "epoch": 0.18, - "learning_rate": 0.0003260771501391206, - "loss": 4.4629, - "step": 574 - }, - { - "epoch": 0.18, - "learning_rate": 0.0003260415210798834, - "loss": 4.286, - "step": 575 - }, - { - "epoch": 0.18, - "learning_rate": 0.0003260058920206462, - "loss": 4.2038, - "step": 576 - }, - { - "epoch": 0.18, - "learning_rate": 0.00032597026296140897, - "loss": 4.3508, - "step": 577 - }, - { - "epoch": 0.18, - "learning_rate": 0.00032593463390217175, - "loss": 4.4798, - "step": 578 - }, - { - "epoch": 0.18, - "learning_rate": 0.0003258990048429345, - "loss": 4.281, - "step": 579 - }, - { - "epoch": 0.18, - "learning_rate": 0.00032586337578369726, - "loss": 4.2855, - "step": 580 - }, - { - "epoch": 0.18, - "learning_rate": 0.00032582774672446004, - "loss": 4.3629, - "step": 581 - }, - { - "epoch": 0.18, - "learning_rate": 0.0003257921176652228, - "loss": 4.095, - "step": 582 - }, - { - "epoch": 0.18, - "learning_rate": 0.00032575648860598555, - "loss": 4.2636, - "step": 583 - }, - { - "epoch": 0.18, - "learning_rate": 0.00032572085954674833, - "loss": 4.3922, - "step": 584 - }, - { - "epoch": 0.18, - "learning_rate": 0.0003256852304875111, - "loss": 4.4354, - "step": 585 - }, - { - "epoch": 0.18, - "learning_rate": 0.0003256496014282739, - "loss": 4.2108, - "step": 586 - }, - { - "epoch": 0.18, - "learning_rate": 0.0003256139723690367, - "loss": 4.1252, - "step": 587 - }, - { - "epoch": 0.18, - "learning_rate": 0.0003255783433097994, - "loss": 4.2144, - "step": 588 - }, - { - "epoch": 0.18, - "learning_rate": 0.0003255427142505622, - "loss": 4.1965, - "step": 589 - }, - { - "epoch": 0.18, - "learning_rate": 0.00032550708519132497, - "loss": 3.8201, - "step": 590 - }, - { - "epoch": 0.18, - "learning_rate": 0.00032547145613208775, - "loss": 4.0785, - "step": 591 - }, - { - "epoch": 0.18, - "learning_rate": 0.00032543582707285053, - "loss": 3.647, - "step": 592 - }, - { - "epoch": 0.18, - "learning_rate": 0.00032540019801361326, - "loss": 3.8259, - "step": 593 - }, - { - "epoch": 0.18, - "learning_rate": 0.00032536456895437604, - "loss": 3.3992, - "step": 594 - }, - { - "epoch": 0.18, - "learning_rate": 0.0003253289398951388, - "loss": 3.8117, - "step": 595 - }, - { - "epoch": 0.18, - "learning_rate": 0.0003252933108359016, - "loss": 3.5423, - "step": 596 - }, - { - "epoch": 0.18, - "learning_rate": 0.0003252576817766644, - "loss": 3.2909, - "step": 597 - }, - { - "epoch": 0.18, - "learning_rate": 0.0003252220527174271, - "loss": 3.2371, - "step": 598 - }, - { - "epoch": 0.18, - "learning_rate": 0.0003251864236581899, - "loss": 3.1458, - "step": 599 - }, - { - "epoch": 0.19, - "learning_rate": 0.0003251507945989527, - "loss": 2.8437, - "step": 600 - }, - { - "epoch": 0.19, - "learning_rate": 0.00032511516553971546, - "loss": 6.1913, - "step": 601 - }, - { - "epoch": 0.19, - "learning_rate": 0.00032507953648047824, - "loss": 5.8923, - "step": 602 - }, - { - "epoch": 0.19, - "learning_rate": 0.00032504390742124097, - "loss": 5.2444, - "step": 603 - }, - { - "epoch": 0.19, - "learning_rate": 0.00032500827836200375, - "loss": 4.9218, - "step": 604 - }, - { - "epoch": 0.19, - "learning_rate": 0.00032497264930276653, - "loss": 4.584, - "step": 605 - }, - { - "epoch": 0.19, - "learning_rate": 0.0003249370202435293, - "loss": 4.8353, - "step": 606 - }, - { - "epoch": 0.19, - "learning_rate": 0.0003249013911842921, - "loss": 4.5922, - "step": 607 - }, - { - "epoch": 0.19, - "learning_rate": 0.0003248657621250548, - "loss": 4.8958, - "step": 608 - }, - { - "epoch": 0.19, - "learning_rate": 0.0003248301330658176, - "loss": 4.5731, - "step": 609 - }, - { - "epoch": 0.19, - "learning_rate": 0.0003247945040065804, - "loss": 4.4427, - "step": 610 - }, - { - "epoch": 0.19, - "learning_rate": 0.00032475887494734317, - "loss": 4.6072, - "step": 611 - }, - { - "epoch": 0.19, - "learning_rate": 0.00032472324588810595, - "loss": 4.644, - "step": 612 - }, - { - "epoch": 0.19, - "learning_rate": 0.0003246876168288687, - "loss": 4.4587, - "step": 613 - }, - { - "epoch": 0.19, - "learning_rate": 0.00032465198776963146, - "loss": 4.6023, - "step": 614 - }, - { - "epoch": 0.19, - "learning_rate": 0.00032461635871039424, - "loss": 4.522, - "step": 615 - }, - { - "epoch": 0.19, - "learning_rate": 0.000324580729651157, - "loss": 4.7412, - "step": 616 - }, - { - "epoch": 0.19, - "learning_rate": 0.00032454510059191975, - "loss": 4.4786, - "step": 617 - }, - { - "epoch": 0.19, - "learning_rate": 0.00032450947153268254, - "loss": 4.3927, - "step": 618 - }, - { - "epoch": 0.19, - "learning_rate": 0.0003244738424734453, - "loss": 4.3706, - "step": 619 - }, - { - "epoch": 0.19, - "learning_rate": 0.0003244382134142081, - "loss": 4.4966, - "step": 620 - }, - { - "epoch": 0.19, - "learning_rate": 0.0003244025843549709, - "loss": 4.6049, - "step": 621 - }, - { - "epoch": 0.19, - "learning_rate": 0.0003243669552957336, - "loss": 4.4932, - "step": 622 - }, - { - "epoch": 0.19, - "learning_rate": 0.0003243313262364964, - "loss": 4.3992, - "step": 623 - }, - { - "epoch": 0.19, - "learning_rate": 0.00032429569717725917, - "loss": 4.4008, - "step": 624 - }, - { - "epoch": 0.19, - "learning_rate": 0.00032426006811802195, - "loss": 4.5263, - "step": 625 - }, - { - "epoch": 0.19, - "learning_rate": 0.00032422443905878474, - "loss": 4.1794, - "step": 626 - }, - { - "epoch": 0.19, - "learning_rate": 0.00032418880999954746, - "loss": 4.5676, - "step": 627 - }, - { - "epoch": 0.19, - "learning_rate": 0.0003241531809403103, - "loss": 4.1694, - "step": 628 - }, - { - "epoch": 0.19, - "learning_rate": 0.00032411755188107303, - "loss": 4.3338, - "step": 629 - }, - { - "epoch": 0.19, - "learning_rate": 0.0003240819228218358, - "loss": 4.7834, - "step": 630 - }, - { - "epoch": 0.19, - "learning_rate": 0.0003240462937625986, - "loss": 4.6943, - "step": 631 - }, - { - "epoch": 0.2, - "learning_rate": 0.0003240106647033613, - "loss": 4.3402, - "step": 632 - }, - { - "epoch": 0.2, - "learning_rate": 0.00032397503564412415, - "loss": 4.2537, - "step": 633 - }, - { - "epoch": 0.2, - "learning_rate": 0.0003239394065848869, - "loss": 4.1278, - "step": 634 - }, - { - "epoch": 0.2, - "learning_rate": 0.00032390377752564966, - "loss": 3.9416, - "step": 635 - }, - { - "epoch": 0.2, - "learning_rate": 0.00032386814846641245, - "loss": 4.2324, - "step": 636 - }, - { - "epoch": 0.2, - "learning_rate": 0.0003238325194071752, - "loss": 4.0345, - "step": 637 - }, - { - "epoch": 0.2, - "learning_rate": 0.000323796890347938, - "loss": 4.0859, - "step": 638 - }, - { - "epoch": 0.2, - "learning_rate": 0.00032376126128870074, - "loss": 3.9759, - "step": 639 - }, - { - "epoch": 0.2, - "learning_rate": 0.0003237256322294635, - "loss": 3.9151, - "step": 640 - }, - { - "epoch": 0.2, - "learning_rate": 0.0003236900031702263, - "loss": 3.6031, - "step": 641 - }, - { - "epoch": 0.2, - "learning_rate": 0.00032365437411098903, - "loss": 3.9811, - "step": 642 - }, - { - "epoch": 0.2, - "learning_rate": 0.0003236187450517518, - "loss": 3.7555, - "step": 643 - }, - { - "epoch": 0.2, - "learning_rate": 0.0003235831159925146, - "loss": 3.6389, - "step": 644 - }, - { - "epoch": 0.2, - "learning_rate": 0.0003235474869332774, - "loss": 3.5065, - "step": 645 - }, - { - "epoch": 0.2, - "learning_rate": 0.00032351185787404016, - "loss": 3.4343, - "step": 646 - }, - { - "epoch": 0.2, - "learning_rate": 0.00032347622881480294, - "loss": 3.3052, - "step": 647 - }, - { - "epoch": 0.2, - "learning_rate": 0.00032344059975556567, - "loss": 3.3348, - "step": 648 - }, - { - "epoch": 0.2, - "learning_rate": 0.00032340497069632845, - "loss": 3.173, - "step": 649 - }, - { - "epoch": 0.2, - "learning_rate": 0.00032336934163709123, - "loss": 2.7573, - "step": 650 - }, - { - "epoch": 0.2, - "learning_rate": 0.00032333371257785396, - "loss": 6.2311, - "step": 651 - }, - { - "epoch": 0.2, - "learning_rate": 0.0003232980835186168, - "loss": 5.9324, - "step": 652 - }, - { - "epoch": 0.2, - "learning_rate": 0.0003232624544593795, - "loss": 5.4267, - "step": 653 - }, - { - "epoch": 0.2, - "learning_rate": 0.0003232268254001423, - "loss": 5.29, - "step": 654 - }, - { - "epoch": 0.2, - "learning_rate": 0.0003231911963409051, - "loss": 5.1996, - "step": 655 - }, - { - "epoch": 0.2, - "learning_rate": 0.0003231555672816678, - "loss": 4.7653, - "step": 656 - }, - { - "epoch": 0.2, - "learning_rate": 0.00032311993822243065, - "loss": 4.8765, - "step": 657 - }, - { - "epoch": 0.2, - "learning_rate": 0.0003230843091631934, - "loss": 4.8486, - "step": 658 - }, - { - "epoch": 0.2, - "learning_rate": 0.00032304868010395616, - "loss": 4.5627, - "step": 659 - }, - { - "epoch": 0.2, - "learning_rate": 0.00032301305104471894, - "loss": 4.7988, - "step": 660 - }, - { - "epoch": 0.2, - "learning_rate": 0.00032297742198548167, - "loss": 4.6356, - "step": 661 - }, - { - "epoch": 0.2, - "learning_rate": 0.0003229417929262445, - "loss": 4.5247, - "step": 662 - }, - { - "epoch": 0.2, - "learning_rate": 0.00032290616386700723, - "loss": 4.8718, - "step": 663 - }, - { - "epoch": 0.2, - "learning_rate": 0.00032287053480777, - "loss": 4.5409, - "step": 664 - }, - { - "epoch": 0.21, - "learning_rate": 0.0003228349057485328, - "loss": 4.4832, - "step": 665 - }, - { - "epoch": 0.21, - "learning_rate": 0.0003227992766892955, - "loss": 4.5016, - "step": 666 - }, - { - "epoch": 0.21, - "learning_rate": 0.00032276364763005836, - "loss": 4.7042, - "step": 667 - }, - { - "epoch": 0.21, - "learning_rate": 0.0003227280185708211, - "loss": 4.4653, - "step": 668 - }, - { - "epoch": 0.21, - "learning_rate": 0.00032269238951158387, - "loss": 4.3768, - "step": 669 - }, - { - "epoch": 0.21, - "learning_rate": 0.00032265676045234665, - "loss": 4.6229, - "step": 670 - }, - { - "epoch": 0.21, - "learning_rate": 0.00032262113139310943, - "loss": 4.5951, - "step": 671 - }, - { - "epoch": 0.21, - "learning_rate": 0.0003225855023338722, - "loss": 4.5205, - "step": 672 - }, - { - "epoch": 0.21, - "learning_rate": 0.00032254987327463494, - "loss": 4.4447, - "step": 673 - }, - { - "epoch": 0.21, - "learning_rate": 0.0003225142442153977, - "loss": 4.739, - "step": 674 - }, - { - "epoch": 0.21, - "learning_rate": 0.0003224786151561605, - "loss": 4.4509, - "step": 675 - }, - { - "epoch": 0.21, - "learning_rate": 0.0003224429860969233, - "loss": 4.7115, - "step": 676 - }, - { - "epoch": 0.21, - "learning_rate": 0.000322407357037686, - "loss": 4.3424, - "step": 677 - }, - { - "epoch": 0.21, - "learning_rate": 0.0003223717279784488, - "loss": 4.4603, - "step": 678 - }, - { - "epoch": 0.21, - "learning_rate": 0.0003223360989192116, - "loss": 4.614, - "step": 679 - }, - { - "epoch": 0.21, - "learning_rate": 0.00032230046985997436, - "loss": 4.4052, - "step": 680 - }, - { - "epoch": 0.21, - "learning_rate": 0.00032226484080073714, - "loss": 4.7224, - "step": 681 - }, - { - "epoch": 0.21, - "learning_rate": 0.00032222921174149987, - "loss": 4.4315, - "step": 682 - }, - { - "epoch": 0.21, - "learning_rate": 0.00032219358268226265, - "loss": 4.798, - "step": 683 - }, - { - "epoch": 0.21, - "learning_rate": 0.00032215795362302543, - "loss": 4.1718, - "step": 684 - }, - { - "epoch": 0.21, - "learning_rate": 0.00032212232456378816, - "loss": 4.2697, - "step": 685 - }, - { - "epoch": 0.21, - "learning_rate": 0.000322086695504551, - "loss": 4.4247, - "step": 686 - }, - { - "epoch": 0.21, - "learning_rate": 0.0003220510664453137, - "loss": 4.3495, - "step": 687 - }, - { - "epoch": 0.21, - "learning_rate": 0.0003220154373860765, - "loss": 4.084, - "step": 688 - }, - { - "epoch": 0.21, - "learning_rate": 0.0003219798083268393, - "loss": 4.0288, - "step": 689 - }, - { - "epoch": 0.21, - "learning_rate": 0.00032194417926760207, - "loss": 4.0901, - "step": 690 - }, - { - "epoch": 0.21, - "learning_rate": 0.00032190855020836485, - "loss": 4.0617, - "step": 691 - }, - { - "epoch": 0.21, - "learning_rate": 0.0003218729211491276, - "loss": 4.0171, - "step": 692 - }, - { - "epoch": 0.21, - "learning_rate": 0.00032183729208989036, - "loss": 4.438, - "step": 693 - }, - { - "epoch": 0.21, - "learning_rate": 0.00032180166303065314, - "loss": 3.9928, - "step": 694 - }, - { - "epoch": 0.21, - "learning_rate": 0.0003217660339714159, - "loss": 3.9308, - "step": 695 - }, - { - "epoch": 0.21, - "learning_rate": 0.0003217304049121787, - "loss": 3.8863, - "step": 696 - }, - { - "epoch": 0.22, - "learning_rate": 0.00032169477585294144, - "loss": 3.7087, - "step": 697 - }, - { - "epoch": 0.22, - "learning_rate": 0.0003216591467937042, - "loss": 3.6466, - "step": 698 - }, - { - "epoch": 0.22, - "learning_rate": 0.000321623517734467, - "loss": 3.4242, - "step": 699 - }, - { - "epoch": 0.22, - "learning_rate": 0.0003215878886752298, - "loss": 3.3662, - "step": 700 - }, - { - "epoch": 0.22, - "learning_rate": 0.00032155225961599256, - "loss": 6.47, - "step": 701 - }, - { - "epoch": 0.22, - "learning_rate": 0.0003215166305567553, - "loss": 6.4183, - "step": 702 - }, - { - "epoch": 0.22, - "learning_rate": 0.0003214810014975181, - "loss": 6.0522, - "step": 703 - }, - { - "epoch": 0.22, - "learning_rate": 0.00032144537243828085, - "loss": 5.6719, - "step": 704 - }, - { - "epoch": 0.22, - "learning_rate": 0.00032140974337904364, - "loss": 5.5593, - "step": 705 - }, - { - "epoch": 0.22, - "learning_rate": 0.0003213741143198064, - "loss": 5.3021, - "step": 706 - }, - { - "epoch": 0.22, - "learning_rate": 0.00032133848526056915, - "loss": 5.6657, - "step": 707 - }, - { - "epoch": 0.22, - "learning_rate": 0.00032130285620133193, - "loss": 4.9884, - "step": 708 - }, - { - "epoch": 0.22, - "learning_rate": 0.0003212672271420947, - "loss": 4.9615, - "step": 709 - }, - { - "epoch": 0.22, - "learning_rate": 0.0003212315980828575, - "loss": 4.8835, - "step": 710 - }, - { - "epoch": 0.22, - "learning_rate": 0.0003211959690236202, - "loss": 5.0906, - "step": 711 - }, - { - "epoch": 0.22, - "learning_rate": 0.000321160339964383, - "loss": 4.816, - "step": 712 - }, - { - "epoch": 0.22, - "learning_rate": 0.0003211247109051458, - "loss": 4.7756, - "step": 713 - }, - { - "epoch": 0.22, - "learning_rate": 0.00032108908184590857, - "loss": 5.1242, - "step": 714 - }, - { - "epoch": 0.22, - "learning_rate": 0.00032105345278667135, - "loss": 5.1941, - "step": 715 - }, - { - "epoch": 0.22, - "learning_rate": 0.0003210178237274341, - "loss": 4.8952, - "step": 716 - }, - { - "epoch": 0.22, - "learning_rate": 0.00032098219466819686, - "loss": 4.5833, - "step": 717 - }, - { - "epoch": 0.22, - "learning_rate": 0.00032094656560895964, - "loss": 4.9627, - "step": 718 - }, - { - "epoch": 0.22, - "learning_rate": 0.0003209109365497224, - "loss": 4.6861, - "step": 719 - }, - { - "epoch": 0.22, - "learning_rate": 0.0003208753074904852, - "loss": 4.806, - "step": 720 - }, - { - "epoch": 0.22, - "learning_rate": 0.00032083967843124793, - "loss": 4.6327, - "step": 721 - }, - { - "epoch": 0.22, - "learning_rate": 0.00032080404937201077, - "loss": 4.7109, - "step": 722 - }, - { - "epoch": 0.22, - "learning_rate": 0.0003207684203127735, - "loss": 4.7094, - "step": 723 - }, - { - "epoch": 0.22, - "learning_rate": 0.0003207327912535363, - "loss": 4.6903, - "step": 724 - }, - { - "epoch": 0.22, - "learning_rate": 0.00032069716219429906, - "loss": 4.7569, - "step": 725 - }, - { - "epoch": 0.22, - "learning_rate": 0.0003206615331350618, - "loss": 4.649, - "step": 726 - }, - { - "epoch": 0.22, - "learning_rate": 0.0003206259040758246, - "loss": 4.5057, - "step": 727 - }, - { - "epoch": 0.22, - "learning_rate": 0.00032059027501658735, - "loss": 4.5998, - "step": 728 - }, - { - "epoch": 0.22, - "learning_rate": 0.00032055464595735013, - "loss": 4.453, - "step": 729 - }, - { - "epoch": 0.23, - "learning_rate": 0.0003205190168981129, - "loss": 4.6019, - "step": 730 - }, - { - "epoch": 0.23, - "learning_rate": 0.00032048338783887564, - "loss": 4.4339, - "step": 731 - }, - { - "epoch": 0.23, - "learning_rate": 0.0003204477587796384, - "loss": 4.7953, - "step": 732 - }, - { - "epoch": 0.23, - "learning_rate": 0.0003204121297204012, - "loss": 4.2888, - "step": 733 - }, - { - "epoch": 0.23, - "learning_rate": 0.000320376500661164, - "loss": 4.5335, - "step": 734 - }, - { - "epoch": 0.23, - "learning_rate": 0.00032034087160192677, - "loss": 4.2448, - "step": 735 - }, - { - "epoch": 0.23, - "learning_rate": 0.0003203052425426895, - "loss": 4.3432, - "step": 736 - }, - { - "epoch": 0.23, - "learning_rate": 0.0003202696134834523, - "loss": 4.2657, - "step": 737 - }, - { - "epoch": 0.23, - "learning_rate": 0.00032023398442421506, - "loss": 4.2923, - "step": 738 - }, - { - "epoch": 0.23, - "learning_rate": 0.00032019835536497784, - "loss": 4.0112, - "step": 739 - }, - { - "epoch": 0.23, - "learning_rate": 0.0003201627263057406, - "loss": 4.4751, - "step": 740 - }, - { - "epoch": 0.23, - "learning_rate": 0.0003201270972465034, - "loss": 4.2462, - "step": 741 - }, - { - "epoch": 0.23, - "learning_rate": 0.00032009146818726613, - "loss": 4.0629, - "step": 742 - }, - { - "epoch": 0.23, - "learning_rate": 0.0003200558391280289, - "loss": 3.9804, - "step": 743 - }, - { - "epoch": 0.23, - "learning_rate": 0.0003200202100687917, - "loss": 3.7141, - "step": 744 - }, - { - "epoch": 0.23, - "learning_rate": 0.0003199845810095544, - "loss": 3.7757, - "step": 745 - }, - { - "epoch": 0.23, - "learning_rate": 0.00031994895195031726, - "loss": 3.5529, - "step": 746 - }, - { - "epoch": 0.23, - "learning_rate": 0.00031991332289108, - "loss": 3.6752, - "step": 747 - }, - { - "epoch": 0.23, - "learning_rate": 0.00031987769383184277, - "loss": 3.8655, - "step": 748 - }, - { - "epoch": 0.23, - "learning_rate": 0.00031984206477260555, - "loss": 3.4067, - "step": 749 - }, - { - "epoch": 0.23, - "learning_rate": 0.0003198064357133683, - "loss": 3.1073, - "step": 750 - }, - { - "epoch": 0.23, - "learning_rate": 0.0003197708066541311, - "loss": 6.4978, - "step": 751 - }, - { - "epoch": 0.23, - "learning_rate": 0.00031973517759489384, - "loss": 5.5928, - "step": 752 - }, - { - "epoch": 0.23, - "learning_rate": 0.0003196995485356566, - "loss": 5.4286, - "step": 753 - }, - { - "epoch": 0.23, - "learning_rate": 0.0003196639194764194, - "loss": 5.0086, - "step": 754 - }, - { - "epoch": 0.23, - "learning_rate": 0.00031962829041718213, - "loss": 5.1504, - "step": 755 - }, - { - "epoch": 0.23, - "learning_rate": 0.00031959266135794497, - "loss": 5.0985, - "step": 756 - }, - { - "epoch": 0.23, - "learning_rate": 0.0003195570322987077, - "loss": 5.0739, - "step": 757 - }, - { - "epoch": 0.23, - "learning_rate": 0.0003195214032394705, - "loss": 4.6801, - "step": 758 - }, - { - "epoch": 0.23, - "learning_rate": 0.00031948577418023326, - "loss": 4.8236, - "step": 759 - }, - { - "epoch": 0.23, - "learning_rate": 0.000319450145120996, - "loss": 4.9626, - "step": 760 - }, - { - "epoch": 0.23, - "learning_rate": 0.0003194145160617588, - "loss": 4.7853, - "step": 761 - }, - { - "epoch": 0.24, - "learning_rate": 0.00031937888700252155, - "loss": 4.7102, - "step": 762 - }, - { - "epoch": 0.24, - "learning_rate": 0.00031934325794328433, - "loss": 4.9914, - "step": 763 - }, - { - "epoch": 0.24, - "learning_rate": 0.0003193076288840471, - "loss": 4.9741, - "step": 764 - }, - { - "epoch": 0.24, - "learning_rate": 0.0003192719998248099, - "loss": 4.5619, - "step": 765 - }, - { - "epoch": 0.24, - "learning_rate": 0.0003192363707655726, - "loss": 4.8508, - "step": 766 - }, - { - "epoch": 0.24, - "learning_rate": 0.0003192007417063354, - "loss": 4.9575, - "step": 767 - }, - { - "epoch": 0.24, - "learning_rate": 0.0003191651126470982, - "loss": 4.8022, - "step": 768 - }, - { - "epoch": 0.24, - "learning_rate": 0.00031912948358786097, - "loss": 4.7418, - "step": 769 - }, - { - "epoch": 0.24, - "learning_rate": 0.00031909385452862375, - "loss": 4.8474, - "step": 770 - }, - { - "epoch": 0.24, - "learning_rate": 0.0003190582254693865, - "loss": 4.8716, - "step": 771 - }, - { - "epoch": 0.24, - "learning_rate": 0.00031902259641014926, - "loss": 4.6858, - "step": 772 - }, - { - "epoch": 0.24, - "learning_rate": 0.00031898696735091205, - "loss": 4.5442, - "step": 773 - }, - { - "epoch": 0.24, - "learning_rate": 0.00031895133829167483, - "loss": 4.4317, - "step": 774 - }, - { - "epoch": 0.24, - "learning_rate": 0.0003189157092324376, - "loss": 4.7171, - "step": 775 - }, - { - "epoch": 0.24, - "learning_rate": 0.00031888008017320034, - "loss": 4.7172, - "step": 776 - }, - { - "epoch": 0.24, - "learning_rate": 0.0003188444511139631, - "loss": 4.8377, - "step": 777 - }, - { - "epoch": 0.24, - "learning_rate": 0.0003188088220547259, - "loss": 4.6487, - "step": 778 - }, - { - "epoch": 0.24, - "learning_rate": 0.00031877319299548863, - "loss": 4.5178, - "step": 779 - }, - { - "epoch": 0.24, - "learning_rate": 0.00031873756393625146, - "loss": 4.541, - "step": 780 - }, - { - "epoch": 0.24, - "learning_rate": 0.0003187019348770142, - "loss": 4.5854, - "step": 781 - }, - { - "epoch": 0.24, - "learning_rate": 0.000318666305817777, - "loss": 4.6579, - "step": 782 - }, - { - "epoch": 0.24, - "learning_rate": 0.00031863067675853976, - "loss": 4.4028, - "step": 783 - }, - { - "epoch": 0.24, - "learning_rate": 0.00031859504769930254, - "loss": 4.3095, - "step": 784 - }, - { - "epoch": 0.24, - "learning_rate": 0.0003185594186400653, - "loss": 4.5231, - "step": 785 - }, - { - "epoch": 0.24, - "learning_rate": 0.00031852378958082805, - "loss": 4.2833, - "step": 786 - }, - { - "epoch": 0.24, - "learning_rate": 0.00031848816052159083, - "loss": 4.274, - "step": 787 - }, - { - "epoch": 0.24, - "learning_rate": 0.0003184525314623536, - "loss": 4.5985, - "step": 788 - }, - { - "epoch": 0.24, - "learning_rate": 0.0003184169024031164, - "loss": 5.0541, - "step": 789 - }, - { - "epoch": 0.24, - "learning_rate": 0.0003183812733438792, - "loss": 4.1923, - "step": 790 - }, - { - "epoch": 0.24, - "learning_rate": 0.0003183456442846419, - "loss": 4.1639, - "step": 791 - }, - { - "epoch": 0.24, - "learning_rate": 0.0003183100152254047, - "loss": 3.9991, - "step": 792 - }, - { - "epoch": 0.24, - "learning_rate": 0.00031827438616616747, - "loss": 4.0063, - "step": 793 - }, - { - "epoch": 0.25, - "learning_rate": 0.00031823875710693025, - "loss": 3.8333, - "step": 794 - }, - { - "epoch": 0.25, - "learning_rate": 0.00031820312804769303, - "loss": 4.1733, - "step": 795 - }, - { - "epoch": 0.25, - "learning_rate": 0.00031816749898845576, - "loss": 3.7356, - "step": 796 - }, - { - "epoch": 0.25, - "learning_rate": 0.00031813186992921854, - "loss": 3.8003, - "step": 797 - }, - { - "epoch": 0.25, - "learning_rate": 0.0003180962408699813, - "loss": 3.4117, - "step": 798 - }, - { - "epoch": 0.25, - "learning_rate": 0.0003180606118107441, - "loss": 3.2608, - "step": 799 - }, - { - "epoch": 0.25, - "learning_rate": 0.00031802498275150683, - "loss": 3.2906, - "step": 800 - }, - { - "epoch": 0.25, - "learning_rate": 0.0003179893536922696, - "loss": 6.6787, - "step": 801 - }, - { - "epoch": 0.25, - "learning_rate": 0.0003179537246330324, - "loss": 6.4735, - "step": 802 - }, - { - "epoch": 0.25, - "learning_rate": 0.0003179180955737952, - "loss": 5.8129, - "step": 803 - }, - { - "epoch": 0.25, - "learning_rate": 0.00031788246651455796, - "loss": 5.2917, - "step": 804 - }, - { - "epoch": 0.25, - "learning_rate": 0.0003178468374553207, - "loss": 5.4524, - "step": 805 - }, - { - "epoch": 0.25, - "learning_rate": 0.00031781120839608347, - "loss": 5.2113, - "step": 806 - }, - { - "epoch": 0.25, - "learning_rate": 0.00031777557933684625, - "loss": 5.1347, - "step": 807 - }, - { - "epoch": 0.25, - "learning_rate": 0.00031773995027760903, - "loss": 5.0582, - "step": 808 - }, - { - "epoch": 0.25, - "learning_rate": 0.0003177043212183718, - "loss": 5.0439, - "step": 809 - }, - { - "epoch": 0.25, - "learning_rate": 0.00031766869215913454, - "loss": 4.8616, - "step": 810 - }, - { - "epoch": 0.25, - "learning_rate": 0.0003176330630998974, - "loss": 4.4569, - "step": 811 - }, - { - "epoch": 0.25, - "learning_rate": 0.0003175974340406601, - "loss": 4.7055, - "step": 812 - }, - { - "epoch": 0.25, - "learning_rate": 0.0003175618049814229, - "loss": 4.9301, - "step": 813 - }, - { - "epoch": 0.25, - "learning_rate": 0.00031752617592218567, - "loss": 4.6795, - "step": 814 - }, - { - "epoch": 0.25, - "learning_rate": 0.0003174905468629484, - "loss": 4.866, - "step": 815 - }, - { - "epoch": 0.25, - "learning_rate": 0.00031745491780371123, - "loss": 4.6602, - "step": 816 - }, - { - "epoch": 0.25, - "learning_rate": 0.00031741928874447396, - "loss": 4.9522, - "step": 817 - }, - { - "epoch": 0.25, - "learning_rate": 0.00031738365968523674, - "loss": 5.0115, - "step": 818 - }, - { - "epoch": 0.25, - "learning_rate": 0.0003173480306259995, - "loss": 4.5117, - "step": 819 - }, - { - "epoch": 0.25, - "learning_rate": 0.00031731240156676225, - "loss": 4.6974, - "step": 820 - }, - { - "epoch": 0.25, - "learning_rate": 0.0003172767725075251, - "loss": 4.9883, - "step": 821 - }, - { - "epoch": 0.25, - "learning_rate": 0.0003172411434482878, - "loss": 4.4035, - "step": 822 - }, - { - "epoch": 0.25, - "learning_rate": 0.0003172055143890506, - "loss": 4.636, - "step": 823 - }, - { - "epoch": 0.25, - "learning_rate": 0.0003171698853298134, - "loss": 4.878, - "step": 824 - }, - { - "epoch": 0.25, - "learning_rate": 0.0003171342562705761, - "loss": 4.5198, - "step": 825 - }, - { - "epoch": 0.25, - "learning_rate": 0.0003170986272113389, - "loss": 4.4156, - "step": 826 - }, - { - "epoch": 0.26, - "learning_rate": 0.00031706299815210167, - "loss": 4.6267, - "step": 827 - }, - { - "epoch": 0.26, - "learning_rate": 0.00031702736909286445, - "loss": 4.778, - "step": 828 - }, - { - "epoch": 0.26, - "learning_rate": 0.00031699174003362723, - "loss": 4.5432, - "step": 829 - }, - { - "epoch": 0.26, - "learning_rate": 0.00031695611097438996, - "loss": 4.756, - "step": 830 - }, - { - "epoch": 0.26, - "learning_rate": 0.00031692048191515274, - "loss": 4.5705, - "step": 831 - }, - { - "epoch": 0.26, - "learning_rate": 0.0003168848528559155, - "loss": 4.5189, - "step": 832 - }, - { - "epoch": 0.26, - "learning_rate": 0.0003168492237966783, - "loss": 4.4518, - "step": 833 - }, - { - "epoch": 0.26, - "learning_rate": 0.0003168135947374411, - "loss": 4.33, - "step": 834 - }, - { - "epoch": 0.26, - "learning_rate": 0.00031677796567820387, - "loss": 4.3499, - "step": 835 - }, - { - "epoch": 0.26, - "learning_rate": 0.0003167423366189666, - "loss": 4.2432, - "step": 836 - }, - { - "epoch": 0.26, - "learning_rate": 0.0003167067075597294, - "loss": 4.5634, - "step": 837 - }, - { - "epoch": 0.26, - "learning_rate": 0.00031667107850049216, - "loss": 4.1849, - "step": 838 - }, - { - "epoch": 0.26, - "learning_rate": 0.0003166354494412549, - "loss": 4.1699, - "step": 839 - }, - { - "epoch": 0.26, - "learning_rate": 0.0003165998203820177, - "loss": 4.2506, - "step": 840 - }, - { - "epoch": 0.26, - "learning_rate": 0.00031656419132278045, - "loss": 4.2697, - "step": 841 - }, - { - "epoch": 0.26, - "learning_rate": 0.00031652856226354324, - "loss": 4.0719, - "step": 842 - }, - { - "epoch": 0.26, - "learning_rate": 0.000316492933204306, - "loss": 3.6326, - "step": 843 - }, - { - "epoch": 0.26, - "learning_rate": 0.00031645730414506875, - "loss": 3.5361, - "step": 844 - }, - { - "epoch": 0.26, - "learning_rate": 0.0003164216750858316, - "loss": 3.6797, - "step": 845 - }, - { - "epoch": 0.26, - "learning_rate": 0.0003163860460265943, - "loss": 3.7435, - "step": 846 - }, - { - "epoch": 0.26, - "learning_rate": 0.0003163504169673571, - "loss": 3.6399, - "step": 847 - }, - { - "epoch": 0.26, - "learning_rate": 0.00031631478790811987, - "loss": 3.6595, - "step": 848 - }, - { - "epoch": 0.26, - "learning_rate": 0.0003162791588488826, - "loss": 3.2975, - "step": 849 - }, - { - "epoch": 0.26, - "learning_rate": 0.00031624352978964544, - "loss": 3.052, - "step": 850 - }, - { - "epoch": 0.26, - "learning_rate": 0.00031620790073040816, - "loss": 6.6104, - "step": 851 - }, - { - "epoch": 0.26, - "learning_rate": 0.00031617227167117095, - "loss": 5.9987, - "step": 852 - }, - { - "epoch": 0.26, - "learning_rate": 0.00031613664261193373, - "loss": 5.9487, - "step": 853 - }, - { - "epoch": 0.26, - "learning_rate": 0.00031610101355269646, - "loss": 5.4157, - "step": 854 - }, - { - "epoch": 0.26, - "learning_rate": 0.0003160653844934593, - "loss": 5.2302, - "step": 855 - }, - { - "epoch": 0.26, - "learning_rate": 0.000316029755434222, - "loss": 5.1533, - "step": 856 - }, - { - "epoch": 0.26, - "learning_rate": 0.0003159941263749848, - "loss": 5.0236, - "step": 857 - }, - { - "epoch": 0.26, - "learning_rate": 0.0003159584973157476, - "loss": 4.9179, - "step": 858 - }, - { - "epoch": 0.27, - "learning_rate": 0.00031592286825651036, - "loss": 4.9565, - "step": 859 - }, - { - "epoch": 0.27, - "learning_rate": 0.0003158872391972731, - "loss": 4.7953, - "step": 860 - }, - { - "epoch": 0.27, - "learning_rate": 0.0003158516101380359, - "loss": 4.6231, - "step": 861 - }, - { - "epoch": 0.27, - "learning_rate": 0.00031581598107879866, - "loss": 4.9661, - "step": 862 - }, - { - "epoch": 0.27, - "learning_rate": 0.00031578035201956144, - "loss": 4.7249, - "step": 863 - }, - { - "epoch": 0.27, - "learning_rate": 0.0003157447229603242, - "loss": 4.8068, - "step": 864 - }, - { - "epoch": 0.27, - "learning_rate": 0.00031570909390108695, - "loss": 4.8051, - "step": 865 - }, - { - "epoch": 0.27, - "learning_rate": 0.00031567346484184973, - "loss": 4.8017, - "step": 866 - }, - { - "epoch": 0.27, - "learning_rate": 0.0003156378357826125, - "loss": 4.6119, - "step": 867 - }, - { - "epoch": 0.27, - "learning_rate": 0.0003156022067233753, - "loss": 4.7729, - "step": 868 - }, - { - "epoch": 0.27, - "learning_rate": 0.0003155665776641381, - "loss": 5.0054, - "step": 869 - }, - { - "epoch": 0.27, - "learning_rate": 0.0003155309486049008, - "loss": 4.8891, - "step": 870 - }, - { - "epoch": 0.27, - "learning_rate": 0.0003154953195456636, - "loss": 4.4381, - "step": 871 - }, - { - "epoch": 0.27, - "learning_rate": 0.00031545969048642637, - "loss": 4.5233, - "step": 872 - }, - { - "epoch": 0.27, - "learning_rate": 0.0003154240614271891, - "loss": 4.5199, - "step": 873 - }, - { - "epoch": 0.27, - "learning_rate": 0.00031538843236795193, - "loss": 5.0761, - "step": 874 - }, - { - "epoch": 0.27, - "learning_rate": 0.00031535280330871466, - "loss": 4.5819, - "step": 875 - }, - { - "epoch": 0.27, - "learning_rate": 0.00031531717424947744, - "loss": 4.7146, - "step": 876 - }, - { - "epoch": 0.27, - "learning_rate": 0.0003152815451902402, - "loss": 4.5888, - "step": 877 - }, - { - "epoch": 0.27, - "learning_rate": 0.000315245916131003, - "loss": 4.4594, - "step": 878 - }, - { - "epoch": 0.27, - "learning_rate": 0.0003152102870717658, - "loss": 4.5146, - "step": 879 - }, - { - "epoch": 0.27, - "learning_rate": 0.0003151746580125285, - "loss": 4.2043, - "step": 880 - }, - { - "epoch": 0.27, - "learning_rate": 0.0003151390289532913, - "loss": 4.3887, - "step": 881 - }, - { - "epoch": 0.27, - "learning_rate": 0.0003151033998940541, - "loss": 4.3032, - "step": 882 - }, - { - "epoch": 0.27, - "learning_rate": 0.00031506777083481686, - "loss": 4.313, - "step": 883 - }, - { - "epoch": 0.27, - "learning_rate": 0.00031503214177557964, - "loss": 4.2478, - "step": 884 - }, - { - "epoch": 0.27, - "learning_rate": 0.00031499651271634237, - "loss": 4.2393, - "step": 885 - }, - { - "epoch": 0.27, - "learning_rate": 0.00031496088365710515, - "loss": 3.9922, - "step": 886 - }, - { - "epoch": 0.27, - "learning_rate": 0.00031492525459786793, - "loss": 4.4679, - "step": 887 - }, - { - "epoch": 0.27, - "learning_rate": 0.0003148896255386307, - "loss": 4.2588, - "step": 888 - }, - { - "epoch": 0.27, - "learning_rate": 0.0003148539964793935, - "loss": 4.3304, - "step": 889 - }, - { - "epoch": 0.27, - "learning_rate": 0.0003148183674201562, - "loss": 4.0801, - "step": 890 - }, - { - "epoch": 0.27, - "learning_rate": 0.000314782738360919, - "loss": 4.0099, - "step": 891 - }, - { - "epoch": 0.28, - "learning_rate": 0.0003147471093016818, - "loss": 3.8702, - "step": 892 - }, - { - "epoch": 0.28, - "learning_rate": 0.00031471148024244457, - "loss": 4.2203, - "step": 893 - }, - { - "epoch": 0.28, - "learning_rate": 0.0003146758511832073, - "loss": 3.7047, - "step": 894 - }, - { - "epoch": 0.28, - "learning_rate": 0.0003146402221239701, - "loss": 3.8448, - "step": 895 - }, - { - "epoch": 0.28, - "learning_rate": 0.00031460459306473286, - "loss": 3.4714, - "step": 896 - }, - { - "epoch": 0.28, - "learning_rate": 0.00031456896400549564, - "loss": 3.2731, - "step": 897 - }, - { - "epoch": 0.28, - "learning_rate": 0.0003145333349462584, - "loss": 3.4538, - "step": 898 - }, - { - "epoch": 0.28, - "learning_rate": 0.00031449770588702115, - "loss": 3.4279, - "step": 899 - }, - { - "epoch": 0.28, - "learning_rate": 0.00031446207682778393, - "loss": 3.1156, - "step": 900 - }, - { - "epoch": 0.28, - "learning_rate": 0.0003144264477685467, - "loss": 6.2662, - "step": 901 - }, - { - "epoch": 0.28, - "learning_rate": 0.0003143908187093095, - "loss": 5.6305, - "step": 902 - }, - { - "epoch": 0.28, - "learning_rate": 0.0003143551896500723, - "loss": 5.3684, - "step": 903 - }, - { - "epoch": 0.28, - "learning_rate": 0.000314319560590835, - "loss": 5.3337, - "step": 904 - }, - { - "epoch": 0.28, - "learning_rate": 0.00031428393153159784, - "loss": 5.0379, - "step": 905 - }, - { - "epoch": 0.28, - "learning_rate": 0.00031424830247236057, - "loss": 4.8474, - "step": 906 - }, - { - "epoch": 0.28, - "learning_rate": 0.00031421267341312335, - "loss": 5.2066, - "step": 907 - }, - { - "epoch": 0.28, - "learning_rate": 0.00031417704435388613, - "loss": 4.9112, - "step": 908 - }, - { - "epoch": 0.28, - "learning_rate": 0.00031414141529464886, - "loss": 4.9829, - "step": 909 - }, - { - "epoch": 0.28, - "learning_rate": 0.0003141057862354117, - "loss": 4.7379, - "step": 910 - }, - { - "epoch": 0.28, - "learning_rate": 0.0003140701571761744, - "loss": 4.7751, - "step": 911 - }, - { - "epoch": 0.28, - "learning_rate": 0.0003140345281169372, - "loss": 4.7586, - "step": 912 - }, - { - "epoch": 0.28, - "learning_rate": 0.0003139988990577, - "loss": 4.8118, - "step": 913 - }, - { - "epoch": 0.28, - "learning_rate": 0.0003139632699984627, - "loss": 4.9603, - "step": 914 - }, - { - "epoch": 0.28, - "learning_rate": 0.00031392764093922555, - "loss": 5.0008, - "step": 915 - }, - { - "epoch": 0.28, - "learning_rate": 0.0003138920118799883, - "loss": 4.9402, - "step": 916 - }, - { - "epoch": 0.28, - "learning_rate": 0.00031385638282075106, - "loss": 4.6815, - "step": 917 - }, - { - "epoch": 0.28, - "learning_rate": 0.00031382075376151385, - "loss": 4.7787, - "step": 918 - }, - { - "epoch": 0.28, - "learning_rate": 0.00031378512470227657, - "loss": 4.7789, - "step": 919 - }, - { - "epoch": 0.28, - "learning_rate": 0.00031374949564303935, - "loss": 4.9569, - "step": 920 - }, - { - "epoch": 0.28, - "learning_rate": 0.00031371386658380214, - "loss": 4.8704, - "step": 921 - }, - { - "epoch": 0.28, - "learning_rate": 0.0003136782375245649, - "loss": 4.6449, - "step": 922 - }, - { - "epoch": 0.28, - "learning_rate": 0.0003136426084653277, - "loss": 4.6035, - "step": 923 - }, - { - "epoch": 0.29, - "learning_rate": 0.00031360697940609043, - "loss": 4.8499, - "step": 924 - }, - { - "epoch": 0.29, - "learning_rate": 0.0003135713503468532, - "loss": 4.6556, - "step": 925 - }, - { - "epoch": 0.29, - "learning_rate": 0.000313535721287616, - "loss": 4.73, - "step": 926 - }, - { - "epoch": 0.29, - "learning_rate": 0.0003135000922283788, - "loss": 4.7274, - "step": 927 - }, - { - "epoch": 0.29, - "learning_rate": 0.0003134644631691415, - "loss": 4.9487, - "step": 928 - }, - { - "epoch": 0.29, - "learning_rate": 0.00031342883410990434, - "loss": 4.3901, - "step": 929 - }, - { - "epoch": 0.29, - "learning_rate": 0.00031339320505066706, - "loss": 4.5403, - "step": 930 - }, - { - "epoch": 0.29, - "learning_rate": 0.00031335757599142985, - "loss": 4.2667, - "step": 931 - }, - { - "epoch": 0.29, - "learning_rate": 0.00031332194693219263, - "loss": 4.6732, - "step": 932 - }, - { - "epoch": 0.29, - "learning_rate": 0.00031328631787295536, - "loss": 4.5, - "step": 933 - }, - { - "epoch": 0.29, - "learning_rate": 0.0003132506888137182, - "loss": 4.7049, - "step": 934 - }, - { - "epoch": 0.29, - "learning_rate": 0.0003132150597544809, - "loss": 4.2075, - "step": 935 - }, - { - "epoch": 0.29, - "learning_rate": 0.0003131794306952437, - "loss": 4.3334, - "step": 936 - }, - { - "epoch": 0.29, - "learning_rate": 0.0003131438016360065, - "loss": 4.2938, - "step": 937 - }, - { - "epoch": 0.29, - "learning_rate": 0.0003131081725767692, - "loss": 4.3269, - "step": 938 - }, - { - "epoch": 0.29, - "learning_rate": 0.00031307254351753205, - "loss": 4.1757, - "step": 939 - }, - { - "epoch": 0.29, - "learning_rate": 0.0003130369144582948, - "loss": 4.0869, - "step": 940 - }, - { - "epoch": 0.29, - "learning_rate": 0.00031300128539905756, - "loss": 4.16, - "step": 941 - }, - { - "epoch": 0.29, - "learning_rate": 0.00031296565633982034, - "loss": 4.0857, - "step": 942 - }, - { - "epoch": 0.29, - "learning_rate": 0.00031293002728058307, - "loss": 3.875, - "step": 943 - }, - { - "epoch": 0.29, - "learning_rate": 0.0003128943982213459, - "loss": 4.0039, - "step": 944 - }, - { - "epoch": 0.29, - "learning_rate": 0.00031285876916210863, - "loss": 3.905, - "step": 945 - }, - { - "epoch": 0.29, - "learning_rate": 0.0003128231401028714, - "loss": 3.5325, - "step": 946 - }, - { - "epoch": 0.29, - "learning_rate": 0.0003127875110436342, - "loss": 3.5509, - "step": 947 - }, - { - "epoch": 0.29, - "learning_rate": 0.000312751881984397, - "loss": 3.6116, - "step": 948 - }, - { - "epoch": 0.29, - "learning_rate": 0.00031271625292515976, - "loss": 3.2935, - "step": 949 - }, - { - "epoch": 0.29, - "learning_rate": 0.0003126806238659225, - "loss": 3.1373, - "step": 950 - }, - { - "epoch": 0.29, - "learning_rate": 0.00031264499480668527, - "loss": 6.1515, - "step": 951 - }, - { - "epoch": 0.29, - "learning_rate": 0.00031260936574744805, - "loss": 5.8838, - "step": 952 - }, - { - "epoch": 0.29, - "learning_rate": 0.00031257373668821083, - "loss": 5.7219, - "step": 953 - }, - { - "epoch": 0.29, - "learning_rate": 0.00031253810762897356, - "loss": 5.3645, - "step": 954 - }, - { - "epoch": 0.29, - "learning_rate": 0.00031250247856973634, - "loss": 5.3646, - "step": 955 - }, - { - "epoch": 0.3, - "learning_rate": 0.0003124668495104991, - "loss": 5.2483, - "step": 956 - }, - { - "epoch": 0.3, - "learning_rate": 0.0003124312204512619, - "loss": 5.0023, - "step": 957 - }, - { - "epoch": 0.3, - "learning_rate": 0.0003123955913920247, - "loss": 5.2127, - "step": 958 - }, - { - "epoch": 0.3, - "learning_rate": 0.0003123599623327874, - "loss": 4.9758, - "step": 959 - }, - { - "epoch": 0.3, - "learning_rate": 0.0003123243332735502, - "loss": 4.6177, - "step": 960 - }, - { - "epoch": 0.3, - "learning_rate": 0.000312288704214313, - "loss": 4.5279, - "step": 961 - }, - { - "epoch": 0.3, - "learning_rate": 0.0003122530751550757, - "loss": 4.9342, - "step": 962 - }, - { - "epoch": 0.3, - "learning_rate": 0.00031221744609583854, - "loss": 4.8577, - "step": 963 - }, - { - "epoch": 0.3, - "learning_rate": 0.00031218181703660127, - "loss": 4.7622, - "step": 964 - }, - { - "epoch": 0.3, - "learning_rate": 0.00031214618797736405, - "loss": 4.8761, - "step": 965 - }, - { - "epoch": 0.3, - "learning_rate": 0.00031211055891812683, - "loss": 4.7664, - "step": 966 - }, - { - "epoch": 0.3, - "learning_rate": 0.00031207492985888956, - "loss": 4.9797, - "step": 967 - }, - { - "epoch": 0.3, - "learning_rate": 0.0003120393007996524, - "loss": 4.7389, - "step": 968 - }, - { - "epoch": 0.3, - "learning_rate": 0.0003120036717404151, - "loss": 4.7964, - "step": 969 - }, - { - "epoch": 0.3, - "learning_rate": 0.0003119680426811779, - "loss": 4.4982, - "step": 970 - }, - { - "epoch": 0.3, - "learning_rate": 0.0003119324136219407, - "loss": 4.4666, - "step": 971 - }, - { - "epoch": 0.3, - "learning_rate": 0.00031189678456270347, - "loss": 4.9018, - "step": 972 - }, - { - "epoch": 0.3, - "learning_rate": 0.00031186115550346625, - "loss": 4.6384, - "step": 973 - }, - { - "epoch": 0.3, - "learning_rate": 0.000311825526444229, - "loss": 4.6017, - "step": 974 - }, - { - "epoch": 0.3, - "learning_rate": 0.00031178989738499176, - "loss": 4.6073, - "step": 975 - }, - { - "epoch": 0.3, - "learning_rate": 0.00031175426832575454, - "loss": 4.5389, - "step": 976 - }, - { - "epoch": 0.3, - "learning_rate": 0.0003117186392665173, - "loss": 4.2345, - "step": 977 - }, - { - "epoch": 0.3, - "learning_rate": 0.0003116830102072801, - "loss": 4.5278, - "step": 978 - }, - { - "epoch": 0.3, - "learning_rate": 0.00031164738114804283, - "loss": 4.4742, - "step": 979 - }, - { - "epoch": 0.3, - "learning_rate": 0.0003116117520888056, - "loss": 4.583, - "step": 980 - }, - { - "epoch": 0.3, - "learning_rate": 0.0003115761230295684, - "loss": 4.5069, - "step": 981 - }, - { - "epoch": 0.3, - "learning_rate": 0.0003115404939703312, - "loss": 4.4788, - "step": 982 - }, - { - "epoch": 0.3, - "learning_rate": 0.00031150486491109396, - "loss": 4.5548, - "step": 983 - }, - { - "epoch": 0.3, - "learning_rate": 0.0003114692358518567, - "loss": 4.071, - "step": 984 - }, - { - "epoch": 0.3, - "learning_rate": 0.00031143360679261947, - "loss": 4.3909, - "step": 985 - }, - { - "epoch": 0.3, - "learning_rate": 0.00031139797773338225, - "loss": 4.1309, - "step": 986 - }, - { - "epoch": 0.3, - "learning_rate": 0.00031136234867414504, - "loss": 4.0651, - "step": 987 - }, - { - "epoch": 0.3, - "learning_rate": 0.00031132671961490776, - "loss": 3.8988, - "step": 988 - }, - { - "epoch": 0.31, - "learning_rate": 0.00031129109055567055, - "loss": 4.2998, - "step": 989 - }, - { - "epoch": 0.31, - "learning_rate": 0.0003112554614964333, - "loss": 3.8281, - "step": 990 - }, - { - "epoch": 0.31, - "learning_rate": 0.0003112198324371961, - "loss": 4.1739, - "step": 991 - }, - { - "epoch": 0.31, - "learning_rate": 0.0003111842033779589, - "loss": 4.2605, - "step": 992 - }, - { - "epoch": 0.31, - "learning_rate": 0.0003111485743187216, - "loss": 3.9964, - "step": 993 - }, - { - "epoch": 0.31, - "learning_rate": 0.0003111129452594844, - "loss": 3.8471, - "step": 994 - }, - { - "epoch": 0.31, - "learning_rate": 0.0003110773162002472, - "loss": 3.853, - "step": 995 - }, - { - "epoch": 0.31, - "learning_rate": 0.00031104168714100996, - "loss": 3.6185, - "step": 996 - }, - { - "epoch": 0.31, - "learning_rate": 0.00031100605808177275, - "loss": 3.3647, - "step": 997 - }, - { - "epoch": 0.31, - "learning_rate": 0.0003109704290225355, - "loss": 3.5623, - "step": 998 - }, - { - "epoch": 0.31, - "learning_rate": 0.0003109347999632983, - "loss": 3.0593, - "step": 999 - }, - { - "epoch": 0.31, - "learning_rate": 0.00031089917090406104, - "loss": 3.0735, - "step": 1000 - }, - { - "epoch": 0.31, - "eval_bleu": 2.4961869532998874e-13, - "eval_loss": 5.599357604980469, - "eval_runtime": 2585.0388, - "eval_samples_per_second": 5.71, - "eval_steps_per_second": 0.714, - "step": 1000 - }, - { - "epoch": 0.31, - "learning_rate": 0.0003108635418448238, - "loss": 6.6653, - "step": 1001 - }, - { - "epoch": 0.31, - "learning_rate": 0.0003108279127855866, - "loss": 5.993, - "step": 1002 - }, - { - "epoch": 0.31, - "learning_rate": 0.00031079228372634933, - "loss": 5.2619, - "step": 1003 - }, - { - "epoch": 0.31, - "learning_rate": 0.00031075665466711216, - "loss": 5.315, - "step": 1004 - }, - { - "epoch": 0.31, - "learning_rate": 0.0003107210256078749, - "loss": 5.2509, - "step": 1005 - }, - { - "epoch": 0.31, - "learning_rate": 0.0003106853965486377, - "loss": 5.0751, - "step": 1006 - }, - { - "epoch": 0.31, - "learning_rate": 0.00031064976748940046, - "loss": 5.0152, - "step": 1007 - }, - { - "epoch": 0.31, - "learning_rate": 0.0003106141384301632, - "loss": 4.8883, - "step": 1008 - }, - { - "epoch": 0.31, - "learning_rate": 0.00031057850937092597, - "loss": 4.9054, - "step": 1009 - }, - { - "epoch": 0.31, - "learning_rate": 0.00031054288031168875, - "loss": 5.1369, - "step": 1010 - }, - { - "epoch": 0.31, - "learning_rate": 0.00031050725125245153, - "loss": 4.8128, - "step": 1011 - }, - { - "epoch": 0.31, - "learning_rate": 0.0003104716221932143, - "loss": 4.5203, - "step": 1012 - }, - { - "epoch": 0.31, - "learning_rate": 0.00031043599313397704, - "loss": 4.4711, - "step": 1013 - }, - { - "epoch": 0.31, - "learning_rate": 0.0003104003640747398, - "loss": 4.6929, - "step": 1014 - }, - { - "epoch": 0.31, - "learning_rate": 0.0003103647350155026, - "loss": 4.7625, - "step": 1015 - }, - { - "epoch": 0.31, - "learning_rate": 0.0003103291059562654, - "loss": 4.8906, - "step": 1016 - }, - { - "epoch": 0.31, - "learning_rate": 0.00031029347689702817, - "loss": 4.8319, - "step": 1017 - }, - { - "epoch": 0.31, - "learning_rate": 0.0003102578478377909, - "loss": 4.3231, - "step": 1018 - }, - { - "epoch": 0.31, - "learning_rate": 0.0003102222187785537, - "loss": 4.5903, - "step": 1019 - }, - { - "epoch": 0.31, - "learning_rate": 0.00031018658971931646, - "loss": 4.6172, - "step": 1020 - }, - { - "epoch": 0.32, - "learning_rate": 0.00031015096066007924, - "loss": 5.0053, - "step": 1021 - }, - { - "epoch": 0.32, - "learning_rate": 0.00031011533160084197, - "loss": 4.7422, - "step": 1022 - }, - { - "epoch": 0.32, - "learning_rate": 0.0003100797025416048, - "loss": 4.5341, - "step": 1023 - }, - { - "epoch": 0.32, - "learning_rate": 0.00031004407348236753, - "loss": 4.6354, - "step": 1024 - }, - { - "epoch": 0.32, - "learning_rate": 0.0003100084444231303, - "loss": 4.5176, - "step": 1025 - }, - { - "epoch": 0.32, - "learning_rate": 0.0003099728153638931, - "loss": 4.3475, - "step": 1026 - }, - { - "epoch": 0.32, - "learning_rate": 0.0003099371863046558, - "loss": 4.4205, - "step": 1027 - }, - { - "epoch": 0.32, - "learning_rate": 0.00030990155724541866, - "loss": 4.645, - "step": 1028 - }, - { - "epoch": 0.32, - "learning_rate": 0.0003098659281861814, - "loss": 4.6165, - "step": 1029 - }, - { - "epoch": 0.32, - "learning_rate": 0.00030983029912694417, - "loss": 4.5766, - "step": 1030 - }, - { - "epoch": 0.32, - "learning_rate": 0.00030979467006770695, - "loss": 4.4004, - "step": 1031 - }, - { - "epoch": 0.32, - "learning_rate": 0.0003097590410084697, - "loss": 4.6851, - "step": 1032 - }, - { - "epoch": 0.32, - "learning_rate": 0.0003097234119492325, - "loss": 4.4411, - "step": 1033 - }, - { - "epoch": 0.32, - "learning_rate": 0.00030968778288999524, - "loss": 4.406, - "step": 1034 - }, - { - "epoch": 0.32, - "learning_rate": 0.000309652153830758, - "loss": 4.3268, - "step": 1035 - }, - { - "epoch": 0.32, - "learning_rate": 0.0003096165247715208, - "loss": 5.0019, - "step": 1036 - }, - { - "epoch": 0.32, - "learning_rate": 0.00030958089571228353, - "loss": 4.3991, - "step": 1037 - }, - { - "epoch": 0.32, - "learning_rate": 0.00030954526665304637, - "loss": 4.2463, - "step": 1038 - }, - { - "epoch": 0.32, - "learning_rate": 0.0003095096375938091, - "loss": 4.3169, - "step": 1039 - }, - { - "epoch": 0.32, - "learning_rate": 0.0003094740085345719, - "loss": 4.1203, - "step": 1040 - }, - { - "epoch": 0.32, - "learning_rate": 0.00030943837947533466, - "loss": 4.3825, - "step": 1041 - }, - { - "epoch": 0.32, - "learning_rate": 0.00030940275041609744, - "loss": 4.0861, - "step": 1042 - }, - { - "epoch": 0.32, - "learning_rate": 0.00030936712135686017, - "loss": 4.1061, - "step": 1043 - }, - { - "epoch": 0.32, - "learning_rate": 0.00030933149229762295, - "loss": 3.6647, - "step": 1044 - }, - { - "epoch": 0.32, - "learning_rate": 0.00030929586323838573, - "loss": 3.6897, - "step": 1045 - }, - { - "epoch": 0.32, - "learning_rate": 0.0003092602341791485, - "loss": 3.6912, - "step": 1046 - }, - { - "epoch": 0.32, - "learning_rate": 0.0003092246051199113, - "loss": 3.6683, - "step": 1047 - }, - { - "epoch": 0.32, - "learning_rate": 0.000309188976060674, - "loss": 3.2097, - "step": 1048 - }, - { - "epoch": 0.32, - "learning_rate": 0.0003091533470014368, - "loss": 3.4005, - "step": 1049 - }, - { - "epoch": 0.32, - "learning_rate": 0.0003091177179421996, - "loss": 3.1207, - "step": 1050 - }, - { - "epoch": 0.32, - "learning_rate": 0.00030908208888296237, - "loss": 6.6301, - "step": 1051 - }, - { - "epoch": 0.32, - "learning_rate": 0.00030904645982372515, - "loss": 5.6808, - "step": 1052 - }, - { - "epoch": 0.32, - "learning_rate": 0.0003090108307644879, - "loss": 5.4759, - "step": 1053 - }, - { - "epoch": 0.33, - "learning_rate": 0.00030897520170525066, - "loss": 5.703, - "step": 1054 - }, - { - "epoch": 0.33, - "learning_rate": 0.00030893957264601344, - "loss": 5.3103, - "step": 1055 - }, - { - "epoch": 0.33, - "learning_rate": 0.00030890394358677617, - "loss": 5.0595, - "step": 1056 - }, - { - "epoch": 0.33, - "learning_rate": 0.000308868314527539, - "loss": 4.7962, - "step": 1057 - }, - { - "epoch": 0.33, - "learning_rate": 0.00030883268546830174, - "loss": 5.0354, - "step": 1058 - }, - { - "epoch": 0.33, - "learning_rate": 0.0003087970564090645, - "loss": 4.854, - "step": 1059 - }, - { - "epoch": 0.33, - "learning_rate": 0.0003087614273498273, - "loss": 4.6391, - "step": 1060 - }, - { - "epoch": 0.33, - "learning_rate": 0.00030872579829059, - "loss": 5.0223, - "step": 1061 - }, - { - "epoch": 0.33, - "learning_rate": 0.00030869016923135286, - "loss": 4.8927, - "step": 1062 - }, - { - "epoch": 0.33, - "learning_rate": 0.0003086545401721156, - "loss": 4.4353, - "step": 1063 - }, - { - "epoch": 0.33, - "learning_rate": 0.00030861891111287837, - "loss": 5.0053, - "step": 1064 - }, - { - "epoch": 0.33, - "learning_rate": 0.00030858328205364115, - "loss": 4.6891, - "step": 1065 - }, - { - "epoch": 0.33, - "learning_rate": 0.00030854765299440394, - "loss": 4.7675, - "step": 1066 - }, - { - "epoch": 0.33, - "learning_rate": 0.0003085120239351667, - "loss": 4.5319, - "step": 1067 - }, - { - "epoch": 0.33, - "learning_rate": 0.00030847639487592945, - "loss": 4.7498, - "step": 1068 - }, - { - "epoch": 0.33, - "learning_rate": 0.00030844076581669223, - "loss": 4.6845, - "step": 1069 - }, - { - "epoch": 0.33, - "learning_rate": 0.000308405136757455, - "loss": 4.4324, - "step": 1070 - }, - { - "epoch": 0.33, - "learning_rate": 0.0003083695076982178, - "loss": 4.9639, - "step": 1071 - }, - { - "epoch": 0.33, - "learning_rate": 0.0003083338786389806, - "loss": 4.58, - "step": 1072 - }, - { - "epoch": 0.33, - "learning_rate": 0.0003082982495797433, - "loss": 4.7139, - "step": 1073 - }, - { - "epoch": 0.33, - "learning_rate": 0.0003082626205205061, - "loss": 4.5709, - "step": 1074 - }, - { - "epoch": 0.33, - "learning_rate": 0.00030822699146126886, - "loss": 4.4785, - "step": 1075 - }, - { - "epoch": 0.33, - "learning_rate": 0.00030819136240203165, - "loss": 4.531, - "step": 1076 - }, - { - "epoch": 0.33, - "learning_rate": 0.0003081557333427944, - "loss": 4.3016, - "step": 1077 - }, - { - "epoch": 0.33, - "learning_rate": 0.00030812010428355716, - "loss": 4.5966, - "step": 1078 - }, - { - "epoch": 0.33, - "learning_rate": 0.00030808447522431994, - "loss": 4.2484, - "step": 1079 - }, - { - "epoch": 0.33, - "learning_rate": 0.0003080488461650827, - "loss": 4.741, - "step": 1080 - }, - { - "epoch": 0.33, - "learning_rate": 0.0003080132171058455, - "loss": 4.3773, - "step": 1081 - }, - { - "epoch": 0.33, - "learning_rate": 0.00030797758804660823, - "loss": 4.6006, - "step": 1082 - }, - { - "epoch": 0.33, - "learning_rate": 0.000307941958987371, - "loss": 4.3654, - "step": 1083 - }, - { - "epoch": 0.33, - "learning_rate": 0.0003079063299281338, - "loss": 4.1866, - "step": 1084 - }, - { - "epoch": 0.33, - "learning_rate": 0.0003078707008688966, - "loss": 4.2877, - "step": 1085 - }, - { - "epoch": 0.34, - "learning_rate": 0.00030783507180965936, - "loss": 4.6805, - "step": 1086 - }, - { - "epoch": 0.34, - "learning_rate": 0.0003077994427504221, - "loss": 4.3094, - "step": 1087 - }, - { - "epoch": 0.34, - "learning_rate": 0.00030776381369118487, - "loss": 4.1768, - "step": 1088 - }, - { - "epoch": 0.34, - "learning_rate": 0.00030772818463194765, - "loss": 4.1984, - "step": 1089 - }, - { - "epoch": 0.34, - "learning_rate": 0.00030769255557271043, - "loss": 4.3514, - "step": 1090 - }, - { - "epoch": 0.34, - "learning_rate": 0.0003076569265134732, - "loss": 4.148, - "step": 1091 - }, - { - "epoch": 0.34, - "learning_rate": 0.00030762129745423594, - "loss": 3.7951, - "step": 1092 - }, - { - "epoch": 0.34, - "learning_rate": 0.0003075856683949988, - "loss": 4.2785, - "step": 1093 - }, - { - "epoch": 0.34, - "learning_rate": 0.0003075500393357615, - "loss": 3.8842, - "step": 1094 - }, - { - "epoch": 0.34, - "learning_rate": 0.0003075144102765243, - "loss": 3.6878, - "step": 1095 - }, - { - "epoch": 0.34, - "learning_rate": 0.00030747878121728707, - "loss": 3.4125, - "step": 1096 - }, - { - "epoch": 0.34, - "learning_rate": 0.0003074431521580498, - "loss": 3.4207, - "step": 1097 - }, - { - "epoch": 0.34, - "learning_rate": 0.00030740752309881263, - "loss": 3.3905, - "step": 1098 - }, - { - "epoch": 0.34, - "learning_rate": 0.00030737189403957536, - "loss": 3.3195, - "step": 1099 - }, - { - "epoch": 0.34, - "learning_rate": 0.00030733626498033814, - "loss": 3.5056, - "step": 1100 - }, - { - "epoch": 0.34, - "learning_rate": 0.0003073006359211009, - "loss": 6.0964, - "step": 1101 - }, - { - "epoch": 0.34, - "learning_rate": 0.00030726500686186365, - "loss": 5.6064, - "step": 1102 - }, - { - "epoch": 0.34, - "learning_rate": 0.00030722937780262643, - "loss": 5.8648, - "step": 1103 - }, - { - "epoch": 0.34, - "learning_rate": 0.0003071937487433892, - "loss": 5.3246, - "step": 1104 - }, - { - "epoch": 0.34, - "learning_rate": 0.000307158119684152, - "loss": 4.8642, - "step": 1105 - }, - { - "epoch": 0.34, - "learning_rate": 0.0003071224906249148, - "loss": 5.1456, - "step": 1106 - }, - { - "epoch": 0.34, - "learning_rate": 0.0003070868615656775, - "loss": 5.1185, - "step": 1107 - }, - { - "epoch": 0.34, - "learning_rate": 0.0003070512325064403, - "loss": 5.0108, - "step": 1108 - }, - { - "epoch": 0.34, - "learning_rate": 0.00030701560344720307, - "loss": 4.745, - "step": 1109 - }, - { - "epoch": 0.34, - "learning_rate": 0.00030697997438796585, - "loss": 5.1636, - "step": 1110 - }, - { - "epoch": 0.34, - "learning_rate": 0.00030694434532872863, - "loss": 4.728, - "step": 1111 - }, - { - "epoch": 0.34, - "learning_rate": 0.00030690871626949136, - "loss": 4.7272, - "step": 1112 - }, - { - "epoch": 0.34, - "learning_rate": 0.00030687308721025414, - "loss": 4.8844, - "step": 1113 - }, - { - "epoch": 0.34, - "learning_rate": 0.0003068374581510169, - "loss": 4.8765, - "step": 1114 - }, - { - "epoch": 0.34, - "learning_rate": 0.0003068018290917797, - "loss": 4.8006, - "step": 1115 - }, - { - "epoch": 0.34, - "learning_rate": 0.00030676620003254243, - "loss": 4.6507, - "step": 1116 - }, - { - "epoch": 0.34, - "learning_rate": 0.00030673057097330527, - "loss": 4.4997, - "step": 1117 - }, - { - "epoch": 0.35, - "learning_rate": 0.000306694941914068, - "loss": 4.7253, - "step": 1118 - }, - { - "epoch": 0.35, - "learning_rate": 0.0003066593128548308, - "loss": 4.6503, - "step": 1119 - }, - { - "epoch": 0.35, - "learning_rate": 0.00030662368379559356, - "loss": 4.6227, - "step": 1120 - }, - { - "epoch": 0.35, - "learning_rate": 0.0003065880547363563, - "loss": 4.7162, - "step": 1121 - }, - { - "epoch": 0.35, - "learning_rate": 0.0003065524256771191, - "loss": 4.6373, - "step": 1122 - }, - { - "epoch": 0.35, - "learning_rate": 0.00030651679661788185, - "loss": 4.5818, - "step": 1123 - }, - { - "epoch": 0.35, - "learning_rate": 0.00030648116755864463, - "loss": 4.5671, - "step": 1124 - }, - { - "epoch": 0.35, - "learning_rate": 0.0003064455384994074, - "loss": 4.4879, - "step": 1125 - }, - { - "epoch": 0.35, - "learning_rate": 0.00030640990944017014, - "loss": 4.2044, - "step": 1126 - }, - { - "epoch": 0.35, - "learning_rate": 0.000306374280380933, - "loss": 4.6453, - "step": 1127 - }, - { - "epoch": 0.35, - "learning_rate": 0.0003063386513216957, - "loss": 4.4925, - "step": 1128 - }, - { - "epoch": 0.35, - "learning_rate": 0.0003063030222624585, - "loss": 4.4086, - "step": 1129 - }, - { - "epoch": 0.35, - "learning_rate": 0.00030626739320322127, - "loss": 4.7093, - "step": 1130 - }, - { - "epoch": 0.35, - "learning_rate": 0.000306231764143984, - "loss": 4.5648, - "step": 1131 - }, - { - "epoch": 0.35, - "learning_rate": 0.00030619613508474684, - "loss": 4.3671, - "step": 1132 - }, - { - "epoch": 0.35, - "learning_rate": 0.00030616050602550956, - "loss": 4.4897, - "step": 1133 - }, - { - "epoch": 0.35, - "learning_rate": 0.00030612487696627234, - "loss": 4.3607, - "step": 1134 - }, - { - "epoch": 0.35, - "learning_rate": 0.0003060892479070351, - "loss": 4.259, - "step": 1135 - }, - { - "epoch": 0.35, - "learning_rate": 0.0003060536188477979, - "loss": 4.4788, - "step": 1136 - }, - { - "epoch": 0.35, - "learning_rate": 0.00030601798978856064, - "loss": 3.9928, - "step": 1137 - }, - { - "epoch": 0.35, - "learning_rate": 0.0003059823607293234, - "loss": 3.9577, - "step": 1138 - }, - { - "epoch": 0.35, - "learning_rate": 0.0003059467316700862, - "loss": 4.0182, - "step": 1139 - }, - { - "epoch": 0.35, - "learning_rate": 0.000305911102610849, - "loss": 4.0482, - "step": 1140 - }, - { - "epoch": 0.35, - "learning_rate": 0.00030587547355161176, - "loss": 4.2836, - "step": 1141 - }, - { - "epoch": 0.35, - "learning_rate": 0.0003058398444923745, - "loss": 3.9542, - "step": 1142 - }, - { - "epoch": 0.35, - "learning_rate": 0.0003058042154331373, - "loss": 3.9558, - "step": 1143 - }, - { - "epoch": 0.35, - "learning_rate": 0.00030576858637390006, - "loss": 4.2283, - "step": 1144 - }, - { - "epoch": 0.35, - "learning_rate": 0.00030573295731466284, - "loss": 4.0596, - "step": 1145 - }, - { - "epoch": 0.35, - "learning_rate": 0.0003056973282554256, - "loss": 3.9759, - "step": 1146 - }, - { - "epoch": 0.35, - "learning_rate": 0.00030566169919618835, - "loss": 3.7101, - "step": 1147 - }, - { - "epoch": 0.35, - "learning_rate": 0.00030562607013695113, - "loss": 3.4012, - "step": 1148 - }, - { - "epoch": 0.35, - "learning_rate": 0.0003055904410777139, - "loss": 3.583, - "step": 1149 - }, - { - "epoch": 0.35, - "learning_rate": 0.00030555481201847664, - "loss": 3.093, - "step": 1150 - }, - { - "epoch": 0.36, - "learning_rate": 0.0003055191829592395, - "loss": 6.2242, - "step": 1151 - }, - { - "epoch": 0.36, - "learning_rate": 0.0003054835539000022, - "loss": 6.008, - "step": 1152 - }, - { - "epoch": 0.36, - "learning_rate": 0.000305447924840765, - "loss": 5.5076, - "step": 1153 - }, - { - "epoch": 0.36, - "learning_rate": 0.00030541229578152777, - "loss": 5.2832, - "step": 1154 - }, - { - "epoch": 0.36, - "learning_rate": 0.0003053766667222905, - "loss": 4.8887, - "step": 1155 - }, - { - "epoch": 0.36, - "learning_rate": 0.00030534103766305333, - "loss": 4.8274, - "step": 1156 - }, - { - "epoch": 0.36, - "learning_rate": 0.00030530540860381606, - "loss": 4.7412, - "step": 1157 - }, - { - "epoch": 0.36, - "learning_rate": 0.00030526977954457884, - "loss": 4.9053, - "step": 1158 - }, - { - "epoch": 0.36, - "learning_rate": 0.0003052341504853416, - "loss": 4.8952, - "step": 1159 - }, - { - "epoch": 0.36, - "learning_rate": 0.0003051985214261044, - "loss": 4.9197, - "step": 1160 - }, - { - "epoch": 0.36, - "learning_rate": 0.0003051628923668672, - "loss": 4.7081, - "step": 1161 - }, - { - "epoch": 0.36, - "learning_rate": 0.0003051272633076299, - "loss": 4.6475, - "step": 1162 - }, - { - "epoch": 0.36, - "learning_rate": 0.0003050916342483927, - "loss": 4.5716, - "step": 1163 - }, - { - "epoch": 0.36, - "learning_rate": 0.0003050560051891555, - "loss": 4.692, - "step": 1164 - }, - { - "epoch": 0.36, - "learning_rate": 0.00030502037612991826, - "loss": 4.4114, - "step": 1165 - }, - { - "epoch": 0.36, - "learning_rate": 0.00030498474707068104, - "loss": 4.7366, - "step": 1166 - }, - { - "epoch": 0.36, - "learning_rate": 0.00030494911801144377, - "loss": 4.5152, - "step": 1167 - }, - { - "epoch": 0.36, - "learning_rate": 0.00030491348895220655, - "loss": 4.5177, - "step": 1168 - }, - { - "epoch": 0.36, - "learning_rate": 0.00030487785989296933, - "loss": 4.7018, - "step": 1169 - }, - { - "epoch": 0.36, - "learning_rate": 0.0003048422308337321, - "loss": 4.7174, - "step": 1170 - }, - { - "epoch": 0.36, - "learning_rate": 0.00030480660177449484, - "loss": 4.2952, - "step": 1171 - }, - { - "epoch": 0.36, - "learning_rate": 0.0003047709727152576, - "loss": 4.527, - "step": 1172 - }, - { - "epoch": 0.36, - "learning_rate": 0.0003047353436560204, - "loss": 4.5972, - "step": 1173 - }, - { - "epoch": 0.36, - "learning_rate": 0.0003046997145967832, - "loss": 4.6169, - "step": 1174 - }, - { - "epoch": 0.36, - "learning_rate": 0.00030466408553754597, - "loss": 4.4527, - "step": 1175 - }, - { - "epoch": 0.36, - "learning_rate": 0.0003046284564783087, - "loss": 4.529, - "step": 1176 - }, - { - "epoch": 0.36, - "learning_rate": 0.0003045928274190715, - "loss": 4.5556, - "step": 1177 - }, - { - "epoch": 0.36, - "learning_rate": 0.00030455719835983426, - "loss": 4.3729, - "step": 1178 - }, - { - "epoch": 0.36, - "learning_rate": 0.00030452156930059704, - "loss": 4.4536, - "step": 1179 - }, - { - "epoch": 0.36, - "learning_rate": 0.0003044859402413598, - "loss": 4.1811, - "step": 1180 - }, - { - "epoch": 0.36, - "learning_rate": 0.00030445031118212255, - "loss": 4.7439, - "step": 1181 - }, - { - "epoch": 0.36, - "learning_rate": 0.00030441468212288533, - "loss": 4.3372, - "step": 1182 - }, - { - "epoch": 0.37, - "learning_rate": 0.0003043790530636481, - "loss": 4.0854, - "step": 1183 - }, - { - "epoch": 0.37, - "learning_rate": 0.0003043434240044109, - "loss": 4.413, - "step": 1184 - }, - { - "epoch": 0.37, - "learning_rate": 0.0003043077949451737, - "loss": 4.224, - "step": 1185 - }, - { - "epoch": 0.37, - "learning_rate": 0.0003042721658859364, - "loss": 4.1362, - "step": 1186 - }, - { - "epoch": 0.37, - "learning_rate": 0.00030423653682669924, - "loss": 4.2719, - "step": 1187 - }, - { - "epoch": 0.37, - "learning_rate": 0.00030420090776746197, - "loss": 4.0646, - "step": 1188 - }, - { - "epoch": 0.37, - "learning_rate": 0.00030416527870822475, - "loss": 4.3789, - "step": 1189 - }, - { - "epoch": 0.37, - "learning_rate": 0.00030412964964898753, - "loss": 3.9628, - "step": 1190 - }, - { - "epoch": 0.37, - "learning_rate": 0.00030409402058975026, - "loss": 4.2604, - "step": 1191 - }, - { - "epoch": 0.37, - "learning_rate": 0.00030405839153051304, - "loss": 3.632, - "step": 1192 - }, - { - "epoch": 0.37, - "learning_rate": 0.0003040227624712758, - "loss": 3.6616, - "step": 1193 - }, - { - "epoch": 0.37, - "learning_rate": 0.0003039871334120386, - "loss": 3.8303, - "step": 1194 - }, - { - "epoch": 0.37, - "learning_rate": 0.0003039515043528014, - "loss": 3.9684, - "step": 1195 - }, - { - "epoch": 0.37, - "learning_rate": 0.0003039158752935641, - "loss": 3.4594, - "step": 1196 - }, - { - "epoch": 0.37, - "learning_rate": 0.0003038802462343269, - "loss": 3.3668, - "step": 1197 - }, - { - "epoch": 0.37, - "learning_rate": 0.0003038446171750897, - "loss": 3.3288, - "step": 1198 - }, - { - "epoch": 0.37, - "learning_rate": 0.00030380898811585246, - "loss": 3.1448, - "step": 1199 - }, - { - "epoch": 0.37, - "learning_rate": 0.00030377335905661524, - "loss": 3.0514, - "step": 1200 - }, - { - "epoch": 0.37, - "learning_rate": 0.00030373772999737797, - "loss": 6.3367, - "step": 1201 - }, - { - "epoch": 0.37, - "learning_rate": 0.00030370210093814075, - "loss": 5.6882, - "step": 1202 - }, - { - "epoch": 0.37, - "learning_rate": 0.00030366647187890354, - "loss": 5.7425, - "step": 1203 - }, - { - "epoch": 0.37, - "learning_rate": 0.0003036308428196663, - "loss": 5.6542, - "step": 1204 - }, - { - "epoch": 0.37, - "learning_rate": 0.00030359521376042904, - "loss": 5.1333, - "step": 1205 - }, - { - "epoch": 0.37, - "learning_rate": 0.0003035595847011919, - "loss": 5.0806, - "step": 1206 - }, - { - "epoch": 0.37, - "learning_rate": 0.0003035239556419546, - "loss": 5.0461, - "step": 1207 - }, - { - "epoch": 0.37, - "learning_rate": 0.0003034883265827174, - "loss": 4.7644, - "step": 1208 - }, - { - "epoch": 0.37, - "learning_rate": 0.00030345269752348017, - "loss": 4.7942, - "step": 1209 - }, - { - "epoch": 0.37, - "learning_rate": 0.0003034170684642429, - "loss": 5.3676, - "step": 1210 - }, - { - "epoch": 0.37, - "learning_rate": 0.00030338143940500574, - "loss": 5.022, - "step": 1211 - }, - { - "epoch": 0.37, - "learning_rate": 0.00030334581034576846, - "loss": 4.7159, - "step": 1212 - }, - { - "epoch": 0.37, - "learning_rate": 0.00030331018128653125, - "loss": 4.8108, - "step": 1213 - }, - { - "epoch": 0.37, - "learning_rate": 0.00030327455222729403, - "loss": 4.8116, - "step": 1214 - }, - { - "epoch": 0.37, - "learning_rate": 0.00030323892316805676, - "loss": 4.435, - "step": 1215 - }, - { - "epoch": 0.38, - "learning_rate": 0.0003032032941088196, - "loss": 4.4862, - "step": 1216 - }, - { - "epoch": 0.38, - "learning_rate": 0.0003031676650495823, - "loss": 4.614, - "step": 1217 - }, - { - "epoch": 0.38, - "learning_rate": 0.0003031320359903451, - "loss": 4.6822, - "step": 1218 - }, - { - "epoch": 0.38, - "learning_rate": 0.0003030964069311079, - "loss": 4.4671, - "step": 1219 - }, - { - "epoch": 0.38, - "learning_rate": 0.0003030607778718706, - "loss": 4.4582, - "step": 1220 - }, - { - "epoch": 0.38, - "learning_rate": 0.00030302514881263345, - "loss": 4.7106, - "step": 1221 - }, - { - "epoch": 0.38, - "learning_rate": 0.0003029895197533962, - "loss": 4.9276, - "step": 1222 - }, - { - "epoch": 0.38, - "learning_rate": 0.00030295389069415896, - "loss": 4.485, - "step": 1223 - }, - { - "epoch": 0.38, - "learning_rate": 0.00030291826163492174, - "loss": 4.4922, - "step": 1224 - }, - { - "epoch": 0.38, - "learning_rate": 0.00030288263257568447, - "loss": 4.6685, - "step": 1225 - }, - { - "epoch": 0.38, - "learning_rate": 0.0003028470035164473, - "loss": 4.6698, - "step": 1226 - }, - { - "epoch": 0.38, - "learning_rate": 0.00030281137445721003, - "loss": 4.6098, - "step": 1227 - }, - { - "epoch": 0.38, - "learning_rate": 0.0003027757453979728, - "loss": 4.3242, - "step": 1228 - }, - { - "epoch": 0.38, - "learning_rate": 0.0003027401163387356, - "loss": 4.2578, - "step": 1229 - }, - { - "epoch": 0.38, - "learning_rate": 0.0003027044872794984, - "loss": 4.4822, - "step": 1230 - }, - { - "epoch": 0.38, - "learning_rate": 0.0003026688582202611, - "loss": 4.096, - "step": 1231 - }, - { - "epoch": 0.38, - "learning_rate": 0.0003026332291610239, - "loss": 4.5255, - "step": 1232 - }, - { - "epoch": 0.38, - "learning_rate": 0.00030259760010178667, - "loss": 4.0442, - "step": 1233 - }, - { - "epoch": 0.38, - "learning_rate": 0.00030256197104254945, - "loss": 4.2654, - "step": 1234 - }, - { - "epoch": 0.38, - "learning_rate": 0.00030252634198331223, - "loss": 4.4865, - "step": 1235 - }, - { - "epoch": 0.38, - "learning_rate": 0.00030249071292407496, - "loss": 4.1198, - "step": 1236 - }, - { - "epoch": 0.38, - "learning_rate": 0.00030245508386483774, - "loss": 4.1931, - "step": 1237 - }, - { - "epoch": 0.38, - "learning_rate": 0.0003024194548056005, - "loss": 4.5079, - "step": 1238 - }, - { - "epoch": 0.38, - "learning_rate": 0.00030238382574636325, - "loss": 3.8155, - "step": 1239 - }, - { - "epoch": 0.38, - "learning_rate": 0.0003023481966871261, - "loss": 3.8261, - "step": 1240 - }, - { - "epoch": 0.38, - "learning_rate": 0.0003023125676278888, - "loss": 4.0365, - "step": 1241 - }, - { - "epoch": 0.38, - "learning_rate": 0.0003022769385686516, - "loss": 4.0544, - "step": 1242 - }, - { - "epoch": 0.38, - "learning_rate": 0.0003022413095094144, - "loss": 4.0254, - "step": 1243 - }, - { - "epoch": 0.38, - "learning_rate": 0.0003022056804501771, - "loss": 3.801, - "step": 1244 - }, - { - "epoch": 0.38, - "learning_rate": 0.00030217005139093994, - "loss": 3.61, - "step": 1245 - }, - { - "epoch": 0.38, - "learning_rate": 0.00030213442233170267, - "loss": 3.4299, - "step": 1246 - }, - { - "epoch": 0.38, - "learning_rate": 0.00030209879327246545, - "loss": 3.747, - "step": 1247 - }, - { - "epoch": 0.39, - "learning_rate": 0.00030206316421322823, - "loss": 3.1419, - "step": 1248 - }, - { - "epoch": 0.39, - "learning_rate": 0.000302027535153991, - "loss": 3.211, - "step": 1249 - }, - { - "epoch": 0.39, - "learning_rate": 0.0003019919060947538, - "loss": 3.1184, - "step": 1250 - }, - { - "epoch": 0.39, - "learning_rate": 0.0003019562770355165, - "loss": 5.8578, - "step": 1251 - }, - { - "epoch": 0.39, - "learning_rate": 0.0003019206479762793, - "loss": 5.5349, - "step": 1252 - }, - { - "epoch": 0.39, - "learning_rate": 0.0003018850189170421, - "loss": 5.3148, - "step": 1253 - }, - { - "epoch": 0.39, - "learning_rate": 0.00030184938985780487, - "loss": 5.2813, - "step": 1254 - }, - { - "epoch": 0.39, - "learning_rate": 0.00030181376079856765, - "loss": 5.0839, - "step": 1255 - }, - { - "epoch": 0.39, - "learning_rate": 0.0003017781317393304, - "loss": 5.1541, - "step": 1256 - }, - { - "epoch": 0.39, - "learning_rate": 0.00030174250268009316, - "loss": 4.8838, - "step": 1257 - }, - { - "epoch": 0.39, - "learning_rate": 0.00030170687362085594, - "loss": 4.9196, - "step": 1258 - }, - { - "epoch": 0.39, - "learning_rate": 0.0003016712445616187, - "loss": 4.7085, - "step": 1259 - }, - { - "epoch": 0.39, - "learning_rate": 0.0003016356155023815, - "loss": 4.6361, - "step": 1260 - }, - { - "epoch": 0.39, - "learning_rate": 0.00030159998644314423, - "loss": 4.7814, - "step": 1261 - }, - { - "epoch": 0.39, - "learning_rate": 0.000301564357383907, - "loss": 4.7659, - "step": 1262 - }, - { - "epoch": 0.39, - "learning_rate": 0.0003015287283246698, - "loss": 4.5659, - "step": 1263 - }, - { - "epoch": 0.39, - "learning_rate": 0.0003014930992654326, - "loss": 4.5773, - "step": 1264 - }, - { - "epoch": 0.39, - "learning_rate": 0.0003014574702061953, - "loss": 4.7589, - "step": 1265 - }, - { - "epoch": 0.39, - "learning_rate": 0.0003014218411469581, - "loss": 4.8611, - "step": 1266 - }, - { - "epoch": 0.39, - "learning_rate": 0.00030138621208772087, - "loss": 4.3796, - "step": 1267 - }, - { - "epoch": 0.39, - "learning_rate": 0.00030135058302848365, - "loss": 4.6032, - "step": 1268 - }, - { - "epoch": 0.39, - "learning_rate": 0.00030131495396924643, - "loss": 4.847, - "step": 1269 - }, - { - "epoch": 0.39, - "learning_rate": 0.00030127932491000916, - "loss": 4.5729, - "step": 1270 - }, - { - "epoch": 0.39, - "learning_rate": 0.00030124369585077194, - "loss": 4.5536, - "step": 1271 - }, - { - "epoch": 0.39, - "learning_rate": 0.0003012080667915347, - "loss": 4.584, - "step": 1272 - }, - { - "epoch": 0.39, - "learning_rate": 0.0003011724377322975, - "loss": 4.7049, - "step": 1273 - }, - { - "epoch": 0.39, - "learning_rate": 0.0003011368086730603, - "loss": 4.8543, - "step": 1274 - }, - { - "epoch": 0.39, - "learning_rate": 0.000301101179613823, - "loss": 4.4413, - "step": 1275 - }, - { - "epoch": 0.39, - "learning_rate": 0.0003010655505545858, - "loss": 4.3694, - "step": 1276 - }, - { - "epoch": 0.39, - "learning_rate": 0.0003010299214953486, - "loss": 4.436, - "step": 1277 - }, - { - "epoch": 0.39, - "learning_rate": 0.00030099429243611136, - "loss": 4.2365, - "step": 1278 - }, - { - "epoch": 0.39, - "learning_rate": 0.00030095866337687414, - "loss": 4.292, - "step": 1279 - }, - { - "epoch": 0.4, - "learning_rate": 0.00030092303431763687, - "loss": 4.407, - "step": 1280 - }, - { - "epoch": 0.4, - "learning_rate": 0.0003008874052583997, - "loss": 4.5448, - "step": 1281 - }, - { - "epoch": 0.4, - "learning_rate": 0.00030085177619916244, - "loss": 4.5506, - "step": 1282 - }, - { - "epoch": 0.4, - "learning_rate": 0.0003008161471399252, - "loss": 4.2262, - "step": 1283 - }, - { - "epoch": 0.4, - "learning_rate": 0.000300780518080688, - "loss": 4.359, - "step": 1284 - }, - { - "epoch": 0.4, - "learning_rate": 0.00030074488902145073, - "loss": 4.3358, - "step": 1285 - }, - { - "epoch": 0.4, - "learning_rate": 0.0003007092599622135, - "loss": 4.217, - "step": 1286 - }, - { - "epoch": 0.4, - "learning_rate": 0.0003006736309029763, - "loss": 4.0225, - "step": 1287 - }, - { - "epoch": 0.4, - "learning_rate": 0.0003006380018437391, - "loss": 4.2107, - "step": 1288 - }, - { - "epoch": 0.4, - "learning_rate": 0.00030060237278450186, - "loss": 4.2255, - "step": 1289 - }, - { - "epoch": 0.4, - "learning_rate": 0.0003005667437252646, - "loss": 4.0149, - "step": 1290 - }, - { - "epoch": 0.4, - "learning_rate": 0.00030053111466602736, - "loss": 3.7913, - "step": 1291 - }, - { - "epoch": 0.4, - "learning_rate": 0.00030049548560679015, - "loss": 3.9904, - "step": 1292 - }, - { - "epoch": 0.4, - "learning_rate": 0.00030045985654755293, - "loss": 3.6373, - "step": 1293 - }, - { - "epoch": 0.4, - "learning_rate": 0.0003004242274883157, - "loss": 3.7541, - "step": 1294 - }, - { - "epoch": 0.4, - "learning_rate": 0.00030038859842907844, - "loss": 3.3555, - "step": 1295 - }, - { - "epoch": 0.4, - "learning_rate": 0.0003003529693698412, - "loss": 3.2242, - "step": 1296 - }, - { - "epoch": 0.4, - "learning_rate": 0.000300317340310604, - "loss": 3.3962, - "step": 1297 - }, - { - "epoch": 0.4, - "learning_rate": 0.0003002817112513668, - "loss": 3.0672, - "step": 1298 - }, - { - "epoch": 0.4, - "learning_rate": 0.0003002460821921295, - "loss": 3.2196, - "step": 1299 - }, - { - "epoch": 0.4, - "learning_rate": 0.00030021045313289235, - "loss": 2.8454, - "step": 1300 - }, - { - "epoch": 0.4, - "learning_rate": 0.0003001748240736551, - "loss": 6.1441, - "step": 1301 - }, - { - "epoch": 0.4, - "learning_rate": 0.00030013919501441786, - "loss": 5.9909, - "step": 1302 - }, - { - "epoch": 0.4, - "learning_rate": 0.00030010356595518064, - "loss": 5.3787, - "step": 1303 - }, - { - "epoch": 0.4, - "learning_rate": 0.00030006793689594337, - "loss": 5.157, - "step": 1304 - }, - { - "epoch": 0.4, - "learning_rate": 0.0003000323078367062, - "loss": 5.518, - "step": 1305 - }, - { - "epoch": 0.4, - "learning_rate": 0.00029999667877746893, - "loss": 4.9633, - "step": 1306 - }, - { - "epoch": 0.4, - "learning_rate": 0.0002999610497182317, - "loss": 5.153, - "step": 1307 - }, - { - "epoch": 0.4, - "learning_rate": 0.0002999254206589945, - "loss": 4.9611, - "step": 1308 - }, - { - "epoch": 0.4, - "learning_rate": 0.0002998897915997572, - "loss": 4.8557, - "step": 1309 - }, - { - "epoch": 0.4, - "learning_rate": 0.00029985416254052006, - "loss": 5.0678, - "step": 1310 - }, - { - "epoch": 0.4, - "learning_rate": 0.0002998185334812828, - "loss": 4.795, - "step": 1311 - }, - { - "epoch": 0.4, - "learning_rate": 0.00029978290442204557, - "loss": 5.0049, - "step": 1312 - }, - { - "epoch": 0.41, - "learning_rate": 0.00029974727536280835, - "loss": 4.8876, - "step": 1313 - }, - { - "epoch": 0.41, - "learning_rate": 0.0002997116463035711, - "loss": 4.782, - "step": 1314 - }, - { - "epoch": 0.41, - "learning_rate": 0.0002996760172443339, - "loss": 4.6578, - "step": 1315 - }, - { - "epoch": 0.41, - "learning_rate": 0.00029964038818509664, - "loss": 4.6495, - "step": 1316 - }, - { - "epoch": 0.41, - "learning_rate": 0.0002996047591258594, - "loss": 4.739, - "step": 1317 - }, - { - "epoch": 0.41, - "learning_rate": 0.0002995691300666222, - "loss": 4.5213, - "step": 1318 - }, - { - "epoch": 0.41, - "learning_rate": 0.00029953350100738493, - "loss": 4.8373, - "step": 1319 - }, - { - "epoch": 0.41, - "learning_rate": 0.0002994978719481477, - "loss": 4.698, - "step": 1320 - }, - { - "epoch": 0.41, - "learning_rate": 0.0002994622428889105, - "loss": 4.5739, - "step": 1321 - }, - { - "epoch": 0.41, - "learning_rate": 0.0002994266138296733, - "loss": 4.734, - "step": 1322 - }, - { - "epoch": 0.41, - "learning_rate": 0.00029939098477043606, - "loss": 4.44, - "step": 1323 - }, - { - "epoch": 0.41, - "learning_rate": 0.00029935535571119884, - "loss": 4.6595, - "step": 1324 - }, - { - "epoch": 0.41, - "learning_rate": 0.00029931972665196157, - "loss": 4.0913, - "step": 1325 - }, - { - "epoch": 0.41, - "learning_rate": 0.00029928409759272435, - "loss": 4.6594, - "step": 1326 - }, - { - "epoch": 0.41, - "learning_rate": 0.00029924846853348713, - "loss": 4.4297, - "step": 1327 - }, - { - "epoch": 0.41, - "learning_rate": 0.0002992128394742499, - "loss": 4.3579, - "step": 1328 - }, - { - "epoch": 0.41, - "learning_rate": 0.0002991772104150127, - "loss": 4.5808, - "step": 1329 - }, - { - "epoch": 0.41, - "learning_rate": 0.0002991415813557754, - "loss": 4.5049, - "step": 1330 - }, - { - "epoch": 0.41, - "learning_rate": 0.0002991059522965382, - "loss": 4.4733, - "step": 1331 - }, - { - "epoch": 0.41, - "learning_rate": 0.000299070323237301, - "loss": 4.2266, - "step": 1332 - }, - { - "epoch": 0.41, - "learning_rate": 0.0002990346941780637, - "loss": 4.3146, - "step": 1333 - }, - { - "epoch": 0.41, - "learning_rate": 0.00029899906511882655, - "loss": 4.5649, - "step": 1334 - }, - { - "epoch": 0.41, - "learning_rate": 0.0002989634360595893, - "loss": 4.3494, - "step": 1335 - }, - { - "epoch": 0.41, - "learning_rate": 0.00029892780700035206, - "loss": 4.1564, - "step": 1336 - }, - { - "epoch": 0.41, - "learning_rate": 0.00029889217794111484, - "loss": 4.3388, - "step": 1337 - }, - { - "epoch": 0.41, - "learning_rate": 0.00029885654888187757, - "loss": 3.8988, - "step": 1338 - }, - { - "epoch": 0.41, - "learning_rate": 0.0002988209198226404, - "loss": 4.0641, - "step": 1339 - }, - { - "epoch": 0.41, - "learning_rate": 0.00029878529076340313, - "loss": 3.8808, - "step": 1340 - }, - { - "epoch": 0.41, - "learning_rate": 0.0002987496617041659, - "loss": 4.0006, - "step": 1341 - }, - { - "epoch": 0.41, - "learning_rate": 0.0002987140326449287, - "loss": 3.9012, - "step": 1342 - }, - { - "epoch": 0.41, - "learning_rate": 0.0002986784035856915, - "loss": 3.9728, - "step": 1343 - }, - { - "epoch": 0.41, - "learning_rate": 0.00029864277452645426, - "loss": 3.7354, - "step": 1344 - }, - { - "epoch": 0.42, - "learning_rate": 0.000298607145467217, - "loss": 3.3499, - "step": 1345 - }, - { - "epoch": 0.42, - "learning_rate": 0.00029857151640797977, - "loss": 3.5908, - "step": 1346 - }, - { - "epoch": 0.42, - "learning_rate": 0.00029853588734874255, - "loss": 3.2084, - "step": 1347 - }, - { - "epoch": 0.42, - "learning_rate": 0.00029850025828950534, - "loss": 3.1736, - "step": 1348 - }, - { - "epoch": 0.42, - "learning_rate": 0.0002984646292302681, - "loss": 3.2985, - "step": 1349 - }, - { - "epoch": 0.42, - "learning_rate": 0.00029842900017103084, - "loss": 2.8575, - "step": 1350 - }, - { - "epoch": 0.42, - "learning_rate": 0.0002983933711117936, - "loss": 6.0847, - "step": 1351 - }, - { - "epoch": 0.42, - "learning_rate": 0.0002983577420525564, - "loss": 5.4779, - "step": 1352 - }, - { - "epoch": 0.42, - "learning_rate": 0.0002983221129933192, - "loss": 5.318, - "step": 1353 - }, - { - "epoch": 0.42, - "learning_rate": 0.0002982864839340819, - "loss": 5.1022, - "step": 1354 - }, - { - "epoch": 0.42, - "learning_rate": 0.0002982508548748447, - "loss": 5.3079, - "step": 1355 - }, - { - "epoch": 0.42, - "learning_rate": 0.0002982152258156075, - "loss": 5.0206, - "step": 1356 - }, - { - "epoch": 0.42, - "learning_rate": 0.00029817959675637026, - "loss": 4.474, - "step": 1357 - }, - { - "epoch": 0.42, - "learning_rate": 0.00029814396769713305, - "loss": 4.9875, - "step": 1358 - }, - { - "epoch": 0.42, - "learning_rate": 0.0002981083386378958, - "loss": 5.0414, - "step": 1359 - }, - { - "epoch": 0.42, - "learning_rate": 0.00029807270957865856, - "loss": 5.2205, - "step": 1360 - }, - { - "epoch": 0.42, - "learning_rate": 0.00029803708051942134, - "loss": 4.7262, - "step": 1361 - }, - { - "epoch": 0.42, - "learning_rate": 0.0002980014514601841, - "loss": 4.8278, - "step": 1362 - }, - { - "epoch": 0.42, - "learning_rate": 0.0002979658224009469, - "loss": 4.7, - "step": 1363 - }, - { - "epoch": 0.42, - "learning_rate": 0.00029793019334170963, - "loss": 4.7975, - "step": 1364 - }, - { - "epoch": 0.42, - "learning_rate": 0.0002978945642824724, - "loss": 4.6585, - "step": 1365 - }, - { - "epoch": 0.42, - "learning_rate": 0.0002978589352232352, - "loss": 4.4247, - "step": 1366 - }, - { - "epoch": 0.42, - "learning_rate": 0.000297823306163998, - "loss": 4.6945, - "step": 1367 - }, - { - "epoch": 0.42, - "learning_rate": 0.00029778767710476076, - "loss": 4.6252, - "step": 1368 - }, - { - "epoch": 0.42, - "learning_rate": 0.0002977520480455235, - "loss": 4.9368, - "step": 1369 - }, - { - "epoch": 0.42, - "learning_rate": 0.00029771641898628627, - "loss": 4.8987, - "step": 1370 - }, - { - "epoch": 0.42, - "learning_rate": 0.00029768078992704905, - "loss": 4.578, - "step": 1371 - }, - { - "epoch": 0.42, - "learning_rate": 0.00029764516086781183, - "loss": 4.5628, - "step": 1372 - }, - { - "epoch": 0.42, - "learning_rate": 0.0002976095318085746, - "loss": 4.6645, - "step": 1373 - }, - { - "epoch": 0.42, - "learning_rate": 0.00029757390274933734, - "loss": 4.4381, - "step": 1374 - }, - { - "epoch": 0.42, - "learning_rate": 0.0002975382736901002, - "loss": 4.2857, - "step": 1375 - }, - { - "epoch": 0.42, - "learning_rate": 0.0002975026446308629, - "loss": 4.3584, - "step": 1376 - }, - { - "epoch": 0.42, - "learning_rate": 0.0002974670155716257, - "loss": 4.4446, - "step": 1377 - }, - { - "epoch": 0.43, - "learning_rate": 0.00029743138651238847, - "loss": 4.4407, - "step": 1378 - }, - { - "epoch": 0.43, - "learning_rate": 0.0002973957574531512, - "loss": 4.2401, - "step": 1379 - }, - { - "epoch": 0.43, - "learning_rate": 0.000297360128393914, - "loss": 4.2348, - "step": 1380 - }, - { - "epoch": 0.43, - "learning_rate": 0.00029732449933467676, - "loss": 4.1892, - "step": 1381 - }, - { - "epoch": 0.43, - "learning_rate": 0.00029728887027543954, - "loss": 4.4059, - "step": 1382 - }, - { - "epoch": 0.43, - "learning_rate": 0.0002972532412162023, - "loss": 4.0896, - "step": 1383 - }, - { - "epoch": 0.43, - "learning_rate": 0.00029721761215696505, - "loss": 4.3106, - "step": 1384 - }, - { - "epoch": 0.43, - "learning_rate": 0.00029718198309772783, - "loss": 4.1299, - "step": 1385 - }, - { - "epoch": 0.43, - "learning_rate": 0.0002971463540384906, - "loss": 4.2921, - "step": 1386 - }, - { - "epoch": 0.43, - "learning_rate": 0.0002971107249792534, - "loss": 4.0942, - "step": 1387 - }, - { - "epoch": 0.43, - "learning_rate": 0.0002970750959200161, - "loss": 4.0125, - "step": 1388 - }, - { - "epoch": 0.43, - "learning_rate": 0.0002970394668607789, - "loss": 4.2825, - "step": 1389 - }, - { - "epoch": 0.43, - "learning_rate": 0.0002970038378015417, - "loss": 4.2438, - "step": 1390 - }, - { - "epoch": 0.43, - "learning_rate": 0.00029696820874230447, - "loss": 3.9249, - "step": 1391 - }, - { - "epoch": 0.43, - "learning_rate": 0.00029693257968306725, - "loss": 3.763, - "step": 1392 - }, - { - "epoch": 0.43, - "learning_rate": 0.00029689695062383, - "loss": 3.7987, - "step": 1393 - }, - { - "epoch": 0.43, - "learning_rate": 0.0002968613215645928, - "loss": 3.891, - "step": 1394 - }, - { - "epoch": 0.43, - "learning_rate": 0.00029682569250535554, - "loss": 3.824, - "step": 1395 - }, - { - "epoch": 0.43, - "learning_rate": 0.0002967900634461183, - "loss": 3.4456, - "step": 1396 - }, - { - "epoch": 0.43, - "learning_rate": 0.0002967544343868811, - "loss": 3.4597, - "step": 1397 - }, - { - "epoch": 0.43, - "learning_rate": 0.00029671880532764383, - "loss": 3.0946, - "step": 1398 - }, - { - "epoch": 0.43, - "learning_rate": 0.00029668317626840667, - "loss": 3.0734, - "step": 1399 - }, - { - "epoch": 0.43, - "learning_rate": 0.0002966475472091694, - "loss": 3.1359, - "step": 1400 - }, - { - "epoch": 0.43, - "learning_rate": 0.0002966119181499322, - "loss": 6.2601, - "step": 1401 - }, - { - "epoch": 0.43, - "learning_rate": 0.00029657628909069496, - "loss": 5.6439, - "step": 1402 - }, - { - "epoch": 0.43, - "learning_rate": 0.0002965406600314577, - "loss": 5.5004, - "step": 1403 - }, - { - "epoch": 0.43, - "learning_rate": 0.0002965050309722205, - "loss": 5.0289, - "step": 1404 - }, - { - "epoch": 0.43, - "learning_rate": 0.00029646940191298325, - "loss": 5.0701, - "step": 1405 - }, - { - "epoch": 0.43, - "learning_rate": 0.00029643377285374603, - "loss": 4.6878, - "step": 1406 - }, - { - "epoch": 0.43, - "learning_rate": 0.0002963981437945088, - "loss": 5.1698, - "step": 1407 - }, - { - "epoch": 0.43, - "learning_rate": 0.00029636251473527154, - "loss": 4.5904, - "step": 1408 - }, - { - "epoch": 0.43, - "learning_rate": 0.0002963268856760344, - "loss": 4.8074, - "step": 1409 - }, - { - "epoch": 0.44, - "learning_rate": 0.0002962912566167971, - "loss": 4.8577, - "step": 1410 - }, - { - "epoch": 0.44, - "learning_rate": 0.0002962556275575599, - "loss": 4.9662, - "step": 1411 - }, - { - "epoch": 0.44, - "learning_rate": 0.00029621999849832267, - "loss": 4.6517, - "step": 1412 - }, - { - "epoch": 0.44, - "learning_rate": 0.0002961843694390854, - "loss": 4.643, - "step": 1413 - }, - { - "epoch": 0.44, - "learning_rate": 0.0002961487403798482, - "loss": 4.8603, - "step": 1414 - }, - { - "epoch": 0.44, - "learning_rate": 0.00029611311132061096, - "loss": 4.6163, - "step": 1415 - }, - { - "epoch": 0.44, - "learning_rate": 0.00029607748226137374, - "loss": 4.5498, - "step": 1416 - }, - { - "epoch": 0.44, - "learning_rate": 0.0002960418532021365, - "loss": 4.5429, - "step": 1417 - }, - { - "epoch": 0.44, - "learning_rate": 0.0002960062241428993, - "loss": 4.6, - "step": 1418 - }, - { - "epoch": 0.44, - "learning_rate": 0.00029597059508366204, - "loss": 4.4403, - "step": 1419 - }, - { - "epoch": 0.44, - "learning_rate": 0.0002959349660244248, - "loss": 4.649, - "step": 1420 - }, - { - "epoch": 0.44, - "learning_rate": 0.0002958993369651876, - "loss": 4.5089, - "step": 1421 - }, - { - "epoch": 0.44, - "learning_rate": 0.0002958637079059504, - "loss": 4.7455, - "step": 1422 - }, - { - "epoch": 0.44, - "learning_rate": 0.00029582807884671316, - "loss": 4.6444, - "step": 1423 - }, - { - "epoch": 0.44, - "learning_rate": 0.0002957924497874759, - "loss": 4.5978, - "step": 1424 - }, - { - "epoch": 0.44, - "learning_rate": 0.00029575682072823867, - "loss": 4.6573, - "step": 1425 - }, - { - "epoch": 0.44, - "learning_rate": 0.00029572119166900145, - "loss": 4.6228, - "step": 1426 - }, - { - "epoch": 0.44, - "learning_rate": 0.0002956855626097642, - "loss": 4.5709, - "step": 1427 - }, - { - "epoch": 0.44, - "learning_rate": 0.000295649933550527, - "loss": 4.3087, - "step": 1428 - }, - { - "epoch": 0.44, - "learning_rate": 0.00029561430449128975, - "loss": 4.3838, - "step": 1429 - }, - { - "epoch": 0.44, - "learning_rate": 0.00029557867543205253, - "loss": 4.5056, - "step": 1430 - }, - { - "epoch": 0.44, - "learning_rate": 0.0002955430463728153, - "loss": 4.6182, - "step": 1431 - }, - { - "epoch": 0.44, - "learning_rate": 0.00029550741731357804, - "loss": 4.1732, - "step": 1432 - }, - { - "epoch": 0.44, - "learning_rate": 0.0002954717882543409, - "loss": 4.3924, - "step": 1433 - }, - { - "epoch": 0.44, - "learning_rate": 0.0002954361591951036, - "loss": 4.4977, - "step": 1434 - }, - { - "epoch": 0.44, - "learning_rate": 0.0002954005301358664, - "loss": 4.1768, - "step": 1435 - }, - { - "epoch": 0.44, - "learning_rate": 0.00029536490107662916, - "loss": 4.2573, - "step": 1436 - }, - { - "epoch": 0.44, - "learning_rate": 0.00029532927201739195, - "loss": 4.229, - "step": 1437 - }, - { - "epoch": 0.44, - "learning_rate": 0.00029529364295815473, - "loss": 4.0335, - "step": 1438 - }, - { - "epoch": 0.44, - "learning_rate": 0.00029525801389891746, - "loss": 3.9274, - "step": 1439 - }, - { - "epoch": 0.44, - "learning_rate": 0.00029522238483968024, - "loss": 4.2552, - "step": 1440 - }, - { - "epoch": 0.44, - "learning_rate": 0.000295186755780443, - "loss": 3.9059, - "step": 1441 - }, - { - "epoch": 0.45, - "learning_rate": 0.0002951511267212058, - "loss": 3.6717, - "step": 1442 - }, - { - "epoch": 0.45, - "learning_rate": 0.0002951154976619686, - "loss": 3.9661, - "step": 1443 - }, - { - "epoch": 0.45, - "learning_rate": 0.0002950798686027313, - "loss": 3.8879, - "step": 1444 - }, - { - "epoch": 0.45, - "learning_rate": 0.0002950442395434941, - "loss": 3.7356, - "step": 1445 - }, - { - "epoch": 0.45, - "learning_rate": 0.0002950086104842569, - "loss": 3.5681, - "step": 1446 - }, - { - "epoch": 0.45, - "learning_rate": 0.00029497298142501966, - "loss": 3.2617, - "step": 1447 - }, - { - "epoch": 0.45, - "learning_rate": 0.0002949373523657824, - "loss": 3.0631, - "step": 1448 - }, - { - "epoch": 0.45, - "learning_rate": 0.00029490172330654517, - "loss": 3.2543, - "step": 1449 - }, - { - "epoch": 0.45, - "learning_rate": 0.00029486609424730795, - "loss": 3.0812, - "step": 1450 - }, - { - "epoch": 0.45, - "learning_rate": 0.00029483046518807073, - "loss": 6.3088, - "step": 1451 - }, - { - "epoch": 0.45, - "learning_rate": 0.0002947948361288335, - "loss": 5.5911, - "step": 1452 - }, - { - "epoch": 0.45, - "learning_rate": 0.00029475920706959624, - "loss": 5.2747, - "step": 1453 - }, - { - "epoch": 0.45, - "learning_rate": 0.000294723578010359, - "loss": 5.2358, - "step": 1454 - }, - { - "epoch": 0.45, - "learning_rate": 0.0002946879489511218, - "loss": 4.873, - "step": 1455 - }, - { - "epoch": 0.45, - "learning_rate": 0.0002946523198918846, - "loss": 5.103, - "step": 1456 - }, - { - "epoch": 0.45, - "learning_rate": 0.00029461669083264737, - "loss": 5.0973, - "step": 1457 - }, - { - "epoch": 0.45, - "learning_rate": 0.0002945810617734101, - "loss": 4.6137, - "step": 1458 - }, - { - "epoch": 0.45, - "learning_rate": 0.0002945454327141729, - "loss": 4.5452, - "step": 1459 - }, - { - "epoch": 0.45, - "learning_rate": 0.00029450980365493566, - "loss": 4.8113, - "step": 1460 - }, - { - "epoch": 0.45, - "learning_rate": 0.00029447417459569844, - "loss": 4.831, - "step": 1461 - }, - { - "epoch": 0.45, - "learning_rate": 0.0002944385455364612, - "loss": 4.6025, - "step": 1462 - }, - { - "epoch": 0.45, - "learning_rate": 0.00029440291647722395, - "loss": 4.7336, - "step": 1463 - }, - { - "epoch": 0.45, - "learning_rate": 0.0002943672874179868, - "loss": 4.6847, - "step": 1464 - }, - { - "epoch": 0.45, - "learning_rate": 0.0002943316583587495, - "loss": 4.7134, - "step": 1465 - }, - { - "epoch": 0.45, - "learning_rate": 0.0002942960292995123, - "loss": 4.4742, - "step": 1466 - }, - { - "epoch": 0.45, - "learning_rate": 0.0002942604002402751, - "loss": 4.9778, - "step": 1467 - }, - { - "epoch": 0.45, - "learning_rate": 0.0002942247711810378, - "loss": 4.5412, - "step": 1468 - }, - { - "epoch": 0.45, - "learning_rate": 0.0002941891421218006, - "loss": 4.6059, - "step": 1469 - }, - { - "epoch": 0.45, - "learning_rate": 0.00029415351306256337, - "loss": 4.9422, - "step": 1470 - }, - { - "epoch": 0.45, - "learning_rate": 0.00029411788400332615, - "loss": 4.4398, - "step": 1471 - }, - { - "epoch": 0.45, - "learning_rate": 0.00029408225494408893, - "loss": 4.5057, - "step": 1472 - }, - { - "epoch": 0.45, - "learning_rate": 0.00029404662588485166, - "loss": 4.3073, - "step": 1473 - }, - { - "epoch": 0.45, - "learning_rate": 0.00029401099682561444, - "loss": 4.549, - "step": 1474 - }, - { - "epoch": 0.46, - "learning_rate": 0.0002939753677663772, - "loss": 4.6019, - "step": 1475 - }, - { - "epoch": 0.46, - "learning_rate": 0.00029393973870714, - "loss": 4.6149, - "step": 1476 - }, - { - "epoch": 0.46, - "learning_rate": 0.0002939041096479028, - "loss": 4.4029, - "step": 1477 - }, - { - "epoch": 0.46, - "learning_rate": 0.0002938684805886655, - "loss": 4.283, - "step": 1478 - }, - { - "epoch": 0.46, - "learning_rate": 0.0002938328515294283, - "loss": 4.2084, - "step": 1479 - }, - { - "epoch": 0.46, - "learning_rate": 0.0002937972224701911, - "loss": 4.1736, - "step": 1480 - }, - { - "epoch": 0.46, - "learning_rate": 0.00029376159341095386, - "loss": 4.3942, - "step": 1481 - }, - { - "epoch": 0.46, - "learning_rate": 0.0002937259643517166, - "loss": 4.6199, - "step": 1482 - }, - { - "epoch": 0.46, - "learning_rate": 0.00029369033529247937, - "loss": 4.1039, - "step": 1483 - }, - { - "epoch": 0.46, - "learning_rate": 0.00029365470623324215, - "loss": 4.3398, - "step": 1484 - }, - { - "epoch": 0.46, - "learning_rate": 0.00029361907717400493, - "loss": 4.4076, - "step": 1485 - }, - { - "epoch": 0.46, - "learning_rate": 0.0002935834481147677, - "loss": 4.0263, - "step": 1486 - }, - { - "epoch": 0.46, - "learning_rate": 0.00029354781905553044, - "loss": 4.2947, - "step": 1487 - }, - { - "epoch": 0.46, - "learning_rate": 0.0002935121899962933, - "loss": 3.9938, - "step": 1488 - }, - { - "epoch": 0.46, - "learning_rate": 0.000293476560937056, - "loss": 4.0327, - "step": 1489 - }, - { - "epoch": 0.46, - "learning_rate": 0.0002934409318778188, - "loss": 3.8864, - "step": 1490 - }, - { - "epoch": 0.46, - "learning_rate": 0.00029340530281858157, - "loss": 3.6926, - "step": 1491 - }, - { - "epoch": 0.46, - "learning_rate": 0.0002933696737593443, - "loss": 4.0948, - "step": 1492 - }, - { - "epoch": 0.46, - "learning_rate": 0.00029333404470010713, - "loss": 3.7913, - "step": 1493 - }, - { - "epoch": 0.46, - "learning_rate": 0.00029329841564086986, - "loss": 3.4333, - "step": 1494 - }, - { - "epoch": 0.46, - "learning_rate": 0.00029326278658163264, - "loss": 3.5553, - "step": 1495 - }, - { - "epoch": 0.46, - "learning_rate": 0.0002932271575223954, - "loss": 3.4108, - "step": 1496 - }, - { - "epoch": 0.46, - "learning_rate": 0.00029319152846315815, - "loss": 3.3711, - "step": 1497 - }, - { - "epoch": 0.46, - "learning_rate": 0.000293155899403921, - "loss": 3.1167, - "step": 1498 - }, - { - "epoch": 0.46, - "learning_rate": 0.0002931202703446837, - "loss": 3.1449, - "step": 1499 - }, - { - "epoch": 0.46, - "learning_rate": 0.0002930846412854465, - "loss": 2.9879, - "step": 1500 - }, - { - "epoch": 0.46, - "eval_bleu": 0.0, - "eval_loss": 5.218113899230957, - "eval_runtime": 1382.8233, - "eval_samples_per_second": 10.674, - "eval_steps_per_second": 1.334, - "step": 1500 - }, - { - "epoch": 0.46, - "learning_rate": 0.0002930490122262093, - "loss": 5.9271, - "step": 1501 - }, - { - "epoch": 0.46, - "learning_rate": 0.000293013383166972, - "loss": 5.8332, - "step": 1502 - }, - { - "epoch": 0.46, - "learning_rate": 0.0002929777541077348, - "loss": 5.0913, - "step": 1503 - }, - { - "epoch": 0.46, - "learning_rate": 0.0002929421250484976, - "loss": 5.1893, - "step": 1504 - }, - { - "epoch": 0.46, - "learning_rate": 0.00029290649598926035, - "loss": 4.824, - "step": 1505 - }, - { - "epoch": 0.46, - "learning_rate": 0.00029287086693002314, - "loss": 5.0932, - "step": 1506 - }, - { - "epoch": 0.47, - "learning_rate": 0.0002928352378707859, - "loss": 4.6225, - "step": 1507 - }, - { - "epoch": 0.47, - "learning_rate": 0.00029279960881154865, - "loss": 5.0119, - "step": 1508 - }, - { - "epoch": 0.47, - "learning_rate": 0.00029276397975231143, - "loss": 4.5293, - "step": 1509 - }, - { - "epoch": 0.47, - "learning_rate": 0.0002927283506930742, - "loss": 4.4856, - "step": 1510 - }, - { - "epoch": 0.47, - "learning_rate": 0.000292692721633837, - "loss": 4.6485, - "step": 1511 - }, - { - "epoch": 0.47, - "learning_rate": 0.0002926570925745998, - "loss": 4.7876, - "step": 1512 - }, - { - "epoch": 0.47, - "learning_rate": 0.0002926214635153625, - "loss": 4.6034, - "step": 1513 - }, - { - "epoch": 0.47, - "learning_rate": 0.0002925858344561253, - "loss": 4.7034, - "step": 1514 - }, - { - "epoch": 0.47, - "learning_rate": 0.00029255020539688807, - "loss": 4.8002, - "step": 1515 - }, - { - "epoch": 0.47, - "learning_rate": 0.0002925145763376508, - "loss": 4.6793, - "step": 1516 - }, - { - "epoch": 0.47, - "learning_rate": 0.00029247894727841363, - "loss": 4.6064, - "step": 1517 - }, - { - "epoch": 0.47, - "learning_rate": 0.00029244331821917636, - "loss": 4.6033, - "step": 1518 - }, - { - "epoch": 0.47, - "learning_rate": 0.00029240768915993914, - "loss": 4.5903, - "step": 1519 - }, - { - "epoch": 0.47, - "learning_rate": 0.0002923720601007019, - "loss": 4.5567, - "step": 1520 - }, - { - "epoch": 0.47, - "learning_rate": 0.00029233643104146465, - "loss": 3.9217, - "step": 1521 - }, - { - "epoch": 0.47, - "learning_rate": 0.0002923008019822275, - "loss": 4.3524, - "step": 1522 - }, - { - "epoch": 0.47, - "learning_rate": 0.0002922651729229902, - "loss": 4.4129, - "step": 1523 - }, - { - "epoch": 0.47, - "learning_rate": 0.000292229543863753, - "loss": 4.4007, - "step": 1524 - }, - { - "epoch": 0.47, - "learning_rate": 0.0002921939148045158, - "loss": 4.3499, - "step": 1525 - }, - { - "epoch": 0.47, - "learning_rate": 0.0002921582857452785, - "loss": 4.3925, - "step": 1526 - }, - { - "epoch": 0.47, - "learning_rate": 0.00029212265668604134, - "loss": 4.5935, - "step": 1527 - }, - { - "epoch": 0.47, - "learning_rate": 0.00029208702762680407, - "loss": 4.3094, - "step": 1528 - }, - { - "epoch": 0.47, - "learning_rate": 0.00029205139856756685, - "loss": 4.3955, - "step": 1529 - }, - { - "epoch": 0.47, - "learning_rate": 0.00029201576950832963, - "loss": 4.1156, - "step": 1530 - }, - { - "epoch": 0.47, - "learning_rate": 0.0002919801404490924, - "loss": 4.2628, - "step": 1531 - }, - { - "epoch": 0.47, - "learning_rate": 0.0002919445113898552, - "loss": 4.333, - "step": 1532 - }, - { - "epoch": 0.47, - "learning_rate": 0.0002919088823306179, - "loss": 4.1443, - "step": 1533 - }, - { - "epoch": 0.47, - "learning_rate": 0.0002918732532713807, - "loss": 4.47, - "step": 1534 - }, - { - "epoch": 0.47, - "learning_rate": 0.0002918376242121435, - "loss": 4.2293, - "step": 1535 - }, - { - "epoch": 0.47, - "learning_rate": 0.00029180199515290627, - "loss": 3.9859, - "step": 1536 - }, - { - "epoch": 0.47, - "learning_rate": 0.00029176636609366905, - "loss": 4.0588, - "step": 1537 - }, - { - "epoch": 0.47, - "learning_rate": 0.0002917307370344318, - "loss": 3.9031, - "step": 1538 - }, - { - "epoch": 0.47, - "learning_rate": 0.00029169510797519456, - "loss": 4.1155, - "step": 1539 - }, - { - "epoch": 0.48, - "learning_rate": 0.00029165947891595734, - "loss": 3.6605, - "step": 1540 - }, - { - "epoch": 0.48, - "learning_rate": 0.0002916238498567201, - "loss": 3.9866, - "step": 1541 - }, - { - "epoch": 0.48, - "learning_rate": 0.00029158822079748285, - "loss": 4.113, - "step": 1542 - }, - { - "epoch": 0.48, - "learning_rate": 0.00029155259173824563, - "loss": 3.7653, - "step": 1543 - }, - { - "epoch": 0.48, - "learning_rate": 0.0002915169626790084, - "loss": 3.6642, - "step": 1544 - }, - { - "epoch": 0.48, - "learning_rate": 0.0002914813336197712, - "loss": 3.566, - "step": 1545 - }, - { - "epoch": 0.48, - "learning_rate": 0.000291445704560534, - "loss": 3.7636, - "step": 1546 - }, - { - "epoch": 0.48, - "learning_rate": 0.0002914100755012967, - "loss": 3.2914, - "step": 1547 - }, - { - "epoch": 0.48, - "learning_rate": 0.0002913744464420595, - "loss": 3.3236, - "step": 1548 - }, - { - "epoch": 0.48, - "learning_rate": 0.00029133881738282227, - "loss": 3.2093, - "step": 1549 - }, - { - "epoch": 0.48, - "learning_rate": 0.000291303188323585, - "loss": 2.8138, - "step": 1550 - }, - { - "epoch": 0.48, - "learning_rate": 0.00029126755926434783, - "loss": 6.0967, - "step": 1551 - }, - { - "epoch": 0.48, - "learning_rate": 0.00029123193020511056, - "loss": 5.6046, - "step": 1552 - }, - { - "epoch": 0.48, - "learning_rate": 0.00029119630114587334, - "loss": 5.2391, - "step": 1553 - }, - { - "epoch": 0.48, - "learning_rate": 0.0002911606720866361, - "loss": 5.0387, - "step": 1554 - }, - { - "epoch": 0.48, - "learning_rate": 0.0002911250430273989, - "loss": 4.8765, - "step": 1555 - }, - { - "epoch": 0.48, - "learning_rate": 0.0002910894139681617, - "loss": 5.0187, - "step": 1556 - }, - { - "epoch": 0.48, - "learning_rate": 0.0002910537849089244, - "loss": 4.8688, - "step": 1557 - }, - { - "epoch": 0.48, - "learning_rate": 0.00029101815584968725, - "loss": 4.7368, - "step": 1558 - }, - { - "epoch": 0.48, - "learning_rate": 0.00029098252679045, - "loss": 4.6912, - "step": 1559 - }, - { - "epoch": 0.48, - "learning_rate": 0.00029094689773121276, - "loss": 4.6317, - "step": 1560 - }, - { - "epoch": 0.48, - "learning_rate": 0.00029091126867197554, - "loss": 4.5266, - "step": 1561 - }, - { - "epoch": 0.48, - "learning_rate": 0.00029087563961273827, - "loss": 4.8036, - "step": 1562 - }, - { - "epoch": 0.48, - "learning_rate": 0.00029084001055350105, - "loss": 4.3639, - "step": 1563 - }, - { - "epoch": 0.48, - "learning_rate": 0.00029080438149426383, - "loss": 4.8885, - "step": 1564 - }, - { - "epoch": 0.48, - "learning_rate": 0.0002907687524350266, - "loss": 4.8304, - "step": 1565 - }, - { - "epoch": 0.48, - "learning_rate": 0.0002907331233757894, - "loss": 4.4184, - "step": 1566 - }, - { - "epoch": 0.48, - "learning_rate": 0.0002906974943165521, - "loss": 4.7267, - "step": 1567 - }, - { - "epoch": 0.48, - "learning_rate": 0.0002906618652573149, - "loss": 4.4361, - "step": 1568 - }, - { - "epoch": 0.48, - "learning_rate": 0.0002906262361980777, - "loss": 4.85, - "step": 1569 - }, - { - "epoch": 0.48, - "learning_rate": 0.00029059060713884047, - "loss": 4.6357, - "step": 1570 - }, - { - "epoch": 0.48, - "learning_rate": 0.00029055497807960325, - "loss": 4.4048, - "step": 1571 - }, - { - "epoch": 0.49, - "learning_rate": 0.000290519349020366, - "loss": 4.3945, - "step": 1572 - }, - { - "epoch": 0.49, - "learning_rate": 0.00029048371996112876, - "loss": 4.4739, - "step": 1573 - }, - { - "epoch": 0.49, - "learning_rate": 0.00029044809090189155, - "loss": 4.4761, - "step": 1574 - }, - { - "epoch": 0.49, - "learning_rate": 0.00029041246184265433, - "loss": 4.2886, - "step": 1575 - }, - { - "epoch": 0.49, - "learning_rate": 0.00029037683278341705, - "loss": 4.2753, - "step": 1576 - }, - { - "epoch": 0.49, - "learning_rate": 0.00029034120372417984, - "loss": 4.3819, - "step": 1577 - }, - { - "epoch": 0.49, - "learning_rate": 0.0002903055746649426, - "loss": 4.2209, - "step": 1578 - }, - { - "epoch": 0.49, - "learning_rate": 0.0002902699456057054, - "loss": 4.4748, - "step": 1579 - }, - { - "epoch": 0.49, - "learning_rate": 0.0002902343165464682, - "loss": 4.6274, - "step": 1580 - }, - { - "epoch": 0.49, - "learning_rate": 0.0002901986874872309, - "loss": 4.6299, - "step": 1581 - }, - { - "epoch": 0.49, - "learning_rate": 0.00029016305842799375, - "loss": 4.2846, - "step": 1582 - }, - { - "epoch": 0.49, - "learning_rate": 0.0002901274293687565, - "loss": 4.3461, - "step": 1583 - }, - { - "epoch": 0.49, - "learning_rate": 0.00029009180030951926, - "loss": 4.05, - "step": 1584 - }, - { - "epoch": 0.49, - "learning_rate": 0.00029005617125028204, - "loss": 4.0239, - "step": 1585 - }, - { - "epoch": 0.49, - "learning_rate": 0.00029002054219104477, - "loss": 4.0345, - "step": 1586 - }, - { - "epoch": 0.49, - "learning_rate": 0.0002899849131318076, - "loss": 3.9146, - "step": 1587 - }, - { - "epoch": 0.49, - "learning_rate": 0.00028994928407257033, - "loss": 4.1882, - "step": 1588 - }, - { - "epoch": 0.49, - "learning_rate": 0.0002899136550133331, - "loss": 3.9867, - "step": 1589 - }, - { - "epoch": 0.49, - "learning_rate": 0.0002898780259540959, - "loss": 4.0643, - "step": 1590 - }, - { - "epoch": 0.49, - "learning_rate": 0.0002898423968948586, - "loss": 3.8283, - "step": 1591 - }, - { - "epoch": 0.49, - "learning_rate": 0.00028980676783562146, - "loss": 3.8347, - "step": 1592 - }, - { - "epoch": 0.49, - "learning_rate": 0.0002897711387763842, - "loss": 3.8835, - "step": 1593 - }, - { - "epoch": 0.49, - "learning_rate": 0.00028973550971714697, - "loss": 3.613, - "step": 1594 - }, - { - "epoch": 0.49, - "learning_rate": 0.00028969988065790975, - "loss": 3.7456, - "step": 1595 - }, - { - "epoch": 0.49, - "learning_rate": 0.0002896642515986725, - "loss": 3.4203, - "step": 1596 - }, - { - "epoch": 0.49, - "learning_rate": 0.00028962862253943526, - "loss": 3.4784, - "step": 1597 - }, - { - "epoch": 0.49, - "learning_rate": 0.00028959299348019804, - "loss": 3.2668, - "step": 1598 - }, - { - "epoch": 0.49, - "learning_rate": 0.0002895573644209608, - "loss": 3.1252, - "step": 1599 - }, - { - "epoch": 0.49, - "learning_rate": 0.0002895217353617236, - "loss": 2.9936, - "step": 1600 - }, - { - "epoch": 0.49, - "learning_rate": 0.0002894861063024864, - "loss": 6.226, - "step": 1601 - }, - { - "epoch": 0.49, - "learning_rate": 0.0002894504772432491, - "loss": 5.4487, - "step": 1602 - }, - { - "epoch": 0.49, - "learning_rate": 0.0002894148481840119, - "loss": 5.1382, - "step": 1603 - }, - { - "epoch": 0.5, - "learning_rate": 0.0002893792191247747, - "loss": 5.2172, - "step": 1604 - }, - { - "epoch": 0.5, - "learning_rate": 0.00028934359006553746, - "loss": 5.1825, - "step": 1605 - }, - { - "epoch": 0.5, - "learning_rate": 0.00028930796100630024, - "loss": 4.8012, - "step": 1606 - }, - { - "epoch": 0.5, - "learning_rate": 0.00028927233194706297, - "loss": 4.7798, - "step": 1607 - }, - { - "epoch": 0.5, - "learning_rate": 0.00028923670288782575, - "loss": 4.6503, - "step": 1608 - }, - { - "epoch": 0.5, - "learning_rate": 0.00028920107382858853, - "loss": 4.6209, - "step": 1609 - }, - { - "epoch": 0.5, - "learning_rate": 0.00028916544476935126, - "loss": 4.8553, - "step": 1610 - }, - { - "epoch": 0.5, - "learning_rate": 0.0002891298157101141, - "loss": 4.4706, - "step": 1611 - }, - { - "epoch": 0.5, - "learning_rate": 0.0002890941866508768, - "loss": 4.6565, - "step": 1612 - }, - { - "epoch": 0.5, - "learning_rate": 0.0002890585575916396, - "loss": 4.6162, - "step": 1613 - }, - { - "epoch": 0.5, - "learning_rate": 0.0002890229285324024, - "loss": 4.7839, - "step": 1614 - }, - { - "epoch": 0.5, - "learning_rate": 0.0002889872994731651, - "loss": 4.62, - "step": 1615 - }, - { - "epoch": 0.5, - "learning_rate": 0.00028895167041392795, - "loss": 4.6899, - "step": 1616 - }, - { - "epoch": 0.5, - "learning_rate": 0.0002889160413546907, - "loss": 4.6458, - "step": 1617 - }, - { - "epoch": 0.5, - "learning_rate": 0.00028888041229545346, - "loss": 4.2545, - "step": 1618 - }, - { - "epoch": 0.5, - "learning_rate": 0.00028884478323621624, - "loss": 4.5741, - "step": 1619 - }, - { - "epoch": 0.5, - "learning_rate": 0.00028880915417697897, - "loss": 4.7165, - "step": 1620 - }, - { - "epoch": 0.5, - "learning_rate": 0.0002887735251177418, - "loss": 4.5225, - "step": 1621 - }, - { - "epoch": 0.5, - "learning_rate": 0.00028873789605850453, - "loss": 4.6099, - "step": 1622 - }, - { - "epoch": 0.5, - "learning_rate": 0.0002887022669992673, - "loss": 4.3643, - "step": 1623 - }, - { - "epoch": 0.5, - "learning_rate": 0.0002886666379400301, - "loss": 4.7348, - "step": 1624 - }, - { - "epoch": 0.5, - "learning_rate": 0.0002886310088807929, - "loss": 4.4335, - "step": 1625 - }, - { - "epoch": 0.5, - "learning_rate": 0.00028859537982155566, - "loss": 4.3101, - "step": 1626 - }, - { - "epoch": 0.5, - "learning_rate": 0.0002885597507623184, - "loss": 4.5908, - "step": 1627 - }, - { - "epoch": 0.5, - "learning_rate": 0.00028852412170308117, - "loss": 4.6177, - "step": 1628 - }, - { - "epoch": 0.5, - "learning_rate": 0.00028848849264384395, - "loss": 4.4698, - "step": 1629 - }, - { - "epoch": 0.5, - "learning_rate": 0.00028845286358460673, - "loss": 4.4572, - "step": 1630 - }, - { - "epoch": 0.5, - "learning_rate": 0.00028841723452536946, - "loss": 4.3128, - "step": 1631 - }, - { - "epoch": 0.5, - "learning_rate": 0.00028838160546613224, - "loss": 4.3521, - "step": 1632 - }, - { - "epoch": 0.5, - "learning_rate": 0.000288345976406895, - "loss": 4.317, - "step": 1633 - }, - { - "epoch": 0.5, - "learning_rate": 0.0002883103473476578, - "loss": 4.3642, - "step": 1634 - }, - { - "epoch": 0.5, - "learning_rate": 0.0002882747182884206, - "loss": 4.1254, - "step": 1635 - }, - { - "epoch": 0.5, - "learning_rate": 0.0002882390892291833, - "loss": 4.0972, - "step": 1636 - }, - { - "epoch": 0.51, - "learning_rate": 0.0002882034601699461, - "loss": 4.0258, - "step": 1637 - }, - { - "epoch": 0.51, - "learning_rate": 0.0002881678311107089, - "loss": 3.9691, - "step": 1638 - }, - { - "epoch": 0.51, - "learning_rate": 0.00028813220205147166, - "loss": 4.1063, - "step": 1639 - }, - { - "epoch": 0.51, - "learning_rate": 0.00028809657299223444, - "loss": 3.9501, - "step": 1640 - }, - { - "epoch": 0.51, - "learning_rate": 0.00028806094393299717, - "loss": 4.1064, - "step": 1641 - }, - { - "epoch": 0.51, - "learning_rate": 0.00028802531487375995, - "loss": 3.8949, - "step": 1642 - }, - { - "epoch": 0.51, - "learning_rate": 0.00028798968581452274, - "loss": 3.9081, - "step": 1643 - }, - { - "epoch": 0.51, - "learning_rate": 0.0002879540567552855, - "loss": 3.7898, - "step": 1644 - }, - { - "epoch": 0.51, - "learning_rate": 0.0002879184276960483, - "loss": 3.4469, - "step": 1645 - }, - { - "epoch": 0.51, - "learning_rate": 0.00028788279863681103, - "loss": 3.3873, - "step": 1646 - }, - { - "epoch": 0.51, - "learning_rate": 0.0002878471695775738, - "loss": 3.2614, - "step": 1647 - }, - { - "epoch": 0.51, - "learning_rate": 0.0002878115405183366, - "loss": 3.0555, - "step": 1648 - }, - { - "epoch": 0.51, - "learning_rate": 0.00028777591145909937, - "loss": 2.9158, - "step": 1649 - }, - { - "epoch": 0.51, - "learning_rate": 0.00028774028239986215, - "loss": 2.9319, - "step": 1650 - }, - { - "epoch": 0.51, - "learning_rate": 0.0002877046533406249, - "loss": 5.8169, - "step": 1651 - }, - { - "epoch": 0.51, - "learning_rate": 0.0002876690242813877, - "loss": 5.9246, - "step": 1652 - }, - { - "epoch": 0.51, - "learning_rate": 0.00028763339522215045, - "loss": 5.3571, - "step": 1653 - }, - { - "epoch": 0.51, - "learning_rate": 0.00028759776616291323, - "loss": 5.0096, - "step": 1654 - }, - { - "epoch": 0.51, - "learning_rate": 0.000287562137103676, - "loss": 5.0218, - "step": 1655 - }, - { - "epoch": 0.51, - "learning_rate": 0.00028752650804443874, - "loss": 4.7095, - "step": 1656 - }, - { - "epoch": 0.51, - "learning_rate": 0.0002874908789852015, - "loss": 4.9994, - "step": 1657 - }, - { - "epoch": 0.51, - "learning_rate": 0.0002874552499259643, - "loss": 4.7598, - "step": 1658 - }, - { - "epoch": 0.51, - "learning_rate": 0.0002874196208667271, - "loss": 4.6655, - "step": 1659 - }, - { - "epoch": 0.51, - "learning_rate": 0.00028738399180748987, - "loss": 4.5321, - "step": 1660 - }, - { - "epoch": 0.51, - "learning_rate": 0.0002873483627482526, - "loss": 4.6492, - "step": 1661 - }, - { - "epoch": 0.51, - "learning_rate": 0.0002873127336890154, - "loss": 4.8681, - "step": 1662 - }, - { - "epoch": 0.51, - "learning_rate": 0.00028727710462977816, - "loss": 4.5626, - "step": 1663 - }, - { - "epoch": 0.51, - "learning_rate": 0.00028724147557054094, - "loss": 4.5711, - "step": 1664 - }, - { - "epoch": 0.51, - "learning_rate": 0.00028720584651130367, - "loss": 4.7182, - "step": 1665 - }, - { - "epoch": 0.51, - "learning_rate": 0.00028717021745206645, - "loss": 4.6934, - "step": 1666 - }, - { - "epoch": 0.51, - "learning_rate": 0.00028713458839282923, - "loss": 4.875, - "step": 1667 - }, - { - "epoch": 0.51, - "learning_rate": 0.000287098959333592, - "loss": 4.2952, - "step": 1668 - }, - { - "epoch": 0.52, - "learning_rate": 0.0002870633302743548, - "loss": 4.6768, - "step": 1669 - }, - { - "epoch": 0.52, - "learning_rate": 0.0002870277012151175, - "loss": 4.4892, - "step": 1670 - }, - { - "epoch": 0.52, - "learning_rate": 0.0002869920721558803, - "loss": 4.7037, - "step": 1671 - }, - { - "epoch": 0.52, - "learning_rate": 0.0002869564430966431, - "loss": 4.2983, - "step": 1672 - }, - { - "epoch": 0.52, - "learning_rate": 0.00028692081403740587, - "loss": 4.5209, - "step": 1673 - }, - { - "epoch": 0.52, - "learning_rate": 0.00028688518497816865, - "loss": 4.6206, - "step": 1674 - }, - { - "epoch": 0.52, - "learning_rate": 0.0002868495559189314, - "loss": 4.2462, - "step": 1675 - }, - { - "epoch": 0.52, - "learning_rate": 0.0002868139268596942, - "loss": 4.2332, - "step": 1676 - }, - { - "epoch": 0.52, - "learning_rate": 0.00028677829780045694, - "loss": 4.5599, - "step": 1677 - }, - { - "epoch": 0.52, - "learning_rate": 0.0002867426687412197, - "loss": 4.6862, - "step": 1678 - }, - { - "epoch": 0.52, - "learning_rate": 0.0002867070396819825, - "loss": 4.1977, - "step": 1679 - }, - { - "epoch": 0.52, - "learning_rate": 0.00028667141062274523, - "loss": 4.4284, - "step": 1680 - }, - { - "epoch": 0.52, - "learning_rate": 0.00028663578156350807, - "loss": 4.7382, - "step": 1681 - }, - { - "epoch": 0.52, - "learning_rate": 0.0002866001525042708, - "loss": 4.3225, - "step": 1682 - }, - { - "epoch": 0.52, - "learning_rate": 0.0002865645234450336, - "loss": 4.1831, - "step": 1683 - }, - { - "epoch": 0.52, - "learning_rate": 0.00028652889438579636, - "loss": 4.2036, - "step": 1684 - }, - { - "epoch": 0.52, - "learning_rate": 0.0002864932653265591, - "loss": 4.1341, - "step": 1685 - }, - { - "epoch": 0.52, - "learning_rate": 0.0002864576362673219, - "loss": 4.1981, - "step": 1686 - }, - { - "epoch": 0.52, - "learning_rate": 0.00028642200720808465, - "loss": 4.2529, - "step": 1687 - }, - { - "epoch": 0.52, - "learning_rate": 0.00028638637814884743, - "loss": 3.9305, - "step": 1688 - }, - { - "epoch": 0.52, - "learning_rate": 0.0002863507490896102, - "loss": 3.8363, - "step": 1689 - }, - { - "epoch": 0.52, - "learning_rate": 0.00028631512003037294, - "loss": 3.8006, - "step": 1690 - }, - { - "epoch": 0.52, - "learning_rate": 0.0002862794909711357, - "loss": 3.9113, - "step": 1691 - }, - { - "epoch": 0.52, - "learning_rate": 0.0002862438619118985, - "loss": 3.8559, - "step": 1692 - }, - { - "epoch": 0.52, - "learning_rate": 0.0002862082328526613, - "loss": 3.8625, - "step": 1693 - }, - { - "epoch": 0.52, - "learning_rate": 0.00028617260379342407, - "loss": 3.671, - "step": 1694 - }, - { - "epoch": 0.52, - "learning_rate": 0.00028613697473418685, - "loss": 3.5207, - "step": 1695 - }, - { - "epoch": 0.52, - "learning_rate": 0.0002861013456749496, - "loss": 3.4143, - "step": 1696 - }, - { - "epoch": 0.52, - "learning_rate": 0.00028606571661571236, - "loss": 3.3294, - "step": 1697 - }, - { - "epoch": 0.52, - "learning_rate": 0.00028603008755647514, - "loss": 3.2698, - "step": 1698 - }, - { - "epoch": 0.52, - "learning_rate": 0.00028599445849723787, - "loss": 2.8946, - "step": 1699 - }, - { - "epoch": 0.52, - "learning_rate": 0.0002859588294380007, - "loss": 2.8252, - "step": 1700 - }, - { - "epoch": 0.52, - "learning_rate": 0.00028592320037876343, - "loss": 5.9559, - "step": 1701 - }, - { - "epoch": 0.53, - "learning_rate": 0.0002858875713195262, - "loss": 5.4915, - "step": 1702 - }, - { - "epoch": 0.53, - "learning_rate": 0.000285851942260289, - "loss": 5.3747, - "step": 1703 - }, - { - "epoch": 0.53, - "learning_rate": 0.0002858163132010517, - "loss": 5.3121, - "step": 1704 - }, - { - "epoch": 0.53, - "learning_rate": 0.00028578068414181456, - "loss": 4.8444, - "step": 1705 - }, - { - "epoch": 0.53, - "learning_rate": 0.0002857450550825773, - "loss": 5.0988, - "step": 1706 - }, - { - "epoch": 0.53, - "learning_rate": 0.00028570942602334007, - "loss": 4.8321, - "step": 1707 - }, - { - "epoch": 0.53, - "learning_rate": 0.00028567379696410285, - "loss": 4.4809, - "step": 1708 - }, - { - "epoch": 0.53, - "learning_rate": 0.0002856381679048656, - "loss": 4.5086, - "step": 1709 - }, - { - "epoch": 0.53, - "learning_rate": 0.0002856025388456284, - "loss": 4.7842, - "step": 1710 - }, - { - "epoch": 0.53, - "learning_rate": 0.00028556690978639114, - "loss": 4.7684, - "step": 1711 - }, - { - "epoch": 0.53, - "learning_rate": 0.0002855312807271539, - "loss": 4.5887, - "step": 1712 - }, - { - "epoch": 0.53, - "learning_rate": 0.0002854956516679167, - "loss": 4.672, - "step": 1713 - }, - { - "epoch": 0.53, - "learning_rate": 0.00028546002260867944, - "loss": 4.4023, - "step": 1714 - }, - { - "epoch": 0.53, - "learning_rate": 0.00028542439354944227, - "loss": 4.4104, - "step": 1715 - }, - { - "epoch": 0.53, - "learning_rate": 0.000285388764490205, - "loss": 4.6319, - "step": 1716 - }, - { - "epoch": 0.53, - "learning_rate": 0.0002853531354309678, - "loss": 4.5523, - "step": 1717 - }, - { - "epoch": 0.53, - "learning_rate": 0.00028531750637173056, - "loss": 4.7205, - "step": 1718 - }, - { - "epoch": 0.53, - "learning_rate": 0.00028528187731249335, - "loss": 4.4206, - "step": 1719 - }, - { - "epoch": 0.53, - "learning_rate": 0.0002852462482532561, - "loss": 4.7251, - "step": 1720 - }, - { - "epoch": 0.53, - "learning_rate": 0.00028521061919401885, - "loss": 4.491, - "step": 1721 - }, - { - "epoch": 0.53, - "learning_rate": 0.00028517499013478164, - "loss": 4.2801, - "step": 1722 - }, - { - "epoch": 0.53, - "learning_rate": 0.0002851393610755444, - "loss": 4.3607, - "step": 1723 - }, - { - "epoch": 0.53, - "learning_rate": 0.0002851037320163072, - "loss": 4.2176, - "step": 1724 - }, - { - "epoch": 0.53, - "learning_rate": 0.00028506810295706993, - "loss": 4.4153, - "step": 1725 - }, - { - "epoch": 0.53, - "learning_rate": 0.0002850324738978327, - "loss": 4.3146, - "step": 1726 - }, - { - "epoch": 0.53, - "learning_rate": 0.0002849968448385955, - "loss": 4.4574, - "step": 1727 - }, - { - "epoch": 0.53, - "learning_rate": 0.0002849612157793583, - "loss": 4.2592, - "step": 1728 - }, - { - "epoch": 0.53, - "learning_rate": 0.00028492558672012106, - "loss": 4.4472, - "step": 1729 - }, - { - "epoch": 0.53, - "learning_rate": 0.0002848899576608838, - "loss": 4.4364, - "step": 1730 - }, - { - "epoch": 0.53, - "learning_rate": 0.00028485432860164656, - "loss": 4.5569, - "step": 1731 - }, - { - "epoch": 0.53, - "learning_rate": 0.00028481869954240935, - "loss": 4.3802, - "step": 1732 - }, - { - "epoch": 0.53, - "learning_rate": 0.00028478307048317213, - "loss": 4.2796, - "step": 1733 - }, - { - "epoch": 0.54, - "learning_rate": 0.0002847474414239349, - "loss": 4.095, - "step": 1734 - }, - { - "epoch": 0.54, - "learning_rate": 0.00028471181236469764, - "loss": 4.1858, - "step": 1735 - }, - { - "epoch": 0.54, - "learning_rate": 0.0002846761833054604, - "loss": 4.1695, - "step": 1736 - }, - { - "epoch": 0.54, - "learning_rate": 0.0002846405542462232, - "loss": 4.0288, - "step": 1737 - }, - { - "epoch": 0.54, - "learning_rate": 0.000284604925186986, - "loss": 4.1276, - "step": 1738 - }, - { - "epoch": 0.54, - "learning_rate": 0.00028456929612774877, - "loss": 3.8841, - "step": 1739 - }, - { - "epoch": 0.54, - "learning_rate": 0.0002845336670685115, - "loss": 3.8218, - "step": 1740 - }, - { - "epoch": 0.54, - "learning_rate": 0.0002844980380092743, - "loss": 3.7556, - "step": 1741 - }, - { - "epoch": 0.54, - "learning_rate": 0.00028446240895003706, - "loss": 3.864, - "step": 1742 - }, - { - "epoch": 0.54, - "learning_rate": 0.00028442677989079984, - "loss": 3.846, - "step": 1743 - }, - { - "epoch": 0.54, - "learning_rate": 0.0002843911508315626, - "loss": 3.6336, - "step": 1744 - }, - { - "epoch": 0.54, - "learning_rate": 0.00028435552177232535, - "loss": 3.4695, - "step": 1745 - }, - { - "epoch": 0.54, - "learning_rate": 0.00028431989271308813, - "loss": 3.632, - "step": 1746 - }, - { - "epoch": 0.54, - "learning_rate": 0.0002842842636538509, - "loss": 3.4329, - "step": 1747 - }, - { - "epoch": 0.54, - "learning_rate": 0.0002842486345946137, - "loss": 3.6854, - "step": 1748 - }, - { - "epoch": 0.54, - "learning_rate": 0.0002842130055353765, - "loss": 3.0649, - "step": 1749 - }, - { - "epoch": 0.54, - "learning_rate": 0.0002841773764761392, - "loss": 2.9265, - "step": 1750 - }, - { - "epoch": 0.54, - "learning_rate": 0.000284141747416902, - "loss": 5.4338, - "step": 1751 - }, - { - "epoch": 0.54, - "learning_rate": 0.00028410611835766477, - "loss": 5.2903, - "step": 1752 - }, - { - "epoch": 0.54, - "learning_rate": 0.00028407048929842755, - "loss": 4.8795, - "step": 1753 - }, - { - "epoch": 0.54, - "learning_rate": 0.00028403486023919033, - "loss": 4.7638, - "step": 1754 - }, - { - "epoch": 0.54, - "learning_rate": 0.00028399923117995306, - "loss": 5.0412, - "step": 1755 - }, - { - "epoch": 0.54, - "learning_rate": 0.00028396360212071584, - "loss": 4.8736, - "step": 1756 - }, - { - "epoch": 0.54, - "learning_rate": 0.0002839279730614786, - "loss": 4.8018, - "step": 1757 - }, - { - "epoch": 0.54, - "learning_rate": 0.0002838923440022414, - "loss": 5.0769, - "step": 1758 - }, - { - "epoch": 0.54, - "learning_rate": 0.00028385671494300413, - "loss": 4.4701, - "step": 1759 - }, - { - "epoch": 0.54, - "learning_rate": 0.0002838210858837669, - "loss": 4.4708, - "step": 1760 - }, - { - "epoch": 0.54, - "learning_rate": 0.0002837854568245297, - "loss": 4.6064, - "step": 1761 - }, - { - "epoch": 0.54, - "learning_rate": 0.0002837498277652925, - "loss": 4.8933, - "step": 1762 - }, - { - "epoch": 0.54, - "learning_rate": 0.00028371419870605526, - "loss": 4.5085, - "step": 1763 - }, - { - "epoch": 0.54, - "learning_rate": 0.000283678569646818, - "loss": 4.4442, - "step": 1764 - }, - { - "epoch": 0.54, - "learning_rate": 0.00028364294058758077, - "loss": 4.8048, - "step": 1765 - }, - { - "epoch": 0.55, - "learning_rate": 0.00028360731152834355, - "loss": 4.7817, - "step": 1766 - }, - { - "epoch": 0.55, - "learning_rate": 0.00028357168246910633, - "loss": 4.5038, - "step": 1767 - }, - { - "epoch": 0.55, - "learning_rate": 0.0002835360534098691, - "loss": 4.5987, - "step": 1768 - }, - { - "epoch": 0.55, - "learning_rate": 0.00028350042435063184, - "loss": 4.8134, - "step": 1769 - }, - { - "epoch": 0.55, - "learning_rate": 0.0002834647952913947, - "loss": 4.5126, - "step": 1770 - }, - { - "epoch": 0.55, - "learning_rate": 0.0002834291662321574, - "loss": 4.2741, - "step": 1771 - }, - { - "epoch": 0.55, - "learning_rate": 0.0002833935371729202, - "loss": 4.3485, - "step": 1772 - }, - { - "epoch": 0.55, - "learning_rate": 0.00028335790811368297, - "loss": 4.3584, - "step": 1773 - }, - { - "epoch": 0.55, - "learning_rate": 0.0002833222790544457, - "loss": 4.4953, - "step": 1774 - }, - { - "epoch": 0.55, - "learning_rate": 0.00028328664999520853, - "loss": 4.3292, - "step": 1775 - }, - { - "epoch": 0.55, - "learning_rate": 0.00028325102093597126, - "loss": 4.5157, - "step": 1776 - }, - { - "epoch": 0.55, - "learning_rate": 0.00028321539187673404, - "loss": 4.122, - "step": 1777 - }, - { - "epoch": 0.55, - "learning_rate": 0.0002831797628174968, - "loss": 4.4656, - "step": 1778 - }, - { - "epoch": 0.55, - "learning_rate": 0.00028314413375825955, - "loss": 4.3855, - "step": 1779 - }, - { - "epoch": 0.55, - "learning_rate": 0.00028310850469902233, - "loss": 4.2659, - "step": 1780 - }, - { - "epoch": 0.55, - "learning_rate": 0.0002830728756397851, - "loss": 4.1749, - "step": 1781 - }, - { - "epoch": 0.55, - "learning_rate": 0.0002830372465805479, - "loss": 4.2162, - "step": 1782 - }, - { - "epoch": 0.55, - "learning_rate": 0.0002830016175213107, - "loss": 4.0078, - "step": 1783 - }, - { - "epoch": 0.55, - "learning_rate": 0.0002829659884620734, - "loss": 4.3881, - "step": 1784 - }, - { - "epoch": 0.55, - "learning_rate": 0.0002829303594028362, - "loss": 3.9709, - "step": 1785 - }, - { - "epoch": 0.55, - "learning_rate": 0.00028289473034359897, - "loss": 4.1335, - "step": 1786 - }, - { - "epoch": 0.55, - "learning_rate": 0.00028285910128436175, - "loss": 4.0052, - "step": 1787 - }, - { - "epoch": 0.55, - "learning_rate": 0.00028282347222512454, - "loss": 4.0536, - "step": 1788 - }, - { - "epoch": 0.55, - "learning_rate": 0.0002827878431658873, - "loss": 3.8297, - "step": 1789 - }, - { - "epoch": 0.55, - "learning_rate": 0.00028275221410665005, - "loss": 3.857, - "step": 1790 - }, - { - "epoch": 0.55, - "learning_rate": 0.0002827165850474128, - "loss": 3.471, - "step": 1791 - }, - { - "epoch": 0.55, - "learning_rate": 0.0002826809559881756, - "loss": 3.8361, - "step": 1792 - }, - { - "epoch": 0.55, - "learning_rate": 0.00028264532692893834, - "loss": 3.8958, - "step": 1793 - }, - { - "epoch": 0.55, - "learning_rate": 0.00028260969786970117, - "loss": 4.051, - "step": 1794 - }, - { - "epoch": 0.55, - "learning_rate": 0.0002825740688104639, - "loss": 3.5728, - "step": 1795 - }, - { - "epoch": 0.55, - "learning_rate": 0.0002825384397512267, - "loss": 3.3148, - "step": 1796 - }, - { - "epoch": 0.55, - "learning_rate": 0.00028250281069198946, - "loss": 3.6328, - "step": 1797 - }, - { - "epoch": 0.55, - "learning_rate": 0.0002824671816327522, - "loss": 3.0205, - "step": 1798 - }, - { - "epoch": 0.56, - "learning_rate": 0.00028243155257351503, - "loss": 3.0506, - "step": 1799 - }, - { - "epoch": 0.56, - "learning_rate": 0.00028239592351427776, - "loss": 3.0353, - "step": 1800 - }, - { - "epoch": 0.56, - "learning_rate": 0.00028236029445504054, - "loss": 6.1927, - "step": 1801 - }, - { - "epoch": 0.56, - "learning_rate": 0.0002823246653958033, - "loss": 5.62, - "step": 1802 - }, - { - "epoch": 0.56, - "learning_rate": 0.00028228903633656605, - "loss": 5.6133, - "step": 1803 - }, - { - "epoch": 0.56, - "learning_rate": 0.0002822534072773289, - "loss": 5.0318, - "step": 1804 - }, - { - "epoch": 0.56, - "learning_rate": 0.0002822177782180916, - "loss": 5.2845, - "step": 1805 - }, - { - "epoch": 0.56, - "learning_rate": 0.0002821821491588544, - "loss": 4.7732, - "step": 1806 - }, - { - "epoch": 0.56, - "learning_rate": 0.0002821465200996172, - "loss": 4.4498, - "step": 1807 - }, - { - "epoch": 0.56, - "learning_rate": 0.0002821108910403799, - "loss": 4.3548, - "step": 1808 - }, - { - "epoch": 0.56, - "learning_rate": 0.00028207526198114274, - "loss": 4.7838, - "step": 1809 - }, - { - "epoch": 0.56, - "learning_rate": 0.00028203963292190547, - "loss": 4.577, - "step": 1810 - }, - { - "epoch": 0.56, - "learning_rate": 0.00028200400386266825, - "loss": 4.9805, - "step": 1811 - }, - { - "epoch": 0.56, - "learning_rate": 0.00028196837480343103, - "loss": 4.6141, - "step": 1812 - }, - { - "epoch": 0.56, - "learning_rate": 0.0002819327457441938, - "loss": 4.6616, - "step": 1813 - }, - { - "epoch": 0.56, - "learning_rate": 0.0002818971166849566, - "loss": 4.3279, - "step": 1814 - }, - { - "epoch": 0.56, - "learning_rate": 0.0002818614876257193, - "loss": 4.5409, - "step": 1815 - }, - { - "epoch": 0.56, - "learning_rate": 0.0002818258585664821, - "loss": 4.384, - "step": 1816 - }, - { - "epoch": 0.56, - "learning_rate": 0.0002817902295072449, - "loss": 4.2928, - "step": 1817 - }, - { - "epoch": 0.56, - "learning_rate": 0.00028175460044800767, - "loss": 4.5664, - "step": 1818 - }, - { - "epoch": 0.56, - "learning_rate": 0.0002817189713887704, - "loss": 4.3505, - "step": 1819 - }, - { - "epoch": 0.56, - "learning_rate": 0.0002816833423295332, - "loss": 4.4975, - "step": 1820 - }, - { - "epoch": 0.56, - "learning_rate": 0.00028164771327029596, - "loss": 4.2909, - "step": 1821 - }, - { - "epoch": 0.56, - "learning_rate": 0.00028161208421105874, - "loss": 4.7649, - "step": 1822 - }, - { - "epoch": 0.56, - "learning_rate": 0.0002815764551518215, - "loss": 4.5508, - "step": 1823 - }, - { - "epoch": 0.56, - "learning_rate": 0.00028154082609258425, - "loss": 4.4209, - "step": 1824 - }, - { - "epoch": 0.56, - "learning_rate": 0.00028150519703334703, - "loss": 4.1213, - "step": 1825 - }, - { - "epoch": 0.56, - "learning_rate": 0.0002814695679741098, - "loss": 4.3874, - "step": 1826 - }, - { - "epoch": 0.56, - "learning_rate": 0.00028143393891487254, - "loss": 4.3513, - "step": 1827 - }, - { - "epoch": 0.56, - "learning_rate": 0.0002813983098556354, - "loss": 4.367, - "step": 1828 - }, - { - "epoch": 0.56, - "learning_rate": 0.0002813626807963981, - "loss": 4.416, - "step": 1829 - }, - { - "epoch": 0.56, - "learning_rate": 0.0002813270517371609, - "loss": 4.1103, - "step": 1830 - }, - { - "epoch": 0.57, - "learning_rate": 0.00028129142267792367, - "loss": 4.1808, - "step": 1831 - }, - { - "epoch": 0.57, - "learning_rate": 0.00028125579361868645, - "loss": 4.1928, - "step": 1832 - }, - { - "epoch": 0.57, - "learning_rate": 0.00028122016455944923, - "loss": 4.3737, - "step": 1833 - }, - { - "epoch": 0.57, - "learning_rate": 0.00028118453550021196, - "loss": 3.8465, - "step": 1834 - }, - { - "epoch": 0.57, - "learning_rate": 0.00028114890644097474, - "loss": 4.0205, - "step": 1835 - }, - { - "epoch": 0.57, - "learning_rate": 0.0002811132773817375, - "loss": 4.0079, - "step": 1836 - }, - { - "epoch": 0.57, - "learning_rate": 0.0002810776483225003, - "loss": 3.8324, - "step": 1837 - }, - { - "epoch": 0.57, - "learning_rate": 0.0002810420192632631, - "loss": 4.1645, - "step": 1838 - }, - { - "epoch": 0.57, - "learning_rate": 0.0002810063902040258, - "loss": 4.1409, - "step": 1839 - }, - { - "epoch": 0.57, - "learning_rate": 0.0002809707611447886, - "loss": 4.0916, - "step": 1840 - }, - { - "epoch": 0.57, - "learning_rate": 0.0002809351320855514, - "loss": 3.8341, - "step": 1841 - }, - { - "epoch": 0.57, - "learning_rate": 0.00028089950302631416, - "loss": 3.6373, - "step": 1842 - }, - { - "epoch": 0.57, - "learning_rate": 0.00028086387396707694, - "loss": 3.7619, - "step": 1843 - }, - { - "epoch": 0.57, - "learning_rate": 0.00028082824490783967, - "loss": 3.3177, - "step": 1844 - }, - { - "epoch": 0.57, - "learning_rate": 0.00028079261584860245, - "loss": 3.3486, - "step": 1845 - }, - { - "epoch": 0.57, - "learning_rate": 0.00028075698678936523, - "loss": 3.4016, - "step": 1846 - }, - { - "epoch": 0.57, - "learning_rate": 0.000280721357730128, - "loss": 3.4434, - "step": 1847 - }, - { - "epoch": 0.57, - "learning_rate": 0.0002806857286708908, - "loss": 3.2736, - "step": 1848 - }, - { - "epoch": 0.57, - "learning_rate": 0.0002806500996116535, - "loss": 2.9675, - "step": 1849 - }, - { - "epoch": 0.57, - "learning_rate": 0.0002806144705524163, - "loss": 2.4712, - "step": 1850 - }, - { - "epoch": 0.57, - "learning_rate": 0.0002805788414931791, - "loss": 5.931, - "step": 1851 - }, - { - "epoch": 0.57, - "learning_rate": 0.00028054321243394187, - "loss": 5.1531, - "step": 1852 - }, - { - "epoch": 0.57, - "learning_rate": 0.0002805075833747046, - "loss": 5.1907, - "step": 1853 - }, - { - "epoch": 0.57, - "learning_rate": 0.0002804719543154674, - "loss": 5.3324, - "step": 1854 - }, - { - "epoch": 0.57, - "learning_rate": 0.00028043632525623016, - "loss": 5.3411, - "step": 1855 - }, - { - "epoch": 0.57, - "learning_rate": 0.00028040069619699294, - "loss": 4.6327, - "step": 1856 - }, - { - "epoch": 0.57, - "learning_rate": 0.0002803650671377557, - "loss": 5.0004, - "step": 1857 - }, - { - "epoch": 0.57, - "learning_rate": 0.00028032943807851845, - "loss": 4.5061, - "step": 1858 - }, - { - "epoch": 0.57, - "learning_rate": 0.0002802938090192813, - "loss": 4.753, - "step": 1859 - }, - { - "epoch": 0.57, - "learning_rate": 0.000280258179960044, - "loss": 4.6296, - "step": 1860 - }, - { - "epoch": 0.57, - "learning_rate": 0.0002802225509008068, - "loss": 4.8117, - "step": 1861 - }, - { - "epoch": 0.57, - "learning_rate": 0.0002801869218415696, - "loss": 4.6888, - "step": 1862 - }, - { - "epoch": 0.57, - "learning_rate": 0.0002801512927823323, - "loss": 4.8442, - "step": 1863 - }, - { - "epoch": 0.58, - "learning_rate": 0.00028011566372309514, - "loss": 4.6244, - "step": 1864 - }, - { - "epoch": 0.58, - "learning_rate": 0.00028008003466385787, - "loss": 4.5149, - "step": 1865 - }, - { - "epoch": 0.58, - "learning_rate": 0.00028004440560462065, - "loss": 4.4641, - "step": 1866 - }, - { - "epoch": 0.58, - "learning_rate": 0.00028000877654538344, - "loss": 4.4827, - "step": 1867 - }, - { - "epoch": 0.58, - "learning_rate": 0.00027997314748614616, - "loss": 4.465, - "step": 1868 - }, - { - "epoch": 0.58, - "learning_rate": 0.000279937518426909, - "loss": 4.6763, - "step": 1869 - }, - { - "epoch": 0.58, - "learning_rate": 0.00027990188936767173, - "loss": 4.2837, - "step": 1870 - }, - { - "epoch": 0.58, - "learning_rate": 0.0002798662603084345, - "loss": 4.3393, - "step": 1871 - }, - { - "epoch": 0.58, - "learning_rate": 0.0002798306312491973, - "loss": 4.1916, - "step": 1872 - }, - { - "epoch": 0.58, - "learning_rate": 0.00027979500218996, - "loss": 4.1749, - "step": 1873 - }, - { - "epoch": 0.58, - "learning_rate": 0.0002797593731307228, - "loss": 4.2497, - "step": 1874 - }, - { - "epoch": 0.58, - "learning_rate": 0.0002797237440714856, - "loss": 4.1868, - "step": 1875 - }, - { - "epoch": 0.58, - "learning_rate": 0.00027968811501224836, - "loss": 4.3625, - "step": 1876 - }, - { - "epoch": 0.58, - "learning_rate": 0.00027965248595301115, - "loss": 4.5243, - "step": 1877 - }, - { - "epoch": 0.58, - "learning_rate": 0.0002796168568937739, - "loss": 4.2954, - "step": 1878 - }, - { - "epoch": 0.58, - "learning_rate": 0.00027958122783453666, - "loss": 4.2173, - "step": 1879 - }, - { - "epoch": 0.58, - "learning_rate": 0.00027954559877529944, - "loss": 4.4197, - "step": 1880 - }, - { - "epoch": 0.58, - "learning_rate": 0.0002795099697160622, - "loss": 4.3507, - "step": 1881 - }, - { - "epoch": 0.58, - "learning_rate": 0.000279474340656825, - "loss": 4.2776, - "step": 1882 - }, - { - "epoch": 0.58, - "learning_rate": 0.0002794387115975878, - "loss": 3.9773, - "step": 1883 - }, - { - "epoch": 0.58, - "learning_rate": 0.0002794030825383505, - "loss": 4.2877, - "step": 1884 - }, - { - "epoch": 0.58, - "learning_rate": 0.0002793674534791133, - "loss": 4.0587, - "step": 1885 - }, - { - "epoch": 0.58, - "learning_rate": 0.0002793318244198761, - "loss": 4.091, - "step": 1886 - }, - { - "epoch": 0.58, - "learning_rate": 0.0002792961953606388, - "loss": 3.9697, - "step": 1887 - }, - { - "epoch": 0.58, - "learning_rate": 0.00027926056630140164, - "loss": 3.8383, - "step": 1888 - }, - { - "epoch": 0.58, - "learning_rate": 0.00027922493724216437, - "loss": 4.0129, - "step": 1889 - }, - { - "epoch": 0.58, - "learning_rate": 0.00027918930818292715, - "loss": 4.0093, - "step": 1890 - }, - { - "epoch": 0.58, - "learning_rate": 0.00027915367912368993, - "loss": 3.5314, - "step": 1891 - }, - { - "epoch": 0.58, - "learning_rate": 0.00027911805006445266, - "loss": 3.5054, - "step": 1892 - }, - { - "epoch": 0.58, - "learning_rate": 0.0002790824210052155, - "loss": 3.6333, - "step": 1893 - }, - { - "epoch": 0.58, - "learning_rate": 0.0002790467919459782, - "loss": 3.7856, - "step": 1894 - }, - { - "epoch": 0.58, - "learning_rate": 0.000279011162886741, - "loss": 3.656, - "step": 1895 - }, - { - "epoch": 0.59, - "learning_rate": 0.0002789755338275038, - "loss": 3.6999, - "step": 1896 - }, - { - "epoch": 0.59, - "learning_rate": 0.0002789399047682665, - "loss": 3.6861, - "step": 1897 - }, - { - "epoch": 0.59, - "learning_rate": 0.00027890427570902935, - "loss": 3.0799, - "step": 1898 - }, - { - "epoch": 0.59, - "learning_rate": 0.0002788686466497921, - "loss": 2.9319, - "step": 1899 - }, - { - "epoch": 0.59, - "learning_rate": 0.00027883301759055486, - "loss": 2.8406, - "step": 1900 - }, - { - "epoch": 0.59, - "learning_rate": 0.00027879738853131764, - "loss": 6.0849, - "step": 1901 - }, - { - "epoch": 0.59, - "learning_rate": 0.0002787617594720804, - "loss": 5.3402, - "step": 1902 - }, - { - "epoch": 0.59, - "learning_rate": 0.0002787261304128432, - "loss": 5.1385, - "step": 1903 - }, - { - "epoch": 0.59, - "learning_rate": 0.00027869050135360593, - "loss": 5.1564, - "step": 1904 - }, - { - "epoch": 0.59, - "learning_rate": 0.0002786548722943687, - "loss": 4.997, - "step": 1905 - }, - { - "epoch": 0.59, - "learning_rate": 0.0002786192432351315, - "loss": 4.7909, - "step": 1906 - }, - { - "epoch": 0.59, - "learning_rate": 0.0002785836141758943, - "loss": 4.7124, - "step": 1907 - }, - { - "epoch": 0.59, - "learning_rate": 0.000278547985116657, - "loss": 4.9631, - "step": 1908 - }, - { - "epoch": 0.59, - "learning_rate": 0.0002785123560574198, - "loss": 4.9777, - "step": 1909 - }, - { - "epoch": 0.59, - "learning_rate": 0.00027847672699818257, - "loss": 4.6659, - "step": 1910 - }, - { - "epoch": 0.59, - "learning_rate": 0.00027844109793894535, - "loss": 4.7956, - "step": 1911 - }, - { - "epoch": 0.59, - "learning_rate": 0.00027840546887970813, - "loss": 4.7263, - "step": 1912 - }, - { - "epoch": 0.59, - "learning_rate": 0.00027836983982047086, - "loss": 4.4392, - "step": 1913 - }, - { - "epoch": 0.59, - "learning_rate": 0.00027833421076123364, - "loss": 4.6713, - "step": 1914 - }, - { - "epoch": 0.59, - "learning_rate": 0.0002782985817019964, - "loss": 4.5464, - "step": 1915 - }, - { - "epoch": 0.59, - "learning_rate": 0.0002782629526427592, - "loss": 4.6236, - "step": 1916 - }, - { - "epoch": 0.59, - "learning_rate": 0.000278227323583522, - "loss": 4.6247, - "step": 1917 - }, - { - "epoch": 0.59, - "learning_rate": 0.0002781916945242847, - "loss": 4.5863, - "step": 1918 - }, - { - "epoch": 0.59, - "learning_rate": 0.0002781560654650475, - "loss": 4.3881, - "step": 1919 - }, - { - "epoch": 0.59, - "learning_rate": 0.0002781204364058103, - "loss": 4.2146, - "step": 1920 - }, - { - "epoch": 0.59, - "learning_rate": 0.000278084807346573, - "loss": 4.3224, - "step": 1921 - }, - { - "epoch": 0.59, - "learning_rate": 0.00027804917828733584, - "loss": 4.2245, - "step": 1922 - }, - { - "epoch": 0.59, - "learning_rate": 0.00027801354922809857, - "loss": 4.3457, - "step": 1923 - }, - { - "epoch": 0.59, - "learning_rate": 0.00027797792016886135, - "loss": 4.0761, - "step": 1924 - }, - { - "epoch": 0.59, - "learning_rate": 0.00027794229110962413, - "loss": 4.5551, - "step": 1925 - }, - { - "epoch": 0.59, - "learning_rate": 0.0002779066620503869, - "loss": 4.2646, - "step": 1926 - }, - { - "epoch": 0.59, - "learning_rate": 0.0002778710329911497, - "loss": 4.3472, - "step": 1927 - }, - { - "epoch": 0.6, - "learning_rate": 0.0002778354039319124, - "loss": 4.4012, - "step": 1928 - }, - { - "epoch": 0.6, - "learning_rate": 0.0002777997748726752, - "loss": 4.2185, - "step": 1929 - }, - { - "epoch": 0.6, - "learning_rate": 0.000277764145813438, - "loss": 4.0906, - "step": 1930 - }, - { - "epoch": 0.6, - "learning_rate": 0.00027772851675420077, - "loss": 4.1106, - "step": 1931 - }, - { - "epoch": 0.6, - "learning_rate": 0.00027769288769496355, - "loss": 4.5432, - "step": 1932 - }, - { - "epoch": 0.6, - "learning_rate": 0.0002776572586357263, - "loss": 4.1548, - "step": 1933 - }, - { - "epoch": 0.6, - "learning_rate": 0.00027762162957648906, - "loss": 3.9916, - "step": 1934 - }, - { - "epoch": 0.6, - "learning_rate": 0.00027758600051725184, - "loss": 4.2435, - "step": 1935 - }, - { - "epoch": 0.6, - "learning_rate": 0.0002775503714580146, - "loss": 3.9455, - "step": 1936 - }, - { - "epoch": 0.6, - "learning_rate": 0.0002775147423987774, - "loss": 3.9697, - "step": 1937 - }, - { - "epoch": 0.6, - "learning_rate": 0.00027747911333954014, - "loss": 4.0098, - "step": 1938 - }, - { - "epoch": 0.6, - "learning_rate": 0.0002774434842803029, - "loss": 3.97, - "step": 1939 - }, - { - "epoch": 0.6, - "learning_rate": 0.0002774078552210657, - "loss": 3.9678, - "step": 1940 - }, - { - "epoch": 0.6, - "learning_rate": 0.0002773722261618285, - "loss": 3.6627, - "step": 1941 - }, - { - "epoch": 0.6, - "learning_rate": 0.0002773365971025912, - "loss": 3.7294, - "step": 1942 - }, - { - "epoch": 0.6, - "learning_rate": 0.000277300968043354, - "loss": 3.4384, - "step": 1943 - }, - { - "epoch": 0.6, - "learning_rate": 0.0002772653389841168, - "loss": 3.3939, - "step": 1944 - }, - { - "epoch": 0.6, - "learning_rate": 0.00027722970992487956, - "loss": 3.3806, - "step": 1945 - }, - { - "epoch": 0.6, - "learning_rate": 0.00027719408086564234, - "loss": 3.3158, - "step": 1946 - }, - { - "epoch": 0.6, - "learning_rate": 0.00027715845180640506, - "loss": 3.6413, - "step": 1947 - }, - { - "epoch": 0.6, - "learning_rate": 0.00027712282274716785, - "loss": 3.1799, - "step": 1948 - }, - { - "epoch": 0.6, - "learning_rate": 0.00027708719368793063, - "loss": 2.8909, - "step": 1949 - }, - { - "epoch": 0.6, - "learning_rate": 0.0002770515646286934, - "loss": 2.7417, - "step": 1950 - }, - { - "epoch": 0.6, - "learning_rate": 0.0002770159355694562, - "loss": 5.8252, - "step": 1951 - }, - { - "epoch": 0.6, - "learning_rate": 0.0002769803065102189, - "loss": 5.8696, - "step": 1952 - }, - { - "epoch": 0.6, - "learning_rate": 0.00027694467745098176, - "loss": 5.3137, - "step": 1953 - }, - { - "epoch": 0.6, - "learning_rate": 0.0002769090483917445, - "loss": 5.3433, - "step": 1954 - }, - { - "epoch": 0.6, - "learning_rate": 0.00027687341933250727, - "loss": 5.0567, - "step": 1955 - }, - { - "epoch": 0.6, - "learning_rate": 0.00027683779027327005, - "loss": 4.6707, - "step": 1956 - }, - { - "epoch": 0.6, - "learning_rate": 0.0002768021612140328, - "loss": 4.7928, - "step": 1957 - }, - { - "epoch": 0.6, - "learning_rate": 0.0002767665321547956, - "loss": 4.685, - "step": 1958 - }, - { - "epoch": 0.6, - "learning_rate": 0.00027673090309555834, - "loss": 4.8176, - "step": 1959 - }, - { - "epoch": 0.6, - "learning_rate": 0.0002766952740363211, - "loss": 4.766, - "step": 1960 - }, - { - "epoch": 0.61, - "learning_rate": 0.0002766596449770839, - "loss": 4.9201, - "step": 1961 - }, - { - "epoch": 0.61, - "learning_rate": 0.00027662401591784663, - "loss": 4.605, - "step": 1962 - }, - { - "epoch": 0.61, - "learning_rate": 0.00027658838685860947, - "loss": 4.8998, - "step": 1963 - }, - { - "epoch": 0.61, - "learning_rate": 0.0002765527577993722, - "loss": 4.6064, - "step": 1964 - }, - { - "epoch": 0.61, - "learning_rate": 0.000276517128740135, - "loss": 4.2127, - "step": 1965 - }, - { - "epoch": 0.61, - "learning_rate": 0.00027648149968089776, - "loss": 4.6782, - "step": 1966 - }, - { - "epoch": 0.61, - "learning_rate": 0.0002764458706216605, - "loss": 4.3776, - "step": 1967 - }, - { - "epoch": 0.61, - "learning_rate": 0.00027641024156242327, - "loss": 4.5418, - "step": 1968 - }, - { - "epoch": 0.61, - "learning_rate": 0.00027637461250318605, - "loss": 4.4247, - "step": 1969 - }, - { - "epoch": 0.61, - "learning_rate": 0.00027633898344394883, - "loss": 4.3267, - "step": 1970 - }, - { - "epoch": 0.61, - "learning_rate": 0.0002763033543847116, - "loss": 4.3468, - "step": 1971 - }, - { - "epoch": 0.61, - "learning_rate": 0.00027626772532547434, - "loss": 4.1767, - "step": 1972 - }, - { - "epoch": 0.61, - "learning_rate": 0.0002762320962662371, - "loss": 4.4511, - "step": 1973 - }, - { - "epoch": 0.61, - "learning_rate": 0.0002761964672069999, - "loss": 4.2063, - "step": 1974 - }, - { - "epoch": 0.61, - "learning_rate": 0.0002761608381477627, - "loss": 4.3169, - "step": 1975 - }, - { - "epoch": 0.61, - "learning_rate": 0.0002761252090885254, - "loss": 4.2372, - "step": 1976 - }, - { - "epoch": 0.61, - "learning_rate": 0.00027608958002928825, - "loss": 4.6001, - "step": 1977 - }, - { - "epoch": 0.61, - "learning_rate": 0.000276053950970051, - "loss": 4.5791, - "step": 1978 - }, - { - "epoch": 0.61, - "learning_rate": 0.00027601832191081376, - "loss": 4.2773, - "step": 1979 - }, - { - "epoch": 0.61, - "learning_rate": 0.00027598269285157654, - "loss": 4.4102, - "step": 1980 - }, - { - "epoch": 0.61, - "learning_rate": 0.00027594706379233927, - "loss": 4.3947, - "step": 1981 - }, - { - "epoch": 0.61, - "learning_rate": 0.0002759114347331021, - "loss": 4.3096, - "step": 1982 - }, - { - "epoch": 0.61, - "learning_rate": 0.00027587580567386483, - "loss": 4.0216, - "step": 1983 - }, - { - "epoch": 0.61, - "learning_rate": 0.0002758401766146276, - "loss": 4.1195, - "step": 1984 - }, - { - "epoch": 0.61, - "learning_rate": 0.0002758045475553904, - "loss": 3.9684, - "step": 1985 - }, - { - "epoch": 0.61, - "learning_rate": 0.0002757689184961531, - "loss": 3.7992, - "step": 1986 - }, - { - "epoch": 0.61, - "learning_rate": 0.00027573328943691596, - "loss": 3.9759, - "step": 1987 - }, - { - "epoch": 0.61, - "learning_rate": 0.0002756976603776787, - "loss": 4.179, - "step": 1988 - }, - { - "epoch": 0.61, - "learning_rate": 0.00027566203131844147, - "loss": 4.1401, - "step": 1989 - }, - { - "epoch": 0.61, - "learning_rate": 0.00027562640225920425, - "loss": 3.7779, - "step": 1990 - }, - { - "epoch": 0.61, - "learning_rate": 0.000275590773199967, - "loss": 3.8421, - "step": 1991 - }, - { - "epoch": 0.61, - "learning_rate": 0.0002755551441407298, - "loss": 3.5797, - "step": 1992 - }, - { - "epoch": 0.62, - "learning_rate": 0.00027551951508149254, - "loss": 3.5298, - "step": 1993 - }, - { - "epoch": 0.62, - "learning_rate": 0.0002754838860222553, - "loss": 3.4437, - "step": 1994 - }, - { - "epoch": 0.62, - "learning_rate": 0.0002754482569630181, - "loss": 3.303, - "step": 1995 - }, - { - "epoch": 0.62, - "learning_rate": 0.0002754126279037809, - "loss": 3.3195, - "step": 1996 - }, - { - "epoch": 0.62, - "learning_rate": 0.00027537699884454367, - "loss": 3.074, - "step": 1997 - }, - { - "epoch": 0.62, - "learning_rate": 0.0002753413697853064, - "loss": 3.0603, - "step": 1998 - }, - { - "epoch": 0.62, - "learning_rate": 0.0002753057407260692, - "loss": 3.0066, - "step": 1999 - }, - { - "epoch": 0.62, - "learning_rate": 0.00027527011166683196, - "loss": 2.9617, - "step": 2000 - }, - { - "epoch": 0.62, - "eval_bleu": 0.0, - "eval_loss": 4.966036796569824, - "eval_runtime": 2574.9105, - "eval_samples_per_second": 5.732, - "eval_steps_per_second": 0.717, - "step": 2000 - }, - { - "epoch": 0.62, - "learning_rate": 0.00027523448260759474, - "loss": 6.1769, - "step": 2001 - }, - { - "epoch": 0.62, - "learning_rate": 0.00027519885354835747, - "loss": 5.6556, - "step": 2002 - }, - { - "epoch": 0.62, - "learning_rate": 0.00027516322448912025, - "loss": 5.3767, - "step": 2003 - }, - { - "epoch": 0.62, - "learning_rate": 0.00027512759542988304, - "loss": 4.9736, - "step": 2004 - }, - { - "epoch": 0.62, - "learning_rate": 0.0002750919663706458, - "loss": 4.7481, - "step": 2005 - }, - { - "epoch": 0.62, - "learning_rate": 0.0002750563373114086, - "loss": 4.8907, - "step": 2006 - }, - { - "epoch": 0.62, - "learning_rate": 0.0002750207082521713, - "loss": 4.5035, - "step": 2007 - }, - { - "epoch": 0.62, - "learning_rate": 0.0002749850791929341, - "loss": 4.7699, - "step": 2008 - }, - { - "epoch": 0.62, - "learning_rate": 0.0002749494501336969, - "loss": 4.631, - "step": 2009 - }, - { - "epoch": 0.62, - "learning_rate": 0.00027491382107445967, - "loss": 4.6193, - "step": 2010 - }, - { - "epoch": 0.62, - "learning_rate": 0.00027487819201522245, - "loss": 4.6941, - "step": 2011 - }, - { - "epoch": 0.62, - "learning_rate": 0.0002748425629559852, - "loss": 4.3448, - "step": 2012 - }, - { - "epoch": 0.62, - "learning_rate": 0.00027480693389674796, - "loss": 4.6623, - "step": 2013 - }, - { - "epoch": 0.62, - "learning_rate": 0.00027477130483751075, - "loss": 4.5536, - "step": 2014 - }, - { - "epoch": 0.62, - "learning_rate": 0.0002747356757782735, - "loss": 4.6681, - "step": 2015 - }, - { - "epoch": 0.62, - "learning_rate": 0.0002747000467190363, - "loss": 4.418, - "step": 2016 - }, - { - "epoch": 0.62, - "learning_rate": 0.00027466441765979904, - "loss": 4.3259, - "step": 2017 - }, - { - "epoch": 0.62, - "learning_rate": 0.0002746287886005618, - "loss": 4.3832, - "step": 2018 - }, - { - "epoch": 0.62, - "learning_rate": 0.0002745931595413246, - "loss": 4.3156, - "step": 2019 - }, - { - "epoch": 0.62, - "learning_rate": 0.0002745575304820874, - "loss": 4.4399, - "step": 2020 - }, - { - "epoch": 0.62, - "learning_rate": 0.00027452190142285016, - "loss": 4.6134, - "step": 2021 - }, - { - "epoch": 0.62, - "learning_rate": 0.0002744862723636129, - "loss": 4.3347, - "step": 2022 - }, - { - "epoch": 0.62, - "learning_rate": 0.0002744506433043757, - "loss": 4.5988, - "step": 2023 - }, - { - "epoch": 0.62, - "learning_rate": 0.00027441501424513846, - "loss": 4.4484, - "step": 2024 - }, - { - "epoch": 0.62, - "learning_rate": 0.00027437938518590124, - "loss": 4.3457, - "step": 2025 - }, - { - "epoch": 0.63, - "learning_rate": 0.000274343756126664, - "loss": 4.0325, - "step": 2026 - }, - { - "epoch": 0.63, - "learning_rate": 0.00027430812706742675, - "loss": 4.015, - "step": 2027 - }, - { - "epoch": 0.63, - "learning_rate": 0.00027427249800818953, - "loss": 4.2661, - "step": 2028 - }, - { - "epoch": 0.63, - "learning_rate": 0.0002742368689489523, - "loss": 4.2043, - "step": 2029 - }, - { - "epoch": 0.63, - "learning_rate": 0.0002742012398897151, - "loss": 4.1988, - "step": 2030 - }, - { - "epoch": 0.63, - "learning_rate": 0.0002741656108304779, - "loss": 4.0476, - "step": 2031 - }, - { - "epoch": 0.63, - "learning_rate": 0.0002741299817712406, - "loss": 4.0984, - "step": 2032 - }, - { - "epoch": 0.63, - "learning_rate": 0.0002740943527120034, - "loss": 4.0974, - "step": 2033 - }, - { - "epoch": 0.63, - "learning_rate": 0.00027405872365276617, - "loss": 4.0471, - "step": 2034 - }, - { - "epoch": 0.63, - "learning_rate": 0.00027402309459352895, - "loss": 3.9909, - "step": 2035 - }, - { - "epoch": 0.63, - "learning_rate": 0.0002739874655342917, - "loss": 4.0386, - "step": 2036 - }, - { - "epoch": 0.63, - "learning_rate": 0.00027395183647505446, - "loss": 3.9185, - "step": 2037 - }, - { - "epoch": 0.63, - "learning_rate": 0.00027391620741581724, - "loss": 3.7998, - "step": 2038 - }, - { - "epoch": 0.63, - "learning_rate": 0.00027388057835658, - "loss": 3.9185, - "step": 2039 - }, - { - "epoch": 0.63, - "learning_rate": 0.0002738449492973428, - "loss": 3.7441, - "step": 2040 - }, - { - "epoch": 0.63, - "learning_rate": 0.00027380932023810553, - "loss": 3.5783, - "step": 2041 - }, - { - "epoch": 0.63, - "learning_rate": 0.0002737736911788683, - "loss": 3.6455, - "step": 2042 - }, - { - "epoch": 0.63, - "learning_rate": 0.0002737380621196311, - "loss": 3.6768, - "step": 2043 - }, - { - "epoch": 0.63, - "learning_rate": 0.0002737024330603939, - "loss": 3.5912, - "step": 2044 - }, - { - "epoch": 0.63, - "learning_rate": 0.00027366680400115666, - "loss": 3.7213, - "step": 2045 - }, - { - "epoch": 0.63, - "learning_rate": 0.0002736311749419194, - "loss": 3.1424, - "step": 2046 - }, - { - "epoch": 0.63, - "learning_rate": 0.0002735955458826822, - "loss": 3.5562, - "step": 2047 - }, - { - "epoch": 0.63, - "learning_rate": 0.00027355991682344495, - "loss": 3.27, - "step": 2048 - }, - { - "epoch": 0.63, - "learning_rate": 0.00027352428776420773, - "loss": 2.9097, - "step": 2049 - }, - { - "epoch": 0.63, - "learning_rate": 0.0002734886587049705, - "loss": 2.9704, - "step": 2050 - }, - { - "epoch": 0.63, - "learning_rate": 0.00027345302964573324, - "loss": 5.5976, - "step": 2051 - }, - { - "epoch": 0.63, - "learning_rate": 0.0002734174005864961, - "loss": 5.1167, - "step": 2052 - }, - { - "epoch": 0.63, - "learning_rate": 0.0002733817715272588, - "loss": 5.0992, - "step": 2053 - }, - { - "epoch": 0.63, - "learning_rate": 0.0002733461424680216, - "loss": 4.8379, - "step": 2054 - }, - { - "epoch": 0.63, - "learning_rate": 0.00027331051340878437, - "loss": 4.8975, - "step": 2055 - }, - { - "epoch": 0.63, - "learning_rate": 0.0002732748843495471, - "loss": 4.4934, - "step": 2056 - }, - { - "epoch": 0.63, - "learning_rate": 0.0002732392552903099, - "loss": 4.6695, - "step": 2057 - }, - { - "epoch": 0.64, - "learning_rate": 0.00027320362623107266, - "loss": 4.893, - "step": 2058 - }, - { - "epoch": 0.64, - "learning_rate": 0.00027316799717183544, - "loss": 4.4798, - "step": 2059 - }, - { - "epoch": 0.64, - "learning_rate": 0.0002731323681125982, - "loss": 4.5974, - "step": 2060 - }, - { - "epoch": 0.64, - "learning_rate": 0.00027309673905336095, - "loss": 4.4924, - "step": 2061 - }, - { - "epoch": 0.64, - "learning_rate": 0.00027306110999412373, - "loss": 4.3781, - "step": 2062 - }, - { - "epoch": 0.64, - "learning_rate": 0.0002730254809348865, - "loss": 4.2143, - "step": 2063 - }, - { - "epoch": 0.64, - "learning_rate": 0.0002729898518756493, - "loss": 4.5611, - "step": 2064 - }, - { - "epoch": 0.64, - "learning_rate": 0.0002729542228164121, - "loss": 4.8, - "step": 2065 - }, - { - "epoch": 0.64, - "learning_rate": 0.0002729185937571748, - "loss": 4.4736, - "step": 2066 - }, - { - "epoch": 0.64, - "learning_rate": 0.0002728829646979376, - "loss": 4.7248, - "step": 2067 - }, - { - "epoch": 0.64, - "learning_rate": 0.00027284733563870037, - "loss": 4.694, - "step": 2068 - }, - { - "epoch": 0.64, - "learning_rate": 0.00027281170657946315, - "loss": 4.6907, - "step": 2069 - }, - { - "epoch": 0.64, - "learning_rate": 0.0002727760775202259, - "loss": 4.3891, - "step": 2070 - }, - { - "epoch": 0.64, - "learning_rate": 0.0002727404484609887, - "loss": 4.5203, - "step": 2071 - }, - { - "epoch": 0.64, - "learning_rate": 0.00027270481940175144, - "loss": 4.2891, - "step": 2072 - }, - { - "epoch": 0.64, - "learning_rate": 0.0002726691903425142, - "loss": 4.407, - "step": 2073 - }, - { - "epoch": 0.64, - "learning_rate": 0.000272633561283277, - "loss": 4.1381, - "step": 2074 - }, - { - "epoch": 0.64, - "learning_rate": 0.00027259793222403974, - "loss": 4.4153, - "step": 2075 - }, - { - "epoch": 0.64, - "learning_rate": 0.00027256230316480257, - "loss": 4.4035, - "step": 2076 - }, - { - "epoch": 0.64, - "learning_rate": 0.0002725266741055653, - "loss": 4.3938, - "step": 2077 - }, - { - "epoch": 0.64, - "learning_rate": 0.0002724910450463281, - "loss": 4.2848, - "step": 2078 - }, - { - "epoch": 0.64, - "learning_rate": 0.00027245541598709086, - "loss": 4.5552, - "step": 2079 - }, - { - "epoch": 0.64, - "learning_rate": 0.0002724197869278536, - "loss": 4.2738, - "step": 2080 - }, - { - "epoch": 0.64, - "learning_rate": 0.0002723841578686164, - "loss": 4.3799, - "step": 2081 - }, - { - "epoch": 0.64, - "learning_rate": 0.00027234852880937915, - "loss": 4.2782, - "step": 2082 - }, - { - "epoch": 0.64, - "learning_rate": 0.00027231289975014194, - "loss": 4.0154, - "step": 2083 - }, - { - "epoch": 0.64, - "learning_rate": 0.0002722772706909047, - "loss": 4.1351, - "step": 2084 - }, - { - "epoch": 0.64, - "learning_rate": 0.00027224164163166745, - "loss": 4.1901, - "step": 2085 - }, - { - "epoch": 0.64, - "learning_rate": 0.0002722060125724303, - "loss": 4.1203, - "step": 2086 - }, - { - "epoch": 0.64, - "learning_rate": 0.000272170383513193, - "loss": 3.8366, - "step": 2087 - }, - { - "epoch": 0.64, - "learning_rate": 0.0002721347544539558, - "loss": 3.8378, - "step": 2088 - }, - { - "epoch": 0.64, - "learning_rate": 0.0002720991253947186, - "loss": 4.0113, - "step": 2089 - }, - { - "epoch": 0.65, - "learning_rate": 0.00027206349633548136, - "loss": 4.1015, - "step": 2090 - }, - { - "epoch": 0.65, - "learning_rate": 0.0002720278672762441, - "loss": 3.9593, - "step": 2091 - }, - { - "epoch": 0.65, - "learning_rate": 0.00027199223821700686, - "loss": 3.5824, - "step": 2092 - }, - { - "epoch": 0.65, - "learning_rate": 0.00027195660915776965, - "loss": 3.7761, - "step": 2093 - }, - { - "epoch": 0.65, - "learning_rate": 0.00027192098009853243, - "loss": 3.7241, - "step": 2094 - }, - { - "epoch": 0.65, - "learning_rate": 0.0002718853510392952, - "loss": 3.4441, - "step": 2095 - }, - { - "epoch": 0.65, - "learning_rate": 0.00027184972198005794, - "loss": 3.165, - "step": 2096 - }, - { - "epoch": 0.65, - "learning_rate": 0.0002718140929208207, - "loss": 3.0505, - "step": 2097 - }, - { - "epoch": 0.65, - "learning_rate": 0.0002717784638615835, - "loss": 3.194, - "step": 2098 - }, - { - "epoch": 0.65, - "learning_rate": 0.0002717428348023463, - "loss": 2.8755, - "step": 2099 - }, - { - "epoch": 0.65, - "learning_rate": 0.00027170720574310907, - "loss": 2.7077, - "step": 2100 - }, - { - "epoch": 0.65, - "learning_rate": 0.0002716715766838718, - "loss": 5.6802, - "step": 2101 - }, - { - "epoch": 0.65, - "learning_rate": 0.0002716359476246346, - "loss": 5.4096, - "step": 2102 - }, - { - "epoch": 0.65, - "learning_rate": 0.00027160031856539736, - "loss": 5.338, - "step": 2103 - }, - { - "epoch": 0.65, - "learning_rate": 0.0002715646895061601, - "loss": 5.325, - "step": 2104 - }, - { - "epoch": 0.65, - "learning_rate": 0.0002715290604469229, - "loss": 5.099, - "step": 2105 - }, - { - "epoch": 0.65, - "learning_rate": 0.00027149343138768565, - "loss": 4.8985, - "step": 2106 - }, - { - "epoch": 0.65, - "learning_rate": 0.00027145780232844843, - "loss": 4.9076, - "step": 2107 - }, - { - "epoch": 0.65, - "learning_rate": 0.0002714221732692112, - "loss": 4.5141, - "step": 2108 - }, - { - "epoch": 0.65, - "learning_rate": 0.00027138654420997394, - "loss": 4.7797, - "step": 2109 - }, - { - "epoch": 0.65, - "learning_rate": 0.0002713509151507368, - "loss": 4.4077, - "step": 2110 - }, - { - "epoch": 0.65, - "learning_rate": 0.0002713152860914995, - "loss": 4.6801, - "step": 2111 - }, - { - "epoch": 0.65, - "learning_rate": 0.0002712796570322623, - "loss": 4.7318, - "step": 2112 - }, - { - "epoch": 0.65, - "learning_rate": 0.00027124402797302507, - "loss": 4.5695, - "step": 2113 - }, - { - "epoch": 0.65, - "learning_rate": 0.00027120839891378785, - "loss": 4.4002, - "step": 2114 - }, - { - "epoch": 0.65, - "learning_rate": 0.00027117276985455063, - "loss": 4.5413, - "step": 2115 - }, - { - "epoch": 0.65, - "learning_rate": 0.00027113714079531336, - "loss": 4.4605, - "step": 2116 - }, - { - "epoch": 0.65, - "learning_rate": 0.00027110151173607614, - "loss": 4.5996, - "step": 2117 - }, - { - "epoch": 0.65, - "learning_rate": 0.0002710658826768389, - "loss": 4.4574, - "step": 2118 - }, - { - "epoch": 0.65, - "learning_rate": 0.0002710302536176017, - "loss": 4.5211, - "step": 2119 - }, - { - "epoch": 0.65, - "learning_rate": 0.0002709946245583645, - "loss": 4.6674, - "step": 2120 - }, - { - "epoch": 0.65, - "learning_rate": 0.0002709589954991272, - "loss": 4.5588, - "step": 2121 - }, - { - "epoch": 0.65, - "learning_rate": 0.00027092336643989, - "loss": 4.7214, - "step": 2122 - }, - { - "epoch": 0.66, - "learning_rate": 0.0002708877373806528, - "loss": 4.336, - "step": 2123 - }, - { - "epoch": 0.66, - "learning_rate": 0.00027085210832141556, - "loss": 4.4832, - "step": 2124 - }, - { - "epoch": 0.66, - "learning_rate": 0.00027081647926217834, - "loss": 4.2008, - "step": 2125 - }, - { - "epoch": 0.66, - "learning_rate": 0.00027078085020294107, - "loss": 4.2849, - "step": 2126 - }, - { - "epoch": 0.66, - "learning_rate": 0.00027074522114370385, - "loss": 4.3505, - "step": 2127 - }, - { - "epoch": 0.66, - "learning_rate": 0.00027070959208446663, - "loss": 4.2519, - "step": 2128 - }, - { - "epoch": 0.66, - "learning_rate": 0.0002706739630252294, - "loss": 4.0948, - "step": 2129 - }, - { - "epoch": 0.66, - "learning_rate": 0.00027063833396599214, - "loss": 4.1339, - "step": 2130 - }, - { - "epoch": 0.66, - "learning_rate": 0.0002706027049067549, - "loss": 4.3383, - "step": 2131 - }, - { - "epoch": 0.66, - "learning_rate": 0.0002705670758475177, - "loss": 4.2567, - "step": 2132 - }, - { - "epoch": 0.66, - "learning_rate": 0.0002705314467882805, - "loss": 4.3303, - "step": 2133 - }, - { - "epoch": 0.66, - "learning_rate": 0.00027049581772904327, - "loss": 4.1213, - "step": 2134 - }, - { - "epoch": 0.66, - "learning_rate": 0.000270460188669806, - "loss": 3.9718, - "step": 2135 - }, - { - "epoch": 0.66, - "learning_rate": 0.0002704245596105688, - "loss": 4.0548, - "step": 2136 - }, - { - "epoch": 0.66, - "learning_rate": 0.00027038893055133156, - "loss": 4.0638, - "step": 2137 - }, - { - "epoch": 0.66, - "learning_rate": 0.00027035330149209434, - "loss": 3.8733, - "step": 2138 - }, - { - "epoch": 0.66, - "learning_rate": 0.0002703176724328571, - "loss": 3.7628, - "step": 2139 - }, - { - "epoch": 0.66, - "learning_rate": 0.00027028204337361985, - "loss": 3.8237, - "step": 2140 - }, - { - "epoch": 0.66, - "learning_rate": 0.0002702464143143827, - "loss": 3.6339, - "step": 2141 - }, - { - "epoch": 0.66, - "learning_rate": 0.0002702107852551454, - "loss": 3.7266, - "step": 2142 - }, - { - "epoch": 0.66, - "learning_rate": 0.0002701751561959082, - "loss": 3.5038, - "step": 2143 - }, - { - "epoch": 0.66, - "learning_rate": 0.000270139527136671, - "loss": 3.8761, - "step": 2144 - }, - { - "epoch": 0.66, - "learning_rate": 0.0002701038980774337, - "loss": 3.3854, - "step": 2145 - }, - { - "epoch": 0.66, - "learning_rate": 0.00027006826901819654, - "loss": 2.971, - "step": 2146 - }, - { - "epoch": 0.66, - "learning_rate": 0.00027003263995895927, - "loss": 3.2835, - "step": 2147 - }, - { - "epoch": 0.66, - "learning_rate": 0.00026999701089972205, - "loss": 3.215, - "step": 2148 - }, - { - "epoch": 0.66, - "learning_rate": 0.00026996138184048484, - "loss": 2.8101, - "step": 2149 - }, - { - "epoch": 0.66, - "learning_rate": 0.00026992575278124756, - "loss": 2.6127, - "step": 2150 - }, - { - "epoch": 0.66, - "learning_rate": 0.00026989012372201034, - "loss": 5.4214, - "step": 2151 - }, - { - "epoch": 0.66, - "learning_rate": 0.0002698544946627731, - "loss": 5.1861, - "step": 2152 - }, - { - "epoch": 0.66, - "learning_rate": 0.0002698188656035359, - "loss": 5.1411, - "step": 2153 - }, - { - "epoch": 0.66, - "learning_rate": 0.0002697832365442987, - "loss": 4.9494, - "step": 2154 - }, - { - "epoch": 0.67, - "learning_rate": 0.0002697476074850614, - "loss": 4.8036, - "step": 2155 - }, - { - "epoch": 0.67, - "learning_rate": 0.0002697119784258242, - "loss": 4.7427, - "step": 2156 - }, - { - "epoch": 0.67, - "learning_rate": 0.000269676349366587, - "loss": 5.1107, - "step": 2157 - }, - { - "epoch": 0.67, - "learning_rate": 0.00026964072030734976, - "loss": 5.0204, - "step": 2158 - }, - { - "epoch": 0.67, - "learning_rate": 0.00026960509124811255, - "loss": 4.4267, - "step": 2159 - }, - { - "epoch": 0.67, - "learning_rate": 0.00026956946218887533, - "loss": 4.8836, - "step": 2160 - }, - { - "epoch": 0.67, - "learning_rate": 0.00026953383312963806, - "loss": 4.404, - "step": 2161 - }, - { - "epoch": 0.67, - "learning_rate": 0.00026949820407040084, - "loss": 4.6778, - "step": 2162 - }, - { - "epoch": 0.67, - "learning_rate": 0.0002694625750111636, - "loss": 4.4312, - "step": 2163 - }, - { - "epoch": 0.67, - "learning_rate": 0.00026942694595192635, - "loss": 4.7257, - "step": 2164 - }, - { - "epoch": 0.67, - "learning_rate": 0.0002693913168926892, - "loss": 4.5319, - "step": 2165 - }, - { - "epoch": 0.67, - "learning_rate": 0.0002693556878334519, - "loss": 4.5441, - "step": 2166 - }, - { - "epoch": 0.67, - "learning_rate": 0.0002693200587742147, - "loss": 4.4688, - "step": 2167 - }, - { - "epoch": 0.67, - "learning_rate": 0.0002692844297149775, - "loss": 4.2892, - "step": 2168 - }, - { - "epoch": 0.67, - "learning_rate": 0.0002692488006557402, - "loss": 4.3047, - "step": 2169 - }, - { - "epoch": 0.67, - "learning_rate": 0.00026921317159650304, - "loss": 4.7218, - "step": 2170 - }, - { - "epoch": 0.67, - "learning_rate": 0.00026917754253726577, - "loss": 4.3161, - "step": 2171 - }, - { - "epoch": 0.67, - "learning_rate": 0.00026914191347802855, - "loss": 4.4661, - "step": 2172 - }, - { - "epoch": 0.67, - "learning_rate": 0.00026910628441879133, - "loss": 4.4211, - "step": 2173 - }, - { - "epoch": 0.67, - "learning_rate": 0.00026907065535955406, - "loss": 4.5731, - "step": 2174 - }, - { - "epoch": 0.67, - "learning_rate": 0.0002690350263003169, - "loss": 4.2675, - "step": 2175 - }, - { - "epoch": 0.67, - "learning_rate": 0.0002689993972410796, - "loss": 4.3119, - "step": 2176 - }, - { - "epoch": 0.67, - "learning_rate": 0.0002689637681818424, - "loss": 4.4024, - "step": 2177 - }, - { - "epoch": 0.67, - "learning_rate": 0.0002689281391226052, - "loss": 4.2476, - "step": 2178 - }, - { - "epoch": 0.67, - "learning_rate": 0.0002688925100633679, - "loss": 4.1693, - "step": 2179 - }, - { - "epoch": 0.67, - "learning_rate": 0.00026885688100413075, - "loss": 4.1901, - "step": 2180 - }, - { - "epoch": 0.67, - "learning_rate": 0.0002688212519448935, - "loss": 4.2276, - "step": 2181 - }, - { - "epoch": 0.67, - "learning_rate": 0.00026878562288565626, - "loss": 3.9802, - "step": 2182 - }, - { - "epoch": 0.67, - "learning_rate": 0.00026874999382641904, - "loss": 3.8448, - "step": 2183 - }, - { - "epoch": 0.67, - "learning_rate": 0.0002687143647671818, - "loss": 3.92, - "step": 2184 - }, - { - "epoch": 0.67, - "learning_rate": 0.00026867873570794455, - "loss": 4.1696, - "step": 2185 - }, - { - "epoch": 0.67, - "learning_rate": 0.00026864310664870733, - "loss": 4.2616, - "step": 2186 - }, - { - "epoch": 0.67, - "learning_rate": 0.0002686074775894701, - "loss": 4.0214, - "step": 2187 - }, - { - "epoch": 0.68, - "learning_rate": 0.0002685718485302329, - "loss": 4.1242, - "step": 2188 - }, - { - "epoch": 0.68, - "learning_rate": 0.0002685362194709957, - "loss": 3.8292, - "step": 2189 - }, - { - "epoch": 0.68, - "learning_rate": 0.0002685005904117584, - "loss": 3.8482, - "step": 2190 - }, - { - "epoch": 0.68, - "learning_rate": 0.0002684649613525212, - "loss": 3.6833, - "step": 2191 - }, - { - "epoch": 0.68, - "learning_rate": 0.00026842933229328397, - "loss": 3.6177, - "step": 2192 - }, - { - "epoch": 0.68, - "learning_rate": 0.00026839370323404675, - "loss": 3.1868, - "step": 2193 - }, - { - "epoch": 0.68, - "learning_rate": 0.00026835807417480953, - "loss": 3.5675, - "step": 2194 - }, - { - "epoch": 0.68, - "learning_rate": 0.00026832244511557226, - "loss": 3.4567, - "step": 2195 - }, - { - "epoch": 0.68, - "learning_rate": 0.00026828681605633504, - "loss": 3.2616, - "step": 2196 - }, - { - "epoch": 0.68, - "learning_rate": 0.0002682511869970978, - "loss": 3.1958, - "step": 2197 - }, - { - "epoch": 0.68, - "learning_rate": 0.00026821555793786055, - "loss": 3.01, - "step": 2198 - }, - { - "epoch": 0.68, - "learning_rate": 0.0002681799288786234, - "loss": 2.9395, - "step": 2199 - }, - { - "epoch": 0.68, - "learning_rate": 0.0002681442998193861, - "loss": 2.8252, - "step": 2200 - }, - { - "epoch": 0.68, - "learning_rate": 0.0002681086707601489, - "loss": 5.7191, - "step": 2201 - }, - { - "epoch": 0.68, - "learning_rate": 0.0002680730417009117, - "loss": 5.8459, - "step": 2202 - }, - { - "epoch": 0.68, - "learning_rate": 0.0002680374126416744, - "loss": 5.5128, - "step": 2203 - }, - { - "epoch": 0.68, - "learning_rate": 0.00026800178358243724, - "loss": 5.4134, - "step": 2204 - }, - { - "epoch": 0.68, - "learning_rate": 0.00026796615452319997, - "loss": 5.1334, - "step": 2205 - }, - { - "epoch": 0.68, - "learning_rate": 0.00026793052546396275, - "loss": 4.7954, - "step": 2206 - }, - { - "epoch": 0.68, - "learning_rate": 0.00026789489640472553, - "loss": 4.5874, - "step": 2207 - }, - { - "epoch": 0.68, - "learning_rate": 0.0002678592673454883, - "loss": 4.8741, - "step": 2208 - }, - { - "epoch": 0.68, - "learning_rate": 0.0002678236382862511, - "loss": 4.8996, - "step": 2209 - }, - { - "epoch": 0.68, - "learning_rate": 0.0002677880092270138, - "loss": 4.6359, - "step": 2210 - }, - { - "epoch": 0.68, - "learning_rate": 0.0002677523801677766, - "loss": 4.669, - "step": 2211 - }, - { - "epoch": 0.68, - "learning_rate": 0.0002677167511085394, - "loss": 4.497, - "step": 2212 - }, - { - "epoch": 0.68, - "learning_rate": 0.00026768112204930217, - "loss": 4.7719, - "step": 2213 - }, - { - "epoch": 0.68, - "learning_rate": 0.00026764549299006495, - "loss": 4.7294, - "step": 2214 - }, - { - "epoch": 0.68, - "learning_rate": 0.0002676098639308277, - "loss": 4.7212, - "step": 2215 - }, - { - "epoch": 0.68, - "learning_rate": 0.00026757423487159046, - "loss": 4.5129, - "step": 2216 - }, - { - "epoch": 0.68, - "learning_rate": 0.00026753860581235324, - "loss": 4.5215, - "step": 2217 - }, - { - "epoch": 0.68, - "learning_rate": 0.000267502976753116, - "loss": 4.4539, - "step": 2218 - }, - { - "epoch": 0.68, - "learning_rate": 0.00026746734769387875, - "loss": 4.1164, - "step": 2219 - }, - { - "epoch": 0.69, - "learning_rate": 0.00026743171863464154, - "loss": 4.2726, - "step": 2220 - }, - { - "epoch": 0.69, - "learning_rate": 0.0002673960895754043, - "loss": 4.3926, - "step": 2221 - }, - { - "epoch": 0.69, - "learning_rate": 0.0002673604605161671, - "loss": 4.5402, - "step": 2222 - }, - { - "epoch": 0.69, - "learning_rate": 0.0002673248314569299, - "loss": 4.6661, - "step": 2223 - }, - { - "epoch": 0.69, - "learning_rate": 0.0002672892023976926, - "loss": 4.3806, - "step": 2224 - }, - { - "epoch": 0.69, - "learning_rate": 0.0002672535733384554, - "loss": 4.3723, - "step": 2225 - }, - { - "epoch": 0.69, - "learning_rate": 0.00026721794427921817, - "loss": 4.3477, - "step": 2226 - }, - { - "epoch": 0.69, - "learning_rate": 0.00026718231521998095, - "loss": 4.3915, - "step": 2227 - }, - { - "epoch": 0.69, - "learning_rate": 0.00026714668616074374, - "loss": 4.4253, - "step": 2228 - }, - { - "epoch": 0.69, - "learning_rate": 0.00026711105710150646, - "loss": 4.0907, - "step": 2229 - }, - { - "epoch": 0.69, - "learning_rate": 0.00026707542804226925, - "loss": 4.1971, - "step": 2230 - }, - { - "epoch": 0.69, - "learning_rate": 0.00026703979898303203, - "loss": 4.3432, - "step": 2231 - }, - { - "epoch": 0.69, - "learning_rate": 0.0002670041699237948, - "loss": 4.2279, - "step": 2232 - }, - { - "epoch": 0.69, - "learning_rate": 0.0002669685408645576, - "loss": 4.2818, - "step": 2233 - }, - { - "epoch": 0.69, - "learning_rate": 0.0002669329118053203, - "loss": 4.1163, - "step": 2234 - }, - { - "epoch": 0.69, - "learning_rate": 0.00026689728274608315, - "loss": 4.1745, - "step": 2235 - }, - { - "epoch": 0.69, - "learning_rate": 0.0002668616536868459, - "loss": 4.1069, - "step": 2236 - }, - { - "epoch": 0.69, - "learning_rate": 0.00026682602462760866, - "loss": 3.875, - "step": 2237 - }, - { - "epoch": 0.69, - "learning_rate": 0.00026679039556837145, - "loss": 3.6572, - "step": 2238 - }, - { - "epoch": 0.69, - "learning_rate": 0.0002667547665091342, - "loss": 3.9336, - "step": 2239 - }, - { - "epoch": 0.69, - "learning_rate": 0.000266719137449897, - "loss": 4.0039, - "step": 2240 - }, - { - "epoch": 0.69, - "learning_rate": 0.00026668350839065974, - "loss": 4.0218, - "step": 2241 - }, - { - "epoch": 0.69, - "learning_rate": 0.0002666478793314225, - "loss": 3.8118, - "step": 2242 - }, - { - "epoch": 0.69, - "learning_rate": 0.0002666122502721853, - "loss": 3.5175, - "step": 2243 - }, - { - "epoch": 0.69, - "learning_rate": 0.00026657662121294803, - "loss": 3.8809, - "step": 2244 - }, - { - "epoch": 0.69, - "learning_rate": 0.0002665409921537108, - "loss": 3.3924, - "step": 2245 - }, - { - "epoch": 0.69, - "learning_rate": 0.0002665053630944736, - "loss": 3.1747, - "step": 2246 - }, - { - "epoch": 0.69, - "learning_rate": 0.0002664697340352364, - "loss": 3.3114, - "step": 2247 - }, - { - "epoch": 0.69, - "learning_rate": 0.00026643410497599916, - "loss": 3.09, - "step": 2248 - }, - { - "epoch": 0.69, - "learning_rate": 0.0002663984759167619, - "loss": 2.9028, - "step": 2249 - }, - { - "epoch": 0.69, - "learning_rate": 0.00026636284685752467, - "loss": 2.7627, - "step": 2250 - }, - { - "epoch": 0.69, - "learning_rate": 0.00026632721779828745, - "loss": 5.8523, - "step": 2251 - }, - { - "epoch": 0.7, - "learning_rate": 0.00026629158873905023, - "loss": 5.5851, - "step": 2252 - }, - { - "epoch": 0.7, - "learning_rate": 0.00026625595967981296, - "loss": 5.082, - "step": 2253 - }, - { - "epoch": 0.7, - "learning_rate": 0.0002662203306205758, - "loss": 5.3091, - "step": 2254 - }, - { - "epoch": 0.7, - "learning_rate": 0.0002661847015613385, - "loss": 4.8769, - "step": 2255 - }, - { - "epoch": 0.7, - "learning_rate": 0.0002661490725021013, - "loss": 4.5123, - "step": 2256 - }, - { - "epoch": 0.7, - "learning_rate": 0.0002661134434428641, - "loss": 4.4974, - "step": 2257 - }, - { - "epoch": 0.7, - "learning_rate": 0.0002660778143836268, - "loss": 4.5921, - "step": 2258 - }, - { - "epoch": 0.7, - "learning_rate": 0.00026604218532438965, - "loss": 4.5983, - "step": 2259 - }, - { - "epoch": 0.7, - "learning_rate": 0.0002660065562651524, - "loss": 4.7629, - "step": 2260 - }, - { - "epoch": 0.7, - "learning_rate": 0.00026597092720591516, - "loss": 4.587, - "step": 2261 - }, - { - "epoch": 0.7, - "learning_rate": 0.00026593529814667794, - "loss": 4.5702, - "step": 2262 - }, - { - "epoch": 0.7, - "learning_rate": 0.00026589966908744067, - "loss": 4.2578, - "step": 2263 - }, - { - "epoch": 0.7, - "learning_rate": 0.0002658640400282035, - "loss": 4.5776, - "step": 2264 - }, - { - "epoch": 0.7, - "learning_rate": 0.00026582841096896623, - "loss": 4.3071, - "step": 2265 - }, - { - "epoch": 0.7, - "learning_rate": 0.000265792781909729, - "loss": 4.5268, - "step": 2266 - }, - { - "epoch": 0.7, - "learning_rate": 0.0002657571528504918, - "loss": 4.6491, - "step": 2267 - }, - { - "epoch": 0.7, - "learning_rate": 0.0002657215237912545, - "loss": 4.4708, - "step": 2268 - }, - { - "epoch": 0.7, - "learning_rate": 0.00026568589473201736, - "loss": 4.331, - "step": 2269 - }, - { - "epoch": 0.7, - "learning_rate": 0.0002656502656727801, - "loss": 4.7472, - "step": 2270 - }, - { - "epoch": 0.7, - "learning_rate": 0.00026561463661354287, - "loss": 4.4351, - "step": 2271 - }, - { - "epoch": 0.7, - "learning_rate": 0.00026557900755430565, - "loss": 4.1651, - "step": 2272 - }, - { - "epoch": 0.7, - "learning_rate": 0.0002655433784950684, - "loss": 4.0908, - "step": 2273 - }, - { - "epoch": 0.7, - "learning_rate": 0.0002655077494358312, - "loss": 4.3298, - "step": 2274 - }, - { - "epoch": 0.7, - "learning_rate": 0.00026547212037659394, - "loss": 4.4526, - "step": 2275 - }, - { - "epoch": 0.7, - "learning_rate": 0.0002654364913173567, - "loss": 4.4924, - "step": 2276 - }, - { - "epoch": 0.7, - "learning_rate": 0.0002654008622581195, - "loss": 4.2038, - "step": 2277 - }, - { - "epoch": 0.7, - "learning_rate": 0.0002653652331988823, - "loss": 4.375, - "step": 2278 - }, - { - "epoch": 0.7, - "learning_rate": 0.000265329604139645, - "loss": 4.2648, - "step": 2279 - }, - { - "epoch": 0.7, - "learning_rate": 0.0002652939750804078, - "loss": 4.2919, - "step": 2280 - }, - { - "epoch": 0.7, - "learning_rate": 0.0002652583460211706, - "loss": 3.9786, - "step": 2281 - }, - { - "epoch": 0.7, - "learning_rate": 0.00026522271696193336, - "loss": 4.1125, - "step": 2282 - }, - { - "epoch": 0.7, - "learning_rate": 0.00026518708790269614, - "loss": 4.0368, - "step": 2283 - }, - { - "epoch": 0.7, - "learning_rate": 0.00026515145884345887, - "loss": 4.0796, - "step": 2284 - }, - { - "epoch": 0.71, - "learning_rate": 0.00026511582978422165, - "loss": 4.2385, - "step": 2285 - }, - { - "epoch": 0.71, - "learning_rate": 0.00026508020072498443, - "loss": 3.9758, - "step": 2286 - }, - { - "epoch": 0.71, - "learning_rate": 0.00026504457166574716, - "loss": 4.1498, - "step": 2287 - }, - { - "epoch": 0.71, - "learning_rate": 0.00026500894260651, - "loss": 4.0593, - "step": 2288 - }, - { - "epoch": 0.71, - "learning_rate": 0.0002649733135472727, - "loss": 3.5912, - "step": 2289 - }, - { - "epoch": 0.71, - "learning_rate": 0.0002649376844880355, - "loss": 3.7663, - "step": 2290 - }, - { - "epoch": 0.71, - "learning_rate": 0.0002649020554287983, - "loss": 3.6628, - "step": 2291 - }, - { - "epoch": 0.71, - "learning_rate": 0.000264866426369561, - "loss": 3.6855, - "step": 2292 - }, - { - "epoch": 0.71, - "learning_rate": 0.00026483079731032385, - "loss": 3.9472, - "step": 2293 - }, - { - "epoch": 0.71, - "learning_rate": 0.0002647951682510866, - "loss": 3.5427, - "step": 2294 - }, - { - "epoch": 0.71, - "learning_rate": 0.00026475953919184936, - "loss": 3.2918, - "step": 2295 - }, - { - "epoch": 0.71, - "learning_rate": 0.00026472391013261214, - "loss": 3.4156, - "step": 2296 - }, - { - "epoch": 0.71, - "learning_rate": 0.0002646882810733749, - "loss": 3.2335, - "step": 2297 - }, - { - "epoch": 0.71, - "learning_rate": 0.0002646526520141377, - "loss": 3.1201, - "step": 2298 - }, - { - "epoch": 0.71, - "learning_rate": 0.00026461702295490044, - "loss": 2.8926, - "step": 2299 - }, - { - "epoch": 0.71, - "learning_rate": 0.0002645813938956632, - "loss": 2.8963, - "step": 2300 - }, - { - "epoch": 0.71, - "learning_rate": 0.000264545764836426, - "loss": 5.5156, - "step": 2301 - }, - { - "epoch": 0.71, - "learning_rate": 0.0002645101357771888, - "loss": 5.4828, - "step": 2302 - }, - { - "epoch": 0.71, - "learning_rate": 0.00026447450671795156, - "loss": 4.8972, - "step": 2303 - }, - { - "epoch": 0.71, - "learning_rate": 0.0002644388776587143, - "loss": 4.7764, - "step": 2304 - }, - { - "epoch": 0.71, - "learning_rate": 0.0002644032485994771, - "loss": 4.6781, - "step": 2305 - }, - { - "epoch": 0.71, - "learning_rate": 0.00026436761954023985, - "loss": 4.5116, - "step": 2306 - }, - { - "epoch": 0.71, - "learning_rate": 0.00026433199048100264, - "loss": 4.7391, - "step": 2307 - }, - { - "epoch": 0.71, - "learning_rate": 0.0002642963614217654, - "loss": 4.7079, - "step": 2308 - }, - { - "epoch": 0.71, - "learning_rate": 0.00026426073236252815, - "loss": 4.6104, - "step": 2309 - }, - { - "epoch": 0.71, - "learning_rate": 0.00026422510330329093, - "loss": 4.5504, - "step": 2310 - }, - { - "epoch": 0.71, - "learning_rate": 0.0002641894742440537, - "loss": 4.7064, - "step": 2311 - }, - { - "epoch": 0.71, - "learning_rate": 0.0002641538451848165, - "loss": 4.544, - "step": 2312 - }, - { - "epoch": 0.71, - "learning_rate": 0.0002641182161255792, - "loss": 4.3515, - "step": 2313 - }, - { - "epoch": 0.71, - "learning_rate": 0.000264082587066342, - "loss": 4.3783, - "step": 2314 - }, - { - "epoch": 0.71, - "learning_rate": 0.0002640469580071048, - "loss": 4.6333, - "step": 2315 - }, - { - "epoch": 0.71, - "learning_rate": 0.00026401132894786757, - "loss": 4.5166, - "step": 2316 - }, - { - "epoch": 0.72, - "learning_rate": 0.00026397569988863035, - "loss": 4.4685, - "step": 2317 - }, - { - "epoch": 0.72, - "learning_rate": 0.0002639400708293931, - "loss": 4.5128, - "step": 2318 - }, - { - "epoch": 0.72, - "learning_rate": 0.00026390444177015586, - "loss": 4.6736, - "step": 2319 - }, - { - "epoch": 0.72, - "learning_rate": 0.00026386881271091864, - "loss": 4.3192, - "step": 2320 - }, - { - "epoch": 0.72, - "learning_rate": 0.0002638331836516814, - "loss": 4.5259, - "step": 2321 - }, - { - "epoch": 0.72, - "learning_rate": 0.0002637975545924442, - "loss": 4.4803, - "step": 2322 - }, - { - "epoch": 0.72, - "learning_rate": 0.00026376192553320693, - "loss": 4.4336, - "step": 2323 - }, - { - "epoch": 0.72, - "learning_rate": 0.0002637262964739697, - "loss": 4.2775, - "step": 2324 - }, - { - "epoch": 0.72, - "learning_rate": 0.0002636906674147325, - "loss": 4.4003, - "step": 2325 - }, - { - "epoch": 0.72, - "learning_rate": 0.0002636550383554953, - "loss": 4.3128, - "step": 2326 - }, - { - "epoch": 0.72, - "learning_rate": 0.00026361940929625806, - "loss": 4.1117, - "step": 2327 - }, - { - "epoch": 0.72, - "learning_rate": 0.0002635837802370208, - "loss": 4.1381, - "step": 2328 - }, - { - "epoch": 0.72, - "learning_rate": 0.0002635481511777836, - "loss": 4.188, - "step": 2329 - }, - { - "epoch": 0.72, - "learning_rate": 0.00026351252211854635, - "loss": 4.2737, - "step": 2330 - }, - { - "epoch": 0.72, - "learning_rate": 0.00026347689305930913, - "loss": 3.9289, - "step": 2331 - }, - { - "epoch": 0.72, - "learning_rate": 0.0002634412640000719, - "loss": 4.5693, - "step": 2332 - }, - { - "epoch": 0.72, - "learning_rate": 0.00026340563494083464, - "loss": 4.2732, - "step": 2333 - }, - { - "epoch": 0.72, - "learning_rate": 0.0002633700058815974, - "loss": 4.089, - "step": 2334 - }, - { - "epoch": 0.72, - "learning_rate": 0.0002633343768223602, - "loss": 4.2653, - "step": 2335 - }, - { - "epoch": 0.72, - "learning_rate": 0.000263298747763123, - "loss": 4.293, - "step": 2336 - }, - { - "epoch": 0.72, - "learning_rate": 0.00026326311870388577, - "loss": 4.1921, - "step": 2337 - }, - { - "epoch": 0.72, - "learning_rate": 0.0002632274896446485, - "loss": 3.8543, - "step": 2338 - }, - { - "epoch": 0.72, - "learning_rate": 0.0002631918605854113, - "loss": 3.7271, - "step": 2339 - }, - { - "epoch": 0.72, - "learning_rate": 0.00026315623152617406, - "loss": 4.192, - "step": 2340 - }, - { - "epoch": 0.72, - "learning_rate": 0.00026312060246693684, - "loss": 3.6994, - "step": 2341 - }, - { - "epoch": 0.72, - "learning_rate": 0.0002630849734076996, - "loss": 3.5377, - "step": 2342 - }, - { - "epoch": 0.72, - "learning_rate": 0.00026304934434846235, - "loss": 3.7842, - "step": 2343 - }, - { - "epoch": 0.72, - "learning_rate": 0.00026301371528922513, - "loss": 3.7985, - "step": 2344 - }, - { - "epoch": 0.72, - "learning_rate": 0.0002629780862299879, - "loss": 3.482, - "step": 2345 - }, - { - "epoch": 0.72, - "learning_rate": 0.0002629424571707507, - "loss": 3.3045, - "step": 2346 - }, - { - "epoch": 0.72, - "learning_rate": 0.0002629068281115134, - "loss": 3.4809, - "step": 2347 - }, - { - "epoch": 0.72, - "learning_rate": 0.00026287119905227626, - "loss": 3.0863, - "step": 2348 - }, - { - "epoch": 0.72, - "learning_rate": 0.000262835569993039, - "loss": 3.2275, - "step": 2349 - }, - { - "epoch": 0.73, - "learning_rate": 0.00026279994093380177, - "loss": 2.9093, - "step": 2350 - }, - { - "epoch": 0.73, - "learning_rate": 0.00026276431187456455, - "loss": 5.4127, - "step": 2351 - }, - { - "epoch": 0.73, - "learning_rate": 0.0002627286828153273, - "loss": 5.5323, - "step": 2352 - }, - { - "epoch": 0.73, - "learning_rate": 0.0002626930537560901, - "loss": 5.1589, - "step": 2353 - }, - { - "epoch": 0.73, - "learning_rate": 0.00026265742469685284, - "loss": 4.807, - "step": 2354 - }, - { - "epoch": 0.73, - "learning_rate": 0.0002626217956376156, - "loss": 4.8033, - "step": 2355 - }, - { - "epoch": 0.73, - "learning_rate": 0.0002625861665783784, - "loss": 4.5973, - "step": 2356 - }, - { - "epoch": 0.73, - "learning_rate": 0.00026255053751914113, - "loss": 4.8542, - "step": 2357 - }, - { - "epoch": 0.73, - "learning_rate": 0.00026251490845990397, - "loss": 4.3649, - "step": 2358 - }, - { - "epoch": 0.73, - "learning_rate": 0.0002624792794006667, - "loss": 4.8726, - "step": 2359 - }, - { - "epoch": 0.73, - "learning_rate": 0.0002624436503414295, - "loss": 4.5285, - "step": 2360 - }, - { - "epoch": 0.73, - "learning_rate": 0.00026240802128219226, - "loss": 4.6399, - "step": 2361 - }, - { - "epoch": 0.73, - "learning_rate": 0.000262372392222955, - "loss": 4.4844, - "step": 2362 - }, - { - "epoch": 0.73, - "learning_rate": 0.0002623367631637178, - "loss": 4.4909, - "step": 2363 - }, - { - "epoch": 0.73, - "learning_rate": 0.00026230113410448055, - "loss": 4.6307, - "step": 2364 - }, - { - "epoch": 0.73, - "learning_rate": 0.00026226550504524334, - "loss": 4.2898, - "step": 2365 - }, - { - "epoch": 0.73, - "learning_rate": 0.0002622298759860061, - "loss": 4.683, - "step": 2366 - }, - { - "epoch": 0.73, - "learning_rate": 0.00026219424692676884, - "loss": 4.4114, - "step": 2367 - }, - { - "epoch": 0.73, - "learning_rate": 0.0002621586178675316, - "loss": 4.5216, - "step": 2368 - }, - { - "epoch": 0.73, - "learning_rate": 0.0002621229888082944, - "loss": 4.3767, - "step": 2369 - }, - { - "epoch": 0.73, - "learning_rate": 0.0002620873597490572, - "loss": 4.1361, - "step": 2370 - }, - { - "epoch": 0.73, - "learning_rate": 0.00026205173068981997, - "loss": 4.6045, - "step": 2371 - }, - { - "epoch": 0.73, - "learning_rate": 0.00026201610163058275, - "loss": 4.31, - "step": 2372 - }, - { - "epoch": 0.73, - "learning_rate": 0.0002619804725713455, - "loss": 4.4309, - "step": 2373 - }, - { - "epoch": 0.73, - "learning_rate": 0.00026194484351210826, - "loss": 4.1245, - "step": 2374 - }, - { - "epoch": 0.73, - "learning_rate": 0.00026190921445287105, - "loss": 4.061, - "step": 2375 - }, - { - "epoch": 0.73, - "learning_rate": 0.00026187358539363383, - "loss": 4.1325, - "step": 2376 - }, - { - "epoch": 0.73, - "learning_rate": 0.0002618379563343966, - "loss": 4.4997, - "step": 2377 - }, - { - "epoch": 0.73, - "learning_rate": 0.00026180232727515934, - "loss": 4.3111, - "step": 2378 - }, - { - "epoch": 0.73, - "learning_rate": 0.0002617666982159221, - "loss": 4.225, - "step": 2379 - }, - { - "epoch": 0.73, - "learning_rate": 0.0002617310691566849, - "loss": 4.1639, - "step": 2380 - }, - { - "epoch": 0.73, - "learning_rate": 0.00026169544009744763, - "loss": 4.3238, - "step": 2381 - }, - { - "epoch": 0.74, - "learning_rate": 0.00026165981103821046, - "loss": 4.0697, - "step": 2382 - }, - { - "epoch": 0.74, - "learning_rate": 0.0002616241819789732, - "loss": 4.0639, - "step": 2383 - }, - { - "epoch": 0.74, - "learning_rate": 0.000261588552919736, - "loss": 4.0016, - "step": 2384 - }, - { - "epoch": 0.74, - "learning_rate": 0.00026155292386049876, - "loss": 4.0495, - "step": 2385 - }, - { - "epoch": 0.74, - "learning_rate": 0.0002615172948012615, - "loss": 3.8519, - "step": 2386 - }, - { - "epoch": 0.74, - "learning_rate": 0.0002614816657420243, - "loss": 3.7634, - "step": 2387 - }, - { - "epoch": 0.74, - "learning_rate": 0.00026144603668278705, - "loss": 4.1468, - "step": 2388 - }, - { - "epoch": 0.74, - "learning_rate": 0.00026141040762354983, - "loss": 3.5792, - "step": 2389 - }, - { - "epoch": 0.74, - "learning_rate": 0.0002613747785643126, - "loss": 3.6677, - "step": 2390 - }, - { - "epoch": 0.74, - "learning_rate": 0.0002613391495050754, - "loss": 3.9524, - "step": 2391 - }, - { - "epoch": 0.74, - "learning_rate": 0.0002613035204458382, - "loss": 3.5922, - "step": 2392 - }, - { - "epoch": 0.74, - "learning_rate": 0.0002612678913866009, - "loss": 3.6583, - "step": 2393 - }, - { - "epoch": 0.74, - "learning_rate": 0.0002612322623273637, - "loss": 3.3502, - "step": 2394 - }, - { - "epoch": 0.74, - "learning_rate": 0.00026119663326812647, - "loss": 3.4239, - "step": 2395 - }, - { - "epoch": 0.74, - "learning_rate": 0.00026116100420888925, - "loss": 3.2053, - "step": 2396 - }, - { - "epoch": 0.74, - "learning_rate": 0.00026112537514965203, - "loss": 3.2898, - "step": 2397 - }, - { - "epoch": 0.74, - "learning_rate": 0.00026108974609041476, - "loss": 3.1544, - "step": 2398 - }, - { - "epoch": 0.74, - "learning_rate": 0.00026105411703117754, - "loss": 2.9352, - "step": 2399 - }, - { - "epoch": 0.74, - "learning_rate": 0.0002610184879719403, - "loss": 2.8481, - "step": 2400 - }, - { - "epoch": 0.74, - "learning_rate": 0.0002609828589127031, - "loss": 5.6219, - "step": 2401 - }, - { - "epoch": 0.74, - "learning_rate": 0.00026094722985346583, - "loss": 5.5077, - "step": 2402 - }, - { - "epoch": 0.74, - "learning_rate": 0.0002609116007942286, - "loss": 4.78, - "step": 2403 - }, - { - "epoch": 0.74, - "learning_rate": 0.0002608759717349914, - "loss": 4.8887, - "step": 2404 - }, - { - "epoch": 0.74, - "learning_rate": 0.0002608403426757542, - "loss": 4.7734, - "step": 2405 - }, - { - "epoch": 0.74, - "learning_rate": 0.00026080471361651696, - "loss": 4.4328, - "step": 2406 - }, - { - "epoch": 0.74, - "learning_rate": 0.0002607690845572797, - "loss": 4.9363, - "step": 2407 - }, - { - "epoch": 0.74, - "learning_rate": 0.00026073345549804247, - "loss": 4.3731, - "step": 2408 - }, - { - "epoch": 0.74, - "learning_rate": 0.00026069782643880525, - "loss": 4.7249, - "step": 2409 - }, - { - "epoch": 0.74, - "learning_rate": 0.00026066219737956803, - "loss": 4.8923, - "step": 2410 - }, - { - "epoch": 0.74, - "learning_rate": 0.0002606265683203308, - "loss": 4.5366, - "step": 2411 - }, - { - "epoch": 0.74, - "learning_rate": 0.00026059093926109354, - "loss": 4.4079, - "step": 2412 - }, - { - "epoch": 0.74, - "learning_rate": 0.0002605553102018563, - "loss": 4.2715, - "step": 2413 - }, - { - "epoch": 0.75, - "learning_rate": 0.0002605196811426191, - "loss": 4.7195, - "step": 2414 - }, - { - "epoch": 0.75, - "learning_rate": 0.0002604840520833819, - "loss": 4.5711, - "step": 2415 - }, - { - "epoch": 0.75, - "learning_rate": 0.00026044842302414467, - "loss": 4.6974, - "step": 2416 - }, - { - "epoch": 0.75, - "learning_rate": 0.0002604127939649074, - "loss": 4.4431, - "step": 2417 - }, - { - "epoch": 0.75, - "learning_rate": 0.00026037716490567023, - "loss": 4.5794, - "step": 2418 - }, - { - "epoch": 0.75, - "learning_rate": 0.00026034153584643296, - "loss": 4.2757, - "step": 2419 - }, - { - "epoch": 0.75, - "learning_rate": 0.00026030590678719574, - "loss": 4.1613, - "step": 2420 - }, - { - "epoch": 0.75, - "learning_rate": 0.0002602702777279585, - "loss": 4.4725, - "step": 2421 - }, - { - "epoch": 0.75, - "learning_rate": 0.00026023464866872125, - "loss": 4.1682, - "step": 2422 - }, - { - "epoch": 0.75, - "learning_rate": 0.0002601990196094841, - "loss": 4.5955, - "step": 2423 - }, - { - "epoch": 0.75, - "learning_rate": 0.0002601633905502468, - "loss": 4.2187, - "step": 2424 - }, - { - "epoch": 0.75, - "learning_rate": 0.0002601277614910096, - "loss": 4.3871, - "step": 2425 - }, - { - "epoch": 0.75, - "learning_rate": 0.0002600921324317724, - "loss": 4.1902, - "step": 2426 - }, - { - "epoch": 0.75, - "learning_rate": 0.0002600565033725351, - "loss": 4.5625, - "step": 2427 - }, - { - "epoch": 0.75, - "learning_rate": 0.0002600208743132979, - "loss": 4.3426, - "step": 2428 - }, - { - "epoch": 0.75, - "learning_rate": 0.00025998524525406067, - "loss": 4.1461, - "step": 2429 - }, - { - "epoch": 0.75, - "learning_rate": 0.00025994961619482345, - "loss": 4.3465, - "step": 2430 - }, - { - "epoch": 0.75, - "learning_rate": 0.00025991398713558623, - "loss": 4.4043, - "step": 2431 - }, - { - "epoch": 0.75, - "learning_rate": 0.00025987835807634896, - "loss": 4.1648, - "step": 2432 - }, - { - "epoch": 0.75, - "learning_rate": 0.00025984272901711174, - "loss": 4.0564, - "step": 2433 - }, - { - "epoch": 0.75, - "learning_rate": 0.0002598070999578745, - "loss": 3.9525, - "step": 2434 - }, - { - "epoch": 0.75, - "learning_rate": 0.0002597714708986373, - "loss": 4.1194, - "step": 2435 - }, - { - "epoch": 0.75, - "learning_rate": 0.0002597358418394001, - "loss": 4.1319, - "step": 2436 - }, - { - "epoch": 0.75, - "learning_rate": 0.0002597002127801628, - "loss": 3.736, - "step": 2437 - }, - { - "epoch": 0.75, - "learning_rate": 0.0002596645837209256, - "loss": 3.9117, - "step": 2438 - }, - { - "epoch": 0.75, - "learning_rate": 0.0002596289546616884, - "loss": 3.7947, - "step": 2439 - }, - { - "epoch": 0.75, - "learning_rate": 0.00025959332560245116, - "loss": 3.6747, - "step": 2440 - }, - { - "epoch": 0.75, - "learning_rate": 0.0002595576965432139, - "loss": 3.6864, - "step": 2441 - }, - { - "epoch": 0.75, - "learning_rate": 0.0002595220674839767, - "loss": 3.696, - "step": 2442 - }, - { - "epoch": 0.75, - "learning_rate": 0.00025948643842473945, - "loss": 3.9071, - "step": 2443 - }, - { - "epoch": 0.75, - "learning_rate": 0.00025945080936550224, - "loss": 3.1786, - "step": 2444 - }, - { - "epoch": 0.75, - "learning_rate": 0.000259415180306265, - "loss": 3.7053, - "step": 2445 - }, - { - "epoch": 0.75, - "learning_rate": 0.00025937955124702775, - "loss": 3.2856, - "step": 2446 - }, - { - "epoch": 0.76, - "learning_rate": 0.0002593439221877906, - "loss": 3.3361, - "step": 2447 - }, - { - "epoch": 0.76, - "learning_rate": 0.0002593082931285533, - "loss": 2.7972, - "step": 2448 - }, - { - "epoch": 0.76, - "learning_rate": 0.0002592726640693161, - "loss": 2.8921, - "step": 2449 - }, - { - "epoch": 0.76, - "learning_rate": 0.00025923703501007887, - "loss": 2.8109, - "step": 2450 - }, - { - "epoch": 0.76, - "learning_rate": 0.0002592014059508416, - "loss": 5.58, - "step": 2451 - }, - { - "epoch": 0.76, - "learning_rate": 0.00025916577689160444, - "loss": 5.2312, - "step": 2452 - }, - { - "epoch": 0.76, - "learning_rate": 0.00025913014783236716, - "loss": 5.116, - "step": 2453 - }, - { - "epoch": 0.76, - "learning_rate": 0.00025909451877312995, - "loss": 4.7117, - "step": 2454 - }, - { - "epoch": 0.76, - "learning_rate": 0.00025905888971389273, - "loss": 4.8747, - "step": 2455 - }, - { - "epoch": 0.76, - "learning_rate": 0.00025902326065465546, - "loss": 4.5118, - "step": 2456 - }, - { - "epoch": 0.76, - "learning_rate": 0.0002589876315954183, - "loss": 4.5341, - "step": 2457 - }, - { - "epoch": 0.76, - "learning_rate": 0.000258952002536181, - "loss": 4.8384, - "step": 2458 - }, - { - "epoch": 0.76, - "learning_rate": 0.0002589163734769438, - "loss": 4.6472, - "step": 2459 - }, - { - "epoch": 0.76, - "learning_rate": 0.0002588807444177066, - "loss": 4.4225, - "step": 2460 - }, - { - "epoch": 0.76, - "learning_rate": 0.0002588451153584693, - "loss": 4.7712, - "step": 2461 - }, - { - "epoch": 0.76, - "learning_rate": 0.0002588094862992321, - "loss": 4.5142, - "step": 2462 - }, - { - "epoch": 0.76, - "learning_rate": 0.0002587738572399949, - "loss": 4.3298, - "step": 2463 - }, - { - "epoch": 0.76, - "learning_rate": 0.00025873822818075766, - "loss": 4.2856, - "step": 2464 - }, - { - "epoch": 0.76, - "learning_rate": 0.00025870259912152044, - "loss": 4.5749, - "step": 2465 - }, - { - "epoch": 0.76, - "learning_rate": 0.0002586669700622832, - "loss": 4.1792, - "step": 2466 - }, - { - "epoch": 0.76, - "learning_rate": 0.00025863134100304595, - "loss": 4.3868, - "step": 2467 - }, - { - "epoch": 0.76, - "learning_rate": 0.00025859571194380873, - "loss": 4.3901, - "step": 2468 - }, - { - "epoch": 0.76, - "learning_rate": 0.0002585600828845715, - "loss": 4.3862, - "step": 2469 - }, - { - "epoch": 0.76, - "learning_rate": 0.0002585244538253343, - "loss": 4.2981, - "step": 2470 - }, - { - "epoch": 0.76, - "learning_rate": 0.0002584888247660971, - "loss": 4.4714, - "step": 2471 - }, - { - "epoch": 0.76, - "learning_rate": 0.0002584531957068598, - "loss": 4.19, - "step": 2472 - }, - { - "epoch": 0.76, - "learning_rate": 0.0002584175666476226, - "loss": 4.2361, - "step": 2473 - }, - { - "epoch": 0.76, - "learning_rate": 0.00025838193758838537, - "loss": 4.4165, - "step": 2474 - }, - { - "epoch": 0.76, - "learning_rate": 0.0002583463085291481, - "loss": 4.0266, - "step": 2475 - }, - { - "epoch": 0.76, - "learning_rate": 0.00025831067946991093, - "loss": 4.2148, - "step": 2476 - }, - { - "epoch": 0.76, - "learning_rate": 0.00025827505041067366, - "loss": 4.0271, - "step": 2477 - }, - { - "epoch": 0.76, - "learning_rate": 0.00025823942135143644, - "loss": 4.1752, - "step": 2478 - }, - { - "epoch": 0.77, - "learning_rate": 0.0002582037922921992, - "loss": 4.2301, - "step": 2479 - }, - { - "epoch": 0.77, - "learning_rate": 0.00025816816323296195, - "loss": 3.8347, - "step": 2480 - }, - { - "epoch": 0.77, - "learning_rate": 0.0002581325341737248, - "loss": 3.8505, - "step": 2481 - }, - { - "epoch": 0.77, - "learning_rate": 0.0002580969051144875, - "loss": 4.0187, - "step": 2482 - }, - { - "epoch": 0.77, - "learning_rate": 0.0002580612760552503, - "loss": 4.025, - "step": 2483 - }, - { - "epoch": 0.77, - "learning_rate": 0.0002580256469960131, - "loss": 4.0884, - "step": 2484 - }, - { - "epoch": 0.77, - "learning_rate": 0.00025799001793677586, - "loss": 3.6315, - "step": 2485 - }, - { - "epoch": 0.77, - "learning_rate": 0.00025795438887753864, - "loss": 4.0414, - "step": 2486 - }, - { - "epoch": 0.77, - "learning_rate": 0.00025791875981830137, - "loss": 3.9985, - "step": 2487 - }, - { - "epoch": 0.77, - "learning_rate": 0.00025788313075906415, - "loss": 3.9457, - "step": 2488 - }, - { - "epoch": 0.77, - "learning_rate": 0.00025784750169982693, - "loss": 3.7088, - "step": 2489 - }, - { - "epoch": 0.77, - "learning_rate": 0.0002578118726405897, - "loss": 3.7256, - "step": 2490 - }, - { - "epoch": 0.77, - "learning_rate": 0.0002577762435813525, - "loss": 3.623, - "step": 2491 - }, - { - "epoch": 0.77, - "learning_rate": 0.0002577406145221152, - "loss": 3.5128, - "step": 2492 - }, - { - "epoch": 0.77, - "learning_rate": 0.000257704985462878, - "loss": 3.2699, - "step": 2493 - }, - { - "epoch": 0.77, - "learning_rate": 0.0002576693564036408, - "loss": 3.4199, - "step": 2494 - }, - { - "epoch": 0.77, - "learning_rate": 0.00025763372734440357, - "loss": 3.4769, - "step": 2495 - }, - { - "epoch": 0.77, - "learning_rate": 0.0002575980982851663, - "loss": 3.3512, - "step": 2496 - }, - { - "epoch": 0.77, - "learning_rate": 0.0002575624692259291, - "loss": 3.396, - "step": 2497 - }, - { - "epoch": 0.77, - "learning_rate": 0.00025752684016669186, - "loss": 3.0352, - "step": 2498 - }, - { - "epoch": 0.77, - "learning_rate": 0.00025749121110745464, - "loss": 2.7582, - "step": 2499 - }, - { - "epoch": 0.77, - "learning_rate": 0.0002574555820482174, - "loss": 2.8816, - "step": 2500 - }, - { - "epoch": 0.77, - "eval_bleu": 0.0, - "eval_loss": 4.629215240478516, - "eval_runtime": 2570.0611, - "eval_samples_per_second": 5.743, - "eval_steps_per_second": 0.718, - "step": 2500 - }, - { - "epoch": 0.77, - "learning_rate": 0.00025741995298898015, - "loss": 5.5757, - "step": 2501 - }, - { - "epoch": 0.77, - "learning_rate": 0.00025738432392974293, - "loss": 5.2847, - "step": 2502 - }, - { - "epoch": 0.77, - "learning_rate": 0.0002573486948705057, - "loss": 4.9658, - "step": 2503 - }, - { - "epoch": 0.77, - "learning_rate": 0.0002573130658112685, - "loss": 4.9746, - "step": 2504 - }, - { - "epoch": 0.77, - "learning_rate": 0.0002572774367520313, - "loss": 4.713, - "step": 2505 - }, - { - "epoch": 0.77, - "learning_rate": 0.000257241807692794, - "loss": 4.5189, - "step": 2506 - }, - { - "epoch": 0.77, - "learning_rate": 0.0002572061786335568, - "loss": 4.5148, - "step": 2507 - }, - { - "epoch": 0.77, - "learning_rate": 0.00025717054957431957, - "loss": 4.728, - "step": 2508 - }, - { - "epoch": 0.77, - "learning_rate": 0.00025713492051508235, - "loss": 4.5271, - "step": 2509 - }, - { - "epoch": 0.77, - "learning_rate": 0.00025709929145584513, - "loss": 4.5733, - "step": 2510 - }, - { - "epoch": 0.77, - "learning_rate": 0.00025706366239660786, - "loss": 4.4387, - "step": 2511 - }, - { - "epoch": 0.78, - "learning_rate": 0.0002570280333373707, - "loss": 4.4914, - "step": 2512 - }, - { - "epoch": 0.78, - "learning_rate": 0.0002570280333373707, - "loss": 4.5071, - "step": 2513 - }, - { - "epoch": 0.78, - "learning_rate": 0.0002569924042781334, - "loss": 4.4377, - "step": 2514 - }, - { - "epoch": 0.78, - "learning_rate": 0.0002569567752188962, - "loss": 4.6628, - "step": 2515 - }, - { - "epoch": 0.78, - "learning_rate": 0.000256921146159659, - "loss": 4.5694, - "step": 2516 - }, - { - "epoch": 0.78, - "learning_rate": 0.0002568855171004217, - "loss": 4.5888, - "step": 2517 - }, - { - "epoch": 0.78, - "learning_rate": 0.00025684988804118455, - "loss": 4.0517, - "step": 2518 - }, - { - "epoch": 0.78, - "learning_rate": 0.0002568142589819473, - "loss": 4.5035, - "step": 2519 - }, - { - "epoch": 0.78, - "learning_rate": 0.00025677862992271006, - "loss": 4.4972, - "step": 2520 - }, - { - "epoch": 0.78, - "learning_rate": 0.00025674300086347285, - "loss": 4.1945, - "step": 2521 - }, - { - "epoch": 0.78, - "learning_rate": 0.00025670737180423557, - "loss": 4.0409, - "step": 2522 - }, - { - "epoch": 0.78, - "learning_rate": 0.00025667174274499835, - "loss": 4.4237, - "step": 2523 - }, - { - "epoch": 0.78, - "learning_rate": 0.00025663611368576114, - "loss": 4.3724, - "step": 2524 - }, - { - "epoch": 0.78, - "learning_rate": 0.0002566004846265239, - "loss": 4.3978, - "step": 2525 - }, - { - "epoch": 0.78, - "learning_rate": 0.0002565648555672867, - "loss": 4.2491, - "step": 2526 - }, - { - "epoch": 0.78, - "learning_rate": 0.00025652922650804943, - "loss": 4.2489, - "step": 2527 - }, - { - "epoch": 0.78, - "learning_rate": 0.0002564935974488122, - "loss": 4.2097, - "step": 2528 - }, - { - "epoch": 0.78, - "learning_rate": 0.000256457968389575, - "loss": 4.4035, - "step": 2529 - }, - { - "epoch": 0.78, - "learning_rate": 0.0002564223393303378, - "loss": 4.0259, - "step": 2530 - }, - { - "epoch": 0.78, - "learning_rate": 0.0002563867102711005, - "loss": 4.1706, - "step": 2531 - }, - { - "epoch": 0.78, - "learning_rate": 0.0002563510812118633, - "loss": 4.3027, - "step": 2532 - }, - { - "epoch": 0.78, - "learning_rate": 0.00025631545215262607, - "loss": 4.2495, - "step": 2533 - }, - { - "epoch": 0.78, - "learning_rate": 0.00025627982309338885, - "loss": 3.9484, - "step": 2534 - }, - { - "epoch": 0.78, - "learning_rate": 0.00025624419403415163, - "loss": 3.9936, - "step": 2535 - }, - { - "epoch": 0.78, - "learning_rate": 0.00025620856497491436, - "loss": 4.3274, - "step": 2536 - }, - { - "epoch": 0.78, - "learning_rate": 0.0002561729359156772, - "loss": 3.7381, - "step": 2537 - }, - { - "epoch": 0.78, - "learning_rate": 0.0002561373068564399, - "loss": 3.6598, - "step": 2538 - }, - { - "epoch": 0.78, - "learning_rate": 0.0002561016777972027, - "loss": 4.0983, - "step": 2539 - }, - { - "epoch": 0.78, - "learning_rate": 0.0002560660487379655, - "loss": 3.8014, - "step": 2540 - }, - { - "epoch": 0.78, - "learning_rate": 0.0002560304196787282, - "loss": 3.591, - "step": 2541 - }, - { - "epoch": 0.78, - "learning_rate": 0.00025599479061949105, - "loss": 3.5541, - "step": 2542 - }, - { - "epoch": 0.78, - "learning_rate": 0.0002559591615602538, - "loss": 3.2301, - "step": 2543 - }, - { - "epoch": 0.79, - "learning_rate": 0.00025592353250101656, - "loss": 3.415, - "step": 2544 - }, - { - "epoch": 0.79, - "learning_rate": 0.00025588790344177934, - "loss": 3.2344, - "step": 2545 - }, - { - "epoch": 0.79, - "learning_rate": 0.00025585227438254207, - "loss": 3.2152, - "step": 2546 - }, - { - "epoch": 0.79, - "learning_rate": 0.0002558166453233049, - "loss": 3.0609, - "step": 2547 - }, - { - "epoch": 0.79, - "learning_rate": 0.00025578101626406763, - "loss": 2.7601, - "step": 2548 - }, - { - "epoch": 0.79, - "learning_rate": 0.0002557453872048304, - "loss": 2.7428, - "step": 2549 - }, - { - "epoch": 0.79, - "learning_rate": 0.0002557097581455932, - "loss": 2.6817, - "step": 2550 - }, - { - "epoch": 0.79, - "learning_rate": 0.0002556741290863559, - "loss": 5.4404, - "step": 2551 - }, - { - "epoch": 0.79, - "learning_rate": 0.00025563850002711876, - "loss": 5.3606, - "step": 2552 - }, - { - "epoch": 0.79, - "learning_rate": 0.0002556028709678815, - "loss": 5.0698, - "step": 2553 - }, - { - "epoch": 0.79, - "learning_rate": 0.00025556724190864427, - "loss": 4.89, - "step": 2554 - }, - { - "epoch": 0.79, - "learning_rate": 0.00025553161284940705, - "loss": 4.6995, - "step": 2555 - }, - { - "epoch": 0.79, - "learning_rate": 0.00025549598379016983, - "loss": 4.6063, - "step": 2556 - }, - { - "epoch": 0.79, - "learning_rate": 0.00025546035473093256, - "loss": 4.436, - "step": 2557 - }, - { - "epoch": 0.79, - "learning_rate": 0.00025542472567169534, - "loss": 4.3352, - "step": 2558 - }, - { - "epoch": 0.79, - "learning_rate": 0.0002553890966124581, - "loss": 4.6661, - "step": 2559 - }, - { - "epoch": 0.79, - "learning_rate": 0.0002553534675532209, - "loss": 4.4041, - "step": 2560 - }, - { - "epoch": 0.79, - "learning_rate": 0.0002553178384939837, - "loss": 4.4643, - "step": 2561 - }, - { - "epoch": 0.79, - "learning_rate": 0.0002552822094347464, - "loss": 4.3703, - "step": 2562 - }, - { - "epoch": 0.79, - "learning_rate": 0.0002552465803755092, - "loss": 4.1509, - "step": 2563 - }, - { - "epoch": 0.79, - "learning_rate": 0.000255210951316272, - "loss": 4.3058, - "step": 2564 - }, - { - "epoch": 0.79, - "learning_rate": 0.0002551753222570347, - "loss": 4.3526, - "step": 2565 - }, - { - "epoch": 0.79, - "learning_rate": 0.00025513969319779754, - "loss": 4.3695, - "step": 2566 - }, - { - "epoch": 0.79, - "learning_rate": 0.00025510406413856027, - "loss": 4.2624, - "step": 2567 - }, - { - "epoch": 0.79, - "learning_rate": 0.00025506843507932305, - "loss": 4.2271, - "step": 2568 - }, - { - "epoch": 0.79, - "learning_rate": 0.00025503280602008583, - "loss": 4.1177, - "step": 2569 - }, - { - "epoch": 0.79, - "learning_rate": 0.00025499717696084856, - "loss": 4.1103, - "step": 2570 - }, - { - "epoch": 0.79, - "learning_rate": 0.0002549615479016114, - "loss": 4.1808, - "step": 2571 - }, - { - "epoch": 0.79, - "learning_rate": 0.0002549259188423741, - "loss": 4.1405, - "step": 2572 - }, - { - "epoch": 0.79, - "learning_rate": 0.0002548902897831369, - "loss": 4.2116, - "step": 2573 - }, - { - "epoch": 0.79, - "learning_rate": 0.0002548546607238997, - "loss": 4.1636, - "step": 2574 - }, - { - "epoch": 0.79, - "learning_rate": 0.0002548190316646624, - "loss": 3.8042, - "step": 2575 - }, - { - "epoch": 0.8, - "learning_rate": 0.00025478340260542525, - "loss": 3.956, - "step": 2576 - }, - { - "epoch": 0.8, - "learning_rate": 0.000254747773546188, - "loss": 3.878, - "step": 2577 - }, - { - "epoch": 0.8, - "learning_rate": 0.00025471214448695076, - "loss": 3.8237, - "step": 2578 - }, - { - "epoch": 0.8, - "learning_rate": 0.00025467651542771354, - "loss": 4.0869, - "step": 2579 - }, - { - "epoch": 0.8, - "learning_rate": 0.0002546408863684763, - "loss": 4.0401, - "step": 2580 - }, - { - "epoch": 0.8, - "learning_rate": 0.0002546052573092391, - "loss": 4.086, - "step": 2581 - }, - { - "epoch": 0.8, - "learning_rate": 0.00025456962825000183, - "loss": 3.8983, - "step": 2582 - }, - { - "epoch": 0.8, - "learning_rate": 0.0002545339991907646, - "loss": 3.7856, - "step": 2583 - }, - { - "epoch": 0.8, - "learning_rate": 0.0002544983701315274, - "loss": 3.7518, - "step": 2584 - }, - { - "epoch": 0.8, - "learning_rate": 0.0002544627410722902, - "loss": 3.6817, - "step": 2585 - }, - { - "epoch": 0.8, - "learning_rate": 0.00025442711201305296, - "loss": 3.6225, - "step": 2586 - }, - { - "epoch": 0.8, - "learning_rate": 0.0002543914829538157, - "loss": 3.6017, - "step": 2587 - }, - { - "epoch": 0.8, - "learning_rate": 0.00025435585389457847, - "loss": 3.4799, - "step": 2588 - }, - { - "epoch": 0.8, - "learning_rate": 0.00025432022483534125, - "loss": 3.5111, - "step": 2589 - }, - { - "epoch": 0.8, - "learning_rate": 0.00025428459577610404, - "loss": 3.4776, - "step": 2590 - }, - { - "epoch": 0.8, - "learning_rate": 0.00025424896671686676, - "loss": 3.2313, - "step": 2591 - }, - { - "epoch": 0.8, - "learning_rate": 0.00025421333765762955, - "loss": 3.6537, - "step": 2592 - }, - { - "epoch": 0.8, - "learning_rate": 0.0002541777085983923, - "loss": 3.3233, - "step": 2593 - }, - { - "epoch": 0.8, - "learning_rate": 0.0002541420795391551, - "loss": 3.2158, - "step": 2594 - }, - { - "epoch": 0.8, - "learning_rate": 0.0002541064504799179, - "loss": 3.0938, - "step": 2595 - }, - { - "epoch": 0.8, - "learning_rate": 0.0002540708214206806, - "loss": 3.0766, - "step": 2596 - }, - { - "epoch": 0.8, - "learning_rate": 0.0002540351923614434, - "loss": 2.8116, - "step": 2597 - }, - { - "epoch": 0.8, - "learning_rate": 0.0002539995633022062, - "loss": 2.9469, - "step": 2598 - }, - { - "epoch": 0.8, - "learning_rate": 0.00025396393424296896, - "loss": 2.6102, - "step": 2599 - }, - { - "epoch": 0.8, - "learning_rate": 0.00025392830518373175, - "loss": 2.4305, - "step": 2600 - }, - { - "epoch": 0.8, - "learning_rate": 0.0002538926761244945, - "loss": 5.0191, - "step": 2601 - }, - { - "epoch": 0.8, - "learning_rate": 0.00025385704706525726, - "loss": 4.8744, - "step": 2602 - }, - { - "epoch": 0.8, - "learning_rate": 0.00025382141800602004, - "loss": 4.711, - "step": 2603 - }, - { - "epoch": 0.8, - "learning_rate": 0.0002537857889467828, - "loss": 4.6278, - "step": 2604 - }, - { - "epoch": 0.8, - "learning_rate": 0.0002537501598875456, - "loss": 4.5719, - "step": 2605 - }, - { - "epoch": 0.8, - "learning_rate": 0.00025371453082830833, - "loss": 4.4946, - "step": 2606 - }, - { - "epoch": 0.8, - "learning_rate": 0.00025367890176907116, - "loss": 4.3692, - "step": 2607 - }, - { - "epoch": 0.8, - "learning_rate": 0.0002536432727098339, - "loss": 4.4509, - "step": 2608 - }, - { - "epoch": 0.81, - "learning_rate": 0.0002536076436505967, - "loss": 4.4362, - "step": 2609 - }, - { - "epoch": 0.81, - "learning_rate": 0.00025357201459135946, - "loss": 4.216, - "step": 2610 - }, - { - "epoch": 0.81, - "learning_rate": 0.0002535363855321222, - "loss": 4.0902, - "step": 2611 - }, - { - "epoch": 0.81, - "learning_rate": 0.00025350075647288497, - "loss": 4.3517, - "step": 2612 - }, - { - "epoch": 0.81, - "learning_rate": 0.00025346512741364775, - "loss": 4.3637, - "step": 2613 - }, - { - "epoch": 0.81, - "learning_rate": 0.00025342949835441053, - "loss": 4.1776, - "step": 2614 - }, - { - "epoch": 0.81, - "learning_rate": 0.0002533938692951733, - "loss": 4.0667, - "step": 2615 - }, - { - "epoch": 0.81, - "learning_rate": 0.00025335824023593604, - "loss": 4.2753, - "step": 2616 - }, - { - "epoch": 0.81, - "learning_rate": 0.0002533226111766988, - "loss": 4.0334, - "step": 2617 - }, - { - "epoch": 0.81, - "learning_rate": 0.0002532869821174616, - "loss": 3.9675, - "step": 2618 - }, - { - "epoch": 0.81, - "learning_rate": 0.0002532513530582244, - "loss": 4.0972, - "step": 2619 - }, - { - "epoch": 0.81, - "learning_rate": 0.00025321572399898717, - "loss": 4.1103, - "step": 2620 - }, - { - "epoch": 0.81, - "learning_rate": 0.0002531800949397499, - "loss": 4.2297, - "step": 2621 - }, - { - "epoch": 0.81, - "learning_rate": 0.0002531444658805127, - "loss": 4.0127, - "step": 2622 - }, - { - "epoch": 0.81, - "learning_rate": 0.00025310883682127546, - "loss": 4.1252, - "step": 2623 - }, - { - "epoch": 0.81, - "learning_rate": 0.00025307320776203824, - "loss": 3.9704, - "step": 2624 - }, - { - "epoch": 0.81, - "learning_rate": 0.00025303757870280097, - "loss": 3.9824, - "step": 2625 - }, - { - "epoch": 0.81, - "learning_rate": 0.00025300194964356375, - "loss": 4.1631, - "step": 2626 - }, - { - "epoch": 0.81, - "learning_rate": 0.00025296632058432653, - "loss": 3.9456, - "step": 2627 - }, - { - "epoch": 0.81, - "learning_rate": 0.0002529306915250893, - "loss": 3.9519, - "step": 2628 - }, - { - "epoch": 0.81, - "learning_rate": 0.0002528950624658521, - "loss": 3.8485, - "step": 2629 - }, - { - "epoch": 0.81, - "learning_rate": 0.0002528594334066148, - "loss": 3.9384, - "step": 2630 - }, - { - "epoch": 0.81, - "learning_rate": 0.00025282380434737766, - "loss": 3.6721, - "step": 2631 - }, - { - "epoch": 0.81, - "learning_rate": 0.0002527881752881404, - "loss": 3.9117, - "step": 2632 - }, - { - "epoch": 0.81, - "learning_rate": 0.00025275254622890317, - "loss": 4.018, - "step": 2633 - }, - { - "epoch": 0.81, - "learning_rate": 0.00025271691716966595, - "loss": 3.6935, - "step": 2634 - }, - { - "epoch": 0.81, - "learning_rate": 0.0002526812881104287, - "loss": 3.7908, - "step": 2635 - }, - { - "epoch": 0.81, - "learning_rate": 0.0002526456590511915, - "loss": 3.8382, - "step": 2636 - }, - { - "epoch": 0.81, - "learning_rate": 0.00025261002999195424, - "loss": 3.4631, - "step": 2637 - }, - { - "epoch": 0.81, - "learning_rate": 0.000252574400932717, - "loss": 3.4809, - "step": 2638 - }, - { - "epoch": 0.81, - "learning_rate": 0.0002525387718734798, - "loss": 3.549, - "step": 2639 - }, - { - "epoch": 0.81, - "learning_rate": 0.00025250314281424253, - "loss": 3.6252, - "step": 2640 - }, - { - "epoch": 0.82, - "learning_rate": 0.00025246751375500537, - "loss": 3.4157, - "step": 2641 - }, - { - "epoch": 0.82, - "learning_rate": 0.0002524318846957681, - "loss": 3.3236, - "step": 2642 - }, - { - "epoch": 0.82, - "learning_rate": 0.0002523962556365309, - "loss": 3.3568, - "step": 2643 - }, - { - "epoch": 0.82, - "learning_rate": 0.00025236062657729366, - "loss": 3.1308, - "step": 2644 - }, - { - "epoch": 0.82, - "learning_rate": 0.0002523249975180564, - "loss": 3.2226, - "step": 2645 - }, - { - "epoch": 0.82, - "learning_rate": 0.00025228936845881917, - "loss": 2.9759, - "step": 2646 - }, - { - "epoch": 0.82, - "learning_rate": 0.00025225373939958195, - "loss": 3.1991, - "step": 2647 - }, - { - "epoch": 0.82, - "learning_rate": 0.00025221811034034473, - "loss": 2.8763, - "step": 2648 - }, - { - "epoch": 0.82, - "learning_rate": 0.0002521824812811075, - "loss": 2.4978, - "step": 2649 - }, - { - "epoch": 0.82, - "learning_rate": 0.0002521468522218703, - "loss": 2.4495, - "step": 2650 - }, - { - "epoch": 0.82, - "learning_rate": 0.000252111223162633, - "loss": 5.3153, - "step": 2651 - }, - { - "epoch": 0.82, - "learning_rate": 0.0002520755941033958, - "loss": 4.8939, - "step": 2652 - }, - { - "epoch": 0.82, - "learning_rate": 0.0002520399650441586, - "loss": 4.7659, - "step": 2653 - }, - { - "epoch": 0.82, - "learning_rate": 0.00025200433598492137, - "loss": 4.7056, - "step": 2654 - }, - { - "epoch": 0.82, - "learning_rate": 0.00025196870692568415, - "loss": 4.5684, - "step": 2655 - }, - { - "epoch": 0.82, - "learning_rate": 0.0002519330778664469, - "loss": 4.3812, - "step": 2656 - }, - { - "epoch": 0.82, - "learning_rate": 0.00025189744880720966, - "loss": 4.4827, - "step": 2657 - }, - { - "epoch": 0.82, - "learning_rate": 0.00025186181974797244, - "loss": 4.2564, - "step": 2658 - }, - { - "epoch": 0.82, - "learning_rate": 0.00025182619068873517, - "loss": 4.5515, - "step": 2659 - }, - { - "epoch": 0.82, - "learning_rate": 0.000251790561629498, - "loss": 4.3191, - "step": 2660 - }, - { - "epoch": 0.82, - "learning_rate": 0.00025175493257026074, - "loss": 4.2825, - "step": 2661 - }, - { - "epoch": 0.82, - "learning_rate": 0.0002517193035110235, - "loss": 4.1944, - "step": 2662 - }, - { - "epoch": 0.82, - "learning_rate": 0.0002516836744517863, - "loss": 4.293, - "step": 2663 - }, - { - "epoch": 0.82, - "learning_rate": 0.000251648045392549, - "loss": 4.2297, - "step": 2664 - }, - { - "epoch": 0.82, - "learning_rate": 0.00025161241633331186, - "loss": 4.1846, - "step": 2665 - }, - { - "epoch": 0.82, - "learning_rate": 0.0002515767872740746, - "loss": 4.063, - "step": 2666 - }, - { - "epoch": 0.82, - "learning_rate": 0.00025154115821483737, - "loss": 4.3292, - "step": 2667 - }, - { - "epoch": 0.82, - "learning_rate": 0.00025150552915560015, - "loss": 4.0888, - "step": 2668 - }, - { - "epoch": 0.82, - "learning_rate": 0.0002514699000963629, - "loss": 4.1236, - "step": 2669 - }, - { - "epoch": 0.82, - "learning_rate": 0.0002514342710371257, - "loss": 4.0076, - "step": 2670 - }, - { - "epoch": 0.82, - "learning_rate": 0.00025139864197788845, - "loss": 3.8963, - "step": 2671 - }, - { - "epoch": 0.82, - "learning_rate": 0.00025136301291865123, - "loss": 3.9127, - "step": 2672 - }, - { - "epoch": 0.82, - "learning_rate": 0.000251327383859414, - "loss": 3.9635, - "step": 2673 - }, - { - "epoch": 0.83, - "learning_rate": 0.0002512917548001768, - "loss": 3.8568, - "step": 2674 - }, - { - "epoch": 0.83, - "learning_rate": 0.0002512561257409396, - "loss": 4.0404, - "step": 2675 - }, - { - "epoch": 0.83, - "learning_rate": 0.0002512204966817023, - "loss": 3.9645, - "step": 2676 - }, - { - "epoch": 0.83, - "learning_rate": 0.0002511848676224651, - "loss": 3.9574, - "step": 2677 - }, - { - "epoch": 0.83, - "learning_rate": 0.00025114923856322786, - "loss": 3.8694, - "step": 2678 - }, - { - "epoch": 0.83, - "learning_rate": 0.00025111360950399065, - "loss": 3.7471, - "step": 2679 - }, - { - "epoch": 0.83, - "learning_rate": 0.0002510779804447534, - "loss": 3.7963, - "step": 2680 - }, - { - "epoch": 0.83, - "learning_rate": 0.00025104235138551616, - "loss": 3.7481, - "step": 2681 - }, - { - "epoch": 0.83, - "learning_rate": 0.00025100672232627894, - "loss": 3.8706, - "step": 2682 - }, - { - "epoch": 0.83, - "learning_rate": 0.0002509710932670417, - "loss": 3.8604, - "step": 2683 - }, - { - "epoch": 0.83, - "learning_rate": 0.0002509354642078045, - "loss": 3.7611, - "step": 2684 - }, - { - "epoch": 0.83, - "learning_rate": 0.00025089983514856723, - "loss": 3.5409, - "step": 2685 - }, - { - "epoch": 0.83, - "learning_rate": 0.00025086420608933, - "loss": 3.7339, - "step": 2686 - }, - { - "epoch": 0.83, - "learning_rate": 0.0002508285770300928, - "loss": 3.6357, - "step": 2687 - }, - { - "epoch": 0.83, - "learning_rate": 0.0002507929479708556, - "loss": 3.605, - "step": 2688 - }, - { - "epoch": 0.83, - "learning_rate": 0.00025075731891161836, - "loss": 3.6572, - "step": 2689 - }, - { - "epoch": 0.83, - "learning_rate": 0.0002507216898523811, - "loss": 3.6468, - "step": 2690 - }, - { - "epoch": 0.83, - "learning_rate": 0.00025068606079314387, - "loss": 3.3461, - "step": 2691 - }, - { - "epoch": 0.83, - "learning_rate": 0.00025065043173390665, - "loss": 3.4089, - "step": 2692 - }, - { - "epoch": 0.83, - "learning_rate": 0.00025061480267466943, - "loss": 3.3006, - "step": 2693 - }, - { - "epoch": 0.83, - "learning_rate": 0.0002505791736154322, - "loss": 3.3784, - "step": 2694 - }, - { - "epoch": 0.83, - "learning_rate": 0.00025054354455619494, - "loss": 3.0976, - "step": 2695 - }, - { - "epoch": 0.83, - "learning_rate": 0.0002505079154969577, - "loss": 2.9745, - "step": 2696 - }, - { - "epoch": 0.83, - "learning_rate": 0.0002504722864377205, - "loss": 2.8536, - "step": 2697 - }, - { - "epoch": 0.83, - "learning_rate": 0.0002504366573784833, - "loss": 2.8292, - "step": 2698 - }, - { - "epoch": 0.83, - "learning_rate": 0.00025040102831924607, - "loss": 2.8176, - "step": 2699 - }, - { - "epoch": 0.83, - "learning_rate": 0.0002503653992600088, - "loss": 2.658, - "step": 2700 - }, - { - "epoch": 0.83, - "learning_rate": 0.00025032977020077163, - "loss": 5.4987, - "step": 2701 - }, - { - "epoch": 0.83, - "learning_rate": 0.00025029414114153436, - "loss": 5.2087, - "step": 2702 - }, - { - "epoch": 0.83, - "learning_rate": 0.00025025851208229714, - "loss": 4.7573, - "step": 2703 - }, - { - "epoch": 0.83, - "learning_rate": 0.0002502228830230599, - "loss": 4.4529, - "step": 2704 - }, - { - "epoch": 0.83, - "learning_rate": 0.00025018725396382265, - "loss": 4.5227, - "step": 2705 - }, - { - "epoch": 0.84, - "learning_rate": 0.00025015162490458543, - "loss": 4.4291, - "step": 2706 - }, - { - "epoch": 0.84, - "learning_rate": 0.0002501159958453482, - "loss": 4.3855, - "step": 2707 - }, - { - "epoch": 0.84, - "learning_rate": 0.000250080366786111, - "loss": 4.2062, - "step": 2708 - }, - { - "epoch": 0.84, - "learning_rate": 0.0002500447377268738, - "loss": 4.2823, - "step": 2709 - }, - { - "epoch": 0.84, - "learning_rate": 0.0002500091086676365, - "loss": 4.3326, - "step": 2710 - }, - { - "epoch": 0.84, - "learning_rate": 0.0002499734796083993, - "loss": 4.4894, - "step": 2711 - }, - { - "epoch": 0.84, - "learning_rate": 0.00024993785054916207, - "loss": 4.2602, - "step": 2712 - }, - { - "epoch": 0.84, - "learning_rate": 0.00024990222148992485, - "loss": 4.2127, - "step": 2713 - }, - { - "epoch": 0.84, - "learning_rate": 0.00024986659243068763, - "loss": 4.4705, - "step": 2714 - }, - { - "epoch": 0.84, - "learning_rate": 0.00024983096337145036, - "loss": 4.2217, - "step": 2715 - }, - { - "epoch": 0.84, - "learning_rate": 0.00024979533431221314, - "loss": 4.1255, - "step": 2716 - }, - { - "epoch": 0.84, - "learning_rate": 0.0002497597052529759, - "loss": 4.0255, - "step": 2717 - }, - { - "epoch": 0.84, - "learning_rate": 0.0002497240761937387, - "loss": 4.1824, - "step": 2718 - }, - { - "epoch": 0.84, - "learning_rate": 0.00024968844713450143, - "loss": 4.1604, - "step": 2719 - }, - { - "epoch": 0.84, - "learning_rate": 0.0002496528180752642, - "loss": 4.0386, - "step": 2720 - }, - { - "epoch": 0.84, - "learning_rate": 0.000249617189016027, - "loss": 4.2167, - "step": 2721 - }, - { - "epoch": 0.84, - "learning_rate": 0.0002495815599567898, - "loss": 3.8919, - "step": 2722 - }, - { - "epoch": 0.84, - "learning_rate": 0.00024954593089755256, - "loss": 4.1581, - "step": 2723 - }, - { - "epoch": 0.84, - "learning_rate": 0.0002495103018383153, - "loss": 4.0393, - "step": 2724 - }, - { - "epoch": 0.84, - "learning_rate": 0.0002494746727790781, - "loss": 3.9959, - "step": 2725 - }, - { - "epoch": 0.84, - "learning_rate": 0.00024943904371984085, - "loss": 3.947, - "step": 2726 - }, - { - "epoch": 0.84, - "learning_rate": 0.00024940341466060363, - "loss": 3.7699, - "step": 2727 - }, - { - "epoch": 0.84, - "learning_rate": 0.0002493677856013664, - "loss": 3.9608, - "step": 2728 - }, - { - "epoch": 0.84, - "learning_rate": 0.00024933215654212914, - "loss": 3.757, - "step": 2729 - }, - { - "epoch": 0.84, - "learning_rate": 0.000249296527482892, - "loss": 3.8792, - "step": 2730 - }, - { - "epoch": 0.84, - "learning_rate": 0.0002492608984236547, - "loss": 3.8096, - "step": 2731 - }, - { - "epoch": 0.84, - "learning_rate": 0.0002492252693644175, - "loss": 3.6003, - "step": 2732 - }, - { - "epoch": 0.84, - "learning_rate": 0.00024918964030518027, - "loss": 3.6477, - "step": 2733 - }, - { - "epoch": 0.84, - "learning_rate": 0.000249154011245943, - "loss": 3.6283, - "step": 2734 - }, - { - "epoch": 0.84, - "learning_rate": 0.00024911838218670584, - "loss": 3.656, - "step": 2735 - }, - { - "epoch": 0.84, - "learning_rate": 0.00024908275312746856, - "loss": 3.4448, - "step": 2736 - }, - { - "epoch": 0.84, - "learning_rate": 0.00024904712406823134, - "loss": 3.4105, - "step": 2737 - }, - { - "epoch": 0.84, - "learning_rate": 0.0002490114950089941, - "loss": 3.6094, - "step": 2738 - }, - { - "epoch": 0.85, - "learning_rate": 0.00024897586594975685, - "loss": 3.5526, - "step": 2739 - }, - { - "epoch": 0.85, - "learning_rate": 0.00024894023689051964, - "loss": 3.4898, - "step": 2740 - }, - { - "epoch": 0.85, - "learning_rate": 0.0002489046078312824, - "loss": 3.5501, - "step": 2741 - }, - { - "epoch": 0.85, - "learning_rate": 0.0002488689787720452, - "loss": 3.2843, - "step": 2742 - }, - { - "epoch": 0.85, - "learning_rate": 0.000248833349712808, - "loss": 3.2016, - "step": 2743 - }, - { - "epoch": 0.85, - "learning_rate": 0.00024879772065357076, - "loss": 3.3859, - "step": 2744 - }, - { - "epoch": 0.85, - "learning_rate": 0.0002487620915943335, - "loss": 3.0138, - "step": 2745 - }, - { - "epoch": 0.85, - "learning_rate": 0.0002487264625350963, - "loss": 3.0063, - "step": 2746 - }, - { - "epoch": 0.85, - "learning_rate": 0.00024869083347585906, - "loss": 2.8542, - "step": 2747 - }, - { - "epoch": 0.85, - "learning_rate": 0.00024865520441662184, - "loss": 2.5747, - "step": 2748 - }, - { - "epoch": 0.85, - "learning_rate": 0.0002486195753573846, - "loss": 2.7381, - "step": 2749 - }, - { - "epoch": 0.85, - "learning_rate": 0.00024858394629814735, - "loss": 2.2629, - "step": 2750 - }, - { - "epoch": 0.85, - "learning_rate": 0.00024854831723891013, - "loss": 5.754, - "step": 2751 - }, - { - "epoch": 0.85, - "learning_rate": 0.0002485126881796729, - "loss": 5.1719, - "step": 2752 - }, - { - "epoch": 0.85, - "learning_rate": 0.00024847705912043564, - "loss": 4.9049, - "step": 2753 - }, - { - "epoch": 0.85, - "learning_rate": 0.0002484414300611985, - "loss": 4.7782, - "step": 2754 - }, - { - "epoch": 0.85, - "learning_rate": 0.0002484058010019612, - "loss": 4.1999, - "step": 2755 - }, - { - "epoch": 0.85, - "learning_rate": 0.000248370171942724, - "loss": 4.5041, - "step": 2756 - }, - { - "epoch": 0.85, - "learning_rate": 0.00024833454288348677, - "loss": 4.2243, - "step": 2757 - }, - { - "epoch": 0.85, - "learning_rate": 0.0002482989138242495, - "loss": 4.5833, - "step": 2758 - }, - { - "epoch": 0.85, - "learning_rate": 0.00024826328476501233, - "loss": 4.3533, - "step": 2759 - }, - { - "epoch": 0.85, - "learning_rate": 0.00024822765570577506, - "loss": 4.2367, - "step": 2760 - }, - { - "epoch": 0.85, - "learning_rate": 0.00024819202664653784, - "loss": 4.1026, - "step": 2761 - }, - { - "epoch": 0.85, - "learning_rate": 0.0002481563975873006, - "loss": 4.1035, - "step": 2762 - }, - { - "epoch": 0.85, - "learning_rate": 0.00024812076852806335, - "loss": 4.2734, - "step": 2763 - }, - { - "epoch": 0.85, - "learning_rate": 0.0002480851394688262, - "loss": 4.1579, - "step": 2764 - }, - { - "epoch": 0.85, - "learning_rate": 0.0002480495104095889, - "loss": 4.272, - "step": 2765 - }, - { - "epoch": 0.85, - "learning_rate": 0.0002480138813503517, - "loss": 4.1484, - "step": 2766 - }, - { - "epoch": 0.85, - "learning_rate": 0.0002479782522911145, - "loss": 4.1021, - "step": 2767 - }, - { - "epoch": 0.85, - "learning_rate": 0.00024794262323187726, - "loss": 4.1433, - "step": 2768 - }, - { - "epoch": 0.85, - "learning_rate": 0.00024790699417264004, - "loss": 3.9402, - "step": 2769 - }, - { - "epoch": 0.85, - "learning_rate": 0.00024787136511340277, - "loss": 3.8692, - "step": 2770 - }, - { - "epoch": 0.86, - "learning_rate": 0.00024783573605416555, - "loss": 4.1394, - "step": 2771 - }, - { - "epoch": 0.86, - "learning_rate": 0.00024780010699492833, - "loss": 4.1031, - "step": 2772 - }, - { - "epoch": 0.86, - "learning_rate": 0.0002477644779356911, - "loss": 4.0076, - "step": 2773 - }, - { - "epoch": 0.86, - "learning_rate": 0.00024772884887645384, - "loss": 4.038, - "step": 2774 - }, - { - "epoch": 0.86, - "learning_rate": 0.0002476932198172166, - "loss": 3.9126, - "step": 2775 - }, - { - "epoch": 0.86, - "learning_rate": 0.0002476575907579794, - "loss": 4.2113, - "step": 2776 - }, - { - "epoch": 0.86, - "learning_rate": 0.0002476219616987422, - "loss": 3.83, - "step": 2777 - }, - { - "epoch": 0.86, - "learning_rate": 0.00024758633263950497, - "loss": 3.8967, - "step": 2778 - }, - { - "epoch": 0.86, - "learning_rate": 0.0002475507035802677, - "loss": 3.7935, - "step": 2779 - }, - { - "epoch": 0.86, - "learning_rate": 0.0002475150745210305, - "loss": 3.8635, - "step": 2780 - }, - { - "epoch": 0.86, - "learning_rate": 0.00024747944546179326, - "loss": 3.9863, - "step": 2781 - }, - { - "epoch": 0.86, - "learning_rate": 0.00024744381640255604, - "loss": 3.8237, - "step": 2782 - }, - { - "epoch": 0.86, - "learning_rate": 0.0002474081873433188, - "loss": 3.8387, - "step": 2783 - }, - { - "epoch": 0.86, - "learning_rate": 0.00024737255828408155, - "loss": 3.5427, - "step": 2784 - }, - { - "epoch": 0.86, - "learning_rate": 0.00024733692922484433, - "loss": 3.7273, - "step": 2785 - }, - { - "epoch": 0.86, - "learning_rate": 0.0002473013001656071, - "loss": 3.6756, - "step": 2786 - }, - { - "epoch": 0.86, - "learning_rate": 0.0002472656711063699, - "loss": 3.5526, - "step": 2787 - }, - { - "epoch": 0.86, - "learning_rate": 0.0002472300420471327, - "loss": 3.4313, - "step": 2788 - }, - { - "epoch": 0.86, - "learning_rate": 0.0002471944129878954, - "loss": 3.4677, - "step": 2789 - }, - { - "epoch": 0.86, - "learning_rate": 0.0002471587839286582, - "loss": 3.7668, - "step": 2790 - }, - { - "epoch": 0.86, - "learning_rate": 0.00024712315486942097, - "loss": 3.5568, - "step": 2791 - }, - { - "epoch": 0.86, - "learning_rate": 0.00024708752581018375, - "loss": 3.366, - "step": 2792 - }, - { - "epoch": 0.86, - "learning_rate": 0.00024705189675094653, - "loss": 3.2278, - "step": 2793 - }, - { - "epoch": 0.86, - "learning_rate": 0.00024701626769170926, - "loss": 3.282, - "step": 2794 - }, - { - "epoch": 0.86, - "learning_rate": 0.00024698063863247204, - "loss": 3.0045, - "step": 2795 - }, - { - "epoch": 0.86, - "learning_rate": 0.0002469450095732348, - "loss": 3.1054, - "step": 2796 - }, - { - "epoch": 0.86, - "learning_rate": 0.0002469093805139976, - "loss": 2.8257, - "step": 2797 - }, - { - "epoch": 0.86, - "learning_rate": 0.0002468737514547604, - "loss": 2.7304, - "step": 2798 - }, - { - "epoch": 0.86, - "learning_rate": 0.0002468381223955231, - "loss": 2.7284, - "step": 2799 - }, - { - "epoch": 0.86, - "learning_rate": 0.0002468024933362859, - "loss": 2.5429, - "step": 2800 - }, - { - "epoch": 0.86, - "learning_rate": 0.0002467668642770487, - "loss": 5.268, - "step": 2801 - }, - { - "epoch": 0.86, - "learning_rate": 0.00024673123521781146, - "loss": 5.0074, - "step": 2802 - }, - { - "epoch": 0.87, - "learning_rate": 0.00024669560615857424, - "loss": 4.8031, - "step": 2803 - }, - { - "epoch": 0.87, - "learning_rate": 0.00024665997709933697, - "loss": 4.6615, - "step": 2804 - }, - { - "epoch": 0.87, - "learning_rate": 0.00024662434804009975, - "loss": 4.4112, - "step": 2805 - }, - { - "epoch": 0.87, - "learning_rate": 0.00024658871898086254, - "loss": 4.3456, - "step": 2806 - }, - { - "epoch": 0.87, - "learning_rate": 0.0002465530899216253, - "loss": 4.6209, - "step": 2807 - }, - { - "epoch": 0.87, - "learning_rate": 0.00024651746086238804, - "loss": 4.3433, - "step": 2808 - }, - { - "epoch": 0.87, - "learning_rate": 0.0002464818318031508, - "loss": 4.3165, - "step": 2809 - }, - { - "epoch": 0.87, - "learning_rate": 0.0002464462027439136, - "loss": 4.2259, - "step": 2810 - }, - { - "epoch": 0.87, - "learning_rate": 0.0002464105736846764, - "loss": 4.3759, - "step": 2811 - }, - { - "epoch": 0.87, - "learning_rate": 0.00024637494462543917, - "loss": 4.3307, - "step": 2812 - }, - { - "epoch": 0.87, - "learning_rate": 0.0002463393155662019, - "loss": 4.2332, - "step": 2813 - }, - { - "epoch": 0.87, - "learning_rate": 0.00024630368650696474, - "loss": 3.9422, - "step": 2814 - }, - { - "epoch": 0.87, - "learning_rate": 0.00024626805744772746, - "loss": 4.4347, - "step": 2815 - }, - { - "epoch": 0.87, - "learning_rate": 0.00024623242838849025, - "loss": 4.3249, - "step": 2816 - }, - { - "epoch": 0.87, - "learning_rate": 0.00024619679932925303, - "loss": 4.3051, - "step": 2817 - }, - { - "epoch": 0.87, - "learning_rate": 0.00024616117027001576, - "loss": 4.4575, - "step": 2818 - }, - { - "epoch": 0.87, - "learning_rate": 0.0002461255412107786, - "loss": 4.0797, - "step": 2819 - }, - { - "epoch": 0.87, - "learning_rate": 0.0002460899121515413, - "loss": 4.2489, - "step": 2820 - }, - { - "epoch": 0.87, - "learning_rate": 0.0002460542830923041, - "loss": 3.9804, - "step": 2821 - }, - { - "epoch": 0.87, - "learning_rate": 0.0002460186540330669, - "loss": 4.0563, - "step": 2822 - }, - { - "epoch": 0.87, - "learning_rate": 0.0002459830249738296, - "loss": 4.2551, - "step": 2823 - }, - { - "epoch": 0.87, - "learning_rate": 0.00024594739591459245, - "loss": 3.922, - "step": 2824 - }, - { - "epoch": 0.87, - "learning_rate": 0.0002459117668553552, - "loss": 3.9128, - "step": 2825 - }, - { - "epoch": 0.87, - "learning_rate": 0.00024587613779611796, - "loss": 4.0141, - "step": 2826 - }, - { - "epoch": 0.87, - "learning_rate": 0.00024584050873688074, - "loss": 4.0312, - "step": 2827 - }, - { - "epoch": 0.87, - "learning_rate": 0.00024580487967764347, - "loss": 3.8548, - "step": 2828 - }, - { - "epoch": 0.87, - "learning_rate": 0.0002457692506184063, - "loss": 4.1262, - "step": 2829 - }, - { - "epoch": 0.87, - "learning_rate": 0.00024573362155916903, - "loss": 4.0388, - "step": 2830 - }, - { - "epoch": 0.87, - "learning_rate": 0.0002456979924999318, - "loss": 4.0011, - "step": 2831 - }, - { - "epoch": 0.87, - "learning_rate": 0.0002456623634406946, - "loss": 3.7304, - "step": 2832 - }, - { - "epoch": 0.87, - "learning_rate": 0.0002456267343814573, - "loss": 3.7532, - "step": 2833 - }, - { - "epoch": 0.87, - "learning_rate": 0.0002455911053222201, - "loss": 3.958, - "step": 2834 - }, - { - "epoch": 0.87, - "learning_rate": 0.0002455554762629829, - "loss": 3.7444, - "step": 2835 - }, - { - "epoch": 0.88, - "learning_rate": 0.00024551984720374567, - "loss": 3.6189, - "step": 2836 - }, - { - "epoch": 0.88, - "learning_rate": 0.00024548421814450845, - "loss": 3.5782, - "step": 2837 - }, - { - "epoch": 0.88, - "learning_rate": 0.00024544858908527123, - "loss": 3.6067, - "step": 2838 - }, - { - "epoch": 0.88, - "learning_rate": 0.00024541296002603396, - "loss": 3.692, - "step": 2839 - }, - { - "epoch": 0.88, - "learning_rate": 0.00024537733096679674, - "loss": 3.195, - "step": 2840 - }, - { - "epoch": 0.88, - "learning_rate": 0.0002453417019075595, - "loss": 3.5531, - "step": 2841 - }, - { - "epoch": 0.88, - "learning_rate": 0.00024530607284832225, - "loss": 3.3082, - "step": 2842 - }, - { - "epoch": 0.88, - "learning_rate": 0.0002452704437890851, - "loss": 3.3136, - "step": 2843 - }, - { - "epoch": 0.88, - "learning_rate": 0.0002452348147298478, - "loss": 3.0921, - "step": 2844 - }, - { - "epoch": 0.88, - "learning_rate": 0.0002451991856706106, - "loss": 3.2452, - "step": 2845 - }, - { - "epoch": 0.88, - "learning_rate": 0.0002451635566113734, - "loss": 2.9009, - "step": 2846 - }, - { - "epoch": 0.88, - "learning_rate": 0.0002451279275521361, - "loss": 2.8125, - "step": 2847 - }, - { - "epoch": 0.88, - "learning_rate": 0.00024509229849289894, - "loss": 2.7623, - "step": 2848 - }, - { - "epoch": 0.88, - "learning_rate": 0.00024505666943366167, - "loss": 2.6036, - "step": 2849 - }, - { - "epoch": 0.88, - "learning_rate": 0.00024502104037442445, - "loss": 2.5381, - "step": 2850 - }, - { - "epoch": 0.88, - "learning_rate": 0.00024498541131518723, - "loss": 5.3456, - "step": 2851 - }, - { - "epoch": 0.88, - "learning_rate": 0.00024494978225594996, - "loss": 4.9337, - "step": 2852 - }, - { - "epoch": 0.88, - "learning_rate": 0.0002449141531967128, - "loss": 5.0114, - "step": 2853 - }, - { - "epoch": 0.88, - "learning_rate": 0.0002448785241374755, - "loss": 4.6392, - "step": 2854 - }, - { - "epoch": 0.88, - "learning_rate": 0.0002448428950782383, - "loss": 4.422, - "step": 2855 - }, - { - "epoch": 0.88, - "learning_rate": 0.0002448072660190011, - "loss": 4.3948, - "step": 2856 - }, - { - "epoch": 0.88, - "learning_rate": 0.00024477163695976387, - "loss": 4.1018, - "step": 2857 - }, - { - "epoch": 0.88, - "learning_rate": 0.00024473600790052665, - "loss": 4.2018, - "step": 2858 - }, - { - "epoch": 0.88, - "learning_rate": 0.0002447003788412894, - "loss": 4.2994, - "step": 2859 - }, - { - "epoch": 0.88, - "learning_rate": 0.00024466474978205216, - "loss": 4.0564, - "step": 2860 - }, - { - "epoch": 0.88, - "learning_rate": 0.00024462912072281494, - "loss": 4.2178, - "step": 2861 - }, - { - "epoch": 0.88, - "learning_rate": 0.0002445934916635777, - "loss": 4.033, - "step": 2862 - }, - { - "epoch": 0.88, - "learning_rate": 0.0002445578626043405, - "loss": 4.2237, - "step": 2863 - }, - { - "epoch": 0.88, - "learning_rate": 0.00024452223354510323, - "loss": 4.0525, - "step": 2864 - }, - { - "epoch": 0.88, - "learning_rate": 0.000244486604485866, - "loss": 4.025, - "step": 2865 - }, - { - "epoch": 0.88, - "learning_rate": 0.0002444509754266288, - "loss": 4.192, - "step": 2866 - }, - { - "epoch": 0.88, - "learning_rate": 0.0002444153463673916, - "loss": 4.2727, - "step": 2867 - }, - { - "epoch": 0.89, - "learning_rate": 0.0002443797173081543, - "loss": 4.108, - "step": 2868 - }, - { - "epoch": 0.89, - "learning_rate": 0.0002443440882489171, - "loss": 4.0206, - "step": 2869 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024430845918967987, - "loss": 3.9575, - "step": 2870 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024427283013044265, - "loss": 3.7781, - "step": 2871 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024423720107120543, - "loss": 3.9284, - "step": 2872 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024420157201196816, - "loss": 3.9372, - "step": 2873 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024416594295273094, - "loss": 4.002, - "step": 2874 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024413031389349373, - "loss": 4.0255, - "step": 2875 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024409468483425648, - "loss": 3.8838, - "step": 2876 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024405905577501926, - "loss": 3.9523, - "step": 2877 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024402342671578202, - "loss": 4.0318, - "step": 2878 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024398779765654483, - "loss": 3.7941, - "step": 2879 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024395216859730758, - "loss": 3.8414, - "step": 2880 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024391653953807034, - "loss": 3.7683, - "step": 2881 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024388091047883312, - "loss": 3.6342, - "step": 2882 - }, - { - "epoch": 0.89, - "learning_rate": 0.0002438452814195959, - "loss": 3.7049, - "step": 2883 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024380965236035868, - "loss": 4.0264, - "step": 2884 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024377402330112144, - "loss": 3.8285, - "step": 2885 - }, - { - "epoch": 0.89, - "learning_rate": 0.0002437383942418842, - "loss": 3.6939, - "step": 2886 - }, - { - "epoch": 0.89, - "learning_rate": 0.000243702765182647, - "loss": 3.4416, - "step": 2887 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024366713612340975, - "loss": 3.5991, - "step": 2888 - }, - { - "epoch": 0.89, - "learning_rate": 0.0002436315070641725, - "loss": 3.426, - "step": 2889 - }, - { - "epoch": 0.89, - "learning_rate": 0.0002435958780049353, - "loss": 3.3307, - "step": 2890 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024356024894569805, - "loss": 3.3516, - "step": 2891 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024352461988646086, - "loss": 3.2901, - "step": 2892 - }, - { - "epoch": 0.89, - "learning_rate": 0.0002434889908272236, - "loss": 3.1694, - "step": 2893 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024345336176798636, - "loss": 3.0758, - "step": 2894 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024341773270874915, - "loss": 3.2264, - "step": 2895 - }, - { - "epoch": 0.89, - "learning_rate": 0.0002433821036495119, - "loss": 2.8788, - "step": 2896 - }, - { - "epoch": 0.89, - "learning_rate": 0.0002433464745902747, - "loss": 2.651, - "step": 2897 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024331084553103747, - "loss": 2.6848, - "step": 2898 - }, - { - "epoch": 0.89, - "learning_rate": 0.00024327521647180022, - "loss": 2.4058, - "step": 2899 - }, - { - "epoch": 0.89, - "learning_rate": 0.000243239587412563, - "loss": 2.3926, - "step": 2900 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024320395835332576, - "loss": 5.1485, - "step": 2901 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024316832929408854, - "loss": 4.9246, - "step": 2902 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024313270023485132, - "loss": 4.928, - "step": 2903 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024309707117561408, - "loss": 4.4354, - "step": 2904 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024306144211637686, - "loss": 4.2858, - "step": 2905 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024302581305713964, - "loss": 4.5205, - "step": 2906 - }, - { - "epoch": 0.9, - "learning_rate": 0.0002429901839979024, - "loss": 4.2951, - "step": 2907 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024295455493866518, - "loss": 4.4189, - "step": 2908 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024291892587942793, - "loss": 4.3068, - "step": 2909 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024288329682019068, - "loss": 4.3974, - "step": 2910 - }, - { - "epoch": 0.9, - "learning_rate": 0.0002428476677609535, - "loss": 4.1441, - "step": 2911 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024281203870171625, - "loss": 4.2917, - "step": 2912 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024277640964247903, - "loss": 4.1983, - "step": 2913 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024274078058324179, - "loss": 3.9089, - "step": 2914 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024270515152400454, - "loss": 4.1418, - "step": 2915 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024266952246476735, - "loss": 4.0739, - "step": 2916 - }, - { - "epoch": 0.9, - "learning_rate": 0.0002426338934055301, - "loss": 4.0071, - "step": 2917 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024259826434629289, - "loss": 4.0607, - "step": 2918 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024256263528705564, - "loss": 4.0089, - "step": 2919 - }, - { - "epoch": 0.9, - "learning_rate": 0.0002425270062278184, - "loss": 4.0633, - "step": 2920 - }, - { - "epoch": 0.9, - "learning_rate": 0.0002424913771685812, - "loss": 3.916, - "step": 2921 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024245574810934396, - "loss": 4.0718, - "step": 2922 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024242011905010671, - "loss": 3.8374, - "step": 2923 - }, - { - "epoch": 0.9, - "learning_rate": 0.0002423844899908695, - "loss": 3.9778, - "step": 2924 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024234886093163225, - "loss": 3.9684, - "step": 2925 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024231323187239506, - "loss": 3.8807, - "step": 2926 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024227760281315781, - "loss": 4.0067, - "step": 2927 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024224197375392057, - "loss": 3.9332, - "step": 2928 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024220634469468335, - "loss": 3.8465, - "step": 2929 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024217071563544613, - "loss": 3.9522, - "step": 2930 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024213508657620891, - "loss": 3.6859, - "step": 2931 - }, - { - "epoch": 0.9, - "learning_rate": 0.00024209945751697167, - "loss": 3.9702, - "step": 2932 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024206382845773442, - "loss": 3.6527, - "step": 2933 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024202819939849723, - "loss": 3.4784, - "step": 2934 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024199257033926, - "loss": 3.6105, - "step": 2935 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024195694128002274, - "loss": 3.6, - "step": 2936 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024192131222078552, - "loss": 3.4741, - "step": 2937 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024188568316154828, - "loss": 3.4962, - "step": 2938 - }, - { - "epoch": 0.91, - "learning_rate": 0.0002418500541023111, - "loss": 3.2246, - "step": 2939 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024181442504307384, - "loss": 3.5243, - "step": 2940 - }, - { - "epoch": 0.91, - "learning_rate": 0.0002417787959838366, - "loss": 3.4914, - "step": 2941 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024174316692459938, - "loss": 3.217, - "step": 2942 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024170753786536213, - "loss": 3.3969, - "step": 2943 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024167190880612494, - "loss": 3.1569, - "step": 2944 - }, - { - "epoch": 0.91, - "learning_rate": 0.0002416362797468877, - "loss": 3.0429, - "step": 2945 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024160065068765045, - "loss": 2.9023, - "step": 2946 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024156502162841323, - "loss": 2.8648, - "step": 2947 - }, - { - "epoch": 0.91, - "learning_rate": 0.000241529392569176, - "loss": 2.6455, - "step": 2948 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024149376350993877, - "loss": 2.4866, - "step": 2949 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024145813445070155, - "loss": 2.3031, - "step": 2950 - }, - { - "epoch": 0.91, - "learning_rate": 0.0002414225053914643, - "loss": 5.1586, - "step": 2951 - }, - { - "epoch": 0.91, - "learning_rate": 0.0002413868763322271, - "loss": 4.8623, - "step": 2952 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024135124727298987, - "loss": 4.5682, - "step": 2953 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024131561821375263, - "loss": 4.5075, - "step": 2954 - }, - { - "epoch": 0.91, - "learning_rate": 0.0002412799891545154, - "loss": 4.2716, - "step": 2955 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024124436009527816, - "loss": 4.3472, - "step": 2956 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024120873103604092, - "loss": 4.2731, - "step": 2957 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024117310197680373, - "loss": 4.435, - "step": 2958 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024113747291756648, - "loss": 4.1926, - "step": 2959 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024110184385832926, - "loss": 4.2232, - "step": 2960 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024106621479909202, - "loss": 4.2754, - "step": 2961 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024103058573985477, - "loss": 4.2501, - "step": 2962 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024099495668061758, - "loss": 3.9383, - "step": 2963 - }, - { - "epoch": 0.91, - "learning_rate": 0.00024095932762138034, - "loss": 3.8235, - "step": 2964 - }, - { - "epoch": 0.92, - "learning_rate": 0.00024092369856214312, - "loss": 4.0403, - "step": 2965 - }, - { - "epoch": 0.92, - "learning_rate": 0.00024088806950290587, - "loss": 4.1881, - "step": 2966 - }, - { - "epoch": 0.92, - "learning_rate": 0.00024085244044366863, - "loss": 3.8996, - "step": 2967 - }, - { - "epoch": 0.92, - "learning_rate": 0.00024081681138443144, - "loss": 4.2, - "step": 2968 - }, - { - "epoch": 0.92, - "learning_rate": 0.0002407811823251942, - "loss": 4.1397, - "step": 2969 - }, - { - "epoch": 0.92, - "learning_rate": 0.00024074555326595695, - "loss": 4.059, - "step": 2970 - }, - { - "epoch": 0.92, - "learning_rate": 0.00024070992420671973, - "loss": 4.17, - "step": 2971 - }, - { - "epoch": 0.92, - "learning_rate": 0.00024067429514748248, - "loss": 4.1393, - "step": 2972 - }, - { - "epoch": 0.92, - "learning_rate": 0.0002406386660882453, - "loss": 4.0688, - "step": 2973 - }, - { - "epoch": 0.92, - "learning_rate": 0.00024060303702900805, - "loss": 3.9898, - "step": 2974 - }, - { - "epoch": 0.92, - "learning_rate": 0.0002405674079697708, - "loss": 3.8472, - "step": 2975 - }, - { - "epoch": 0.92, - "learning_rate": 0.00024053177891053358, - "loss": 3.8999, - "step": 2976 - }, - { - "epoch": 0.92, - "learning_rate": 0.00024049614985129637, - "loss": 3.7652, - "step": 2977 - }, - { - "epoch": 0.92, - "learning_rate": 0.00024046052079205915, - "loss": 3.9337, - "step": 2978 - }, - { - "epoch": 0.92, - "learning_rate": 0.0002404248917328219, - "loss": 3.7308, - "step": 2979 - }, - { - "epoch": 0.92, - "learning_rate": 0.00024038926267358466, - "loss": 3.5359, - "step": 2980 - }, - { - "epoch": 0.92, - "learning_rate": 0.00024035363361434747, - "loss": 3.5994, - "step": 2981 - }, - { - "epoch": 0.92, - "learning_rate": 0.00024031800455511022, - "loss": 3.77, - "step": 2982 - }, - { - "epoch": 0.92, - "learning_rate": 0.00024028237549587298, - "loss": 3.5036, - "step": 2983 - }, - { - "epoch": 0.92, - "learning_rate": 0.00024024674643663576, - "loss": 3.8905, - "step": 2984 - }, - { - "epoch": 0.92, - "learning_rate": 0.0002402111173773985, - "loss": 3.5129, - "step": 2985 - }, - { - "epoch": 0.92, - "learning_rate": 0.00024017548831816132, - "loss": 3.6096, - "step": 2986 - }, - { - "epoch": 0.92, - "learning_rate": 0.00024013985925892408, - "loss": 3.4196, - "step": 2987 - }, - { - "epoch": 0.92, - "learning_rate": 0.00024010423019968683, - "loss": 3.5883, - "step": 2988 - }, - { - "epoch": 0.92, - "learning_rate": 0.0002400686011404496, - "loss": 3.3895, - "step": 2989 - }, - { - "epoch": 0.92, - "learning_rate": 0.00024003297208121237, - "loss": 3.1491, - "step": 2990 - }, - { - "epoch": 0.92, - "learning_rate": 0.00023999734302197512, - "loss": 3.0395, - "step": 2991 - }, - { - "epoch": 0.92, - "learning_rate": 0.00023996171396273793, - "loss": 3.311, - "step": 2992 - }, - { - "epoch": 0.92, - "learning_rate": 0.00023992608490350069, - "loss": 3.1326, - "step": 2993 - }, - { - "epoch": 0.92, - "learning_rate": 0.00023989045584426347, - "loss": 3.2458, - "step": 2994 - }, - { - "epoch": 0.92, - "learning_rate": 0.00023985482678502622, - "loss": 3.1501, - "step": 2995 - }, - { - "epoch": 0.92, - "learning_rate": 0.000239819197725789, - "loss": 3.0105, - "step": 2996 - }, - { - "epoch": 0.92, - "learning_rate": 0.0002397835686665518, - "loss": 2.6193, - "step": 2997 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023974793960731454, - "loss": 2.7533, - "step": 2998 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023971231054807732, - "loss": 2.8436, - "step": 2999 - }, - { - "epoch": 0.93, - "learning_rate": 0.0002396766814888401, - "loss": 2.5174, - "step": 3000 - }, - { - "epoch": 0.93, - "eval_bleu": 0.0, - "eval_loss": 4.649660110473633, - "eval_runtime": 2573.2938, - "eval_samples_per_second": 5.736, - "eval_steps_per_second": 0.717, - "step": 3000 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023964105242960286, - "loss": 5.1895, - "step": 3001 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023960542337036564, - "loss": 5.0417, - "step": 3002 - }, - { - "epoch": 0.93, - "learning_rate": 0.0002395697943111284, - "loss": 4.9672, - "step": 3003 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023953416525189115, - "loss": 4.8772, - "step": 3004 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023949853619265396, - "loss": 4.5442, - "step": 3005 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023946290713341671, - "loss": 4.2894, - "step": 3006 - }, - { - "epoch": 0.93, - "learning_rate": 0.0002394272780741795, - "loss": 4.2901, - "step": 3007 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023939164901494225, - "loss": 4.3114, - "step": 3008 - }, - { - "epoch": 0.93, - "learning_rate": 0.000239356019955705, - "loss": 4.3435, - "step": 3009 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023932039089646782, - "loss": 4.1441, - "step": 3010 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023928476183723057, - "loss": 4.2567, - "step": 3011 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023924913277799335, - "loss": 4.0212, - "step": 3012 - }, - { - "epoch": 0.93, - "learning_rate": 0.0002392135037187561, - "loss": 4.1907, - "step": 3013 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023917787465951886, - "loss": 4.0466, - "step": 3014 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023914224560028167, - "loss": 3.9375, - "step": 3015 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023910661654104443, - "loss": 4.3079, - "step": 3016 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023907098748180718, - "loss": 3.9988, - "step": 3017 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023903535842256996, - "loss": 4.2982, - "step": 3018 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023899972936333272, - "loss": 4.203, - "step": 3019 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023896410030409553, - "loss": 3.9713, - "step": 3020 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023892847124485828, - "loss": 3.9173, - "step": 3021 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023889284218562104, - "loss": 4.0286, - "step": 3022 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023885721312638382, - "loss": 3.9564, - "step": 3023 - }, - { - "epoch": 0.93, - "learning_rate": 0.0002388215840671466, - "loss": 4.1124, - "step": 3024 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023878595500790938, - "loss": 3.9557, - "step": 3025 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023875032594867214, - "loss": 4.0339, - "step": 3026 - }, - { - "epoch": 0.93, - "learning_rate": 0.0002387146968894349, - "loss": 3.8846, - "step": 3027 - }, - { - "epoch": 0.93, - "learning_rate": 0.0002386790678301977, - "loss": 3.9659, - "step": 3028 - }, - { - "epoch": 0.93, - "learning_rate": 0.00023864343877096045, - "loss": 4.0218, - "step": 3029 - }, - { - "epoch": 0.94, - "learning_rate": 0.0002386078097117232, - "loss": 3.7102, - "step": 3030 - }, - { - "epoch": 0.94, - "learning_rate": 0.000238572180652486, - "loss": 3.7506, - "step": 3031 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023853655159324875, - "loss": 3.8144, - "step": 3032 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023850092253401155, - "loss": 3.4837, - "step": 3033 - }, - { - "epoch": 0.94, - "learning_rate": 0.0002384652934747743, - "loss": 3.513, - "step": 3034 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023842966441553706, - "loss": 3.5956, - "step": 3035 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023839403535629985, - "loss": 3.565, - "step": 3036 - }, - { - "epoch": 0.94, - "learning_rate": 0.0002383584062970626, - "loss": 3.7052, - "step": 3037 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023832277723782536, - "loss": 3.7947, - "step": 3038 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023828714817858816, - "loss": 3.261, - "step": 3039 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023825151911935092, - "loss": 3.2484, - "step": 3040 - }, - { - "epoch": 0.94, - "learning_rate": 0.0002382158900601137, - "loss": 3.5019, - "step": 3041 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023818026100087646, - "loss": 3.2125, - "step": 3042 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023814463194163924, - "loss": 3.0068, - "step": 3043 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023810900288240202, - "loss": 2.9614, - "step": 3044 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023807337382316477, - "loss": 3.0928, - "step": 3045 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023803774476392756, - "loss": 2.995, - "step": 3046 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023800211570469034, - "loss": 2.7878, - "step": 3047 - }, - { - "epoch": 0.94, - "learning_rate": 0.0002379664866454531, - "loss": 2.6381, - "step": 3048 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023793085758621587, - "loss": 2.4016, - "step": 3049 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023789522852697863, - "loss": 2.4339, - "step": 3050 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023785959946774138, - "loss": 5.1597, - "step": 3051 - }, - { - "epoch": 0.94, - "learning_rate": 0.0002378239704085042, - "loss": 5.0598, - "step": 3052 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023778834134926695, - "loss": 4.6554, - "step": 3053 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023775271229002973, - "loss": 4.4459, - "step": 3054 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023771708323079248, - "loss": 4.3191, - "step": 3055 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023768145417155524, - "loss": 4.2706, - "step": 3056 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023764582511231805, - "loss": 4.4143, - "step": 3057 - }, - { - "epoch": 0.94, - "learning_rate": 0.0002376101960530808, - "loss": 4.5541, - "step": 3058 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023757456699384359, - "loss": 4.1362, - "step": 3059 - }, - { - "epoch": 0.94, - "learning_rate": 0.00023753893793460634, - "loss": 4.4401, - "step": 3060 - }, - { - "epoch": 0.94, - "learning_rate": 0.0002375033088753691, - "loss": 4.2569, - "step": 3061 - }, - { - "epoch": 0.94, - "learning_rate": 0.0002374676798161319, - "loss": 4.2379, - "step": 3062 - }, - { - "epoch": 0.95, - "learning_rate": 0.00023743205075689466, - "loss": 4.1407, - "step": 3063 - }, - { - "epoch": 0.95, - "learning_rate": 0.0002373964216976574, - "loss": 4.0396, - "step": 3064 - }, - { - "epoch": 0.95, - "learning_rate": 0.0002373607926384202, - "loss": 3.9111, - "step": 3065 - }, - { - "epoch": 0.95, - "learning_rate": 0.00023732516357918295, - "loss": 3.85, - "step": 3066 - }, - { - "epoch": 0.95, - "learning_rate": 0.00023728953451994576, - "loss": 3.9206, - "step": 3067 - }, - { - "epoch": 0.95, - "learning_rate": 0.00023725390546070851, - "loss": 4.0397, - "step": 3068 - }, - { - "epoch": 0.95, - "learning_rate": 0.00023721827640147127, - "loss": 3.9172, - "step": 3069 - }, - { - "epoch": 0.95, - "learning_rate": 0.00023718264734223405, - "loss": 4.1089, - "step": 3070 - }, - { - "epoch": 0.95, - "learning_rate": 0.00023714701828299683, - "loss": 4.2583, - "step": 3071 - }, - { - "epoch": 0.95, - "learning_rate": 0.0002371113892237596, - "loss": 4.0153, - "step": 3072 - }, - { - "epoch": 0.95, - "learning_rate": 0.00023707576016452237, - "loss": 3.9331, - "step": 3073 - }, - { - "epoch": 0.95, - "learning_rate": 0.00023704013110528512, - "loss": 3.9203, - "step": 3074 - }, - { - "epoch": 0.95, - "learning_rate": 0.00023700450204604793, - "loss": 4.0776, - "step": 3075 - }, - { - "epoch": 0.95, - "learning_rate": 0.0002369688729868107, - "loss": 3.8417, - "step": 3076 - }, - { - "epoch": 0.95, - "learning_rate": 0.00023693324392757344, - "loss": 3.9044, - "step": 3077 - }, - { - "epoch": 0.95, - "learning_rate": 0.00023689761486833622, - "loss": 3.8752, - "step": 3078 - }, - { - "epoch": 0.95, - "learning_rate": 0.00023686198580909898, - "loss": 3.7261, - "step": 3079 - }, - { - "epoch": 0.95, - "learning_rate": 0.0002368263567498618, - "loss": 3.9201, - "step": 3080 - }, - { - "epoch": 0.95, - "learning_rate": 0.00023679072769062454, - "loss": 3.8859, - "step": 3081 - }, - { - "epoch": 0.95, - "learning_rate": 0.0002367550986313873, - "loss": 3.8958, - "step": 3082 - }, - { - "epoch": 0.95, - "learning_rate": 0.00023671946957215008, - "loss": 3.8624, - "step": 3083 - }, - { - "epoch": 0.95, - "learning_rate": 0.00023668384051291283, - "loss": 3.7008, - "step": 3084 - }, - { - "epoch": 0.95, - "learning_rate": 0.0002366482114536756, - "loss": 3.4738, - "step": 3085 - }, - { - "epoch": 0.95, - "learning_rate": 0.0002366125823944384, - "loss": 3.6188, - "step": 3086 - }, - { - "epoch": 0.95, - "learning_rate": 0.00023657695333520115, - "loss": 3.5793, - "step": 3087 - }, - { - "epoch": 0.95, - "learning_rate": 0.00023654132427596393, - "loss": 3.4939, - "step": 3088 - }, - { - "epoch": 0.95, - "learning_rate": 0.0002365056952167267, - "loss": 3.1608, - "step": 3089 - }, - { - "epoch": 0.95, - "learning_rate": 0.00023647006615748947, - "loss": 3.2663, - "step": 3090 - }, - { - "epoch": 0.95, - "learning_rate": 0.00023643443709825225, - "loss": 3.3071, - "step": 3091 - }, - { - "epoch": 0.95, - "learning_rate": 0.000236398808039015, - "loss": 3.2175, - "step": 3092 - }, - { - "epoch": 0.95, - "learning_rate": 0.0002363631789797778, - "loss": 3.1249, - "step": 3093 - }, - { - "epoch": 0.95, - "learning_rate": 0.00023632754992054057, - "loss": 3.1137, - "step": 3094 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023629192086130333, - "loss": 3.0907, - "step": 3095 - }, - { - "epoch": 0.96, - "learning_rate": 0.0002362562918020661, - "loss": 3.0262, - "step": 3096 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023622066274282886, - "loss": 2.6885, - "step": 3097 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023618503368359162, - "loss": 2.6656, - "step": 3098 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023614940462435443, - "loss": 2.5574, - "step": 3099 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023611377556511718, - "loss": 2.3727, - "step": 3100 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023607814650587996, - "loss": 5.2261, - "step": 3101 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023604251744664272, - "loss": 4.9689, - "step": 3102 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023600688838740547, - "loss": 4.798, - "step": 3103 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023597125932816828, - "loss": 4.779, - "step": 3104 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023593563026893104, - "loss": 4.7496, - "step": 3105 - }, - { - "epoch": 0.96, - "learning_rate": 0.0002359000012096938, - "loss": 4.5507, - "step": 3106 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023586437215045657, - "loss": 4.372, - "step": 3107 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023582874309121933, - "loss": 4.1717, - "step": 3108 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023579311403198214, - "loss": 4.2175, - "step": 3109 - }, - { - "epoch": 0.96, - "learning_rate": 0.0002357574849727449, - "loss": 3.9289, - "step": 3110 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023572185591350765, - "loss": 4.1815, - "step": 3111 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023568622685427043, - "loss": 4.0538, - "step": 3112 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023565059779503318, - "loss": 4.1981, - "step": 3113 - }, - { - "epoch": 0.96, - "learning_rate": 0.000235614968735796, - "loss": 4.1601, - "step": 3114 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023557933967655875, - "loss": 3.969, - "step": 3115 - }, - { - "epoch": 0.96, - "learning_rate": 0.0002355437106173215, - "loss": 4.1585, - "step": 3116 - }, - { - "epoch": 0.96, - "learning_rate": 0.0002355080815580843, - "loss": 4.1056, - "step": 3117 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023547245249884707, - "loss": 4.1191, - "step": 3118 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023543682343960982, - "loss": 3.9015, - "step": 3119 - }, - { - "epoch": 0.96, - "learning_rate": 0.0002354011943803726, - "loss": 4.2587, - "step": 3120 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023536556532113536, - "loss": 3.9025, - "step": 3121 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023532993626189817, - "loss": 3.8839, - "step": 3122 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023529430720266092, - "loss": 3.9939, - "step": 3123 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023525867814342368, - "loss": 3.9057, - "step": 3124 - }, - { - "epoch": 0.96, - "learning_rate": 0.00023522304908418646, - "loss": 3.8479, - "step": 3125 - }, - { - "epoch": 0.96, - "learning_rate": 0.0002351874200249492, - "loss": 4.1093, - "step": 3126 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023515179096571202, - "loss": 3.9502, - "step": 3127 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023511616190647478, - "loss": 3.8979, - "step": 3128 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023508053284723753, - "loss": 3.9489, - "step": 3129 - }, - { - "epoch": 0.97, - "learning_rate": 0.0002350449037880003, - "loss": 3.6262, - "step": 3130 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023500927472876307, - "loss": 3.7309, - "step": 3131 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023497364566952582, - "loss": 3.6783, - "step": 3132 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023493801661028863, - "loss": 3.5578, - "step": 3133 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023490238755105139, - "loss": 3.8483, - "step": 3134 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023486675849181417, - "loss": 3.5235, - "step": 3135 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023483112943257692, - "loss": 3.4825, - "step": 3136 - }, - { - "epoch": 0.97, - "learning_rate": 0.0002347955003733397, - "loss": 3.5824, - "step": 3137 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023475987131410249, - "loss": 3.3131, - "step": 3138 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023472424225486524, - "loss": 3.5068, - "step": 3139 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023468861319562802, - "loss": 3.4612, - "step": 3140 - }, - { - "epoch": 0.97, - "learning_rate": 0.0002346529841363908, - "loss": 3.4827, - "step": 3141 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023461735507715356, - "loss": 3.2186, - "step": 3142 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023458172601791634, - "loss": 3.1641, - "step": 3143 - }, - { - "epoch": 0.97, - "learning_rate": 0.0002345460969586791, - "loss": 3.1822, - "step": 3144 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023451046789944185, - "loss": 2.5353, - "step": 3145 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023447483884020466, - "loss": 2.9143, - "step": 3146 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023443920978096741, - "loss": 2.6803, - "step": 3147 - }, - { - "epoch": 0.97, - "learning_rate": 0.0002344035807217302, - "loss": 2.5106, - "step": 3148 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023436795166249295, - "loss": 2.3812, - "step": 3149 - }, - { - "epoch": 0.97, - "learning_rate": 0.0002343323226032557, - "loss": 2.4964, - "step": 3150 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023429669354401851, - "loss": 5.3176, - "step": 3151 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023426106448478127, - "loss": 4.8833, - "step": 3152 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023422543542554402, - "loss": 4.6634, - "step": 3153 - }, - { - "epoch": 0.97, - "learning_rate": 0.0002341898063663068, - "loss": 4.4617, - "step": 3154 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023415417730706956, - "loss": 4.2766, - "step": 3155 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023411854824783237, - "loss": 4.5698, - "step": 3156 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023408291918859512, - "loss": 4.1772, - "step": 3157 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023404729012935788, - "loss": 4.3714, - "step": 3158 - }, - { - "epoch": 0.97, - "learning_rate": 0.00023401166107012066, - "loss": 4.4489, - "step": 3159 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023397603201088342, - "loss": 4.196, - "step": 3160 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023394040295164623, - "loss": 4.5322, - "step": 3161 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023390477389240898, - "loss": 4.2698, - "step": 3162 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023386914483317173, - "loss": 4.2684, - "step": 3163 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023383351577393454, - "loss": 4.0842, - "step": 3164 - }, - { - "epoch": 0.98, - "learning_rate": 0.0002337978867146973, - "loss": 4.2798, - "step": 3165 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023376225765546005, - "loss": 3.998, - "step": 3166 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023372662859622284, - "loss": 4.0314, - "step": 3167 - }, - { - "epoch": 0.98, - "learning_rate": 0.0002336909995369856, - "loss": 3.9714, - "step": 3168 - }, - { - "epoch": 0.98, - "learning_rate": 0.0002336553704777484, - "loss": 3.9638, - "step": 3169 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023361974141851115, - "loss": 4.1141, - "step": 3170 - }, - { - "epoch": 0.98, - "learning_rate": 0.0002335841123592739, - "loss": 3.9792, - "step": 3171 - }, - { - "epoch": 0.98, - "learning_rate": 0.0002335484833000367, - "loss": 3.7714, - "step": 3172 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023351285424079945, - "loss": 3.8314, - "step": 3173 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023347722518156225, - "loss": 3.8603, - "step": 3174 - }, - { - "epoch": 0.98, - "learning_rate": 0.000233441596122325, - "loss": 3.8265, - "step": 3175 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023340596706308776, - "loss": 3.8483, - "step": 3176 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023337033800385055, - "loss": 3.7276, - "step": 3177 - }, - { - "epoch": 0.98, - "learning_rate": 0.0002333347089446133, - "loss": 3.8043, - "step": 3178 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023329907988537605, - "loss": 3.6952, - "step": 3179 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023326345082613886, - "loss": 3.4846, - "step": 3180 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023322782176690162, - "loss": 3.553, - "step": 3181 - }, - { - "epoch": 0.98, - "learning_rate": 0.0002331921927076644, - "loss": 3.6315, - "step": 3182 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023315656364842716, - "loss": 3.5619, - "step": 3183 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023312093458918994, - "loss": 3.771, - "step": 3184 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023308530552995272, - "loss": 3.3283, - "step": 3185 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023304967647071547, - "loss": 3.6697, - "step": 3186 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023301404741147823, - "loss": 3.4642, - "step": 3187 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023297841835224104, - "loss": 3.4893, - "step": 3188 - }, - { - "epoch": 0.98, - "learning_rate": 0.0002329427892930038, - "loss": 3.3099, - "step": 3189 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023290716023376657, - "loss": 3.2631, - "step": 3190 - }, - { - "epoch": 0.98, - "learning_rate": 0.00023287153117452933, - "loss": 3.2937, - "step": 3191 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023283590211529208, - "loss": 3.3957, - "step": 3192 - }, - { - "epoch": 0.99, - "learning_rate": 0.0002328002730560549, - "loss": 3.3246, - "step": 3193 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023276464399681765, - "loss": 3.1247, - "step": 3194 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023272901493758043, - "loss": 3.1037, - "step": 3195 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023269338587834318, - "loss": 2.5896, - "step": 3196 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023265775681910594, - "loss": 2.7472, - "step": 3197 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023262212775986875, - "loss": 2.6087, - "step": 3198 - }, - { - "epoch": 0.99, - "learning_rate": 0.0002325864987006315, - "loss": 2.4268, - "step": 3199 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023255086964139426, - "loss": 2.5054, - "step": 3200 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023251524058215704, - "loss": 5.2924, - "step": 3201 - }, - { - "epoch": 0.99, - "learning_rate": 0.0002324796115229198, - "loss": 5.0185, - "step": 3202 - }, - { - "epoch": 0.99, - "learning_rate": 0.0002324439824636826, - "loss": 4.5099, - "step": 3203 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023240835340444536, - "loss": 4.4024, - "step": 3204 - }, - { - "epoch": 0.99, - "learning_rate": 0.0002323727243452081, - "loss": 4.2863, - "step": 3205 - }, - { - "epoch": 0.99, - "learning_rate": 0.0002323370952859709, - "loss": 4.4508, - "step": 3206 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023230146622673368, - "loss": 4.2674, - "step": 3207 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023226583716749646, - "loss": 4.3007, - "step": 3208 - }, - { - "epoch": 0.99, - "learning_rate": 0.0002322302081082592, - "loss": 4.2449, - "step": 3209 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023219457904902197, - "loss": 4.1617, - "step": 3210 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023215894998978478, - "loss": 4.285, - "step": 3211 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023212332093054753, - "loss": 3.9879, - "step": 3212 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023208769187131029, - "loss": 3.9902, - "step": 3213 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023205206281207307, - "loss": 3.9019, - "step": 3214 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023201643375283582, - "loss": 3.9673, - "step": 3215 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023198080469359863, - "loss": 4.1152, - "step": 3216 - }, - { - "epoch": 0.99, - "learning_rate": 0.0002319451756343614, - "loss": 3.9981, - "step": 3217 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023190954657512414, - "loss": 4.0401, - "step": 3218 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023187391751588692, - "loss": 3.907, - "step": 3219 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023183828845664968, - "loss": 3.9421, - "step": 3220 - }, - { - "epoch": 0.99, - "learning_rate": 0.0002318026593974125, - "loss": 3.6716, - "step": 3221 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023176703033817524, - "loss": 3.973, - "step": 3222 - }, - { - "epoch": 0.99, - "learning_rate": 0.000231731401278938, - "loss": 4.0262, - "step": 3223 - }, - { - "epoch": 0.99, - "learning_rate": 0.00023169577221970078, - "loss": 3.8582, - "step": 3224 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023166014316046353, - "loss": 3.7747, - "step": 3225 - }, - { - "epoch": 1.0, - "learning_rate": 0.0002316245141012263, - "loss": 3.6881, - "step": 3226 - }, - { - "epoch": 1.0, - "learning_rate": 0.0002315888850419891, - "loss": 3.6077, - "step": 3227 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023155325598275185, - "loss": 3.6868, - "step": 3228 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023151762692351463, - "loss": 3.5107, - "step": 3229 - }, - { - "epoch": 1.0, - "learning_rate": 0.0002314819978642774, - "loss": 3.5508, - "step": 3230 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023144636880504017, - "loss": 3.4857, - "step": 3231 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023141073974580295, - "loss": 3.3828, - "step": 3232 - }, - { - "epoch": 1.0, - "learning_rate": 0.0002313751106865657, - "loss": 3.2626, - "step": 3233 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023133948162732846, - "loss": 3.1632, - "step": 3234 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023130385256809127, - "loss": 3.2648, - "step": 3235 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023126822350885403, - "loss": 3.0365, - "step": 3236 - }, - { - "epoch": 1.0, - "learning_rate": 0.0002312325944496168, - "loss": 2.6895, - "step": 3237 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023119696539037956, - "loss": 2.5704, - "step": 3238 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023116133633114232, - "loss": 2.684, - "step": 3239 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023112570727190513, - "loss": 2.3827, - "step": 3240 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023109007821266788, - "loss": 5.4764, - "step": 3241 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023105444915343066, - "loss": 4.8597, - "step": 3242 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023101882009419342, - "loss": 4.5493, - "step": 3243 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023098319103495617, - "loss": 4.6225, - "step": 3244 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023094756197571898, - "loss": 4.2237, - "step": 3245 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023091193291648174, - "loss": 4.1768, - "step": 3246 - }, - { - "epoch": 1.0, - "learning_rate": 0.0002308763038572445, - "loss": 3.7428, - "step": 3247 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023084067479800727, - "loss": 4.2356, - "step": 3248 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023080504573877003, - "loss": 3.9862, - "step": 3249 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023076941667953284, - "loss": 3.7712, - "step": 3250 - }, - { - "epoch": 1.0, - "learning_rate": 0.0002307337876202956, - "loss": 3.8686, - "step": 3251 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023069815856105835, - "loss": 3.8503, - "step": 3252 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023066252950182113, - "loss": 3.9793, - "step": 3253 - }, - { - "epoch": 1.0, - "learning_rate": 0.0002306269004425839, - "loss": 3.7719, - "step": 3254 - }, - { - "epoch": 1.0, - "learning_rate": 0.0002305912713833467, - "loss": 3.7653, - "step": 3255 - }, - { - "epoch": 1.0, - "learning_rate": 0.00023055564232410945, - "loss": 3.7087, - "step": 3256 - }, - { - "epoch": 1.01, - "learning_rate": 0.0002305200132648722, - "loss": 3.7529, - "step": 3257 - }, - { - "epoch": 1.01, - "learning_rate": 0.000230484384205635, - "loss": 3.7963, - "step": 3258 - }, - { - "epoch": 1.01, - "learning_rate": 0.00023044875514639776, - "loss": 3.4675, - "step": 3259 - }, - { - "epoch": 1.01, - "learning_rate": 0.00023041312608716052, - "loss": 3.8207, - "step": 3260 - }, - { - "epoch": 1.01, - "learning_rate": 0.0002303774970279233, - "loss": 3.6712, - "step": 3261 - }, - { - "epoch": 1.01, - "learning_rate": 0.00023034186796868606, - "loss": 3.705, - "step": 3262 - }, - { - "epoch": 1.01, - "learning_rate": 0.00023030623890944887, - "loss": 3.7285, - "step": 3263 - }, - { - "epoch": 1.01, - "learning_rate": 0.00023027060985021162, - "loss": 3.8801, - "step": 3264 - }, - { - "epoch": 1.01, - "learning_rate": 0.00023023498079097437, - "loss": 3.5659, - "step": 3265 - }, - { - "epoch": 1.01, - "learning_rate": 0.00023019935173173716, - "loss": 3.8416, - "step": 3266 - }, - { - "epoch": 1.01, - "learning_rate": 0.0002301637226724999, - "loss": 3.5377, - "step": 3267 - }, - { - "epoch": 1.01, - "learning_rate": 0.00023012809361326267, - "loss": 3.5871, - "step": 3268 - }, - { - "epoch": 1.01, - "learning_rate": 0.00023009246455402548, - "loss": 3.7477, - "step": 3269 - }, - { - "epoch": 1.01, - "learning_rate": 0.00023005683549478823, - "loss": 3.7505, - "step": 3270 - }, - { - "epoch": 1.01, - "learning_rate": 0.000230021206435551, - "loss": 3.2975, - "step": 3271 - }, - { - "epoch": 1.01, - "learning_rate": 0.00022998557737631377, - "loss": 3.4778, - "step": 3272 - }, - { - "epoch": 1.01, - "learning_rate": 0.00022994994831707652, - "loss": 3.5677, - "step": 3273 - }, - { - "epoch": 1.01, - "learning_rate": 0.00022991431925783933, - "loss": 3.306, - "step": 3274 - }, - { - "epoch": 1.01, - "learning_rate": 0.00022987869019860208, - "loss": 3.3762, - "step": 3275 - }, - { - "epoch": 1.01, - "learning_rate": 0.00022984306113936487, - "loss": 3.231, - "step": 3276 - }, - { - "epoch": 1.01, - "learning_rate": 0.00022980743208012762, - "loss": 3.4824, - "step": 3277 - }, - { - "epoch": 1.01, - "learning_rate": 0.0002297718030208904, - "loss": 3.1722, - "step": 3278 - }, - { - "epoch": 1.01, - "learning_rate": 0.00022973617396165319, - "loss": 3.2316, - "step": 3279 - }, - { - "epoch": 1.01, - "learning_rate": 0.00022970054490241594, - "loss": 3.0759, - "step": 3280 - }, - { - "epoch": 1.01, - "learning_rate": 0.0002296649158431787, - "loss": 3.1859, - "step": 3281 - }, - { - "epoch": 1.01, - "learning_rate": 0.0002296292867839415, - "loss": 3.1602, - "step": 3282 - }, - { - "epoch": 1.01, - "learning_rate": 0.00022959365772470426, - "loss": 2.8832, - "step": 3283 - }, - { - "epoch": 1.01, - "learning_rate": 0.00022955802866546704, - "loss": 2.7656, - "step": 3284 - }, - { - "epoch": 1.01, - "learning_rate": 0.0002295223996062298, - "loss": 2.623, - "step": 3285 - }, - { - "epoch": 1.01, - "learning_rate": 0.00022948677054699255, - "loss": 2.897, - "step": 3286 - }, - { - "epoch": 1.01, - "learning_rate": 0.00022945114148775536, - "loss": 2.4938, - "step": 3287 - }, - { - "epoch": 1.01, - "learning_rate": 0.00022941551242851811, - "loss": 2.5806, - "step": 3288 - }, - { - "epoch": 1.02, - "learning_rate": 0.0002293798833692809, - "loss": 2.435, - "step": 3289 - }, - { - "epoch": 1.02, - "learning_rate": 0.00022934425431004365, - "loss": 2.1989, - "step": 3290 - }, - { - "epoch": 1.02, - "learning_rate": 0.0002293086252508064, - "loss": 5.2817, - "step": 3291 - }, - { - "epoch": 1.02, - "learning_rate": 0.00022927299619156921, - "loss": 4.8864, - "step": 3292 - }, - { - "epoch": 1.02, - "learning_rate": 0.00022923736713233197, - "loss": 4.7242, - "step": 3293 - }, - { - "epoch": 1.02, - "learning_rate": 0.00022920173807309472, - "loss": 4.5526, - "step": 3294 - }, - { - "epoch": 1.02, - "learning_rate": 0.0002291661090138575, - "loss": 4.1508, - "step": 3295 - }, - { - "epoch": 1.02, - "learning_rate": 0.00022913047995462026, - "loss": 3.9627, - "step": 3296 - }, - { - "epoch": 1.02, - "learning_rate": 0.00022909485089538307, - "loss": 4.1625, - "step": 3297 - }, - { - "epoch": 1.02, - "learning_rate": 0.00022905922183614582, - "loss": 3.95, - "step": 3298 - }, - { - "epoch": 1.02, - "learning_rate": 0.00022902359277690858, - "loss": 4.1361, - "step": 3299 - }, - { - "epoch": 1.02, - "learning_rate": 0.00022898796371767136, - "loss": 4.1307, - "step": 3300 - }, - { - "epoch": 1.02, - "learning_rate": 0.00022895233465843414, - "loss": 3.8352, - "step": 3301 - }, - { - "epoch": 1.02, - "learning_rate": 0.0002289167055991969, - "loss": 3.7877, - "step": 3302 - }, - { - "epoch": 1.02, - "learning_rate": 0.00022888107653995968, - "loss": 3.9865, - "step": 3303 - }, - { - "epoch": 1.02, - "learning_rate": 0.00022884544748072243, - "loss": 3.9345, - "step": 3304 - }, - { - "epoch": 1.02, - "learning_rate": 0.00022880981842148524, - "loss": 3.8008, - "step": 3305 - }, - { - "epoch": 1.02, - "learning_rate": 0.000228774189362248, - "loss": 3.6873, - "step": 3306 - }, - { - "epoch": 1.02, - "learning_rate": 0.00022873856030301075, - "loss": 3.9142, - "step": 3307 - }, - { - "epoch": 1.02, - "learning_rate": 0.00022870293124377353, - "loss": 3.6214, - "step": 3308 - }, - { - "epoch": 1.02, - "learning_rate": 0.0002286673021845363, - "loss": 3.6497, - "step": 3309 - }, - { - "epoch": 1.02, - "learning_rate": 0.0002286316731252991, - "loss": 3.7443, - "step": 3310 - }, - { - "epoch": 1.02, - "learning_rate": 0.00022859604406606185, - "loss": 3.6966, - "step": 3311 - }, - { - "epoch": 1.02, - "learning_rate": 0.0002285604150068246, - "loss": 3.7438, - "step": 3312 - }, - { - "epoch": 1.02, - "learning_rate": 0.0002285247859475874, - "loss": 3.5737, - "step": 3313 - }, - { - "epoch": 1.02, - "learning_rate": 0.00022848915688835014, - "loss": 3.9687, - "step": 3314 - }, - { - "epoch": 1.02, - "learning_rate": 0.0002284535278291129, - "loss": 3.4963, - "step": 3315 - }, - { - "epoch": 1.02, - "learning_rate": 0.0002284178987698757, - "loss": 3.6447, - "step": 3316 - }, - { - "epoch": 1.02, - "learning_rate": 0.00022838226971063846, - "loss": 3.5669, - "step": 3317 - }, - { - "epoch": 1.02, - "learning_rate": 0.00022834664065140124, - "loss": 3.4375, - "step": 3318 - }, - { - "epoch": 1.02, - "learning_rate": 0.000228311011592164, - "loss": 3.3905, - "step": 3319 - }, - { - "epoch": 1.02, - "learning_rate": 0.00022827538253292675, - "loss": 3.6282, - "step": 3320 - }, - { - "epoch": 1.02, - "learning_rate": 0.00022823975347368956, - "loss": 3.7218, - "step": 3321 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022820412441445232, - "loss": 3.5181, - "step": 3322 - }, - { - "epoch": 1.03, - "learning_rate": 0.0002281684953552151, - "loss": 3.2657, - "step": 3323 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022813286629597785, - "loss": 3.3909, - "step": 3324 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022809723723674064, - "loss": 3.2919, - "step": 3325 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022806160817750342, - "loss": 3.3143, - "step": 3326 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022802597911826617, - "loss": 3.1186, - "step": 3327 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022799035005902893, - "loss": 3.3313, - "step": 3328 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022795472099979174, - "loss": 3.1917, - "step": 3329 - }, - { - "epoch": 1.03, - "learning_rate": 0.0002279190919405545, - "loss": 3.3493, - "step": 3330 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022788346288131727, - "loss": 3.0516, - "step": 3331 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022784783382208003, - "loss": 3.0342, - "step": 3332 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022781220476284278, - "loss": 2.9794, - "step": 3333 - }, - { - "epoch": 1.03, - "learning_rate": 0.0002277765757036056, - "loss": 2.7809, - "step": 3334 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022774094664436835, - "loss": 2.7251, - "step": 3335 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022770531758513113, - "loss": 2.6732, - "step": 3336 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022766968852589388, - "loss": 2.6377, - "step": 3337 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022763405946665664, - "loss": 2.4325, - "step": 3338 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022759843040741945, - "loss": 2.2867, - "step": 3339 - }, - { - "epoch": 1.03, - "learning_rate": 0.0002275628013481822, - "loss": 2.4385, - "step": 3340 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022752717228894496, - "loss": 5.2534, - "step": 3341 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022749154322970774, - "loss": 4.881, - "step": 3342 - }, - { - "epoch": 1.03, - "learning_rate": 0.0002274559141704705, - "loss": 4.6975, - "step": 3343 - }, - { - "epoch": 1.03, - "learning_rate": 0.0002274202851112333, - "loss": 4.5906, - "step": 3344 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022738465605199606, - "loss": 4.2472, - "step": 3345 - }, - { - "epoch": 1.03, - "learning_rate": 0.0002273490269927588, - "loss": 4.2546, - "step": 3346 - }, - { - "epoch": 1.03, - "learning_rate": 0.0002273133979335216, - "loss": 3.9274, - "step": 3347 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022727776887428438, - "loss": 4.1792, - "step": 3348 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022724213981504713, - "loss": 4.0565, - "step": 3349 - }, - { - "epoch": 1.03, - "learning_rate": 0.0002272065107558099, - "loss": 3.8837, - "step": 3350 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022717088169657267, - "loss": 4.0692, - "step": 3351 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022713525263733548, - "loss": 3.8501, - "step": 3352 - }, - { - "epoch": 1.03, - "learning_rate": 0.00022709962357809823, - "loss": 3.8368, - "step": 3353 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022706399451886099, - "loss": 4.0414, - "step": 3354 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022702836545962377, - "loss": 3.9174, - "step": 3355 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022699273640038652, - "loss": 3.7743, - "step": 3356 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022695710734114933, - "loss": 3.6508, - "step": 3357 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022692147828191209, - "loss": 4.017, - "step": 3358 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022688584922267484, - "loss": 3.8742, - "step": 3359 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022685022016343762, - "loss": 3.6234, - "step": 3360 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022681459110420038, - "loss": 3.7314, - "step": 3361 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022677896204496313, - "loss": 3.6142, - "step": 3362 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022674333298572594, - "loss": 3.7584, - "step": 3363 - }, - { - "epoch": 1.04, - "learning_rate": 0.0002267077039264887, - "loss": 3.7109, - "step": 3364 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022667207486725148, - "loss": 3.3795, - "step": 3365 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022663644580801423, - "loss": 3.6356, - "step": 3366 - }, - { - "epoch": 1.04, - "learning_rate": 0.000226600816748777, - "loss": 3.5525, - "step": 3367 - }, - { - "epoch": 1.04, - "learning_rate": 0.0002265651876895398, - "loss": 3.5048, - "step": 3368 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022652955863030255, - "loss": 3.3858, - "step": 3369 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022649392957106533, - "loss": 3.589, - "step": 3370 - }, - { - "epoch": 1.04, - "learning_rate": 0.0002264583005118281, - "loss": 3.4631, - "step": 3371 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022642267145259087, - "loss": 3.2303, - "step": 3372 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022638704239335365, - "loss": 3.4989, - "step": 3373 - }, - { - "epoch": 1.04, - "learning_rate": 0.0002263514133341164, - "loss": 3.2227, - "step": 3374 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022631578427487916, - "loss": 3.224, - "step": 3375 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022628015521564197, - "loss": 3.4244, - "step": 3376 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022624452615640472, - "loss": 3.1714, - "step": 3377 - }, - { - "epoch": 1.04, - "learning_rate": 0.0002262088970971675, - "loss": 3.1927, - "step": 3378 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022617326803793026, - "loss": 3.2285, - "step": 3379 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022613763897869302, - "loss": 3.211, - "step": 3380 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022610200991945583, - "loss": 2.8586, - "step": 3381 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022606638086021858, - "loss": 2.8186, - "step": 3382 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022603075180098133, - "loss": 3.0257, - "step": 3383 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022599512274174412, - "loss": 2.9242, - "step": 3384 - }, - { - "epoch": 1.04, - "learning_rate": 0.00022595949368250687, - "loss": 2.6077, - "step": 3385 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022592386462326968, - "loss": 2.675, - "step": 3386 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022588823556403244, - "loss": 2.3658, - "step": 3387 - }, - { - "epoch": 1.05, - "learning_rate": 0.0002258526065047952, - "loss": 2.507, - "step": 3388 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022581697744555797, - "loss": 2.5374, - "step": 3389 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022578134838632073, - "loss": 2.2848, - "step": 3390 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022574571932708354, - "loss": 4.8547, - "step": 3391 - }, - { - "epoch": 1.05, - "learning_rate": 0.0002257100902678463, - "loss": 4.6589, - "step": 3392 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022567446120860905, - "loss": 4.6273, - "step": 3393 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022563883214937183, - "loss": 4.3349, - "step": 3394 - }, - { - "epoch": 1.05, - "learning_rate": 0.0002256032030901346, - "loss": 4.5368, - "step": 3395 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022556757403089736, - "loss": 4.1337, - "step": 3396 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022553194497166015, - "loss": 3.8984, - "step": 3397 - }, - { - "epoch": 1.05, - "learning_rate": 0.0002254963159124229, - "loss": 4.2195, - "step": 3398 - }, - { - "epoch": 1.05, - "learning_rate": 0.0002254606868531857, - "loss": 4.0098, - "step": 3399 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022542505779394846, - "loss": 3.9825, - "step": 3400 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022538942873471122, - "loss": 3.8282, - "step": 3401 - }, - { - "epoch": 1.05, - "learning_rate": 0.000225353799675474, - "loss": 3.9848, - "step": 3402 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022531817061623676, - "loss": 3.7879, - "step": 3403 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022528254155699956, - "loss": 4.1537, - "step": 3404 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022524691249776232, - "loss": 3.9608, - "step": 3405 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022521128343852507, - "loss": 3.8603, - "step": 3406 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022517565437928786, - "loss": 3.8071, - "step": 3407 - }, - { - "epoch": 1.05, - "learning_rate": 0.0002251400253200506, - "loss": 3.566, - "step": 3408 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022510439626081337, - "loss": 3.8155, - "step": 3409 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022506876720157617, - "loss": 3.7392, - "step": 3410 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022503313814233893, - "loss": 3.8503, - "step": 3411 - }, - { - "epoch": 1.05, - "learning_rate": 0.0002249975090831017, - "loss": 3.5335, - "step": 3412 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022496188002386447, - "loss": 3.7815, - "step": 3413 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022492625096462722, - "loss": 3.5028, - "step": 3414 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022489062190539003, - "loss": 3.7152, - "step": 3415 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022485499284615278, - "loss": 3.7448, - "step": 3416 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022481936378691557, - "loss": 3.6014, - "step": 3417 - }, - { - "epoch": 1.05, - "learning_rate": 0.00022478373472767832, - "loss": 3.4361, - "step": 3418 - }, - { - "epoch": 1.06, - "learning_rate": 0.0002247481056684411, - "loss": 3.5516, - "step": 3419 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022471247660920388, - "loss": 3.8486, - "step": 3420 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022467684754996664, - "loss": 3.6489, - "step": 3421 - }, - { - "epoch": 1.06, - "learning_rate": 0.0002246412184907294, - "loss": 3.3495, - "step": 3422 - }, - { - "epoch": 1.06, - "learning_rate": 0.0002246055894314922, - "loss": 3.3214, - "step": 3423 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022456996037225496, - "loss": 3.3547, - "step": 3424 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022453433131301774, - "loss": 3.3462, - "step": 3425 - }, - { - "epoch": 1.06, - "learning_rate": 0.0002244987022537805, - "loss": 3.0726, - "step": 3426 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022446307319454325, - "loss": 3.0833, - "step": 3427 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022442744413530606, - "loss": 3.0179, - "step": 3428 - }, - { - "epoch": 1.06, - "learning_rate": 0.0002243918150760688, - "loss": 2.9077, - "step": 3429 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022435618601683157, - "loss": 3.1785, - "step": 3430 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022432055695759435, - "loss": 2.9056, - "step": 3431 - }, - { - "epoch": 1.06, - "learning_rate": 0.0002242849278983571, - "loss": 2.8467, - "step": 3432 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022424929883911991, - "loss": 2.8522, - "step": 3433 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022421366977988267, - "loss": 2.8692, - "step": 3434 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022417804072064542, - "loss": 2.7796, - "step": 3435 - }, - { - "epoch": 1.06, - "learning_rate": 0.0002241424116614082, - "loss": 2.5265, - "step": 3436 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022410678260217096, - "loss": 2.5337, - "step": 3437 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022407115354293377, - "loss": 2.5023, - "step": 3438 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022403552448369652, - "loss": 2.1777, - "step": 3439 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022399989542445928, - "loss": 2.3044, - "step": 3440 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022396426636522206, - "loss": 5.2354, - "step": 3441 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022392863730598484, - "loss": 4.7865, - "step": 3442 - }, - { - "epoch": 1.06, - "learning_rate": 0.0002238930082467476, - "loss": 4.6137, - "step": 3443 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022385737918751038, - "loss": 4.1235, - "step": 3444 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022382175012827313, - "loss": 4.3777, - "step": 3445 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022378612106903594, - "loss": 4.1836, - "step": 3446 - }, - { - "epoch": 1.06, - "learning_rate": 0.0002237504920097987, - "loss": 4.1246, - "step": 3447 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022371486295056145, - "loss": 4.3068, - "step": 3448 - }, - { - "epoch": 1.06, - "learning_rate": 0.00022367923389132423, - "loss": 3.7803, - "step": 3449 - }, - { - "epoch": 1.06, - "learning_rate": 0.000223643604832087, - "loss": 4.0443, - "step": 3450 - }, - { - "epoch": 1.07, - "learning_rate": 0.0002236079757728498, - "loss": 3.7497, - "step": 3451 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022357234671361255, - "loss": 3.8822, - "step": 3452 - }, - { - "epoch": 1.07, - "learning_rate": 0.0002235367176543753, - "loss": 3.8341, - "step": 3453 - }, - { - "epoch": 1.07, - "learning_rate": 0.0002235010885951381, - "loss": 3.8673, - "step": 3454 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022346545953590084, - "loss": 3.9177, - "step": 3455 - }, - { - "epoch": 1.07, - "learning_rate": 0.0002234298304766636, - "loss": 3.7889, - "step": 3456 - }, - { - "epoch": 1.07, - "learning_rate": 0.0002233942014174264, - "loss": 3.7346, - "step": 3457 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022335857235818916, - "loss": 3.8982, - "step": 3458 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022332294329895194, - "loss": 3.8405, - "step": 3459 - }, - { - "epoch": 1.07, - "learning_rate": 0.0002232873142397147, - "loss": 3.8455, - "step": 3460 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022325168518047745, - "loss": 3.5226, - "step": 3461 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022321605612124026, - "loss": 3.571, - "step": 3462 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022318042706200302, - "loss": 3.6178, - "step": 3463 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022314479800276577, - "loss": 3.5389, - "step": 3464 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022310916894352858, - "loss": 3.5795, - "step": 3465 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022307353988429134, - "loss": 3.4307, - "step": 3466 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022303791082505412, - "loss": 3.5843, - "step": 3467 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022300228176581687, - "loss": 3.6868, - "step": 3468 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022296665270657963, - "loss": 3.6785, - "step": 3469 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022293102364734244, - "loss": 3.3745, - "step": 3470 - }, - { - "epoch": 1.07, - "learning_rate": 0.0002228953945881052, - "loss": 3.4477, - "step": 3471 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022285976552886797, - "loss": 3.5856, - "step": 3472 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022282413646963073, - "loss": 3.4279, - "step": 3473 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022278850741039348, - "loss": 3.2228, - "step": 3474 - }, - { - "epoch": 1.07, - "learning_rate": 0.0002227528783511563, - "loss": 3.2735, - "step": 3475 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022271724929191905, - "loss": 3.1802, - "step": 3476 - }, - { - "epoch": 1.07, - "learning_rate": 0.0002226816202326818, - "loss": 3.4058, - "step": 3477 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022264599117344458, - "loss": 3.263, - "step": 3478 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022261036211420734, - "loss": 3.035, - "step": 3479 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022257473305497015, - "loss": 2.7861, - "step": 3480 - }, - { - "epoch": 1.07, - "learning_rate": 0.0002225391039957329, - "loss": 3.047, - "step": 3481 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022250347493649566, - "loss": 2.8772, - "step": 3482 - }, - { - "epoch": 1.07, - "learning_rate": 0.00022246784587725844, - "loss": 3.0736, - "step": 3483 - }, - { - "epoch": 1.08, - "learning_rate": 0.0002224322168180212, - "loss": 2.5762, - "step": 3484 - }, - { - "epoch": 1.08, - "learning_rate": 0.000222396587758784, - "loss": 2.6045, - "step": 3485 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022236095869954676, - "loss": 2.6508, - "step": 3486 - }, - { - "epoch": 1.08, - "learning_rate": 0.0002223253296403095, - "loss": 2.634, - "step": 3487 - }, - { - "epoch": 1.08, - "learning_rate": 0.0002222897005810723, - "loss": 2.4015, - "step": 3488 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022225407152183508, - "loss": 2.2374, - "step": 3489 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022221844246259783, - "loss": 2.1403, - "step": 3490 - }, - { - "epoch": 1.08, - "learning_rate": 0.0002221828134033606, - "loss": 4.9263, - "step": 3491 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022214718434412337, - "loss": 4.5969, - "step": 3492 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022211155528488618, - "loss": 4.6296, - "step": 3493 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022207592622564893, - "loss": 4.4994, - "step": 3494 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022204029716641169, - "loss": 4.1733, - "step": 3495 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022200466810717447, - "loss": 4.2153, - "step": 3496 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022196903904793722, - "loss": 4.0729, - "step": 3497 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022193340998869998, - "loss": 4.1431, - "step": 3498 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022189778092946279, - "loss": 4.0883, - "step": 3499 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022186215187022554, - "loss": 4.1027, - "step": 3500 - }, - { - "epoch": 1.08, - "eval_bleu": 0.0, - "eval_loss": 4.329113960266113, - "eval_runtime": 2566.7512, - "eval_samples_per_second": 5.75, - "eval_steps_per_second": 0.719, - "step": 3500 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022182652281098832, - "loss": 3.7445, - "step": 3501 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022179089375175108, - "loss": 3.8809, - "step": 3502 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022175526469251383, - "loss": 4.0919, - "step": 3503 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022171963563327664, - "loss": 3.8528, - "step": 3504 - }, - { - "epoch": 1.08, - "learning_rate": 0.0002216840065740394, - "loss": 3.934, - "step": 3505 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022164837751480218, - "loss": 3.6634, - "step": 3506 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022161274845556493, - "loss": 3.6396, - "step": 3507 - }, - { - "epoch": 1.08, - "learning_rate": 0.0002215771193963277, - "loss": 4.0179, - "step": 3508 - }, - { - "epoch": 1.08, - "learning_rate": 0.0002215414903370905, - "loss": 3.6498, - "step": 3509 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022150586127785325, - "loss": 3.6283, - "step": 3510 - }, - { - "epoch": 1.08, - "learning_rate": 0.000221470232218616, - "loss": 3.5379, - "step": 3511 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022143460315937881, - "loss": 3.5872, - "step": 3512 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022139897410014157, - "loss": 3.5759, - "step": 3513 - }, - { - "epoch": 1.08, - "learning_rate": 0.00022136334504090435, - "loss": 3.5041, - "step": 3514 - }, - { - "epoch": 1.08, - "learning_rate": 0.0002213277159816671, - "loss": 3.3848, - "step": 3515 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022129208692242986, - "loss": 3.5209, - "step": 3516 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022125645786319267, - "loss": 3.6466, - "step": 3517 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022122082880395542, - "loss": 3.3708, - "step": 3518 - }, - { - "epoch": 1.09, - "learning_rate": 0.0002211851997447182, - "loss": 3.3692, - "step": 3519 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022114957068548096, - "loss": 3.7085, - "step": 3520 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022111394162624372, - "loss": 3.418, - "step": 3521 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022107831256700652, - "loss": 3.3103, - "step": 3522 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022104268350776928, - "loss": 3.4353, - "step": 3523 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022100705444853203, - "loss": 3.2997, - "step": 3524 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022097142538929482, - "loss": 3.1203, - "step": 3525 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022093579633005757, - "loss": 3.2134, - "step": 3526 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022090016727082038, - "loss": 3.173, - "step": 3527 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022086453821158313, - "loss": 3.2338, - "step": 3528 - }, - { - "epoch": 1.09, - "learning_rate": 0.0002208289091523459, - "loss": 3.1562, - "step": 3529 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022079328009310867, - "loss": 3.417, - "step": 3530 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022075765103387143, - "loss": 3.117, - "step": 3531 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022072202197463424, - "loss": 2.6628, - "step": 3532 - }, - { - "epoch": 1.09, - "learning_rate": 0.000220686392915397, - "loss": 2.8636, - "step": 3533 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022065076385615974, - "loss": 2.8624, - "step": 3534 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022061513479692253, - "loss": 2.7263, - "step": 3535 - }, - { - "epoch": 1.09, - "learning_rate": 0.0002205795057376853, - "loss": 2.7002, - "step": 3536 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022054387667844806, - "loss": 2.6403, - "step": 3537 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022050824761921085, - "loss": 2.5853, - "step": 3538 - }, - { - "epoch": 1.09, - "learning_rate": 0.0002204726185599736, - "loss": 2.3399, - "step": 3539 - }, - { - "epoch": 1.09, - "learning_rate": 0.0002204369895007364, - "loss": 2.195, - "step": 3540 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022040136044149916, - "loss": 5.1365, - "step": 3541 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022036573138226192, - "loss": 4.8125, - "step": 3542 - }, - { - "epoch": 1.09, - "learning_rate": 0.0002203301023230247, - "loss": 4.2869, - "step": 3543 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022029447326378745, - "loss": 4.5352, - "step": 3544 - }, - { - "epoch": 1.09, - "learning_rate": 0.0002202588442045502, - "loss": 4.0569, - "step": 3545 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022022321514531302, - "loss": 4.0005, - "step": 3546 - }, - { - "epoch": 1.09, - "learning_rate": 0.00022018758608607577, - "loss": 4.0507, - "step": 3547 - }, - { - "epoch": 1.1, - "learning_rate": 0.00022015195702683856, - "loss": 3.8679, - "step": 3548 - }, - { - "epoch": 1.1, - "learning_rate": 0.0002201163279676013, - "loss": 3.9171, - "step": 3549 - }, - { - "epoch": 1.1, - "learning_rate": 0.00022008069890836406, - "loss": 3.9393, - "step": 3550 - }, - { - "epoch": 1.1, - "learning_rate": 0.00022004506984912687, - "loss": 4.0229, - "step": 3551 - }, - { - "epoch": 1.1, - "learning_rate": 0.00022000944078988963, - "loss": 3.8507, - "step": 3552 - }, - { - "epoch": 1.1, - "learning_rate": 0.0002199738117306524, - "loss": 3.8655, - "step": 3553 - }, - { - "epoch": 1.1, - "learning_rate": 0.00021993818267141517, - "loss": 3.7772, - "step": 3554 - }, - { - "epoch": 1.1, - "learning_rate": 0.00021990255361217795, - "loss": 3.7759, - "step": 3555 - }, - { - "epoch": 1.1, - "learning_rate": 0.00021986692455294073, - "loss": 3.8893, - "step": 3556 - }, - { - "epoch": 1.1, - "learning_rate": 0.00021983129549370348, - "loss": 3.6709, - "step": 3557 - }, - { - "epoch": 1.1, - "learning_rate": 0.00021979566643446624, - "loss": 3.9443, - "step": 3558 - }, - { - "epoch": 1.1, - "learning_rate": 0.00021976003737522905, - "loss": 3.8262, - "step": 3559 - }, - { - "epoch": 1.1, - "learning_rate": 0.0002197244083159918, - "loss": 3.87, - "step": 3560 - }, - { - "epoch": 1.1, - "learning_rate": 0.00021968877925675458, - "loss": 3.8376, - "step": 3561 - }, - { - "epoch": 1.1, - "learning_rate": 0.00021965315019751734, - "loss": 3.6727, - "step": 3562 - }, - { - "epoch": 1.1, - "learning_rate": 0.0002196175211382801, - "loss": 3.5641, - "step": 3563 - }, - { - "epoch": 1.1, - "learning_rate": 0.0002195818920790429, - "loss": 3.5567, - "step": 3564 - }, - { - "epoch": 1.1, - "learning_rate": 0.00021954626301980566, - "loss": 3.5487, - "step": 3565 - }, - { - "epoch": 1.1, - "learning_rate": 0.00021951063396056844, - "loss": 3.2647, - "step": 3566 - }, - { - "epoch": 1.1, - "learning_rate": 0.0002194750049013312, - "loss": 3.4869, - "step": 3567 - }, - { - "epoch": 1.1, - "learning_rate": 0.00021943937584209395, - "loss": 3.5559, - "step": 3568 - }, - { - "epoch": 1.1, - "learning_rate": 0.00021940374678285676, - "loss": 3.5276, - "step": 3569 - }, - { - "epoch": 1.1, - "learning_rate": 0.0002193681177236195, - "loss": 3.3603, - "step": 3570 - }, - { - "epoch": 1.1, - "learning_rate": 0.00021933248866438227, - "loss": 3.6102, - "step": 3571 - }, - { - "epoch": 1.1, - "learning_rate": 0.00021929685960514505, - "loss": 3.4474, - "step": 3572 - }, - { - "epoch": 1.1, - "learning_rate": 0.0002192612305459078, - "loss": 3.4479, - "step": 3573 - }, - { - "epoch": 1.1, - "learning_rate": 0.0002192256014866706, - "loss": 3.1307, - "step": 3574 - }, - { - "epoch": 1.1, - "learning_rate": 0.00021918997242743337, - "loss": 3.118, - "step": 3575 - }, - { - "epoch": 1.1, - "learning_rate": 0.00021915434336819612, - "loss": 3.3475, - "step": 3576 - }, - { - "epoch": 1.1, - "learning_rate": 0.0002191187143089589, - "loss": 3.2515, - "step": 3577 - }, - { - "epoch": 1.1, - "learning_rate": 0.00021908308524972166, - "loss": 3.197, - "step": 3578 - }, - { - "epoch": 1.1, - "learning_rate": 0.00021904745619048444, - "loss": 3.0067, - "step": 3579 - }, - { - "epoch": 1.1, - "learning_rate": 0.00021901182713124722, - "loss": 3.0633, - "step": 3580 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021897619807200998, - "loss": 3.3697, - "step": 3581 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021894056901277276, - "loss": 2.9279, - "step": 3582 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021890493995353554, - "loss": 2.8304, - "step": 3583 - }, - { - "epoch": 1.11, - "learning_rate": 0.0002188693108942983, - "loss": 2.7378, - "step": 3584 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021883368183506108, - "loss": 2.6224, - "step": 3585 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021879805277582383, - "loss": 2.5158, - "step": 3586 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021876242371658664, - "loss": 2.309, - "step": 3587 - }, - { - "epoch": 1.11, - "learning_rate": 0.0002187267946573494, - "loss": 2.3903, - "step": 3588 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021869116559811215, - "loss": 2.3519, - "step": 3589 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021865553653887493, - "loss": 2.3764, - "step": 3590 - }, - { - "epoch": 1.11, - "learning_rate": 0.0002186199074796377, - "loss": 4.9679, - "step": 3591 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021858427842040044, - "loss": 4.4186, - "step": 3592 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021854864936116325, - "loss": 4.5821, - "step": 3593 - }, - { - "epoch": 1.11, - "learning_rate": 0.000218513020301926, - "loss": 4.3527, - "step": 3594 - }, - { - "epoch": 1.11, - "learning_rate": 0.0002184773912426888, - "loss": 4.0442, - "step": 3595 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021844176218345154, - "loss": 4.4095, - "step": 3596 - }, - { - "epoch": 1.11, - "learning_rate": 0.0002184061331242143, - "loss": 3.9542, - "step": 3597 - }, - { - "epoch": 1.11, - "learning_rate": 0.0002183705040649771, - "loss": 4.0053, - "step": 3598 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021833487500573986, - "loss": 3.9912, - "step": 3599 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021829924594650264, - "loss": 3.6964, - "step": 3600 - }, - { - "epoch": 1.11, - "learning_rate": 0.0002182636168872654, - "loss": 3.9628, - "step": 3601 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021822798782802818, - "loss": 3.508, - "step": 3602 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021819235876879096, - "loss": 3.7829, - "step": 3603 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021815672970955372, - "loss": 3.8636, - "step": 3604 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021812110065031647, - "loss": 3.5438, - "step": 3605 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021808547159107928, - "loss": 3.9929, - "step": 3606 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021804984253184204, - "loss": 3.6298, - "step": 3607 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021801421347260482, - "loss": 4.0625, - "step": 3608 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021797858441336757, - "loss": 3.8902, - "step": 3609 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021794295535413033, - "loss": 3.854, - "step": 3610 - }, - { - "epoch": 1.11, - "learning_rate": 0.00021790732629489314, - "loss": 3.3604, - "step": 3611 - }, - { - "epoch": 1.11, - "learning_rate": 0.0002178716972356559, - "loss": 3.7369, - "step": 3612 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021783606817641865, - "loss": 3.7086, - "step": 3613 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021780043911718143, - "loss": 3.665, - "step": 3614 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021776481005794418, - "loss": 3.7161, - "step": 3615 - }, - { - "epoch": 1.12, - "learning_rate": 0.000217729180998707, - "loss": 3.5851, - "step": 3616 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021769355193946975, - "loss": 3.8927, - "step": 3617 - }, - { - "epoch": 1.12, - "learning_rate": 0.0002176579228802325, - "loss": 3.5339, - "step": 3618 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021762229382099528, - "loss": 3.6542, - "step": 3619 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021758666476175804, - "loss": 3.3814, - "step": 3620 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021755103570252085, - "loss": 3.2715, - "step": 3621 - }, - { - "epoch": 1.12, - "learning_rate": 0.0002175154066432836, - "loss": 3.5746, - "step": 3622 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021747977758404636, - "loss": 3.4965, - "step": 3623 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021744414852480914, - "loss": 3.3232, - "step": 3624 - }, - { - "epoch": 1.12, - "learning_rate": 0.0002174085194655719, - "loss": 3.5453, - "step": 3625 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021737289040633467, - "loss": 3.1705, - "step": 3626 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021733726134709746, - "loss": 3.1225, - "step": 3627 - }, - { - "epoch": 1.12, - "learning_rate": 0.0002173016322878602, - "loss": 3.0805, - "step": 3628 - }, - { - "epoch": 1.12, - "learning_rate": 0.000217266003228623, - "loss": 3.0205, - "step": 3629 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021723037416938577, - "loss": 3.0141, - "step": 3630 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021719474511014853, - "loss": 3.0227, - "step": 3631 - }, - { - "epoch": 1.12, - "learning_rate": 0.0002171591160509113, - "loss": 3.1335, - "step": 3632 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021712348699167407, - "loss": 2.8616, - "step": 3633 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021708785793243688, - "loss": 2.7484, - "step": 3634 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021705222887319963, - "loss": 2.7113, - "step": 3635 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021701659981396238, - "loss": 2.4392, - "step": 3636 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021698097075472517, - "loss": 2.5949, - "step": 3637 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021694534169548792, - "loss": 2.4943, - "step": 3638 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021690971263625068, - "loss": 2.2826, - "step": 3639 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021687408357701349, - "loss": 2.2249, - "step": 3640 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021683845451777624, - "loss": 5.0556, - "step": 3641 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021680282545853902, - "loss": 4.9701, - "step": 3642 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021676719639930178, - "loss": 4.3252, - "step": 3643 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021673156734006453, - "loss": 4.265, - "step": 3644 - }, - { - "epoch": 1.12, - "learning_rate": 0.00021669593828082734, - "loss": 4.347, - "step": 3645 - }, - { - "epoch": 1.13, - "learning_rate": 0.0002166603092215901, - "loss": 4.2215, - "step": 3646 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021662468016235288, - "loss": 4.1503, - "step": 3647 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021658905110311563, - "loss": 4.1576, - "step": 3648 - }, - { - "epoch": 1.13, - "learning_rate": 0.0002165534220438784, - "loss": 4.1181, - "step": 3649 - }, - { - "epoch": 1.13, - "learning_rate": 0.0002165177929846412, - "loss": 3.8666, - "step": 3650 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021648216392540395, - "loss": 4.0027, - "step": 3651 - }, - { - "epoch": 1.13, - "learning_rate": 0.0002164465348661667, - "loss": 3.9986, - "step": 3652 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021641090580692951, - "loss": 3.8255, - "step": 3653 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021637527674769227, - "loss": 3.8742, - "step": 3654 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021633964768845505, - "loss": 3.7998, - "step": 3655 - }, - { - "epoch": 1.13, - "learning_rate": 0.0002163040186292178, - "loss": 3.9956, - "step": 3656 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021626838956998056, - "loss": 3.7248, - "step": 3657 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021623276051074337, - "loss": 3.7784, - "step": 3658 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021619713145150612, - "loss": 3.9118, - "step": 3659 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021616150239226888, - "loss": 3.6909, - "step": 3660 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021612587333303166, - "loss": 3.6122, - "step": 3661 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021609024427379442, - "loss": 3.4199, - "step": 3662 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021605461521455722, - "loss": 3.8526, - "step": 3663 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021601898615531998, - "loss": 3.5979, - "step": 3664 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021598335709608273, - "loss": 3.5794, - "step": 3665 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021594772803684552, - "loss": 3.5851, - "step": 3666 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021591209897760827, - "loss": 3.5038, - "step": 3667 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021587646991837108, - "loss": 3.7951, - "step": 3668 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021584084085913383, - "loss": 3.3288, - "step": 3669 - }, - { - "epoch": 1.13, - "learning_rate": 0.0002158052117998966, - "loss": 3.475, - "step": 3670 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021576958274065937, - "loss": 3.0766, - "step": 3671 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021573395368142213, - "loss": 3.7131, - "step": 3672 - }, - { - "epoch": 1.13, - "learning_rate": 0.0002156983246221849, - "loss": 3.503, - "step": 3673 - }, - { - "epoch": 1.13, - "learning_rate": 0.0002156626955629477, - "loss": 3.4958, - "step": 3674 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021562706650371044, - "loss": 3.3921, - "step": 3675 - }, - { - "epoch": 1.13, - "learning_rate": 0.00021559143744447323, - "loss": 3.2112, - "step": 3676 - }, - { - "epoch": 1.13, - "learning_rate": 0.000215555808385236, - "loss": 2.9011, - "step": 3677 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021552017932599876, - "loss": 3.044, - "step": 3678 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021548455026676154, - "loss": 2.8073, - "step": 3679 - }, - { - "epoch": 1.14, - "learning_rate": 0.0002154489212075243, - "loss": 2.99, - "step": 3680 - }, - { - "epoch": 1.14, - "learning_rate": 0.0002154132921482871, - "loss": 2.8782, - "step": 3681 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021537766308904986, - "loss": 2.7559, - "step": 3682 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021534203402981262, - "loss": 2.8149, - "step": 3683 - }, - { - "epoch": 1.14, - "learning_rate": 0.0002153064049705754, - "loss": 2.7567, - "step": 3684 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021527077591133815, - "loss": 2.542, - "step": 3685 - }, - { - "epoch": 1.14, - "learning_rate": 0.0002152351468521009, - "loss": 2.6179, - "step": 3686 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021519951779286372, - "loss": 2.5535, - "step": 3687 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021516388873362647, - "loss": 2.3951, - "step": 3688 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021512825967438925, - "loss": 2.3171, - "step": 3689 - }, - { - "epoch": 1.14, - "learning_rate": 0.000215092630615152, - "loss": 2.3259, - "step": 3690 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021505700155591476, - "loss": 5.1075, - "step": 3691 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021502137249667757, - "loss": 4.8097, - "step": 3692 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021498574343744033, - "loss": 4.2654, - "step": 3693 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021495011437820308, - "loss": 4.2753, - "step": 3694 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021491448531896586, - "loss": 4.0222, - "step": 3695 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021487885625972865, - "loss": 3.9405, - "step": 3696 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021484322720049143, - "loss": 4.1678, - "step": 3697 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021480759814125418, - "loss": 4.1229, - "step": 3698 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021477196908201694, - "loss": 3.9476, - "step": 3699 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021473634002277975, - "loss": 4.0646, - "step": 3700 - }, - { - "epoch": 1.14, - "learning_rate": 0.0002147007109635425, - "loss": 4.1456, - "step": 3701 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021466508190430528, - "loss": 3.6841, - "step": 3702 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021462945284506804, - "loss": 4.0765, - "step": 3703 - }, - { - "epoch": 1.14, - "learning_rate": 0.0002145938237858308, - "loss": 3.8705, - "step": 3704 - }, - { - "epoch": 1.14, - "learning_rate": 0.0002145581947265936, - "loss": 3.9964, - "step": 3705 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021452256566735636, - "loss": 3.6951, - "step": 3706 - }, - { - "epoch": 1.14, - "learning_rate": 0.0002144869366081191, - "loss": 3.8308, - "step": 3707 - }, - { - "epoch": 1.14, - "learning_rate": 0.0002144513075488819, - "loss": 3.8429, - "step": 3708 - }, - { - "epoch": 1.14, - "learning_rate": 0.00021441567848964465, - "loss": 3.7294, - "step": 3709 - }, - { - "epoch": 1.15, - "learning_rate": 0.00021438004943040746, - "loss": 3.6664, - "step": 3710 - }, - { - "epoch": 1.15, - "learning_rate": 0.0002143444203711702, - "loss": 3.8397, - "step": 3711 - }, - { - "epoch": 1.15, - "learning_rate": 0.00021430879131193297, - "loss": 3.4751, - "step": 3712 - }, - { - "epoch": 1.15, - "learning_rate": 0.00021427316225269575, - "loss": 3.6625, - "step": 3713 - }, - { - "epoch": 1.15, - "learning_rate": 0.0002142375331934585, - "loss": 3.6257, - "step": 3714 - }, - { - "epoch": 1.15, - "learning_rate": 0.0002142019041342213, - "loss": 3.4867, - "step": 3715 - }, - { - "epoch": 1.15, - "learning_rate": 0.00021416627507498407, - "loss": 3.5768, - "step": 3716 - }, - { - "epoch": 1.15, - "learning_rate": 0.00021413064601574682, - "loss": 3.5201, - "step": 3717 - }, - { - "epoch": 1.15, - "learning_rate": 0.0002140950169565096, - "loss": 3.3912, - "step": 3718 - }, - { - "epoch": 1.15, - "learning_rate": 0.00021405938789727236, - "loss": 3.4237, - "step": 3719 - }, - { - "epoch": 1.15, - "learning_rate": 0.00021402375883803514, - "loss": 3.277, - "step": 3720 - }, - { - "epoch": 1.15, - "learning_rate": 0.00021398812977879792, - "loss": 3.4923, - "step": 3721 - }, - { - "epoch": 1.15, - "learning_rate": 0.00021395250071956068, - "loss": 3.2683, - "step": 3722 - }, - { - "epoch": 1.15, - "learning_rate": 0.00021391687166032349, - "loss": 3.5315, - "step": 3723 - }, - { - "epoch": 1.15, - "learning_rate": 0.00021388124260108624, - "loss": 3.4321, - "step": 3724 - }, - { - "epoch": 1.15, - "learning_rate": 0.000213845613541849, - "loss": 3.0989, - "step": 3725 - }, - { - "epoch": 1.15, - "learning_rate": 0.00021380998448261178, - "loss": 3.4685, - "step": 3726 - }, - { - "epoch": 1.15, - "learning_rate": 0.00021377435542337453, - "loss": 2.9722, - "step": 3727 - }, - { - "epoch": 1.15, - "learning_rate": 0.00021373872636413734, - "loss": 3.0785, - "step": 3728 - }, - { - "epoch": 1.15, - "learning_rate": 0.0002137030973049001, - "loss": 2.8634, - "step": 3729 - }, - { - "epoch": 1.15, - "learning_rate": 0.00021366746824566285, - "loss": 3.0073, - "step": 3730 - }, - { - "epoch": 1.15, - "learning_rate": 0.00021363183918642563, - "loss": 2.8957, - "step": 3731 - }, - { - "epoch": 1.15, - "learning_rate": 0.0002135962101271884, - "loss": 2.8998, - "step": 3732 - }, - { - "epoch": 1.15, - "learning_rate": 0.00021356058106795114, - "loss": 2.6544, - "step": 3733 - }, - { - "epoch": 1.15, - "learning_rate": 0.00021352495200871395, - "loss": 2.8827, - "step": 3734 - }, - { - "epoch": 1.15, - "learning_rate": 0.0002134893229494767, - "loss": 2.6104, - "step": 3735 - }, - { - "epoch": 1.15, - "learning_rate": 0.0002134536938902395, - "loss": 2.4351, - "step": 3736 - }, - { - "epoch": 1.15, - "learning_rate": 0.00021341806483100224, - "loss": 2.5664, - "step": 3737 - }, - { - "epoch": 1.15, - "learning_rate": 0.000213382435771765, - "loss": 2.3942, - "step": 3738 - }, - { - "epoch": 1.15, - "learning_rate": 0.0002133468067125278, - "loss": 2.3263, - "step": 3739 - }, - { - "epoch": 1.15, - "learning_rate": 0.00021331117765329056, - "loss": 2.1375, - "step": 3740 - }, - { - "epoch": 1.15, - "learning_rate": 0.00021327554859405332, - "loss": 4.9413, - "step": 3741 - }, - { - "epoch": 1.15, - "learning_rate": 0.0002132399195348161, - "loss": 4.6035, - "step": 3742 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021320429047557888, - "loss": 4.5939, - "step": 3743 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021316866141634166, - "loss": 4.3925, - "step": 3744 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021313303235710442, - "loss": 4.4223, - "step": 3745 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021309740329786717, - "loss": 3.9904, - "step": 3746 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021306177423862998, - "loss": 4.1845, - "step": 3747 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021302614517939273, - "loss": 4.065, - "step": 3748 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021299051612015552, - "loss": 4.0944, - "step": 3749 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021295488706091827, - "loss": 3.8699, - "step": 3750 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021291925800168103, - "loss": 3.9307, - "step": 3751 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021288362894244384, - "loss": 3.8036, - "step": 3752 - }, - { - "epoch": 1.16, - "learning_rate": 0.0002128479998832066, - "loss": 3.9143, - "step": 3753 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021281237082396934, - "loss": 3.8288, - "step": 3754 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021277674176473213, - "loss": 4.1583, - "step": 3755 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021274111270549488, - "loss": 3.6806, - "step": 3756 - }, - { - "epoch": 1.16, - "learning_rate": 0.0002127054836462577, - "loss": 3.8103, - "step": 3757 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021266985458702045, - "loss": 3.8431, - "step": 3758 - }, - { - "epoch": 1.16, - "learning_rate": 0.0002126342255277832, - "loss": 3.9103, - "step": 3759 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021259859646854598, - "loss": 3.7451, - "step": 3760 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021256296740930874, - "loss": 3.8477, - "step": 3761 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021252733835007155, - "loss": 3.9205, - "step": 3762 - }, - { - "epoch": 1.16, - "learning_rate": 0.0002124917092908343, - "loss": 3.6388, - "step": 3763 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021245608023159706, - "loss": 3.6408, - "step": 3764 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021242045117235984, - "loss": 3.6903, - "step": 3765 - }, - { - "epoch": 1.16, - "learning_rate": 0.0002123848221131226, - "loss": 3.5758, - "step": 3766 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021234919305388537, - "loss": 3.468, - "step": 3767 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021231356399464816, - "loss": 3.546, - "step": 3768 - }, - { - "epoch": 1.16, - "learning_rate": 0.0002122779349354109, - "loss": 3.443, - "step": 3769 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021224230587617372, - "loss": 3.4192, - "step": 3770 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021220667681693647, - "loss": 3.3478, - "step": 3771 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021217104775769923, - "loss": 3.3836, - "step": 3772 - }, - { - "epoch": 1.16, - "learning_rate": 0.000212135418698462, - "loss": 3.4257, - "step": 3773 - }, - { - "epoch": 1.16, - "learning_rate": 0.00021209978963922477, - "loss": 3.5594, - "step": 3774 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021206416057998752, - "loss": 3.2849, - "step": 3775 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021202853152075033, - "loss": 3.1643, - "step": 3776 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021199290246151308, - "loss": 3.2061, - "step": 3777 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021195727340227587, - "loss": 3.1215, - "step": 3778 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021192164434303862, - "loss": 2.9831, - "step": 3779 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021188601528380138, - "loss": 2.9483, - "step": 3780 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021185038622456418, - "loss": 3.0607, - "step": 3781 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021181475716532694, - "loss": 2.958, - "step": 3782 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021177912810608972, - "loss": 2.9157, - "step": 3783 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021174349904685248, - "loss": 2.597, - "step": 3784 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021170786998761523, - "loss": 2.7457, - "step": 3785 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021167224092837804, - "loss": 2.4981, - "step": 3786 - }, - { - "epoch": 1.17, - "learning_rate": 0.0002116366118691408, - "loss": 2.4557, - "step": 3787 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021160098280990355, - "loss": 2.5203, - "step": 3788 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021156535375066633, - "loss": 2.3898, - "step": 3789 - }, - { - "epoch": 1.17, - "learning_rate": 0.0002115297246914291, - "loss": 2.2686, - "step": 3790 - }, - { - "epoch": 1.17, - "learning_rate": 0.0002114940956321919, - "loss": 4.8405, - "step": 3791 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021145846657295465, - "loss": 4.7778, - "step": 3792 - }, - { - "epoch": 1.17, - "learning_rate": 0.0002114228375137174, - "loss": 4.5258, - "step": 3793 - }, - { - "epoch": 1.17, - "learning_rate": 0.0002113872084544802, - "loss": 4.1723, - "step": 3794 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021135157939524297, - "loss": 4.0683, - "step": 3795 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021131595033600575, - "loss": 4.0587, - "step": 3796 - }, - { - "epoch": 1.17, - "learning_rate": 0.0002112803212767685, - "loss": 3.9326, - "step": 3797 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021124469221753126, - "loss": 3.9666, - "step": 3798 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021120906315829407, - "loss": 3.7685, - "step": 3799 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021117343409905682, - "loss": 3.6005, - "step": 3800 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021113780503981958, - "loss": 4.0159, - "step": 3801 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021110217598058236, - "loss": 3.7804, - "step": 3802 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021106654692134511, - "loss": 3.6372, - "step": 3803 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021103091786210792, - "loss": 3.793, - "step": 3804 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021099528880287068, - "loss": 3.9444, - "step": 3805 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021095965974363343, - "loss": 3.8123, - "step": 3806 - }, - { - "epoch": 1.17, - "learning_rate": 0.00021092403068439622, - "loss": 3.4506, - "step": 3807 - }, - { - "epoch": 1.18, - "learning_rate": 0.00021088840162515897, - "loss": 3.643, - "step": 3808 - }, - { - "epoch": 1.18, - "learning_rate": 0.00021085277256592172, - "loss": 3.816, - "step": 3809 - }, - { - "epoch": 1.18, - "learning_rate": 0.00021081714350668453, - "loss": 3.7531, - "step": 3810 - }, - { - "epoch": 1.18, - "learning_rate": 0.0002107815144474473, - "loss": 3.67, - "step": 3811 - }, - { - "epoch": 1.18, - "learning_rate": 0.00021074588538821007, - "loss": 3.422, - "step": 3812 - }, - { - "epoch": 1.18, - "learning_rate": 0.00021071025632897285, - "loss": 3.6633, - "step": 3813 - }, - { - "epoch": 1.18, - "learning_rate": 0.0002106746272697356, - "loss": 3.4557, - "step": 3814 - }, - { - "epoch": 1.18, - "learning_rate": 0.0002106389982104984, - "loss": 3.6747, - "step": 3815 - }, - { - "epoch": 1.18, - "learning_rate": 0.00021060336915126114, - "loss": 3.3116, - "step": 3816 - }, - { - "epoch": 1.18, - "learning_rate": 0.00021056774009202395, - "loss": 3.5444, - "step": 3817 - }, - { - "epoch": 1.18, - "learning_rate": 0.0002105321110327867, - "loss": 3.4997, - "step": 3818 - }, - { - "epoch": 1.18, - "learning_rate": 0.00021049648197354946, - "loss": 3.6271, - "step": 3819 - }, - { - "epoch": 1.18, - "learning_rate": 0.00021046085291431224, - "loss": 3.3925, - "step": 3820 - }, - { - "epoch": 1.18, - "learning_rate": 0.000210425223855075, - "loss": 3.476, - "step": 3821 - }, - { - "epoch": 1.18, - "learning_rate": 0.00021038959479583775, - "loss": 3.3939, - "step": 3822 - }, - { - "epoch": 1.18, - "learning_rate": 0.00021035396573660056, - "loss": 3.3488, - "step": 3823 - }, - { - "epoch": 1.18, - "learning_rate": 0.00021031833667736332, - "loss": 3.1716, - "step": 3824 - }, - { - "epoch": 1.18, - "learning_rate": 0.0002102827076181261, - "loss": 3.1065, - "step": 3825 - }, - { - "epoch": 1.18, - "learning_rate": 0.00021024707855888885, - "loss": 3.0623, - "step": 3826 - }, - { - "epoch": 1.18, - "learning_rate": 0.0002102114494996516, - "loss": 3.1655, - "step": 3827 - }, - { - "epoch": 1.18, - "learning_rate": 0.00021017582044041442, - "loss": 2.9909, - "step": 3828 - }, - { - "epoch": 1.18, - "learning_rate": 0.00021014019138117717, - "loss": 2.9355, - "step": 3829 - }, - { - "epoch": 1.18, - "learning_rate": 0.00021010456232193995, - "loss": 3.0908, - "step": 3830 - }, - { - "epoch": 1.18, - "learning_rate": 0.0002100689332627027, - "loss": 2.9964, - "step": 3831 - }, - { - "epoch": 1.18, - "learning_rate": 0.00021003330420346546, - "loss": 3.048, - "step": 3832 - }, - { - "epoch": 1.18, - "learning_rate": 0.00020999767514422827, - "loss": 2.7156, - "step": 3833 - }, - { - "epoch": 1.18, - "learning_rate": 0.00020996204608499103, - "loss": 2.7042, - "step": 3834 - }, - { - "epoch": 1.18, - "learning_rate": 0.00020992641702575378, - "loss": 2.7597, - "step": 3835 - }, - { - "epoch": 1.18, - "learning_rate": 0.00020989078796651656, - "loss": 2.4959, - "step": 3836 - }, - { - "epoch": 1.18, - "learning_rate": 0.00020985515890727935, - "loss": 2.572, - "step": 3837 - }, - { - "epoch": 1.18, - "learning_rate": 0.00020981952984804213, - "loss": 2.3327, - "step": 3838 - }, - { - "epoch": 1.18, - "learning_rate": 0.00020978390078880488, - "loss": 2.3436, - "step": 3839 - }, - { - "epoch": 1.19, - "learning_rate": 0.00020974827172956764, - "loss": 2.111, - "step": 3840 - }, - { - "epoch": 1.19, - "learning_rate": 0.00020971264267033045, - "loss": 4.9518, - "step": 3841 - }, - { - "epoch": 1.19, - "learning_rate": 0.0002096770136110932, - "loss": 4.7616, - "step": 3842 - }, - { - "epoch": 1.19, - "learning_rate": 0.00020964138455185598, - "loss": 4.2814, - "step": 3843 - }, - { - "epoch": 1.19, - "learning_rate": 0.00020960575549261874, - "loss": 4.5324, - "step": 3844 - }, - { - "epoch": 1.19, - "learning_rate": 0.0002095701264333815, - "loss": 4.1269, - "step": 3845 - }, - { - "epoch": 1.19, - "learning_rate": 0.0002095344973741443, - "loss": 4.0965, - "step": 3846 - }, - { - "epoch": 1.19, - "learning_rate": 0.00020949886831490706, - "loss": 4.0684, - "step": 3847 - }, - { - "epoch": 1.19, - "learning_rate": 0.0002094632392556698, - "loss": 4.1811, - "step": 3848 - }, - { - "epoch": 1.19, - "learning_rate": 0.0002094276101964326, - "loss": 3.8931, - "step": 3849 - }, - { - "epoch": 1.19, - "learning_rate": 0.00020939198113719535, - "loss": 3.8323, - "step": 3850 - }, - { - "epoch": 1.19, - "learning_rate": 0.00020935635207795816, - "loss": 4.1512, - "step": 3851 - }, - { - "epoch": 1.19, - "learning_rate": 0.0002093207230187209, - "loss": 3.9295, - "step": 3852 - }, - { - "epoch": 1.19, - "learning_rate": 0.00020928509395948367, - "loss": 3.6747, - "step": 3853 - }, - { - "epoch": 1.19, - "learning_rate": 0.00020924946490024645, - "loss": 3.6895, - "step": 3854 - }, - { - "epoch": 1.19, - "learning_rate": 0.0002092138358410092, - "loss": 3.9206, - "step": 3855 - }, - { - "epoch": 1.19, - "learning_rate": 0.00020917820678177196, - "loss": 3.9048, - "step": 3856 - }, - { - "epoch": 1.19, - "learning_rate": 0.00020914257772253477, - "loss": 3.7613, - "step": 3857 - }, - { - "epoch": 1.19, - "learning_rate": 0.00020910694866329752, - "loss": 3.7803, - "step": 3858 - }, - { - "epoch": 1.19, - "learning_rate": 0.0002090713196040603, - "loss": 3.5204, - "step": 3859 - }, - { - "epoch": 1.19, - "learning_rate": 0.00020903569054482309, - "loss": 3.7355, - "step": 3860 - }, - { - "epoch": 1.19, - "learning_rate": 0.00020900006148558584, - "loss": 3.6747, - "step": 3861 - }, - { - "epoch": 1.19, - "learning_rate": 0.00020896443242634862, - "loss": 3.4319, - "step": 3862 - }, - { - "epoch": 1.19, - "learning_rate": 0.00020892880336711138, - "loss": 3.523, - "step": 3863 - }, - { - "epoch": 1.19, - "learning_rate": 0.00020889317430787419, - "loss": 3.4138, - "step": 3864 - }, - { - "epoch": 1.19, - "learning_rate": 0.00020885754524863694, - "loss": 3.5175, - "step": 3865 - }, - { - "epoch": 1.19, - "learning_rate": 0.0002088219161893997, - "loss": 3.5736, - "step": 3866 - }, - { - "epoch": 1.19, - "learning_rate": 0.00020878628713016248, - "loss": 3.2976, - "step": 3867 - }, - { - "epoch": 1.19, - "learning_rate": 0.00020875065807092523, - "loss": 3.6764, - "step": 3868 - }, - { - "epoch": 1.19, - "learning_rate": 0.000208715029011688, - "loss": 3.5368, - "step": 3869 - }, - { - "epoch": 1.19, - "learning_rate": 0.0002086793999524508, - "loss": 3.3701, - "step": 3870 - }, - { - "epoch": 1.19, - "learning_rate": 0.00020864377089321355, - "loss": 3.4867, - "step": 3871 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020860814183397633, - "loss": 3.3768, - "step": 3872 - }, - { - "epoch": 1.2, - "learning_rate": 0.0002085725127747391, - "loss": 3.3416, - "step": 3873 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020853688371550184, - "loss": 3.4187, - "step": 3874 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020850125465626465, - "loss": 3.244, - "step": 3875 - }, - { - "epoch": 1.2, - "learning_rate": 0.0002084656255970274, - "loss": 3.2067, - "step": 3876 - }, - { - "epoch": 1.2, - "learning_rate": 0.0002084299965377902, - "loss": 3.2976, - "step": 3877 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020839436747855294, - "loss": 3.1766, - "step": 3878 - }, - { - "epoch": 1.2, - "learning_rate": 0.0002083587384193157, - "loss": 3.2158, - "step": 3879 - }, - { - "epoch": 1.2, - "learning_rate": 0.0002083231093600785, - "loss": 2.9337, - "step": 3880 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020828748030084126, - "loss": 2.9373, - "step": 3881 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020825185124160402, - "loss": 2.8662, - "step": 3882 - }, - { - "epoch": 1.2, - "learning_rate": 0.0002082162221823668, - "loss": 2.8634, - "step": 3883 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020818059312312958, - "loss": 2.753, - "step": 3884 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020814496406389236, - "loss": 2.5606, - "step": 3885 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020810933500465512, - "loss": 2.5708, - "step": 3886 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020807370594541787, - "loss": 2.4448, - "step": 3887 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020803807688618068, - "loss": 2.3447, - "step": 3888 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020800244782694343, - "loss": 2.1252, - "step": 3889 - }, - { - "epoch": 1.2, - "learning_rate": 0.0002079668187677062, - "loss": 2.0913, - "step": 3890 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020793118970846897, - "loss": 4.6698, - "step": 3891 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020789556064923173, - "loss": 4.6599, - "step": 3892 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020785993158999453, - "loss": 4.2264, - "step": 3893 - }, - { - "epoch": 1.2, - "learning_rate": 0.0002078243025307573, - "loss": 4.3761, - "step": 3894 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020778867347152004, - "loss": 4.0745, - "step": 3895 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020775304441228283, - "loss": 3.9686, - "step": 3896 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020771741535304558, - "loss": 3.773, - "step": 3897 - }, - { - "epoch": 1.2, - "learning_rate": 0.0002076817862938084, - "loss": 3.8665, - "step": 3898 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020764615723457114, - "loss": 3.7586, - "step": 3899 - }, - { - "epoch": 1.2, - "learning_rate": 0.0002076105281753339, - "loss": 3.9404, - "step": 3900 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020757489911609668, - "loss": 4.0543, - "step": 3901 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020753927005685944, - "loss": 4.041, - "step": 3902 - }, - { - "epoch": 1.2, - "learning_rate": 0.00020750364099762222, - "loss": 3.5808, - "step": 3903 - }, - { - "epoch": 1.2, - "learning_rate": 0.000207468011938385, - "loss": 3.5349, - "step": 3904 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020743238287914775, - "loss": 3.6957, - "step": 3905 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020739675381991054, - "loss": 3.6008, - "step": 3906 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020736112476067332, - "loss": 3.7735, - "step": 3907 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020732549570143607, - "loss": 3.5541, - "step": 3908 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020728986664219885, - "loss": 3.6948, - "step": 3909 - }, - { - "epoch": 1.21, - "learning_rate": 0.0002072542375829616, - "loss": 3.6745, - "step": 3910 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020721860852372442, - "loss": 3.5198, - "step": 3911 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020718297946448717, - "loss": 3.6279, - "step": 3912 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020714735040524993, - "loss": 3.4345, - "step": 3913 - }, - { - "epoch": 1.21, - "learning_rate": 0.0002071117213460127, - "loss": 3.4261, - "step": 3914 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020707609228677546, - "loss": 3.5164, - "step": 3915 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020704046322753822, - "loss": 3.5485, - "step": 3916 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020700483416830103, - "loss": 3.7133, - "step": 3917 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020696920510906378, - "loss": 3.5885, - "step": 3918 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020693357604982657, - "loss": 3.3758, - "step": 3919 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020689794699058932, - "loss": 3.2216, - "step": 3920 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020686231793135207, - "loss": 3.4085, - "step": 3921 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020682668887211488, - "loss": 3.4565, - "step": 3922 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020679105981287764, - "loss": 3.0165, - "step": 3923 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020675543075364042, - "loss": 3.3217, - "step": 3924 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020671980169440318, - "loss": 3.1182, - "step": 3925 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020668417263516593, - "loss": 3.234, - "step": 3926 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020664854357592874, - "loss": 2.9308, - "step": 3927 - }, - { - "epoch": 1.21, - "learning_rate": 0.0002066129145166915, - "loss": 3.0146, - "step": 3928 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020657728545745425, - "loss": 3.138, - "step": 3929 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020654165639821703, - "loss": 2.8432, - "step": 3930 - }, - { - "epoch": 1.21, - "learning_rate": 0.0002065060273389798, - "loss": 2.9042, - "step": 3931 - }, - { - "epoch": 1.21, - "learning_rate": 0.0002064703982797426, - "loss": 2.7021, - "step": 3932 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020643476922050535, - "loss": 2.7552, - "step": 3933 - }, - { - "epoch": 1.21, - "learning_rate": 0.0002063991401612681, - "loss": 2.5438, - "step": 3934 - }, - { - "epoch": 1.21, - "learning_rate": 0.0002063635111020309, - "loss": 2.4752, - "step": 3935 - }, - { - "epoch": 1.21, - "learning_rate": 0.00020632788204279367, - "loss": 2.5751, - "step": 3936 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020629225298355642, - "loss": 2.3778, - "step": 3937 - }, - { - "epoch": 1.22, - "learning_rate": 0.0002062566239243192, - "loss": 2.3704, - "step": 3938 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020622099486508196, - "loss": 2.1014, - "step": 3939 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020618536580584477, - "loss": 2.3672, - "step": 3940 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020614973674660752, - "loss": 4.9756, - "step": 3941 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020611410768737028, - "loss": 4.7258, - "step": 3942 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020607847862813306, - "loss": 4.5277, - "step": 3943 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020604284956889581, - "loss": 4.2526, - "step": 3944 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020600722050965862, - "loss": 4.0235, - "step": 3945 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020597159145042138, - "loss": 4.1879, - "step": 3946 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020593596239118413, - "loss": 3.9848, - "step": 3947 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020590033333194691, - "loss": 3.7477, - "step": 3948 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020586470427270967, - "loss": 3.7518, - "step": 3949 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020582907521347245, - "loss": 4.0188, - "step": 3950 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020579344615423523, - "loss": 4.0866, - "step": 3951 - }, - { - "epoch": 1.22, - "learning_rate": 0.000205757817094998, - "loss": 3.7769, - "step": 3952 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020572218803576077, - "loss": 3.893, - "step": 3953 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020568655897652355, - "loss": 3.876, - "step": 3954 - }, - { - "epoch": 1.22, - "learning_rate": 0.0002056509299172863, - "loss": 3.7795, - "step": 3955 - }, - { - "epoch": 1.22, - "learning_rate": 0.0002056153008580491, - "loss": 3.726, - "step": 3956 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020557967179881184, - "loss": 3.7535, - "step": 3957 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020554404273957465, - "loss": 3.7489, - "step": 3958 - }, - { - "epoch": 1.22, - "learning_rate": 0.0002055084136803374, - "loss": 3.7134, - "step": 3959 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020547278462110016, - "loss": 3.4635, - "step": 3960 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020543715556186294, - "loss": 3.6018, - "step": 3961 - }, - { - "epoch": 1.22, - "learning_rate": 0.0002054015265026257, - "loss": 3.4967, - "step": 3962 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020536589744338845, - "loss": 3.642, - "step": 3963 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020533026838415126, - "loss": 3.5785, - "step": 3964 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020529463932491402, - "loss": 3.4768, - "step": 3965 - }, - { - "epoch": 1.22, - "learning_rate": 0.0002052590102656768, - "loss": 3.3572, - "step": 3966 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020522338120643955, - "loss": 3.5165, - "step": 3967 - }, - { - "epoch": 1.22, - "learning_rate": 0.0002051877521472023, - "loss": 3.4371, - "step": 3968 - }, - { - "epoch": 1.22, - "learning_rate": 0.00020515212308796512, - "loss": 3.4396, - "step": 3969 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020511649402872787, - "loss": 3.2334, - "step": 3970 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020508086496949063, - "loss": 3.4137, - "step": 3971 - }, - { - "epoch": 1.23, - "learning_rate": 0.0002050452359102534, - "loss": 3.4548, - "step": 3972 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020500960685101616, - "loss": 3.1184, - "step": 3973 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020497397779177897, - "loss": 3.4517, - "step": 3974 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020493834873254173, - "loss": 3.1708, - "step": 3975 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020490271967330448, - "loss": 2.9798, - "step": 3976 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020486709061406726, - "loss": 3.0701, - "step": 3977 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020483146155483005, - "loss": 3.0844, - "step": 3978 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020479583249559283, - "loss": 3.0765, - "step": 3979 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020476020343635558, - "loss": 2.9867, - "step": 3980 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020472457437711834, - "loss": 3.0396, - "step": 3981 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020468894531788115, - "loss": 2.9261, - "step": 3982 - }, - { - "epoch": 1.23, - "learning_rate": 0.0002046533162586439, - "loss": 2.8977, - "step": 3983 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020461768719940666, - "loss": 2.8621, - "step": 3984 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020458205814016944, - "loss": 2.6248, - "step": 3985 - }, - { - "epoch": 1.23, - "learning_rate": 0.0002045464290809322, - "loss": 2.5816, - "step": 3986 - }, - { - "epoch": 1.23, - "learning_rate": 0.000204510800021695, - "loss": 2.5025, - "step": 3987 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020447517096245776, - "loss": 2.2911, - "step": 3988 - }, - { - "epoch": 1.23, - "learning_rate": 0.0002044395419032205, - "loss": 2.2851, - "step": 3989 - }, - { - "epoch": 1.23, - "learning_rate": 0.0002044039128439833, - "loss": 1.9868, - "step": 3990 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020436828378474605, - "loss": 4.8458, - "step": 3991 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020433265472550886, - "loss": 4.4478, - "step": 3992 - }, - { - "epoch": 1.23, - "learning_rate": 0.0002042970256662716, - "loss": 4.4579, - "step": 3993 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020426139660703437, - "loss": 4.2727, - "step": 3994 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020422576754779715, - "loss": 4.3846, - "step": 3995 - }, - { - "epoch": 1.23, - "learning_rate": 0.0002041901384885599, - "loss": 4.1739, - "step": 3996 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020415450942932268, - "loss": 4.2414, - "step": 3997 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020411888037008547, - "loss": 3.9333, - "step": 3998 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020408325131084822, - "loss": 3.6524, - "step": 3999 - }, - { - "epoch": 1.23, - "learning_rate": 0.000204047622251611, - "loss": 3.8403, - "step": 4000 - }, - { - "epoch": 1.23, - "eval_bleu": 4.614693226090912e-15, - "eval_loss": 4.254786491394043, - "eval_runtime": 2559.2675, - "eval_samples_per_second": 5.767, - "eval_steps_per_second": 0.721, - "step": 4000 - }, - { - "epoch": 1.23, - "learning_rate": 0.00020401199319237378, - "loss": 3.9412, - "step": 4001 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020397636413313654, - "loss": 3.9411, - "step": 4002 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020394073507389932, - "loss": 3.9369, - "step": 4003 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020390510601466208, - "loss": 3.7577, - "step": 4004 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020386947695542483, - "loss": 3.7133, - "step": 4005 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020383384789618764, - "loss": 3.5817, - "step": 4006 - }, - { - "epoch": 1.24, - "learning_rate": 0.0002037982188369504, - "loss": 3.7704, - "step": 4007 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020376258977771318, - "loss": 3.6969, - "step": 4008 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020372696071847593, - "loss": 3.5987, - "step": 4009 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020369133165923869, - "loss": 3.5088, - "step": 4010 - }, - { - "epoch": 1.24, - "learning_rate": 0.0002036557026000015, - "loss": 3.5038, - "step": 4011 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020362007354076425, - "loss": 3.608, - "step": 4012 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020358444448152703, - "loss": 3.6325, - "step": 4013 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020354881542228979, - "loss": 3.4694, - "step": 4014 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020351318636305254, - "loss": 3.8725, - "step": 4015 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020347755730381535, - "loss": 3.5531, - "step": 4016 - }, - { - "epoch": 1.24, - "learning_rate": 0.0002034419282445781, - "loss": 3.416, - "step": 4017 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020340629918534086, - "loss": 3.6539, - "step": 4018 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020337067012610364, - "loss": 3.5366, - "step": 4019 - }, - { - "epoch": 1.24, - "learning_rate": 0.0002033350410668664, - "loss": 3.3454, - "step": 4020 - }, - { - "epoch": 1.24, - "learning_rate": 0.0002032994120076292, - "loss": 3.2973, - "step": 4021 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020326378294839196, - "loss": 3.2673, - "step": 4022 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020322815388915471, - "loss": 3.5005, - "step": 4023 - }, - { - "epoch": 1.24, - "learning_rate": 0.0002031925248299175, - "loss": 3.1922, - "step": 4024 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020315689577068028, - "loss": 3.2475, - "step": 4025 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020312126671144306, - "loss": 2.9176, - "step": 4026 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020308563765220582, - "loss": 3.1856, - "step": 4027 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020305000859296857, - "loss": 2.9013, - "step": 4028 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020301437953373138, - "loss": 2.9738, - "step": 4029 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020297875047449413, - "loss": 3.222, - "step": 4030 - }, - { - "epoch": 1.24, - "learning_rate": 0.0002029431214152569, - "loss": 2.8784, - "step": 4031 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020290749235601967, - "loss": 2.9746, - "step": 4032 - }, - { - "epoch": 1.24, - "learning_rate": 0.00020287186329678243, - "loss": 2.8643, - "step": 4033 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020283623423754523, - "loss": 2.512, - "step": 4034 - }, - { - "epoch": 1.25, - "learning_rate": 0.000202800605178308, - "loss": 2.6302, - "step": 4035 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020276497611907074, - "loss": 2.5145, - "step": 4036 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020272934705983353, - "loss": 2.3529, - "step": 4037 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020269371800059628, - "loss": 2.5676, - "step": 4038 - }, - { - "epoch": 1.25, - "learning_rate": 0.0002026580889413591, - "loss": 2.267, - "step": 4039 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020262245988212184, - "loss": 2.1623, - "step": 4040 - }, - { - "epoch": 1.25, - "learning_rate": 0.0002025868308228846, - "loss": 5.1702, - "step": 4041 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020255120176364738, - "loss": 4.637, - "step": 4042 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020251557270441014, - "loss": 4.1834, - "step": 4043 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020247994364517292, - "loss": 4.2682, - "step": 4044 - }, - { - "epoch": 1.25, - "learning_rate": 0.0002024443145859357, - "loss": 3.768, - "step": 4045 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020240868552669845, - "loss": 4.3185, - "step": 4046 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020237305646746124, - "loss": 4.0905, - "step": 4047 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020233742740822402, - "loss": 3.8557, - "step": 4048 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020230179834898677, - "loss": 3.7776, - "step": 4049 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020226616928974955, - "loss": 3.9932, - "step": 4050 - }, - { - "epoch": 1.25, - "learning_rate": 0.0002022305402305123, - "loss": 3.7661, - "step": 4051 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020219491117127506, - "loss": 3.6129, - "step": 4052 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020215928211203787, - "loss": 3.7883, - "step": 4053 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020212365305280063, - "loss": 3.5686, - "step": 4054 - }, - { - "epoch": 1.25, - "learning_rate": 0.0002020880239935634, - "loss": 3.8698, - "step": 4055 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020205239493432616, - "loss": 3.9632, - "step": 4056 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020201676587508892, - "loss": 3.5946, - "step": 4057 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020198113681585173, - "loss": 3.7324, - "step": 4058 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020194550775661448, - "loss": 3.6483, - "step": 4059 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020190987869737726, - "loss": 3.592, - "step": 4060 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020187424963814002, - "loss": 3.8618, - "step": 4061 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020183862057890277, - "loss": 3.5915, - "step": 4062 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020180299151966558, - "loss": 3.5862, - "step": 4063 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020176736246042834, - "loss": 3.5308, - "step": 4064 - }, - { - "epoch": 1.25, - "learning_rate": 0.0002017317334011911, - "loss": 3.5936, - "step": 4065 - }, - { - "epoch": 1.25, - "learning_rate": 0.00020169610434195387, - "loss": 3.6883, - "step": 4066 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020166047528271663, - "loss": 3.2618, - "step": 4067 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020162484622347944, - "loss": 3.5509, - "step": 4068 - }, - { - "epoch": 1.26, - "learning_rate": 0.0002015892171642422, - "loss": 3.5532, - "step": 4069 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020155358810500495, - "loss": 3.3666, - "step": 4070 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020151795904576773, - "loss": 3.3565, - "step": 4071 - }, - { - "epoch": 1.26, - "learning_rate": 0.0002014823299865305, - "loss": 3.191, - "step": 4072 - }, - { - "epoch": 1.26, - "learning_rate": 0.0002014467009272933, - "loss": 3.2705, - "step": 4073 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020141107186805605, - "loss": 3.1202, - "step": 4074 - }, - { - "epoch": 1.26, - "learning_rate": 0.0002013754428088188, - "loss": 3.2307, - "step": 4075 - }, - { - "epoch": 1.26, - "learning_rate": 0.0002013398137495816, - "loss": 3.0596, - "step": 4076 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020130418469034437, - "loss": 3.2147, - "step": 4077 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020126855563110712, - "loss": 2.9279, - "step": 4078 - }, - { - "epoch": 1.26, - "learning_rate": 0.0002012329265718699, - "loss": 2.9907, - "step": 4079 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020119729751263266, - "loss": 2.9992, - "step": 4080 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020116166845339547, - "loss": 2.8377, - "step": 4081 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020112603939415822, - "loss": 2.9127, - "step": 4082 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020109041033492098, - "loss": 2.8503, - "step": 4083 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020105478127568376, - "loss": 2.6834, - "step": 4084 - }, - { - "epoch": 1.26, - "learning_rate": 0.0002010191522164465, - "loss": 2.6571, - "step": 4085 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020098352315720927, - "loss": 2.4462, - "step": 4086 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020094789409797208, - "loss": 2.3052, - "step": 4087 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020091226503873483, - "loss": 2.2622, - "step": 4088 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020087663597949761, - "loss": 2.2503, - "step": 4089 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020084100692026037, - "loss": 2.109, - "step": 4090 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020080537786102315, - "loss": 4.9508, - "step": 4091 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020076974880178593, - "loss": 4.4312, - "step": 4092 - }, - { - "epoch": 1.26, - "learning_rate": 0.0002007341197425487, - "loss": 4.6276, - "step": 4093 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020069849068331147, - "loss": 4.0516, - "step": 4094 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020066286162407425, - "loss": 4.3796, - "step": 4095 - }, - { - "epoch": 1.26, - "learning_rate": 0.000200627232564837, - "loss": 3.8635, - "step": 4096 - }, - { - "epoch": 1.26, - "learning_rate": 0.0002005916035055998, - "loss": 4.0627, - "step": 4097 - }, - { - "epoch": 1.26, - "learning_rate": 0.00020055597444636254, - "loss": 4.1408, - "step": 4098 - }, - { - "epoch": 1.27, - "learning_rate": 0.0002005203453871253, - "loss": 4.1166, - "step": 4099 - }, - { - "epoch": 1.27, - "learning_rate": 0.0002004847163278881, - "loss": 4.1022, - "step": 4100 - }, - { - "epoch": 1.27, - "learning_rate": 0.00020044908726865086, - "loss": 3.7815, - "step": 4101 - }, - { - "epoch": 1.27, - "learning_rate": 0.00020041345820941364, - "loss": 4.1798, - "step": 4102 - }, - { - "epoch": 1.27, - "learning_rate": 0.0002003778291501764, - "loss": 3.8032, - "step": 4103 - }, - { - "epoch": 1.27, - "learning_rate": 0.00020034220009093915, - "loss": 3.7528, - "step": 4104 - }, - { - "epoch": 1.27, - "learning_rate": 0.00020030657103170196, - "loss": 3.8882, - "step": 4105 - }, - { - "epoch": 1.27, - "learning_rate": 0.00020027094197246472, - "loss": 3.9117, - "step": 4106 - }, - { - "epoch": 1.27, - "learning_rate": 0.0002002353129132275, - "loss": 3.817, - "step": 4107 - }, - { - "epoch": 1.27, - "learning_rate": 0.00020019968385399025, - "loss": 3.6919, - "step": 4108 - }, - { - "epoch": 1.27, - "learning_rate": 0.000200164054794753, - "loss": 3.5369, - "step": 4109 - }, - { - "epoch": 1.27, - "learning_rate": 0.00020012842573551582, - "loss": 3.5369, - "step": 4110 - }, - { - "epoch": 1.27, - "learning_rate": 0.00020009279667627857, - "loss": 3.5223, - "step": 4111 - }, - { - "epoch": 1.27, - "learning_rate": 0.00020005716761704133, - "loss": 3.6681, - "step": 4112 - }, - { - "epoch": 1.27, - "learning_rate": 0.0002000215385578041, - "loss": 3.6064, - "step": 4113 - }, - { - "epoch": 1.27, - "learning_rate": 0.00019998590949856686, - "loss": 3.4679, - "step": 4114 - }, - { - "epoch": 1.27, - "learning_rate": 0.00019995028043932967, - "loss": 3.4889, - "step": 4115 - }, - { - "epoch": 1.27, - "learning_rate": 0.00019991465138009243, - "loss": 3.2571, - "step": 4116 - }, - { - "epoch": 1.27, - "learning_rate": 0.00019987902232085518, - "loss": 3.5201, - "step": 4117 - }, - { - "epoch": 1.27, - "learning_rate": 0.000199843393261618, - "loss": 3.5836, - "step": 4118 - }, - { - "epoch": 1.27, - "learning_rate": 0.00019980776420238074, - "loss": 3.2441, - "step": 4119 - }, - { - "epoch": 1.27, - "learning_rate": 0.00019977213514314353, - "loss": 3.4903, - "step": 4120 - }, - { - "epoch": 1.27, - "learning_rate": 0.00019973650608390628, - "loss": 3.3805, - "step": 4121 - }, - { - "epoch": 1.27, - "learning_rate": 0.00019970087702466904, - "loss": 3.4315, - "step": 4122 - }, - { - "epoch": 1.27, - "learning_rate": 0.00019966524796543185, - "loss": 3.1959, - "step": 4123 - }, - { - "epoch": 1.27, - "learning_rate": 0.0001996296189061946, - "loss": 3.0789, - "step": 4124 - }, - { - "epoch": 1.27, - "learning_rate": 0.00019959398984695735, - "loss": 2.9943, - "step": 4125 - }, - { - "epoch": 1.27, - "learning_rate": 0.00019955836078772014, - "loss": 2.7903, - "step": 4126 - }, - { - "epoch": 1.27, - "learning_rate": 0.0001995227317284829, - "loss": 3.1347, - "step": 4127 - }, - { - "epoch": 1.27, - "learning_rate": 0.0001994871026692457, - "loss": 3.3276, - "step": 4128 - }, - { - "epoch": 1.27, - "learning_rate": 0.00019945147361000846, - "loss": 2.9905, - "step": 4129 - }, - { - "epoch": 1.27, - "learning_rate": 0.0001994158445507712, - "loss": 3.1775, - "step": 4130 - }, - { - "epoch": 1.27, - "learning_rate": 0.000199380215491534, - "loss": 2.668, - "step": 4131 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019934458643229675, - "loss": 2.6774, - "step": 4132 - }, - { - "epoch": 1.28, - "learning_rate": 0.0001993089573730595, - "loss": 2.5979, - "step": 4133 - }, - { - "epoch": 1.28, - "learning_rate": 0.0001992733283138223, - "loss": 2.652, - "step": 4134 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019923769925458507, - "loss": 2.4616, - "step": 4135 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019920207019534785, - "loss": 2.5962, - "step": 4136 - }, - { - "epoch": 1.28, - "learning_rate": 0.0001991664411361106, - "loss": 2.1622, - "step": 4137 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019913081207687338, - "loss": 2.2383, - "step": 4138 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019909518301763617, - "loss": 2.1423, - "step": 4139 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019905955395839892, - "loss": 1.9743, - "step": 4140 - }, - { - "epoch": 1.28, - "learning_rate": 0.0001990239248991617, - "loss": 4.7776, - "step": 4141 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019898829583992448, - "loss": 4.7889, - "step": 4142 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019895266678068724, - "loss": 4.5863, - "step": 4143 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019891703772145002, - "loss": 4.2464, - "step": 4144 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019888140866221278, - "loss": 4.1614, - "step": 4145 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019884577960297553, - "loss": 3.9973, - "step": 4146 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019881015054373834, - "loss": 4.1638, - "step": 4147 - }, - { - "epoch": 1.28, - "learning_rate": 0.0001987745214845011, - "loss": 3.9015, - "step": 4148 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019873889242526388, - "loss": 4.0901, - "step": 4149 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019870326336602663, - "loss": 4.0003, - "step": 4150 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019866763430678939, - "loss": 3.538, - "step": 4151 - }, - { - "epoch": 1.28, - "learning_rate": 0.0001986320052475522, - "loss": 3.9631, - "step": 4152 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019859637618831495, - "loss": 3.7521, - "step": 4153 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019856074712907773, - "loss": 3.935, - "step": 4154 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019852511806984049, - "loss": 4.082, - "step": 4155 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019848948901060324, - "loss": 3.5419, - "step": 4156 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019845385995136605, - "loss": 3.3202, - "step": 4157 - }, - { - "epoch": 1.28, - "learning_rate": 0.0001984182308921288, - "loss": 3.9738, - "step": 4158 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019838260183289156, - "loss": 3.5651, - "step": 4159 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019834697277365434, - "loss": 3.579, - "step": 4160 - }, - { - "epoch": 1.28, - "learning_rate": 0.0001983113437144171, - "loss": 3.7046, - "step": 4161 - }, - { - "epoch": 1.28, - "learning_rate": 0.0001982757146551799, - "loss": 3.6878, - "step": 4162 - }, - { - "epoch": 1.28, - "learning_rate": 0.00019824008559594266, - "loss": 3.4205, - "step": 4163 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019820445653670541, - "loss": 3.6325, - "step": 4164 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019816882747746822, - "loss": 3.6469, - "step": 4165 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019813319841823098, - "loss": 3.6515, - "step": 4166 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019809756935899373, - "loss": 3.518, - "step": 4167 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019806194029975651, - "loss": 3.3475, - "step": 4168 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019802631124051927, - "loss": 3.3057, - "step": 4169 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019799068218128208, - "loss": 3.4445, - "step": 4170 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019795505312204483, - "loss": 3.3243, - "step": 4171 - }, - { - "epoch": 1.29, - "learning_rate": 0.0001979194240628076, - "loss": 3.2831, - "step": 4172 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019788379500357037, - "loss": 3.2324, - "step": 4173 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019784816594433312, - "loss": 3.3951, - "step": 4174 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019781253688509593, - "loss": 3.1669, - "step": 4175 - }, - { - "epoch": 1.29, - "learning_rate": 0.0001977769078258587, - "loss": 3.2009, - "step": 4176 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019774127876662144, - "loss": 3.2058, - "step": 4177 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019770564970738422, - "loss": 3.0035, - "step": 4178 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019767002064814698, - "loss": 3.3009, - "step": 4179 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019763439158890973, - "loss": 2.8675, - "step": 4180 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019759876252967254, - "loss": 2.9386, - "step": 4181 - }, - { - "epoch": 1.29, - "learning_rate": 0.0001975631334704353, - "loss": 3.0342, - "step": 4182 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019752750441119808, - "loss": 2.6496, - "step": 4183 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019749187535196083, - "loss": 2.4077, - "step": 4184 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019745624629272362, - "loss": 2.485, - "step": 4185 - }, - { - "epoch": 1.29, - "learning_rate": 0.0001974206172334864, - "loss": 2.3658, - "step": 4186 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019738498817424915, - "loss": 2.4802, - "step": 4187 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019734935911501194, - "loss": 2.336, - "step": 4188 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019731373005577472, - "loss": 2.2165, - "step": 4189 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019727810099653747, - "loss": 1.8735, - "step": 4190 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019724247193730025, - "loss": 5.1235, - "step": 4191 - }, - { - "epoch": 1.29, - "learning_rate": 0.000197206842878063, - "loss": 4.5345, - "step": 4192 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019717121381882576, - "loss": 4.2376, - "step": 4193 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019713558475958857, - "loss": 4.1777, - "step": 4194 - }, - { - "epoch": 1.29, - "learning_rate": 0.00019709995570035133, - "loss": 4.0507, - "step": 4195 - }, - { - "epoch": 1.3, - "learning_rate": 0.0001970643266411141, - "loss": 3.9894, - "step": 4196 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019702869758187686, - "loss": 3.9485, - "step": 4197 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019699306852263962, - "loss": 4.0695, - "step": 4198 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019695743946340243, - "loss": 3.974, - "step": 4199 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019692181040416518, - "loss": 3.9045, - "step": 4200 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019688618134492794, - "loss": 3.8913, - "step": 4201 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019685055228569072, - "loss": 3.906, - "step": 4202 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019681492322645347, - "loss": 3.8258, - "step": 4203 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019677929416721628, - "loss": 3.7487, - "step": 4204 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019674366510797904, - "loss": 3.8174, - "step": 4205 - }, - { - "epoch": 1.3, - "learning_rate": 0.0001967080360487418, - "loss": 3.6031, - "step": 4206 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019667240698950457, - "loss": 3.8142, - "step": 4207 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019663677793026736, - "loss": 3.56, - "step": 4208 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019660114887103014, - "loss": 3.87, - "step": 4209 - }, - { - "epoch": 1.3, - "learning_rate": 0.0001965655198117929, - "loss": 3.5016, - "step": 4210 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019652989075255565, - "loss": 3.7182, - "step": 4211 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019649426169331846, - "loss": 3.5533, - "step": 4212 - }, - { - "epoch": 1.3, - "learning_rate": 0.0001964586326340812, - "loss": 3.5139, - "step": 4213 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019642300357484397, - "loss": 3.468, - "step": 4214 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019638737451560675, - "loss": 3.2553, - "step": 4215 - }, - { - "epoch": 1.3, - "learning_rate": 0.0001963517454563695, - "loss": 3.5977, - "step": 4216 - }, - { - "epoch": 1.3, - "learning_rate": 0.0001963161163971323, - "loss": 3.6332, - "step": 4217 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019628048733789507, - "loss": 3.2914, - "step": 4218 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019624485827865782, - "loss": 3.2852, - "step": 4219 - }, - { - "epoch": 1.3, - "learning_rate": 0.0001962092292194206, - "loss": 3.4892, - "step": 4220 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019617360016018336, - "loss": 3.3141, - "step": 4221 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019613797110094617, - "loss": 3.2893, - "step": 4222 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019610234204170892, - "loss": 3.0638, - "step": 4223 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019606671298247168, - "loss": 3.3924, - "step": 4224 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019603108392323446, - "loss": 3.0383, - "step": 4225 - }, - { - "epoch": 1.3, - "learning_rate": 0.0001959954548639972, - "loss": 3.3122, - "step": 4226 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019595982580475997, - "loss": 3.2657, - "step": 4227 - }, - { - "epoch": 1.3, - "learning_rate": 0.00019592419674552278, - "loss": 3.1933, - "step": 4228 - }, - { - "epoch": 1.31, - "learning_rate": 0.00019588856768628553, - "loss": 3.0377, - "step": 4229 - }, - { - "epoch": 1.31, - "learning_rate": 0.0001958529386270483, - "loss": 2.7775, - "step": 4230 - }, - { - "epoch": 1.31, - "learning_rate": 0.00019581730956781107, - "loss": 2.8229, - "step": 4231 - }, - { - "epoch": 1.31, - "learning_rate": 0.00019578168050857385, - "loss": 2.9715, - "step": 4232 - }, - { - "epoch": 1.31, - "learning_rate": 0.00019574605144933663, - "loss": 2.8702, - "step": 4233 - }, - { - "epoch": 1.31, - "learning_rate": 0.0001957104223900994, - "loss": 2.8079, - "step": 4234 - }, - { - "epoch": 1.31, - "learning_rate": 0.00019567479333086217, - "loss": 2.5521, - "step": 4235 - }, - { - "epoch": 1.31, - "learning_rate": 0.00019563916427162495, - "loss": 2.5671, - "step": 4236 - }, - { - "epoch": 1.31, - "learning_rate": 0.0001956035352123877, - "loss": 2.264, - "step": 4237 - }, - { - "epoch": 1.31, - "learning_rate": 0.0001955679061531505, - "loss": 2.3838, - "step": 4238 - }, - { - "epoch": 1.31, - "learning_rate": 0.00019553227709391324, - "loss": 2.0693, - "step": 4239 - }, - { - "epoch": 1.31, - "learning_rate": 0.000195496648034676, - "loss": 2.0695, - "step": 4240 - }, - { - "epoch": 1.31, - "learning_rate": 0.0001954610189754388, - "loss": 4.756, - "step": 4241 - }, - { - "epoch": 1.31, - "learning_rate": 0.00019542538991620156, - "loss": 4.3353, - "step": 4242 - }, - { - "epoch": 1.31, - "learning_rate": 0.00019538976085696434, - "loss": 4.3121, - "step": 4243 - }, - { - "epoch": 1.31, - "learning_rate": 0.0001953541317977271, - "loss": 4.0447, - "step": 4244 - }, - { - "epoch": 1.31, - "learning_rate": 0.00019531850273848985, - "loss": 4.1033, - "step": 4245 - }, - { - "epoch": 1.31, - "learning_rate": 0.00019528287367925266, - "loss": 3.9433, - "step": 4246 - }, - { - "epoch": 1.31, - "learning_rate": 0.00019524724462001542, - "loss": 4.0171, - "step": 4247 - }, - { - "epoch": 1.31, - "learning_rate": 0.00019521161556077817, - "loss": 3.6522, - "step": 4248 - }, - { - "epoch": 1.31, - "learning_rate": 0.00019517598650154095, - "loss": 3.839, - "step": 4249 - }, - { - "epoch": 1.31, - "learning_rate": 0.0001951403574423037, - "loss": 4.0754, - "step": 4250 - }, - { - "epoch": 1.31, - "learning_rate": 0.00019510472838306652, - "loss": 3.839, - "step": 4251 - }, - { - "epoch": 1.31, - "learning_rate": 0.00019506909932382927, - "loss": 3.8577, - "step": 4252 - }, - { - "epoch": 1.31, - "learning_rate": 0.00019503347026459203, - "loss": 3.5299, - "step": 4253 - }, - { - "epoch": 1.31, - "learning_rate": 0.0001949978412053548, - "loss": 3.8549, - "step": 4254 - }, - { - "epoch": 1.31, - "learning_rate": 0.0001949622121461176, - "loss": 3.8678, - "step": 4255 - }, - { - "epoch": 1.31, - "learning_rate": 0.00019492658308688037, - "loss": 3.7703, - "step": 4256 - }, - { - "epoch": 1.31, - "learning_rate": 0.00019489095402764313, - "loss": 3.4962, - "step": 4257 - }, - { - "epoch": 1.31, - "learning_rate": 0.00019485532496840588, - "loss": 3.6904, - "step": 4258 - }, - { - "epoch": 1.31, - "learning_rate": 0.0001948196959091687, - "loss": 3.7313, - "step": 4259 - }, - { - "epoch": 1.31, - "learning_rate": 0.00019478406684993144, - "loss": 3.6372, - "step": 4260 - }, - { - "epoch": 1.32, - "learning_rate": 0.0001947484377906942, - "loss": 3.6272, - "step": 4261 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019471280873145698, - "loss": 3.4883, - "step": 4262 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019467717967221974, - "loss": 3.834, - "step": 4263 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019464155061298254, - "loss": 3.5524, - "step": 4264 - }, - { - "epoch": 1.32, - "learning_rate": 0.0001946059215537453, - "loss": 3.5772, - "step": 4265 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019457029249450805, - "loss": 3.5734, - "step": 4266 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019453466343527084, - "loss": 3.5523, - "step": 4267 - }, - { - "epoch": 1.32, - "learning_rate": 0.0001944990343760336, - "loss": 3.397, - "step": 4268 - }, - { - "epoch": 1.32, - "learning_rate": 0.0001944634053167964, - "loss": 3.4044, - "step": 4269 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019442777625755915, - "loss": 3.5637, - "step": 4270 - }, - { - "epoch": 1.32, - "learning_rate": 0.0001943921471983219, - "loss": 3.3532, - "step": 4271 - }, - { - "epoch": 1.32, - "learning_rate": 0.0001943565181390847, - "loss": 3.4568, - "step": 4272 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019432088907984745, - "loss": 3.1275, - "step": 4273 - }, - { - "epoch": 1.32, - "learning_rate": 0.0001942852600206102, - "loss": 3.4894, - "step": 4274 - }, - { - "epoch": 1.32, - "learning_rate": 0.000194249630961373, - "loss": 3.1959, - "step": 4275 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019421400190213576, - "loss": 3.033, - "step": 4276 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019417837284289855, - "loss": 3.1806, - "step": 4277 - }, - { - "epoch": 1.32, - "learning_rate": 0.0001941427437836613, - "loss": 2.8889, - "step": 4278 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019410711472442408, - "loss": 3.0244, - "step": 4279 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019407148566518686, - "loss": 2.7249, - "step": 4280 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019403585660594962, - "loss": 2.9049, - "step": 4281 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019400022754671237, - "loss": 3.1185, - "step": 4282 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019396459848747518, - "loss": 2.83, - "step": 4283 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019392896942823794, - "loss": 2.5706, - "step": 4284 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019389334036900072, - "loss": 2.5303, - "step": 4285 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019385771130976347, - "loss": 2.5937, - "step": 4286 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019382208225052623, - "loss": 2.3114, - "step": 4287 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019378645319128904, - "loss": 2.2517, - "step": 4288 - }, - { - "epoch": 1.32, - "learning_rate": 0.0001937508241320518, - "loss": 2.2919, - "step": 4289 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019371519507281458, - "loss": 2.0103, - "step": 4290 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019367956601357733, - "loss": 4.8257, - "step": 4291 - }, - { - "epoch": 1.32, - "learning_rate": 0.00019364393695434008, - "loss": 4.7632, - "step": 4292 - }, - { - "epoch": 1.32, - "learning_rate": 0.0001936083078951029, - "loss": 4.2425, - "step": 4293 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019357267883586565, - "loss": 4.0317, - "step": 4294 - }, - { - "epoch": 1.33, - "learning_rate": 0.0001935370497766284, - "loss": 3.9432, - "step": 4295 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019350142071739119, - "loss": 3.8889, - "step": 4296 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019346579165815394, - "loss": 4.0436, - "step": 4297 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019343016259891675, - "loss": 3.6109, - "step": 4298 - }, - { - "epoch": 1.33, - "learning_rate": 0.0001933945335396795, - "loss": 3.8944, - "step": 4299 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019335890448044226, - "loss": 3.8379, - "step": 4300 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019332327542120504, - "loss": 3.7243, - "step": 4301 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019328764636196782, - "loss": 3.6167, - "step": 4302 - }, - { - "epoch": 1.33, - "learning_rate": 0.0001932520173027306, - "loss": 3.7305, - "step": 4303 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019321638824349336, - "loss": 3.8325, - "step": 4304 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019318075918425611, - "loss": 3.7357, - "step": 4305 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019314513012501892, - "loss": 3.6141, - "step": 4306 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019310950106578168, - "loss": 3.8405, - "step": 4307 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019307387200654443, - "loss": 3.8167, - "step": 4308 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019303824294730721, - "loss": 3.5087, - "step": 4309 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019300261388806997, - "loss": 3.5746, - "step": 4310 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019296698482883278, - "loss": 3.5271, - "step": 4311 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019293135576959553, - "loss": 3.6927, - "step": 4312 - }, - { - "epoch": 1.33, - "learning_rate": 0.0001928957267103583, - "loss": 3.8038, - "step": 4313 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019286009765112107, - "loss": 3.7544, - "step": 4314 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019282446859188382, - "loss": 3.3028, - "step": 4315 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019278883953264663, - "loss": 3.3647, - "step": 4316 - }, - { - "epoch": 1.33, - "learning_rate": 0.0001927532104734094, - "loss": 3.7141, - "step": 4317 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019271758141417214, - "loss": 3.4735, - "step": 4318 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019268195235493492, - "loss": 3.5988, - "step": 4319 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019264632329569768, - "loss": 3.4519, - "step": 4320 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019261069423646043, - "loss": 3.3051, - "step": 4321 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019257506517722324, - "loss": 3.2725, - "step": 4322 - }, - { - "epoch": 1.33, - "learning_rate": 0.000192539436117986, - "loss": 3.3351, - "step": 4323 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019250380705874878, - "loss": 3.1123, - "step": 4324 - }, - { - "epoch": 1.33, - "learning_rate": 0.00019246817799951153, - "loss": 3.0785, - "step": 4325 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019243254894027432, - "loss": 3.2448, - "step": 4326 - }, - { - "epoch": 1.34, - "learning_rate": 0.0001923969198810371, - "loss": 2.9339, - "step": 4327 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019236129082179985, - "loss": 2.7783, - "step": 4328 - }, - { - "epoch": 1.34, - "learning_rate": 0.0001923256617625626, - "loss": 2.9941, - "step": 4329 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019229003270332542, - "loss": 2.7995, - "step": 4330 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019225440364408817, - "loss": 2.9015, - "step": 4331 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019221877458485095, - "loss": 2.8588, - "step": 4332 - }, - { - "epoch": 1.34, - "learning_rate": 0.0001921831455256137, - "loss": 2.6093, - "step": 4333 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019214751646637646, - "loss": 2.6802, - "step": 4334 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019211188740713927, - "loss": 2.783, - "step": 4335 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019207625834790203, - "loss": 2.3952, - "step": 4336 - }, - { - "epoch": 1.34, - "learning_rate": 0.0001920406292886648, - "loss": 2.4198, - "step": 4337 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019200500022942756, - "loss": 2.3446, - "step": 4338 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019196937117019032, - "loss": 2.1385, - "step": 4339 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019193374211095313, - "loss": 1.8476, - "step": 4340 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019189811305171588, - "loss": 4.8288, - "step": 4341 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019186248399247864, - "loss": 4.5279, - "step": 4342 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019182685493324142, - "loss": 4.3287, - "step": 4343 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019179122587400417, - "loss": 4.0691, - "step": 4344 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019175559681476698, - "loss": 4.1254, - "step": 4345 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019171996775552974, - "loss": 4.1111, - "step": 4346 - }, - { - "epoch": 1.34, - "learning_rate": 0.0001916843386962925, - "loss": 3.708, - "step": 4347 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019164870963705527, - "loss": 3.9224, - "step": 4348 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019161308057781806, - "loss": 4.02, - "step": 4349 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019157745151858084, - "loss": 3.7059, - "step": 4350 - }, - { - "epoch": 1.34, - "learning_rate": 0.0001915418224593436, - "loss": 3.8375, - "step": 4351 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019150619340010635, - "loss": 3.5442, - "step": 4352 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019147056434086916, - "loss": 3.6605, - "step": 4353 - }, - { - "epoch": 1.34, - "learning_rate": 0.0001914349352816319, - "loss": 3.6316, - "step": 4354 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019139930622239467, - "loss": 3.4278, - "step": 4355 - }, - { - "epoch": 1.34, - "learning_rate": 0.00019136367716315745, - "loss": 3.7944, - "step": 4356 - }, - { - "epoch": 1.34, - "learning_rate": 0.0001913280481039202, - "loss": 3.5659, - "step": 4357 - }, - { - "epoch": 1.35, - "learning_rate": 0.000191292419044683, - "loss": 3.7439, - "step": 4358 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019125678998544577, - "loss": 3.5405, - "step": 4359 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019122116092620852, - "loss": 3.6779, - "step": 4360 - }, - { - "epoch": 1.35, - "learning_rate": 0.0001911855318669713, - "loss": 3.6648, - "step": 4361 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019114990280773406, - "loss": 3.5249, - "step": 4362 - }, - { - "epoch": 1.35, - "learning_rate": 0.0001911142737484968, - "loss": 3.4776, - "step": 4363 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019107864468925962, - "loss": 3.4766, - "step": 4364 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019104301563002238, - "loss": 3.3854, - "step": 4365 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019100738657078516, - "loss": 3.4335, - "step": 4366 - }, - { - "epoch": 1.35, - "learning_rate": 0.0001909717575115479, - "loss": 3.3647, - "step": 4367 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019093612845231067, - "loss": 3.1917, - "step": 4368 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019090049939307348, - "loss": 3.4183, - "step": 4369 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019086487033383623, - "loss": 3.3175, - "step": 4370 - }, - { - "epoch": 1.35, - "learning_rate": 0.000190829241274599, - "loss": 3.1259, - "step": 4371 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019079361221536177, - "loss": 3.2634, - "step": 4372 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019075798315612455, - "loss": 2.988, - "step": 4373 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019072235409688733, - "loss": 3.272, - "step": 4374 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019068672503765009, - "loss": 3.1217, - "step": 4375 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019065109597841284, - "loss": 2.9683, - "step": 4376 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019061546691917565, - "loss": 2.9318, - "step": 4377 - }, - { - "epoch": 1.35, - "learning_rate": 0.0001905798378599384, - "loss": 3.2093, - "step": 4378 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019054420880070119, - "loss": 2.9556, - "step": 4379 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019050857974146394, - "loss": 3.105, - "step": 4380 - }, - { - "epoch": 1.35, - "learning_rate": 0.0001904729506822267, - "loss": 3.0631, - "step": 4381 - }, - { - "epoch": 1.35, - "learning_rate": 0.0001904373216229895, - "loss": 3.0865, - "step": 4382 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019040169256375226, - "loss": 2.6952, - "step": 4383 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019036606350451504, - "loss": 2.6302, - "step": 4384 - }, - { - "epoch": 1.35, - "learning_rate": 0.0001903304344452778, - "loss": 2.7201, - "step": 4385 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019029480538604055, - "loss": 2.6162, - "step": 4386 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019025917632680336, - "loss": 2.3992, - "step": 4387 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019022354726756611, - "loss": 2.2308, - "step": 4388 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019018791820832887, - "loss": 2.1123, - "step": 4389 - }, - { - "epoch": 1.35, - "learning_rate": 0.00019015228914909165, - "loss": 1.9149, - "step": 4390 - }, - { - "epoch": 1.36, - "learning_rate": 0.0001901166600898544, - "loss": 4.8282, - "step": 4391 - }, - { - "epoch": 1.36, - "learning_rate": 0.00019008103103061722, - "loss": 4.3143, - "step": 4392 - }, - { - "epoch": 1.36, - "learning_rate": 0.00019004540197137997, - "loss": 4.3973, - "step": 4393 - }, - { - "epoch": 1.36, - "learning_rate": 0.00019000977291214272, - "loss": 4.3903, - "step": 4394 - }, - { - "epoch": 1.36, - "learning_rate": 0.0001899741438529055, - "loss": 4.2074, - "step": 4395 - }, - { - "epoch": 1.36, - "learning_rate": 0.0001899385147936683, - "loss": 4.0264, - "step": 4396 - }, - { - "epoch": 1.36, - "learning_rate": 0.00018990288573443104, - "loss": 3.833, - "step": 4397 - }, - { - "epoch": 1.36, - "learning_rate": 0.00018986725667519383, - "loss": 3.8743, - "step": 4398 - }, - { - "epoch": 1.36, - "learning_rate": 0.00018983162761595658, - "loss": 3.8749, - "step": 4399 - }, - { - "epoch": 1.36, - "learning_rate": 0.0001897959985567194, - "loss": 3.836, - "step": 4400 - }, - { - "epoch": 1.36, - "learning_rate": 0.00018976036949748214, - "loss": 3.7608, - "step": 4401 - }, - { - "epoch": 1.36, - "learning_rate": 0.0001897247404382449, - "loss": 3.8409, - "step": 4402 - }, - { - "epoch": 1.36, - "learning_rate": 0.00018968911137900768, - "loss": 4.0019, - "step": 4403 - }, - { - "epoch": 1.36, - "learning_rate": 0.00018965348231977044, - "loss": 4.0407, - "step": 4404 - }, - { - "epoch": 1.36, - "learning_rate": 0.00018961785326053324, - "loss": 3.7862, - "step": 4405 - }, - { - "epoch": 1.36, - "learning_rate": 0.000189582224201296, - "loss": 3.6244, - "step": 4406 - }, - { - "epoch": 1.36, - "learning_rate": 0.00018954659514205875, - "loss": 3.8824, - "step": 4407 - }, - { - "epoch": 1.36, - "learning_rate": 0.00018951096608282154, - "loss": 3.631, - "step": 4408 - }, - { - "epoch": 1.36, - "learning_rate": 0.0001894753370235843, - "loss": 3.7132, - "step": 4409 - }, - { - "epoch": 1.36, - "learning_rate": 0.00018943970796434705, - "loss": 3.6952, - "step": 4410 - }, - { - "epoch": 1.36, - "learning_rate": 0.00018940407890510985, - "loss": 3.4235, - "step": 4411 - }, - { - "epoch": 1.36, - "learning_rate": 0.0001893684498458726, - "loss": 3.3931, - "step": 4412 - }, - { - "epoch": 1.36, - "learning_rate": 0.0001893328207866354, - "loss": 3.5529, - "step": 4413 - }, - { - "epoch": 1.36, - "learning_rate": 0.00018929719172739815, - "loss": 3.7881, - "step": 4414 - }, - { - "epoch": 1.36, - "learning_rate": 0.0001892615626681609, - "loss": 3.3116, - "step": 4415 - }, - { - "epoch": 1.36, - "learning_rate": 0.0001892259336089237, - "loss": 3.3944, - "step": 4416 - }, - { - "epoch": 1.36, - "learning_rate": 0.00018919030454968646, - "loss": 3.6446, - "step": 4417 - }, - { - "epoch": 1.36, - "learning_rate": 0.00018915467549044925, - "loss": 3.3038, - "step": 4418 - }, - { - "epoch": 1.36, - "learning_rate": 0.000189119046431212, - "loss": 3.2889, - "step": 4419 - }, - { - "epoch": 1.36, - "learning_rate": 0.00018908341737197478, - "loss": 3.3144, - "step": 4420 - }, - { - "epoch": 1.36, - "learning_rate": 0.00018904778831273756, - "loss": 3.1515, - "step": 4421 - }, - { - "epoch": 1.36, - "learning_rate": 0.00018901215925350032, - "loss": 3.223, - "step": 4422 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018897653019426307, - "loss": 3.2184, - "step": 4423 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018894090113502588, - "loss": 3.3345, - "step": 4424 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018890527207578864, - "loss": 3.0234, - "step": 4425 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018886964301655142, - "loss": 2.9133, - "step": 4426 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018883401395731417, - "loss": 3.0385, - "step": 4427 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018879838489807693, - "loss": 3.0332, - "step": 4428 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018876275583883974, - "loss": 2.9399, - "step": 4429 - }, - { - "epoch": 1.37, - "learning_rate": 0.0001887271267796025, - "loss": 2.8619, - "step": 4430 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018869149772036527, - "loss": 2.9731, - "step": 4431 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018865586866112803, - "loss": 2.7368, - "step": 4432 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018862023960189078, - "loss": 2.8557, - "step": 4433 - }, - { - "epoch": 1.37, - "learning_rate": 0.0001885846105426536, - "loss": 2.3573, - "step": 4434 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018854898148341635, - "loss": 2.7865, - "step": 4435 - }, - { - "epoch": 1.37, - "learning_rate": 0.0001885133524241791, - "loss": 2.4505, - "step": 4436 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018847772336494188, - "loss": 2.4733, - "step": 4437 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018844209430570464, - "loss": 2.1499, - "step": 4438 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018840646524646745, - "loss": 2.1309, - "step": 4439 - }, - { - "epoch": 1.37, - "learning_rate": 0.0001883708361872302, - "loss": 1.9715, - "step": 4440 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018833520712799296, - "loss": 5.1261, - "step": 4441 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018829957806875574, - "loss": 4.448, - "step": 4442 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018826394900951852, - "loss": 4.7683, - "step": 4443 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018822831995028128, - "loss": 4.2514, - "step": 4444 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018819269089104406, - "loss": 4.2426, - "step": 4445 - }, - { - "epoch": 1.37, - "learning_rate": 0.0001881570618318068, - "loss": 3.9375, - "step": 4446 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018812143277256962, - "loss": 3.7905, - "step": 4447 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018808580371333238, - "loss": 4.0176, - "step": 4448 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018805017465409513, - "loss": 3.8078, - "step": 4449 - }, - { - "epoch": 1.37, - "learning_rate": 0.0001880145455948579, - "loss": 3.6778, - "step": 4450 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018797891653562067, - "loss": 3.8909, - "step": 4451 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018794328747638348, - "loss": 3.9963, - "step": 4452 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018790765841714623, - "loss": 3.9153, - "step": 4453 - }, - { - "epoch": 1.37, - "learning_rate": 0.000187872029357909, - "loss": 3.8126, - "step": 4454 - }, - { - "epoch": 1.37, - "learning_rate": 0.00018783640029867177, - "loss": 3.6202, - "step": 4455 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018780077123943452, - "loss": 3.6812, - "step": 4456 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018776514218019728, - "loss": 3.7388, - "step": 4457 - }, - { - "epoch": 1.38, - "learning_rate": 0.0001877295131209601, - "loss": 3.5055, - "step": 4458 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018769388406172284, - "loss": 3.7038, - "step": 4459 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018765825500248562, - "loss": 3.8466, - "step": 4460 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018762262594324838, - "loss": 3.6387, - "step": 4461 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018758699688401113, - "loss": 3.5933, - "step": 4462 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018755136782477394, - "loss": 3.5309, - "step": 4463 - }, - { - "epoch": 1.38, - "learning_rate": 0.0001875157387655367, - "loss": 3.5349, - "step": 4464 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018748010970629948, - "loss": 3.4066, - "step": 4465 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018744448064706226, - "loss": 3.3315, - "step": 4466 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018740885158782502, - "loss": 3.5529, - "step": 4467 - }, - { - "epoch": 1.38, - "learning_rate": 0.0001873732225285878, - "loss": 3.4812, - "step": 4468 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018733759346935055, - "loss": 3.3587, - "step": 4469 - }, - { - "epoch": 1.38, - "learning_rate": 0.0001873019644101133, - "loss": 3.3246, - "step": 4470 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018726633535087612, - "loss": 3.3587, - "step": 4471 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018723070629163887, - "loss": 3.1762, - "step": 4472 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018719507723240165, - "loss": 3.011, - "step": 4473 - }, - { - "epoch": 1.38, - "learning_rate": 0.0001871594481731644, - "loss": 2.9766, - "step": 4474 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018712381911392716, - "loss": 3.1962, - "step": 4475 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018708819005468997, - "loss": 3.1711, - "step": 4476 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018705256099545273, - "loss": 2.8417, - "step": 4477 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018701693193621548, - "loss": 2.8981, - "step": 4478 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018698130287697826, - "loss": 2.9314, - "step": 4479 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018694567381774102, - "loss": 2.8706, - "step": 4480 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018691004475850383, - "loss": 3.0429, - "step": 4481 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018687441569926658, - "loss": 2.757, - "step": 4482 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018683878664002934, - "loss": 2.6737, - "step": 4483 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018680315758079212, - "loss": 2.771, - "step": 4484 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018676752852155487, - "loss": 2.5789, - "step": 4485 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018673189946231768, - "loss": 2.6008, - "step": 4486 - }, - { - "epoch": 1.38, - "learning_rate": 0.00018669627040308044, - "loss": 2.5054, - "step": 4487 - }, - { - "epoch": 1.39, - "learning_rate": 0.0001866606413438432, - "loss": 2.2605, - "step": 4488 - }, - { - "epoch": 1.39, - "learning_rate": 0.00018662501228460597, - "loss": 2.2089, - "step": 4489 - }, - { - "epoch": 1.39, - "learning_rate": 0.00018658938322536875, - "loss": 1.9788, - "step": 4490 - }, - { - "epoch": 1.39, - "learning_rate": 0.0001865537541661315, - "loss": 4.9339, - "step": 4491 - }, - { - "epoch": 1.39, - "learning_rate": 0.0001865181251068943, - "loss": 4.4801, - "step": 4492 - }, - { - "epoch": 1.39, - "learning_rate": 0.00018648249604765705, - "loss": 4.3051, - "step": 4493 - }, - { - "epoch": 1.39, - "learning_rate": 0.00018644686698841986, - "loss": 4.3707, - "step": 4494 - }, - { - "epoch": 1.39, - "learning_rate": 0.0001864112379291826, - "loss": 3.7899, - "step": 4495 - }, - { - "epoch": 1.39, - "learning_rate": 0.00018637560886994536, - "loss": 3.8678, - "step": 4496 - }, - { - "epoch": 1.39, - "learning_rate": 0.00018633997981070815, - "loss": 3.6558, - "step": 4497 - }, - { - "epoch": 1.39, - "learning_rate": 0.0001863043507514709, - "loss": 3.791, - "step": 4498 - }, - { - "epoch": 1.39, - "learning_rate": 0.0001862687216922337, - "loss": 3.891, - "step": 4499 - }, - { - "epoch": 1.39, - "learning_rate": 0.00018623309263299647, - "loss": 4.0396, - "step": 4500 - }, - { - "epoch": 1.39, - "eval_bleu": 0.0, - "eval_loss": 4.281228065490723, - "eval_runtime": 2569.7097, - "eval_samples_per_second": 5.744, - "eval_steps_per_second": 0.718, - "step": 4500 - }, - { - "epoch": 1.39, - "learning_rate": 0.00018619746357375922, - "loss": 4.1089, - "step": 4501 - }, - { - "epoch": 1.39, - "learning_rate": 0.000186161834514522, - "loss": 3.625, - "step": 4502 - }, - { - "epoch": 1.39, - "learning_rate": 0.00018612620545528476, - "loss": 3.6791, - "step": 4503 - }, - { - "epoch": 1.39, - "learning_rate": 0.0001860905763960475, - "loss": 3.718, - "step": 4504 - }, - { - "epoch": 1.39, - "learning_rate": 0.00018605494733681032, - "loss": 3.9028, - "step": 4505 - }, - { - "epoch": 1.39, - "learning_rate": 0.00018601931827757308, - "loss": 3.7382, - "step": 4506 - }, - { - "epoch": 1.39, - "learning_rate": 0.00018598368921833586, - "loss": 3.6261, - "step": 4507 - }, - { - "epoch": 1.39, - "learning_rate": 0.0001859480601590986, - "loss": 3.6259, - "step": 4508 - }, - { - "epoch": 1.39, - "learning_rate": 0.00018591243109986137, - "loss": 3.6202, - "step": 4509 - }, - { - "epoch": 1.39, - "learning_rate": 0.00018587680204062418, - "loss": 3.6358, - "step": 4510 - }, - { - "epoch": 1.39, - "learning_rate": 0.00018584117298138693, - "loss": 3.8154, - "step": 4511 - }, - { - "epoch": 1.39, - "learning_rate": 0.00018580554392214969, - "loss": 3.8293, - "step": 4512 - }, - { - "epoch": 1.39, - "learning_rate": 0.0001857699148629125, - "loss": 3.7113, - "step": 4513 - }, - { - "epoch": 1.39, - "learning_rate": 0.00018573428580367525, - "loss": 3.2368, - "step": 4514 - }, - { - "epoch": 1.39, - "learning_rate": 0.00018569865674443803, - "loss": 3.6301, - "step": 4515 - }, - { - "epoch": 1.39, - "learning_rate": 0.00018566302768520079, - "loss": 3.3745, - "step": 4516 - }, - { - "epoch": 1.39, - "learning_rate": 0.00018562739862596354, - "loss": 3.5114, - "step": 4517 - }, - { - "epoch": 1.39, - "learning_rate": 0.00018559176956672635, - "loss": 3.3647, - "step": 4518 - }, - { - "epoch": 1.39, - "learning_rate": 0.0001855561405074891, - "loss": 3.5545, - "step": 4519 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018552051144825189, - "loss": 3.3762, - "step": 4520 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018548488238901464, - "loss": 3.4799, - "step": 4521 - }, - { - "epoch": 1.4, - "learning_rate": 0.0001854492533297774, - "loss": 3.6488, - "step": 4522 - }, - { - "epoch": 1.4, - "learning_rate": 0.0001854136242705402, - "loss": 3.2391, - "step": 4523 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018537799521130296, - "loss": 3.1481, - "step": 4524 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018534236615206571, - "loss": 3.0651, - "step": 4525 - }, - { - "epoch": 1.4, - "learning_rate": 0.0001853067370928285, - "loss": 3.1124, - "step": 4526 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018527110803359125, - "loss": 3.3026, - "step": 4527 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018523547897435406, - "loss": 3.2073, - "step": 4528 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018519984991511681, - "loss": 3.3296, - "step": 4529 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018516422085587957, - "loss": 3.242, - "step": 4530 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018512859179664235, - "loss": 2.8497, - "step": 4531 - }, - { - "epoch": 1.4, - "learning_rate": 0.0001850929627374051, - "loss": 2.7863, - "step": 4532 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018505733367816791, - "loss": 3.0001, - "step": 4533 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018502170461893067, - "loss": 2.8538, - "step": 4534 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018498607555969342, - "loss": 2.8208, - "step": 4535 - }, - { - "epoch": 1.4, - "learning_rate": 0.0001849504465004562, - "loss": 2.4614, - "step": 4536 - }, - { - "epoch": 1.4, - "learning_rate": 0.000184914817441219, - "loss": 2.2898, - "step": 4537 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018487918838198174, - "loss": 2.4263, - "step": 4538 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018484355932274452, - "loss": 2.3912, - "step": 4539 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018480793026350728, - "loss": 2.2211, - "step": 4540 - }, - { - "epoch": 1.4, - "learning_rate": 0.0001847723012042701, - "loss": 4.6913, - "step": 4541 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018473667214503284, - "loss": 4.5808, - "step": 4542 - }, - { - "epoch": 1.4, - "learning_rate": 0.0001847010430857956, - "loss": 4.3213, - "step": 4543 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018466541402655838, - "loss": 4.3237, - "step": 4544 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018462978496732113, - "loss": 4.1259, - "step": 4545 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018459415590808394, - "loss": 3.7563, - "step": 4546 - }, - { - "epoch": 1.4, - "learning_rate": 0.0001845585268488467, - "loss": 3.8599, - "step": 4547 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018452289778960945, - "loss": 3.8666, - "step": 4548 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018448726873037223, - "loss": 3.9719, - "step": 4549 - }, - { - "epoch": 1.4, - "learning_rate": 0.000184451639671135, - "loss": 3.6684, - "step": 4550 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018441601061189774, - "loss": 3.8653, - "step": 4551 - }, - { - "epoch": 1.4, - "learning_rate": 0.00018438038155266055, - "loss": 3.7699, - "step": 4552 - }, - { - "epoch": 1.41, - "learning_rate": 0.0001843447524934233, - "loss": 3.3755, - "step": 4553 - }, - { - "epoch": 1.41, - "learning_rate": 0.0001843091234341861, - "loss": 3.8257, - "step": 4554 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018427349437494884, - "loss": 3.9191, - "step": 4555 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018423786531571163, - "loss": 3.7407, - "step": 4556 - }, - { - "epoch": 1.41, - "learning_rate": 0.0001842022362564744, - "loss": 3.7211, - "step": 4557 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018416660719723716, - "loss": 3.7524, - "step": 4558 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018413097813799992, - "loss": 3.6465, - "step": 4559 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018409534907876273, - "loss": 3.4391, - "step": 4560 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018405972001952548, - "loss": 3.4938, - "step": 4561 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018402409096028826, - "loss": 3.5418, - "step": 4562 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018398846190105102, - "loss": 3.7986, - "step": 4563 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018395283284181377, - "loss": 3.5064, - "step": 4564 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018391720378257658, - "loss": 3.6342, - "step": 4565 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018388157472333934, - "loss": 3.5529, - "step": 4566 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018384594566410212, - "loss": 3.2995, - "step": 4567 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018381031660486487, - "loss": 3.4198, - "step": 4568 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018377468754562763, - "loss": 3.4463, - "step": 4569 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018373905848639044, - "loss": 3.4324, - "step": 4570 - }, - { - "epoch": 1.41, - "learning_rate": 0.0001837034294271532, - "loss": 3.2894, - "step": 4571 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018366780036791595, - "loss": 3.1692, - "step": 4572 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018363217130867873, - "loss": 3.3645, - "step": 4573 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018359654224944148, - "loss": 3.101, - "step": 4574 - }, - { - "epoch": 1.41, - "learning_rate": 0.0001835609131902043, - "loss": 3.2577, - "step": 4575 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018352528413096705, - "loss": 2.9987, - "step": 4576 - }, - { - "epoch": 1.41, - "learning_rate": 0.0001834896550717298, - "loss": 2.8119, - "step": 4577 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018345402601249258, - "loss": 2.8331, - "step": 4578 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018341839695325534, - "loss": 3.0664, - "step": 4579 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018338276789401815, - "loss": 2.943, - "step": 4580 - }, - { - "epoch": 1.41, - "learning_rate": 0.0001833471388347809, - "loss": 2.7136, - "step": 4581 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018331150977554366, - "loss": 2.6434, - "step": 4582 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018327588071630644, - "loss": 2.6797, - "step": 4583 - }, - { - "epoch": 1.41, - "learning_rate": 0.00018324025165706922, - "loss": 2.5416, - "step": 4584 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018320462259783198, - "loss": 2.4623, - "step": 4585 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018316899353859476, - "loss": 2.5722, - "step": 4586 - }, - { - "epoch": 1.42, - "learning_rate": 0.0001831333644793575, - "loss": 2.3202, - "step": 4587 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018309773542012032, - "loss": 2.1332, - "step": 4588 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018306210636088308, - "loss": 2.1581, - "step": 4589 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018302647730164583, - "loss": 2.035, - "step": 4590 - }, - { - "epoch": 1.42, - "learning_rate": 0.0001829908482424086, - "loss": 4.4287, - "step": 4591 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018295521918317137, - "loss": 4.4819, - "step": 4592 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018291959012393412, - "loss": 4.3897, - "step": 4593 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018288396106469693, - "loss": 4.1682, - "step": 4594 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018284833200545969, - "loss": 3.9331, - "step": 4595 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018281270294622247, - "loss": 4.2788, - "step": 4596 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018277707388698522, - "loss": 3.7613, - "step": 4597 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018274144482774798, - "loss": 4.033, - "step": 4598 - }, - { - "epoch": 1.42, - "learning_rate": 0.0001827058157685108, - "loss": 3.8745, - "step": 4599 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018267018670927354, - "loss": 3.8303, - "step": 4600 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018263455765003632, - "loss": 3.5588, - "step": 4601 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018259892859079908, - "loss": 3.534, - "step": 4602 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018256329953156186, - "loss": 3.6018, - "step": 4603 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018252767047232464, - "loss": 3.6259, - "step": 4604 - }, - { - "epoch": 1.42, - "learning_rate": 0.0001824920414130874, - "loss": 3.5376, - "step": 4605 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018245641235385015, - "loss": 3.4657, - "step": 4606 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018242078329461296, - "loss": 3.6521, - "step": 4607 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018238515423537572, - "loss": 3.5062, - "step": 4608 - }, - { - "epoch": 1.42, - "learning_rate": 0.0001823495251761385, - "loss": 3.6734, - "step": 4609 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018231389611690125, - "loss": 3.5809, - "step": 4610 - }, - { - "epoch": 1.42, - "learning_rate": 0.000182278267057664, - "loss": 3.3967, - "step": 4611 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018224263799842682, - "loss": 3.7291, - "step": 4612 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018220700893918957, - "loss": 3.6, - "step": 4613 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018217137987995235, - "loss": 3.6512, - "step": 4614 - }, - { - "epoch": 1.42, - "learning_rate": 0.0001821357508207151, - "loss": 3.4838, - "step": 4615 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018210012176147786, - "loss": 3.2941, - "step": 4616 - }, - { - "epoch": 1.42, - "learning_rate": 0.00018206449270224067, - "loss": 3.4516, - "step": 4617 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018202886364300343, - "loss": 3.3152, - "step": 4618 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018199323458376618, - "loss": 3.6056, - "step": 4619 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018195760552452896, - "loss": 3.4139, - "step": 4620 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018192197646529172, - "loss": 3.3395, - "step": 4621 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018188634740605453, - "loss": 3.333, - "step": 4622 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018185071834681728, - "loss": 3.0048, - "step": 4623 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018181508928758004, - "loss": 2.9755, - "step": 4624 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018177946022834282, - "loss": 3.0569, - "step": 4625 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018174383116910557, - "loss": 3.259, - "step": 4626 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018170820210986838, - "loss": 2.9735, - "step": 4627 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018167257305063114, - "loss": 2.9929, - "step": 4628 - }, - { - "epoch": 1.43, - "learning_rate": 0.0001816369439913939, - "loss": 2.9022, - "step": 4629 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018160131493215667, - "loss": 2.8732, - "step": 4630 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018156568587291945, - "loss": 2.6221, - "step": 4631 - }, - { - "epoch": 1.43, - "learning_rate": 0.0001815300568136822, - "loss": 2.675, - "step": 4632 - }, - { - "epoch": 1.43, - "learning_rate": 0.000181494427754445, - "loss": 2.6749, - "step": 4633 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018145879869520775, - "loss": 2.6124, - "step": 4634 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018142316963597055, - "loss": 2.6988, - "step": 4635 - }, - { - "epoch": 1.43, - "learning_rate": 0.0001813875405767333, - "loss": 2.6791, - "step": 4636 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018135191151749606, - "loss": 2.3058, - "step": 4637 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018131628245825885, - "loss": 2.1523, - "step": 4638 - }, - { - "epoch": 1.43, - "learning_rate": 0.0001812806533990216, - "loss": 1.9923, - "step": 4639 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018124502433978436, - "loss": 2.1073, - "step": 4640 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018120939528054716, - "loss": 4.6725, - "step": 4641 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018117376622130992, - "loss": 4.3977, - "step": 4642 - }, - { - "epoch": 1.43, - "learning_rate": 0.0001811381371620727, - "loss": 4.2943, - "step": 4643 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018110250810283546, - "loss": 4.1266, - "step": 4644 - }, - { - "epoch": 1.43, - "learning_rate": 0.0001810668790435982, - "loss": 4.0593, - "step": 4645 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018103124998436102, - "loss": 3.9568, - "step": 4646 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018099562092512377, - "loss": 3.945, - "step": 4647 - }, - { - "epoch": 1.43, - "learning_rate": 0.00018095999186588656, - "loss": 4.0293, - "step": 4648 - }, - { - "epoch": 1.43, - "learning_rate": 0.0001809243628066493, - "loss": 3.5411, - "step": 4649 - }, - { - "epoch": 1.44, - "learning_rate": 0.0001808887337474121, - "loss": 3.9185, - "step": 4650 - }, - { - "epoch": 1.44, - "learning_rate": 0.00018085310468817487, - "loss": 3.6484, - "step": 4651 - }, - { - "epoch": 1.44, - "learning_rate": 0.00018081747562893763, - "loss": 3.9226, - "step": 4652 - }, - { - "epoch": 1.44, - "learning_rate": 0.00018078184656970038, - "loss": 3.594, - "step": 4653 - }, - { - "epoch": 1.44, - "learning_rate": 0.0001807462175104632, - "loss": 3.5263, - "step": 4654 - }, - { - "epoch": 1.44, - "learning_rate": 0.00018071058845122595, - "loss": 3.9934, - "step": 4655 - }, - { - "epoch": 1.44, - "learning_rate": 0.00018067495939198873, - "loss": 3.5791, - "step": 4656 - }, - { - "epoch": 1.44, - "learning_rate": 0.00018063933033275148, - "loss": 3.592, - "step": 4657 - }, - { - "epoch": 1.44, - "learning_rate": 0.00018060370127351424, - "loss": 3.8117, - "step": 4658 - }, - { - "epoch": 1.44, - "learning_rate": 0.00018056807221427705, - "loss": 3.6178, - "step": 4659 - }, - { - "epoch": 1.44, - "learning_rate": 0.0001805324431550398, - "loss": 3.5588, - "step": 4660 - }, - { - "epoch": 1.44, - "learning_rate": 0.00018049681409580259, - "loss": 3.6615, - "step": 4661 - }, - { - "epoch": 1.44, - "learning_rate": 0.00018046118503656534, - "loss": 3.8338, - "step": 4662 - }, - { - "epoch": 1.44, - "learning_rate": 0.0001804255559773281, - "loss": 3.3399, - "step": 4663 - }, - { - "epoch": 1.44, - "learning_rate": 0.0001803899269180909, - "loss": 3.5183, - "step": 4664 - }, - { - "epoch": 1.44, - "learning_rate": 0.00018035429785885366, - "loss": 3.308, - "step": 4665 - }, - { - "epoch": 1.44, - "learning_rate": 0.0001803186687996164, - "loss": 3.314, - "step": 4666 - }, - { - "epoch": 1.44, - "learning_rate": 0.0001802830397403792, - "loss": 3.33, - "step": 4667 - }, - { - "epoch": 1.44, - "learning_rate": 0.00018024741068114195, - "loss": 3.2981, - "step": 4668 - }, - { - "epoch": 1.44, - "learning_rate": 0.00018021178162190476, - "loss": 3.3088, - "step": 4669 - }, - { - "epoch": 1.44, - "learning_rate": 0.00018017615256266751, - "loss": 3.4031, - "step": 4670 - }, - { - "epoch": 1.44, - "learning_rate": 0.00018014052350343027, - "loss": 3.2715, - "step": 4671 - }, - { - "epoch": 1.44, - "learning_rate": 0.00018010489444419305, - "loss": 3.4788, - "step": 4672 - }, - { - "epoch": 1.44, - "learning_rate": 0.0001800692653849558, - "loss": 3.242, - "step": 4673 - }, - { - "epoch": 1.44, - "learning_rate": 0.0001800336363257186, - "loss": 3.2025, - "step": 4674 - }, - { - "epoch": 1.44, - "learning_rate": 0.00017999800726648137, - "loss": 2.9685, - "step": 4675 - }, - { - "epoch": 1.44, - "learning_rate": 0.00017996237820724412, - "loss": 3.0026, - "step": 4676 - }, - { - "epoch": 1.44, - "learning_rate": 0.0001799267491480069, - "loss": 2.9887, - "step": 4677 - }, - { - "epoch": 1.44, - "learning_rate": 0.0001798911200887697, - "loss": 2.8831, - "step": 4678 - }, - { - "epoch": 1.44, - "learning_rate": 0.00017985549102953244, - "loss": 2.7915, - "step": 4679 - }, - { - "epoch": 1.44, - "learning_rate": 0.00017981986197029522, - "loss": 3.1384, - "step": 4680 - }, - { - "epoch": 1.44, - "learning_rate": 0.00017978423291105798, - "loss": 2.8363, - "step": 4681 - }, - { - "epoch": 1.45, - "learning_rate": 0.0001797486038518208, - "loss": 2.9632, - "step": 4682 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017971297479258354, - "loss": 2.6556, - "step": 4683 - }, - { - "epoch": 1.45, - "learning_rate": 0.0001796773457333463, - "loss": 2.6813, - "step": 4684 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017964171667410908, - "loss": 2.3847, - "step": 4685 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017960608761487183, - "loss": 2.3975, - "step": 4686 - }, - { - "epoch": 1.45, - "learning_rate": 0.0001795704585556346, - "loss": 2.3641, - "step": 4687 - }, - { - "epoch": 1.45, - "learning_rate": 0.0001795348294963974, - "loss": 2.3751, - "step": 4688 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017949920043716015, - "loss": 2.1873, - "step": 4689 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017946357137792293, - "loss": 1.8901, - "step": 4690 - }, - { - "epoch": 1.45, - "learning_rate": 0.0001794279423186857, - "loss": 4.7918, - "step": 4691 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017939231325944844, - "loss": 4.4306, - "step": 4692 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017935668420021125, - "loss": 4.1826, - "step": 4693 - }, - { - "epoch": 1.45, - "learning_rate": 0.000179321055140974, - "loss": 4.145, - "step": 4694 - }, - { - "epoch": 1.45, - "learning_rate": 0.0001792854260817368, - "loss": 4.1038, - "step": 4695 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017924979702249954, - "loss": 4.3339, - "step": 4696 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017921416796326233, - "loss": 4.0556, - "step": 4697 - }, - { - "epoch": 1.45, - "learning_rate": 0.0001791785389040251, - "loss": 3.9273, - "step": 4698 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017914290984478786, - "loss": 3.8608, - "step": 4699 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017910728078555062, - "loss": 3.7859, - "step": 4700 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017907165172631343, - "loss": 3.8096, - "step": 4701 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017903602266707618, - "loss": 3.7621, - "step": 4702 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017900039360783896, - "loss": 3.6431, - "step": 4703 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017896476454860172, - "loss": 3.7445, - "step": 4704 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017892913548936447, - "loss": 3.7266, - "step": 4705 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017889350643012728, - "loss": 3.69, - "step": 4706 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017885787737089004, - "loss": 3.4656, - "step": 4707 - }, - { - "epoch": 1.45, - "learning_rate": 0.0001788222483116528, - "loss": 3.6179, - "step": 4708 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017878661925241557, - "loss": 3.3894, - "step": 4709 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017875099019317833, - "loss": 3.4449, - "step": 4710 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017871536113394114, - "loss": 3.7462, - "step": 4711 - }, - { - "epoch": 1.45, - "learning_rate": 0.0001786797320747039, - "loss": 3.5377, - "step": 4712 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017864410301546665, - "loss": 3.4482, - "step": 4713 - }, - { - "epoch": 1.45, - "learning_rate": 0.00017860847395622943, - "loss": 3.493, - "step": 4714 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017857284489699218, - "loss": 3.4481, - "step": 4715 - }, - { - "epoch": 1.46, - "learning_rate": 0.000178537215837755, - "loss": 3.4747, - "step": 4716 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017850158677851775, - "loss": 3.1804, - "step": 4717 - }, - { - "epoch": 1.46, - "learning_rate": 0.0001784659577192805, - "loss": 3.475, - "step": 4718 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017843032866004328, - "loss": 3.2605, - "step": 4719 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017839469960080604, - "loss": 3.3107, - "step": 4720 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017835907054156882, - "loss": 3.1157, - "step": 4721 - }, - { - "epoch": 1.46, - "learning_rate": 0.0001783234414823316, - "loss": 3.2522, - "step": 4722 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017828781242309436, - "loss": 3.4637, - "step": 4723 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017825218336385717, - "loss": 3.0153, - "step": 4724 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017821655430461992, - "loss": 2.9652, - "step": 4725 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017818092524538268, - "loss": 2.9703, - "step": 4726 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017814529618614546, - "loss": 2.7454, - "step": 4727 - }, - { - "epoch": 1.46, - "learning_rate": 0.0001781096671269082, - "loss": 3.1581, - "step": 4728 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017807403806767102, - "loss": 2.7166, - "step": 4729 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017803840900843378, - "loss": 2.5271, - "step": 4730 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017800277994919653, - "loss": 2.7026, - "step": 4731 - }, - { - "epoch": 1.46, - "learning_rate": 0.0001779671508899593, - "loss": 2.6339, - "step": 4732 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017793152183072207, - "loss": 2.6924, - "step": 4733 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017789589277148482, - "loss": 2.6907, - "step": 4734 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017786026371224763, - "loss": 2.3836, - "step": 4735 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017782463465301039, - "loss": 2.3973, - "step": 4736 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017778900559377317, - "loss": 2.2537, - "step": 4737 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017775337653453592, - "loss": 2.0545, - "step": 4738 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017771774747529868, - "loss": 2.0958, - "step": 4739 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017768211841606149, - "loss": 1.9499, - "step": 4740 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017764648935682424, - "loss": 4.6222, - "step": 4741 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017761086029758702, - "loss": 4.401, - "step": 4742 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017757523123834978, - "loss": 4.271, - "step": 4743 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017753960217911256, - "loss": 4.027, - "step": 4744 - }, - { - "epoch": 1.46, - "learning_rate": 0.00017750397311987534, - "loss": 4.2307, - "step": 4745 - }, - { - "epoch": 1.46, - "learning_rate": 0.0001774683440606381, - "loss": 4.0846, - "step": 4746 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017743271500140085, - "loss": 4.0746, - "step": 4747 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017739708594216366, - "loss": 3.677, - "step": 4748 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017736145688292641, - "loss": 3.6718, - "step": 4749 - }, - { - "epoch": 1.47, - "learning_rate": 0.0001773258278236892, - "loss": 3.6752, - "step": 4750 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017729019876445195, - "loss": 3.9156, - "step": 4751 - }, - { - "epoch": 1.47, - "learning_rate": 0.0001772545697052147, - "loss": 3.8832, - "step": 4752 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017721894064597751, - "loss": 3.5671, - "step": 4753 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017718331158674027, - "loss": 3.6431, - "step": 4754 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017714768252750302, - "loss": 3.7322, - "step": 4755 - }, - { - "epoch": 1.47, - "learning_rate": 0.0001771120534682658, - "loss": 3.6364, - "step": 4756 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017707642440902856, - "loss": 3.5702, - "step": 4757 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017704079534979137, - "loss": 3.3503, - "step": 4758 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017700516629055412, - "loss": 3.6083, - "step": 4759 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017696953723131688, - "loss": 3.6549, - "step": 4760 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017693390817207966, - "loss": 3.8239, - "step": 4761 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017689827911284242, - "loss": 3.6375, - "step": 4762 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017686265005360523, - "loss": 3.4365, - "step": 4763 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017682702099436798, - "loss": 3.3429, - "step": 4764 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017679139193513073, - "loss": 3.3905, - "step": 4765 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017675576287589352, - "loss": 3.2386, - "step": 4766 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017672013381665627, - "loss": 3.3342, - "step": 4767 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017668450475741905, - "loss": 3.3042, - "step": 4768 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017664887569818184, - "loss": 3.0734, - "step": 4769 - }, - { - "epoch": 1.47, - "learning_rate": 0.0001766132466389446, - "loss": 3.1943, - "step": 4770 - }, - { - "epoch": 1.47, - "learning_rate": 0.0001765776175797074, - "loss": 3.2742, - "step": 4771 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017654198852047015, - "loss": 3.2435, - "step": 4772 - }, - { - "epoch": 1.47, - "learning_rate": 0.0001765063594612329, - "loss": 3.2274, - "step": 4773 - }, - { - "epoch": 1.47, - "learning_rate": 0.0001764707304019957, - "loss": 3.1006, - "step": 4774 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017643510134275845, - "loss": 3.1299, - "step": 4775 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017639947228352125, - "loss": 3.0492, - "step": 4776 - }, - { - "epoch": 1.47, - "learning_rate": 0.000176363843224284, - "loss": 2.9599, - "step": 4777 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017632821416504676, - "loss": 2.943, - "step": 4778 - }, - { - "epoch": 1.47, - "learning_rate": 0.00017629258510580955, - "loss": 2.9126, - "step": 4779 - }, - { - "epoch": 1.48, - "learning_rate": 0.0001762569560465723, - "loss": 2.7192, - "step": 4780 - }, - { - "epoch": 1.48, - "learning_rate": 0.00017622132698733506, - "loss": 2.6527, - "step": 4781 - }, - { - "epoch": 1.48, - "learning_rate": 0.00017618569792809786, - "loss": 2.9702, - "step": 4782 - }, - { - "epoch": 1.48, - "learning_rate": 0.00017615006886886062, - "loss": 2.6954, - "step": 4783 - }, - { - "epoch": 1.48, - "learning_rate": 0.0001761144398096234, - "loss": 2.7093, - "step": 4784 - }, - { - "epoch": 1.48, - "learning_rate": 0.00017607881075038616, - "loss": 2.3574, - "step": 4785 - }, - { - "epoch": 1.48, - "learning_rate": 0.0001760431816911489, - "loss": 2.4928, - "step": 4786 - }, - { - "epoch": 1.48, - "learning_rate": 0.00017600755263191172, - "loss": 2.5376, - "step": 4787 - }, - { - "epoch": 1.48, - "learning_rate": 0.00017597192357267447, - "loss": 2.2693, - "step": 4788 - }, - { - "epoch": 1.48, - "learning_rate": 0.00017593629451343723, - "loss": 2.2265, - "step": 4789 - }, - { - "epoch": 1.48, - "learning_rate": 0.0001759006654542, - "loss": 1.8167, - "step": 4790 - }, - { - "epoch": 1.48, - "learning_rate": 0.0001758650363949628, - "loss": 4.4275, - "step": 4791 - }, - { - "epoch": 1.48, - "learning_rate": 0.00017582940733572557, - "loss": 4.6542, - "step": 4792 - }, - { - "epoch": 1.48, - "learning_rate": 0.00017579377827648833, - "loss": 4.1792, - "step": 4793 - }, - { - "epoch": 1.48, - "learning_rate": 0.00017575814921725108, - "loss": 4.284, - "step": 4794 - }, - { - "epoch": 1.48, - "learning_rate": 0.0001757225201580139, - "loss": 4.1344, - "step": 4795 - }, - { - "epoch": 1.48, - "learning_rate": 0.00017568689109877665, - "loss": 3.65, - "step": 4796 - }, - { - "epoch": 1.48, - "learning_rate": 0.00017565126203953943, - "loss": 3.8079, - "step": 4797 - }, - { - "epoch": 1.48, - "learning_rate": 0.00017561563298030218, - "loss": 3.926, - "step": 4798 - }, - { - "epoch": 1.48, - "learning_rate": 0.00017558000392106494, - "loss": 3.6262, - "step": 4799 - }, - { - "epoch": 1.48, - "learning_rate": 0.00017554437486182775, - "loss": 3.6807, - "step": 4800 - }, - { - "epoch": 1.48, - "learning_rate": 0.0001755087458025905, - "loss": 3.6051, - "step": 4801 - }, - { - "epoch": 1.48, - "learning_rate": 0.00017547311674335326, - "loss": 3.7835, - "step": 4802 - }, - { - "epoch": 1.48, - "learning_rate": 0.00017543748768411604, - "loss": 3.7049, - "step": 4803 - }, - { - "epoch": 1.48, - "learning_rate": 0.0001754018586248788, - "loss": 3.8645, - "step": 4804 - }, - { - "epoch": 1.48, - "learning_rate": 0.0001753662295656416, - "loss": 3.6909, - "step": 4805 - }, - { - "epoch": 1.48, - "learning_rate": 0.00017533060050640436, - "loss": 3.6918, - "step": 4806 - }, - { - "epoch": 1.48, - "learning_rate": 0.0001752949714471671, - "loss": 3.8078, - "step": 4807 - }, - { - "epoch": 1.48, - "learning_rate": 0.0001752593423879299, - "loss": 3.782, - "step": 4808 - }, - { - "epoch": 1.48, - "learning_rate": 0.00017522371332869265, - "loss": 3.5386, - "step": 4809 - }, - { - "epoch": 1.48, - "learning_rate": 0.00017518808426945546, - "loss": 3.5134, - "step": 4810 - }, - { - "epoch": 1.48, - "learning_rate": 0.0001751524552102182, - "loss": 3.6518, - "step": 4811 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017511682615098097, - "loss": 3.6094, - "step": 4812 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017508119709174375, - "loss": 3.345, - "step": 4813 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017504556803250653, - "loss": 3.3999, - "step": 4814 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017500993897326929, - "loss": 3.428, - "step": 4815 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017497430991403207, - "loss": 3.6267, - "step": 4816 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017493868085479482, - "loss": 3.2533, - "step": 4817 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017490305179555763, - "loss": 3.4008, - "step": 4818 - }, - { - "epoch": 1.49, - "learning_rate": 0.0001748674227363204, - "loss": 3.3549, - "step": 4819 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017483179367708314, - "loss": 3.4212, - "step": 4820 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017479616461784592, - "loss": 3.3165, - "step": 4821 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017476053555860868, - "loss": 3.1172, - "step": 4822 - }, - { - "epoch": 1.49, - "learning_rate": 0.0001747249064993715, - "loss": 3.1252, - "step": 4823 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017468927744013424, - "loss": 3.1517, - "step": 4824 - }, - { - "epoch": 1.49, - "learning_rate": 0.000174653648380897, - "loss": 2.9612, - "step": 4825 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017461801932165978, - "loss": 3.0524, - "step": 4826 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017458239026242253, - "loss": 3.0204, - "step": 4827 - }, - { - "epoch": 1.49, - "learning_rate": 0.0001745467612031853, - "loss": 2.8512, - "step": 4828 - }, - { - "epoch": 1.49, - "learning_rate": 0.0001745111321439481, - "loss": 2.9428, - "step": 4829 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017447550308471085, - "loss": 2.7993, - "step": 4830 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017443987402547363, - "loss": 2.9659, - "step": 4831 - }, - { - "epoch": 1.49, - "learning_rate": 0.0001744042449662364, - "loss": 2.6619, - "step": 4832 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017436861590699914, - "loss": 2.4531, - "step": 4833 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017433298684776195, - "loss": 2.3437, - "step": 4834 - }, - { - "epoch": 1.49, - "learning_rate": 0.0001742973577885247, - "loss": 2.3828, - "step": 4835 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017426172872928746, - "loss": 2.3757, - "step": 4836 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017422609967005024, - "loss": 2.366, - "step": 4837 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017419047061081303, - "loss": 2.2645, - "step": 4838 - }, - { - "epoch": 1.49, - "learning_rate": 0.0001741548415515758, - "loss": 1.9537, - "step": 4839 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017411921249233856, - "loss": 1.9982, - "step": 4840 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017408358343310132, - "loss": 4.7958, - "step": 4841 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017404795437386413, - "loss": 4.2977, - "step": 4842 - }, - { - "epoch": 1.49, - "learning_rate": 0.00017401232531462688, - "loss": 4.4349, - "step": 4843 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017397669625538966, - "loss": 4.1587, - "step": 4844 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017394106719615242, - "loss": 4.0518, - "step": 4845 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017390543813691517, - "loss": 3.724, - "step": 4846 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017386980907767798, - "loss": 3.6911, - "step": 4847 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017383418001844074, - "loss": 3.7774, - "step": 4848 - }, - { - "epoch": 1.5, - "learning_rate": 0.0001737985509592035, - "loss": 3.9569, - "step": 4849 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017376292189996627, - "loss": 3.8986, - "step": 4850 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017372729284072903, - "loss": 3.6954, - "step": 4851 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017369166378149184, - "loss": 3.4838, - "step": 4852 - }, - { - "epoch": 1.5, - "learning_rate": 0.0001736560347222546, - "loss": 3.5859, - "step": 4853 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017362040566301735, - "loss": 3.5272, - "step": 4854 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017358477660378013, - "loss": 3.8501, - "step": 4855 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017354914754454288, - "loss": 3.8488, - "step": 4856 - }, - { - "epoch": 1.5, - "learning_rate": 0.0001735135184853057, - "loss": 3.4773, - "step": 4857 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017347788942606845, - "loss": 3.0753, - "step": 4858 - }, - { - "epoch": 1.5, - "learning_rate": 0.0001734422603668312, - "loss": 3.5781, - "step": 4859 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017340663130759398, - "loss": 3.6138, - "step": 4860 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017337100224835676, - "loss": 3.5067, - "step": 4861 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017333537318911952, - "loss": 3.3507, - "step": 4862 - }, - { - "epoch": 1.5, - "learning_rate": 0.0001732997441298823, - "loss": 3.7711, - "step": 4863 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017326411507064506, - "loss": 3.5116, - "step": 4864 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017322848601140787, - "loss": 3.2464, - "step": 4865 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017319285695217062, - "loss": 3.6144, - "step": 4866 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017315722789293337, - "loss": 3.5795, - "step": 4867 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017312159883369616, - "loss": 3.1394, - "step": 4868 - }, - { - "epoch": 1.5, - "learning_rate": 0.0001730859697744589, - "loss": 3.3071, - "step": 4869 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017305034071522167, - "loss": 3.1332, - "step": 4870 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017301471165598448, - "loss": 3.0444, - "step": 4871 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017297908259674723, - "loss": 3.1078, - "step": 4872 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017294345353751, - "loss": 3.1978, - "step": 4873 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017290782447827277, - "loss": 3.2887, - "step": 4874 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017287219541903552, - "loss": 3.3596, - "step": 4875 - }, - { - "epoch": 1.5, - "learning_rate": 0.00017283656635979833, - "loss": 3.1373, - "step": 4876 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017280093730056109, - "loss": 2.9264, - "step": 4877 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017276530824132387, - "loss": 2.8808, - "step": 4878 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017272967918208662, - "loss": 2.81, - "step": 4879 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017269405012284938, - "loss": 2.5874, - "step": 4880 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017265842106361219, - "loss": 2.8685, - "step": 4881 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017262279200437494, - "loss": 2.642, - "step": 4882 - }, - { - "epoch": 1.51, - "learning_rate": 0.0001725871629451377, - "loss": 2.4158, - "step": 4883 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017255153388590048, - "loss": 2.2786, - "step": 4884 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017251590482666326, - "loss": 2.2767, - "step": 4885 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017248027576742604, - "loss": 2.1929, - "step": 4886 - }, - { - "epoch": 1.51, - "learning_rate": 0.0001724446467081888, - "loss": 2.321, - "step": 4887 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017240901764895155, - "loss": 1.9975, - "step": 4888 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017237338858971436, - "loss": 2.1176, - "step": 4889 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017233775953047711, - "loss": 2.1125, - "step": 4890 - }, - { - "epoch": 1.51, - "learning_rate": 0.0001723021304712399, - "loss": 4.5649, - "step": 4891 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017226650141200265, - "loss": 4.4347, - "step": 4892 - }, - { - "epoch": 1.51, - "learning_rate": 0.0001722308723527654, - "loss": 4.1302, - "step": 4893 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017219524329352821, - "loss": 4.1306, - "step": 4894 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017215961423429097, - "loss": 4.1397, - "step": 4895 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017212398517505372, - "loss": 3.998, - "step": 4896 - }, - { - "epoch": 1.51, - "learning_rate": 0.0001720883561158165, - "loss": 3.9344, - "step": 4897 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017205272705657926, - "loss": 4.0342, - "step": 4898 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017201709799734207, - "loss": 3.9455, - "step": 4899 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017198146893810482, - "loss": 3.5029, - "step": 4900 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017194583987886758, - "loss": 3.7697, - "step": 4901 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017191021081963036, - "loss": 3.8397, - "step": 4902 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017187458176039312, - "loss": 3.6819, - "step": 4903 - }, - { - "epoch": 1.51, - "learning_rate": 0.0001718389527011559, - "loss": 3.7436, - "step": 4904 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017180332364191868, - "loss": 3.7442, - "step": 4905 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017176769458268143, - "loss": 3.5739, - "step": 4906 - }, - { - "epoch": 1.51, - "learning_rate": 0.00017173206552344422, - "loss": 3.7032, - "step": 4907 - }, - { - "epoch": 1.51, - "learning_rate": 0.000171696436464207, - "loss": 3.5674, - "step": 4908 - }, - { - "epoch": 1.52, - "learning_rate": 0.00017166080740496975, - "loss": 3.3831, - "step": 4909 - }, - { - "epoch": 1.52, - "learning_rate": 0.00017162517834573253, - "loss": 3.534, - "step": 4910 - }, - { - "epoch": 1.52, - "learning_rate": 0.0001715895492864953, - "loss": 3.4172, - "step": 4911 - }, - { - "epoch": 1.52, - "learning_rate": 0.0001715539202272581, - "loss": 3.4799, - "step": 4912 - }, - { - "epoch": 1.52, - "learning_rate": 0.00017151829116802085, - "loss": 3.5243, - "step": 4913 - }, - { - "epoch": 1.52, - "learning_rate": 0.0001714826621087836, - "loss": 3.3501, - "step": 4914 - }, - { - "epoch": 1.52, - "learning_rate": 0.0001714470330495464, - "loss": 3.4357, - "step": 4915 - }, - { - "epoch": 1.52, - "learning_rate": 0.00017141140399030914, - "loss": 3.3698, - "step": 4916 - }, - { - "epoch": 1.52, - "learning_rate": 0.0001713757749310719, - "loss": 3.3821, - "step": 4917 - }, - { - "epoch": 1.52, - "learning_rate": 0.0001713401458718347, - "loss": 3.1376, - "step": 4918 - }, - { - "epoch": 1.52, - "learning_rate": 0.00017130451681259746, - "loss": 3.41, - "step": 4919 - }, - { - "epoch": 1.52, - "learning_rate": 0.00017126888775336024, - "loss": 3.3301, - "step": 4920 - }, - { - "epoch": 1.52, - "learning_rate": 0.000171233258694123, - "loss": 3.0893, - "step": 4921 - }, - { - "epoch": 1.52, - "learning_rate": 0.00017119762963488575, - "loss": 3.0963, - "step": 4922 - }, - { - "epoch": 1.52, - "learning_rate": 0.00017116200057564856, - "loss": 2.9594, - "step": 4923 - }, - { - "epoch": 1.52, - "learning_rate": 0.00017112637151641132, - "loss": 2.8193, - "step": 4924 - }, - { - "epoch": 1.52, - "learning_rate": 0.0001710907424571741, - "loss": 3.2229, - "step": 4925 - }, - { - "epoch": 1.52, - "learning_rate": 0.00017105511339793685, - "loss": 3.1508, - "step": 4926 - }, - { - "epoch": 1.52, - "learning_rate": 0.0001710194843386996, - "loss": 3.1167, - "step": 4927 - }, - { - "epoch": 1.52, - "learning_rate": 0.00017098385527946242, - "loss": 2.8698, - "step": 4928 - }, - { - "epoch": 1.52, - "learning_rate": 0.00017094822622022517, - "loss": 2.9609, - "step": 4929 - }, - { - "epoch": 1.52, - "learning_rate": 0.00017091259716098793, - "loss": 2.8707, - "step": 4930 - }, - { - "epoch": 1.52, - "learning_rate": 0.0001708769681017507, - "loss": 2.6924, - "step": 4931 - }, - { - "epoch": 1.52, - "learning_rate": 0.0001708413390425135, - "loss": 2.7957, - "step": 4932 - }, - { - "epoch": 1.52, - "learning_rate": 0.00017080570998327627, - "loss": 2.4449, - "step": 4933 - }, - { - "epoch": 1.52, - "learning_rate": 0.00017077008092403903, - "loss": 2.3071, - "step": 4934 - }, - { - "epoch": 1.52, - "learning_rate": 0.00017073445186480178, - "loss": 2.5278, - "step": 4935 - }, - { - "epoch": 1.52, - "learning_rate": 0.0001706988228055646, - "loss": 2.3266, - "step": 4936 - }, - { - "epoch": 1.52, - "learning_rate": 0.00017066319374632735, - "loss": 2.1352, - "step": 4937 - }, - { - "epoch": 1.52, - "learning_rate": 0.00017062756468709013, - "loss": 2.1713, - "step": 4938 - }, - { - "epoch": 1.52, - "learning_rate": 0.00017059193562785288, - "loss": 1.9682, - "step": 4939 - }, - { - "epoch": 1.52, - "learning_rate": 0.00017055630656861564, - "loss": 2.0487, - "step": 4940 - }, - { - "epoch": 1.52, - "learning_rate": 0.00017052067750937845, - "loss": 4.6505, - "step": 4941 - }, - { - "epoch": 1.53, - "learning_rate": 0.0001704850484501412, - "loss": 4.4596, - "step": 4942 - }, - { - "epoch": 1.53, - "learning_rate": 0.00017044941939090396, - "loss": 3.9328, - "step": 4943 - }, - { - "epoch": 1.53, - "learning_rate": 0.00017041379033166674, - "loss": 3.8647, - "step": 4944 - }, - { - "epoch": 1.53, - "learning_rate": 0.0001703781612724295, - "loss": 3.9741, - "step": 4945 - }, - { - "epoch": 1.53, - "learning_rate": 0.0001703425322131923, - "loss": 3.7937, - "step": 4946 - }, - { - "epoch": 1.53, - "learning_rate": 0.00017030690315395506, - "loss": 3.498, - "step": 4947 - }, - { - "epoch": 1.53, - "learning_rate": 0.0001702712740947178, - "loss": 3.6309, - "step": 4948 - }, - { - "epoch": 1.53, - "learning_rate": 0.0001702356450354806, - "loss": 3.7367, - "step": 4949 - }, - { - "epoch": 1.53, - "learning_rate": 0.00017020001597624335, - "loss": 3.757, - "step": 4950 - }, - { - "epoch": 1.53, - "learning_rate": 0.00017016438691700613, - "loss": 3.8489, - "step": 4951 - }, - { - "epoch": 1.53, - "learning_rate": 0.0001701287578577689, - "loss": 3.7928, - "step": 4952 - }, - { - "epoch": 1.53, - "learning_rate": 0.00017009312879853167, - "loss": 3.6257, - "step": 4953 - }, - { - "epoch": 1.53, - "learning_rate": 0.00017005749973929445, - "loss": 3.5836, - "step": 4954 - }, - { - "epoch": 1.53, - "learning_rate": 0.00017002187068005723, - "loss": 3.5459, - "step": 4955 - }, - { - "epoch": 1.53, - "learning_rate": 0.00016998624162081999, - "loss": 3.819, - "step": 4956 - }, - { - "epoch": 1.53, - "learning_rate": 0.00016995061256158277, - "loss": 3.5434, - "step": 4957 - }, - { - "epoch": 1.53, - "learning_rate": 0.00016991498350234552, - "loss": 3.466, - "step": 4958 - }, - { - "epoch": 1.53, - "learning_rate": 0.00016987935444310833, - "loss": 3.7567, - "step": 4959 - }, - { - "epoch": 1.53, - "learning_rate": 0.00016984372538387109, - "loss": 3.6599, - "step": 4960 - }, - { - "epoch": 1.53, - "learning_rate": 0.00016980809632463384, - "loss": 3.326, - "step": 4961 - }, - { - "epoch": 1.53, - "learning_rate": 0.00016977246726539662, - "loss": 3.5011, - "step": 4962 - }, - { - "epoch": 1.53, - "learning_rate": 0.00016973683820615938, - "loss": 3.3324, - "step": 4963 - }, - { - "epoch": 1.53, - "learning_rate": 0.00016970120914692213, - "loss": 3.3067, - "step": 4964 - }, - { - "epoch": 1.53, - "learning_rate": 0.00016966558008768494, - "loss": 3.2229, - "step": 4965 - }, - { - "epoch": 1.53, - "learning_rate": 0.0001696299510284477, - "loss": 3.2735, - "step": 4966 - }, - { - "epoch": 1.53, - "learning_rate": 0.00016959432196921048, - "loss": 3.3442, - "step": 4967 - }, - { - "epoch": 1.53, - "learning_rate": 0.00016955869290997323, - "loss": 3.376, - "step": 4968 - }, - { - "epoch": 1.53, - "learning_rate": 0.000169523063850736, - "loss": 3.1062, - "step": 4969 - }, - { - "epoch": 1.53, - "learning_rate": 0.0001694874347914988, - "loss": 3.4486, - "step": 4970 - }, - { - "epoch": 1.53, - "learning_rate": 0.00016945180573226155, - "loss": 3.055, - "step": 4971 - }, - { - "epoch": 1.53, - "learning_rate": 0.00016941617667302433, - "loss": 3.3777, - "step": 4972 - }, - { - "epoch": 1.53, - "learning_rate": 0.0001693805476137871, - "loss": 2.9605, - "step": 4973 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016934491855454984, - "loss": 3.0799, - "step": 4974 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016930928949531265, - "loss": 3.101, - "step": 4975 - }, - { - "epoch": 1.54, - "learning_rate": 0.0001692736604360754, - "loss": 2.8785, - "step": 4976 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016923803137683816, - "loss": 3.0342, - "step": 4977 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016920240231760094, - "loss": 2.9606, - "step": 4978 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016916677325836373, - "loss": 2.8398, - "step": 4979 - }, - { - "epoch": 1.54, - "learning_rate": 0.0001691311441991265, - "loss": 2.8264, - "step": 4980 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016909551513988926, - "loss": 2.7263, - "step": 4981 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016905988608065202, - "loss": 2.4636, - "step": 4982 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016902425702141483, - "loss": 2.6144, - "step": 4983 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016898862796217758, - "loss": 2.4867, - "step": 4984 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016895299890294033, - "loss": 2.4298, - "step": 4985 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016891736984370312, - "loss": 2.2732, - "step": 4986 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016888174078446587, - "loss": 2.2657, - "step": 4987 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016884611172522868, - "loss": 2.0809, - "step": 4988 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016881048266599144, - "loss": 2.1756, - "step": 4989 - }, - { - "epoch": 1.54, - "learning_rate": 0.0001687748536067542, - "loss": 1.9941, - "step": 4990 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016873922454751697, - "loss": 4.7083, - "step": 4991 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016870359548827973, - "loss": 4.4278, - "step": 4992 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016866796642904254, - "loss": 4.4681, - "step": 4993 - }, - { - "epoch": 1.54, - "learning_rate": 0.0001686323373698053, - "loss": 3.9955, - "step": 4994 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016859670831056805, - "loss": 4.0113, - "step": 4995 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016856107925133083, - "loss": 3.8935, - "step": 4996 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016852545019209358, - "loss": 4.1576, - "step": 4997 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016848982113285636, - "loss": 3.8884, - "step": 4998 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016845419207361915, - "loss": 3.7954, - "step": 4999 - }, - { - "epoch": 1.54, - "learning_rate": 0.0001684185630143819, - "loss": 3.856, - "step": 5000 - }, - { - "epoch": 1.54, - "eval_bleu": 2.2622740788922234e-13, - "eval_loss": 4.170206069946289, - "eval_runtime": 2580.4115, - "eval_samples_per_second": 5.72, - "eval_steps_per_second": 0.715, - "step": 5000 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016838293395514468, - "loss": 3.5963, - "step": 5001 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016834730489590746, - "loss": 3.9321, - "step": 5002 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016831167583667022, - "loss": 3.8039, - "step": 5003 - }, - { - "epoch": 1.54, - "learning_rate": 0.000168276046777433, - "loss": 3.654, - "step": 5004 - }, - { - "epoch": 1.54, - "learning_rate": 0.00016824041771819576, - "loss": 3.6374, - "step": 5005 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016820478865895856, - "loss": 3.4612, - "step": 5006 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016816915959972132, - "loss": 3.3838, - "step": 5007 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016813353054048407, - "loss": 3.5514, - "step": 5008 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016809790148124686, - "loss": 3.5937, - "step": 5009 - }, - { - "epoch": 1.55, - "learning_rate": 0.0001680622724220096, - "loss": 3.7692, - "step": 5010 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016802664336277237, - "loss": 3.8193, - "step": 5011 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016799101430353517, - "loss": 3.7518, - "step": 5012 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016795538524429793, - "loss": 3.4959, - "step": 5013 - }, - { - "epoch": 1.55, - "learning_rate": 0.0001679197561850607, - "loss": 3.6865, - "step": 5014 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016788412712582347, - "loss": 3.5194, - "step": 5015 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016784849806658622, - "loss": 3.6062, - "step": 5016 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016781286900734903, - "loss": 3.0849, - "step": 5017 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016777723994811178, - "loss": 3.3316, - "step": 5018 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016774161088887457, - "loss": 3.5343, - "step": 5019 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016770598182963732, - "loss": 3.1095, - "step": 5020 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016767035277040008, - "loss": 3.2945, - "step": 5021 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016763472371116288, - "loss": 3.0817, - "step": 5022 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016759909465192564, - "loss": 3.0471, - "step": 5023 - }, - { - "epoch": 1.55, - "learning_rate": 0.0001675634655926884, - "loss": 2.9613, - "step": 5024 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016752783653345118, - "loss": 3.1743, - "step": 5025 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016749220747421396, - "loss": 3.0343, - "step": 5026 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016745657841497674, - "loss": 2.9221, - "step": 5027 - }, - { - "epoch": 1.55, - "learning_rate": 0.0001674209493557395, - "loss": 2.9922, - "step": 5028 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016738532029650225, - "loss": 2.8047, - "step": 5029 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016734969123726506, - "loss": 2.7419, - "step": 5030 - }, - { - "epoch": 1.55, - "learning_rate": 0.0001673140621780278, - "loss": 2.9277, - "step": 5031 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016727843311879057, - "loss": 3.0666, - "step": 5032 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016724280405955335, - "loss": 2.6994, - "step": 5033 - }, - { - "epoch": 1.55, - "learning_rate": 0.0001672071750003161, - "loss": 2.4348, - "step": 5034 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016717154594107891, - "loss": 2.4378, - "step": 5035 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016713591688184167, - "loss": 2.4857, - "step": 5036 - }, - { - "epoch": 1.55, - "learning_rate": 0.00016710028782260442, - "loss": 2.2687, - "step": 5037 - }, - { - "epoch": 1.55, - "learning_rate": 0.0001670646587633672, - "loss": 2.0499, - "step": 5038 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016702902970412996, - "loss": 2.1065, - "step": 5039 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016699340064489277, - "loss": 2.0045, - "step": 5040 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016695777158565552, - "loss": 4.7496, - "step": 5041 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016692214252641828, - "loss": 4.3594, - "step": 5042 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016688651346718106, - "loss": 4.255, - "step": 5043 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016685088440794382, - "loss": 4.0346, - "step": 5044 - }, - { - "epoch": 1.56, - "learning_rate": 0.0001668152553487066, - "loss": 4.2133, - "step": 5045 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016677962628946938, - "loss": 4.1195, - "step": 5046 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016674399723023213, - "loss": 3.9856, - "step": 5047 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016670836817099492, - "loss": 3.796, - "step": 5048 - }, - { - "epoch": 1.56, - "learning_rate": 0.0001666727391117577, - "loss": 3.9051, - "step": 5049 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016663711005252045, - "loss": 3.7122, - "step": 5050 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016660148099328323, - "loss": 3.6531, - "step": 5051 - }, - { - "epoch": 1.56, - "learning_rate": 0.000166565851934046, - "loss": 3.6407, - "step": 5052 - }, - { - "epoch": 1.56, - "learning_rate": 0.0001665302228748088, - "loss": 3.7328, - "step": 5053 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016649459381557155, - "loss": 3.6517, - "step": 5054 - }, - { - "epoch": 1.56, - "learning_rate": 0.0001664589647563343, - "loss": 3.6565, - "step": 5055 - }, - { - "epoch": 1.56, - "learning_rate": 0.0001664233356970971, - "loss": 3.7824, - "step": 5056 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016638770663785984, - "loss": 3.6188, - "step": 5057 - }, - { - "epoch": 1.56, - "learning_rate": 0.0001663520775786226, - "loss": 3.722, - "step": 5058 - }, - { - "epoch": 1.56, - "learning_rate": 0.0001663164485193854, - "loss": 3.4814, - "step": 5059 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016628081946014816, - "loss": 3.7033, - "step": 5060 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016624519040091094, - "loss": 3.5496, - "step": 5061 - }, - { - "epoch": 1.56, - "learning_rate": 0.0001662095613416737, - "loss": 3.5623, - "step": 5062 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016617393228243645, - "loss": 3.5102, - "step": 5063 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016613830322319926, - "loss": 3.5526, - "step": 5064 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016610267416396202, - "loss": 3.3809, - "step": 5065 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016606704510472477, - "loss": 3.2297, - "step": 5066 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016603141604548755, - "loss": 3.1222, - "step": 5067 - }, - { - "epoch": 1.56, - "learning_rate": 0.0001659957869862503, - "loss": 3.159, - "step": 5068 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016596015792701312, - "loss": 2.9245, - "step": 5069 - }, - { - "epoch": 1.56, - "learning_rate": 0.00016592452886777587, - "loss": 3.0366, - "step": 5070 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016588889980853863, - "loss": 3.0104, - "step": 5071 - }, - { - "epoch": 1.57, - "learning_rate": 0.0001658532707493014, - "loss": 3.1461, - "step": 5072 - }, - { - "epoch": 1.57, - "learning_rate": 0.0001658176416900642, - "loss": 3.1299, - "step": 5073 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016578201263082697, - "loss": 2.9052, - "step": 5074 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016574638357158973, - "loss": 3.0537, - "step": 5075 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016571075451235248, - "loss": 3.0296, - "step": 5076 - }, - { - "epoch": 1.57, - "learning_rate": 0.0001656751254531153, - "loss": 2.9194, - "step": 5077 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016563949639387805, - "loss": 2.9633, - "step": 5078 - }, - { - "epoch": 1.57, - "learning_rate": 0.0001656038673346408, - "loss": 2.855, - "step": 5079 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016556823827540358, - "loss": 2.708, - "step": 5080 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016553260921616634, - "loss": 2.7977, - "step": 5081 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016549698015692915, - "loss": 2.4751, - "step": 5082 - }, - { - "epoch": 1.57, - "learning_rate": 0.0001654613510976919, - "loss": 2.6037, - "step": 5083 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016542572203845466, - "loss": 2.7135, - "step": 5084 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016539009297921744, - "loss": 2.4936, - "step": 5085 - }, - { - "epoch": 1.57, - "learning_rate": 0.0001653544639199802, - "loss": 2.2641, - "step": 5086 - }, - { - "epoch": 1.57, - "learning_rate": 0.000165318834860743, - "loss": 2.2471, - "step": 5087 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016528320580150576, - "loss": 2.1959, - "step": 5088 - }, - { - "epoch": 1.57, - "learning_rate": 0.0001652475767422685, - "loss": 1.9294, - "step": 5089 - }, - { - "epoch": 1.57, - "learning_rate": 0.0001652119476830313, - "loss": 2.0809, - "step": 5090 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016517631862379405, - "loss": 4.5385, - "step": 5091 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016514068956455683, - "loss": 4.1514, - "step": 5092 - }, - { - "epoch": 1.57, - "learning_rate": 0.0001651050605053196, - "loss": 4.2218, - "step": 5093 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016506943144608237, - "loss": 4.0605, - "step": 5094 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016503380238684515, - "loss": 4.1388, - "step": 5095 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016499817332760793, - "loss": 3.6764, - "step": 5096 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016496254426837069, - "loss": 4.0067, - "step": 5097 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016492691520913347, - "loss": 3.8095, - "step": 5098 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016489128614989622, - "loss": 3.9261, - "step": 5099 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016485565709065898, - "loss": 3.6464, - "step": 5100 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016482002803142179, - "loss": 3.7204, - "step": 5101 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016478439897218454, - "loss": 3.4486, - "step": 5102 - }, - { - "epoch": 1.57, - "learning_rate": 0.00016474876991294732, - "loss": 3.8439, - "step": 5103 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016471314085371008, - "loss": 3.8556, - "step": 5104 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016467751179447283, - "loss": 3.6765, - "step": 5105 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016464188273523564, - "loss": 3.7793, - "step": 5106 - }, - { - "epoch": 1.58, - "learning_rate": 0.0001646062536759984, - "loss": 3.6761, - "step": 5107 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016457062461676118, - "loss": 3.4965, - "step": 5108 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016453499555752393, - "loss": 3.4632, - "step": 5109 - }, - { - "epoch": 1.58, - "learning_rate": 0.0001644993664982867, - "loss": 3.5923, - "step": 5110 - }, - { - "epoch": 1.58, - "learning_rate": 0.0001644637374390495, - "loss": 3.5198, - "step": 5111 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016442810837981225, - "loss": 3.6408, - "step": 5112 - }, - { - "epoch": 1.58, - "learning_rate": 0.000164392479320575, - "loss": 3.4747, - "step": 5113 - }, - { - "epoch": 1.58, - "learning_rate": 0.0001643568502613378, - "loss": 3.4336, - "step": 5114 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016432122120210054, - "loss": 3.2525, - "step": 5115 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016428559214286335, - "loss": 3.4363, - "step": 5116 - }, - { - "epoch": 1.58, - "learning_rate": 0.0001642499630836261, - "loss": 3.4483, - "step": 5117 - }, - { - "epoch": 1.58, - "learning_rate": 0.0001642143340243889, - "loss": 3.4264, - "step": 5118 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016417870496515167, - "loss": 3.1427, - "step": 5119 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016414307590591442, - "loss": 3.2216, - "step": 5120 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016410744684667718, - "loss": 3.0903, - "step": 5121 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016407181778743996, - "loss": 2.8815, - "step": 5122 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016403618872820274, - "loss": 3.3242, - "step": 5123 - }, - { - "epoch": 1.58, - "learning_rate": 0.0001640005596689655, - "loss": 3.1474, - "step": 5124 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016396493060972828, - "loss": 3.3035, - "step": 5125 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016392930155049103, - "loss": 2.9555, - "step": 5126 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016389367249125382, - "loss": 2.7503, - "step": 5127 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016385804343201657, - "loss": 2.9028, - "step": 5128 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016382241437277935, - "loss": 2.6267, - "step": 5129 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016378678531354213, - "loss": 2.553, - "step": 5130 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016375115625430492, - "loss": 2.7375, - "step": 5131 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016371552719506767, - "loss": 2.5582, - "step": 5132 - }, - { - "epoch": 1.58, - "learning_rate": 0.00016367989813583043, - "loss": 2.5497, - "step": 5133 - }, - { - "epoch": 1.58, - "learning_rate": 0.0001636442690765932, - "loss": 2.927, - "step": 5134 - }, - { - "epoch": 1.58, - "learning_rate": 0.000163608640017356, - "loss": 2.5739, - "step": 5135 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016357301095811877, - "loss": 2.7608, - "step": 5136 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016353738189888153, - "loss": 2.3265, - "step": 5137 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016350175283964428, - "loss": 2.4279, - "step": 5138 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016346612378040706, - "loss": 2.7313, - "step": 5139 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016343049472116985, - "loss": 2.083, - "step": 5140 - }, - { - "epoch": 1.59, - "learning_rate": 0.0001633948656619326, - "loss": 4.6043, - "step": 5141 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016335923660269538, - "loss": 4.2703, - "step": 5142 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016332360754345816, - "loss": 4.5211, - "step": 5143 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016328797848422092, - "loss": 4.322, - "step": 5144 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016325234942498367, - "loss": 4.1857, - "step": 5145 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016321672036574646, - "loss": 4.2685, - "step": 5146 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016318109130650924, - "loss": 3.942, - "step": 5147 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016314546224727202, - "loss": 4.027, - "step": 5148 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016310983318803477, - "loss": 4.0084, - "step": 5149 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016307420412879753, - "loss": 3.8896, - "step": 5150 - }, - { - "epoch": 1.59, - "learning_rate": 0.0001630385750695603, - "loss": 3.9545, - "step": 5151 - }, - { - "epoch": 1.59, - "learning_rate": 0.0001630029460103231, - "loss": 4.0051, - "step": 5152 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016296731695108587, - "loss": 4.0763, - "step": 5153 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016293168789184863, - "loss": 3.9928, - "step": 5154 - }, - { - "epoch": 1.59, - "learning_rate": 0.0001628960588326114, - "loss": 4.1413, - "step": 5155 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016286042977337417, - "loss": 3.6431, - "step": 5156 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016282480071413695, - "loss": 3.6869, - "step": 5157 - }, - { - "epoch": 1.59, - "learning_rate": 0.0001627891716548997, - "loss": 3.6262, - "step": 5158 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016275354259566248, - "loss": 3.5168, - "step": 5159 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016271791353642527, - "loss": 3.5706, - "step": 5160 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016268228447718802, - "loss": 3.466, - "step": 5161 - }, - { - "epoch": 1.59, - "learning_rate": 0.0001626466554179508, - "loss": 3.6046, - "step": 5162 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016261102635871356, - "loss": 3.4636, - "step": 5163 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016257539729947634, - "loss": 3.5983, - "step": 5164 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016253976824023912, - "loss": 3.4472, - "step": 5165 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016250413918100188, - "loss": 3.3257, - "step": 5166 - }, - { - "epoch": 1.59, - "learning_rate": 0.00016246851012176466, - "loss": 3.3518, - "step": 5167 - }, - { - "epoch": 1.6, - "learning_rate": 0.0001624328810625274, - "loss": 3.5599, - "step": 5168 - }, - { - "epoch": 1.6, - "learning_rate": 0.0001623972520032902, - "loss": 3.3584, - "step": 5169 - }, - { - "epoch": 1.6, - "learning_rate": 0.00016236162294405298, - "loss": 3.3488, - "step": 5170 - }, - { - "epoch": 1.6, - "learning_rate": 0.00016232599388481573, - "loss": 3.2606, - "step": 5171 - }, - { - "epoch": 1.6, - "learning_rate": 0.0001622903648255785, - "loss": 3.3869, - "step": 5172 - }, - { - "epoch": 1.6, - "learning_rate": 0.00016225473576634127, - "loss": 2.9113, - "step": 5173 - }, - { - "epoch": 1.6, - "learning_rate": 0.00016221910670710405, - "loss": 3.1682, - "step": 5174 - }, - { - "epoch": 1.6, - "learning_rate": 0.0001621834776478668, - "loss": 3.0919, - "step": 5175 - }, - { - "epoch": 1.6, - "learning_rate": 0.00016214784858862959, - "loss": 2.8764, - "step": 5176 - }, - { - "epoch": 1.6, - "learning_rate": 0.00016211221952939237, - "loss": 2.9131, - "step": 5177 - }, - { - "epoch": 1.6, - "learning_rate": 0.00016207659047015515, - "loss": 2.8004, - "step": 5178 - }, - { - "epoch": 1.6, - "learning_rate": 0.0001620409614109179, - "loss": 2.9883, - "step": 5179 - }, - { - "epoch": 1.6, - "learning_rate": 0.00016200533235168066, - "loss": 2.7768, - "step": 5180 - }, - { - "epoch": 1.6, - "learning_rate": 0.00016196970329244344, - "loss": 2.818, - "step": 5181 - }, - { - "epoch": 1.6, - "learning_rate": 0.00016193407423320622, - "loss": 2.2615, - "step": 5182 - }, - { - "epoch": 1.6, - "learning_rate": 0.000161898445173969, - "loss": 2.6329, - "step": 5183 - }, - { - "epoch": 1.6, - "learning_rate": 0.00016186281611473176, - "loss": 2.6819, - "step": 5184 - }, - { - "epoch": 1.6, - "learning_rate": 0.00016182718705549451, - "loss": 2.3662, - "step": 5185 - }, - { - "epoch": 1.6, - "learning_rate": 0.0001617915579962573, - "loss": 2.3583, - "step": 5186 - }, - { - "epoch": 1.6, - "learning_rate": 0.00016175592893702008, - "loss": 2.0906, - "step": 5187 - }, - { - "epoch": 1.6, - "learning_rate": 0.00016172029987778283, - "loss": 2.1285, - "step": 5188 - }, - { - "epoch": 1.6, - "learning_rate": 0.00016168467081854561, - "loss": 2.0296, - "step": 5189 - }, - { - "epoch": 1.6, - "learning_rate": 0.0001616490417593084, - "loss": 1.8773, - "step": 5190 - }, - { - "epoch": 1.6, - "learning_rate": 0.00016161341270007115, - "loss": 4.7715, - "step": 5191 - }, - { - "epoch": 1.6, - "learning_rate": 0.0001615777836408339, - "loss": 4.2783, - "step": 5192 - }, - { - "epoch": 1.6, - "learning_rate": 0.0001615421545815967, - "loss": 4.2123, - "step": 5193 - }, - { - "epoch": 1.6, - "learning_rate": 0.00016150652552235947, - "loss": 4.026, - "step": 5194 - }, - { - "epoch": 1.6, - "learning_rate": 0.00016147089646312225, - "loss": 3.7333, - "step": 5195 - }, - { - "epoch": 1.6, - "learning_rate": 0.000161435267403885, - "loss": 4.0494, - "step": 5196 - }, - { - "epoch": 1.6, - "learning_rate": 0.00016139963834464776, - "loss": 3.8554, - "step": 5197 - }, - { - "epoch": 1.6, - "learning_rate": 0.00016136400928541054, - "loss": 3.8574, - "step": 5198 - }, - { - "epoch": 1.6, - "learning_rate": 0.00016132838022617333, - "loss": 3.8806, - "step": 5199 - }, - { - "epoch": 1.6, - "learning_rate": 0.0001612927511669361, - "loss": 3.471, - "step": 5200 - }, - { - "epoch": 1.61, - "learning_rate": 0.00016125712210769886, - "loss": 3.7846, - "step": 5201 - }, - { - "epoch": 1.61, - "learning_rate": 0.00016122149304846164, - "loss": 3.7306, - "step": 5202 - }, - { - "epoch": 1.61, - "learning_rate": 0.0001611858639892244, - "loss": 3.554, - "step": 5203 - }, - { - "epoch": 1.61, - "learning_rate": 0.00016115023492998718, - "loss": 3.6427, - "step": 5204 - }, - { - "epoch": 1.61, - "learning_rate": 0.00016111460587074994, - "loss": 3.6213, - "step": 5205 - }, - { - "epoch": 1.61, - "learning_rate": 0.00016107897681151272, - "loss": 3.8517, - "step": 5206 - }, - { - "epoch": 1.61, - "learning_rate": 0.0001610433477522755, - "loss": 3.6802, - "step": 5207 - }, - { - "epoch": 1.61, - "learning_rate": 0.00016100771869303825, - "loss": 3.8714, - "step": 5208 - }, - { - "epoch": 1.61, - "learning_rate": 0.00016097208963380104, - "loss": 3.5008, - "step": 5209 - }, - { - "epoch": 1.61, - "learning_rate": 0.0001609364605745638, - "loss": 3.6019, - "step": 5210 - }, - { - "epoch": 1.61, - "learning_rate": 0.00016090083151532657, - "loss": 3.5131, - "step": 5211 - }, - { - "epoch": 1.61, - "learning_rate": 0.00016086520245608935, - "loss": 3.4922, - "step": 5212 - }, - { - "epoch": 1.61, - "learning_rate": 0.0001608295733968521, - "loss": 3.4151, - "step": 5213 - }, - { - "epoch": 1.61, - "learning_rate": 0.0001607939443376149, - "loss": 3.3909, - "step": 5214 - }, - { - "epoch": 1.61, - "learning_rate": 0.00016075831527837765, - "loss": 3.2531, - "step": 5215 - }, - { - "epoch": 1.61, - "learning_rate": 0.00016072268621914043, - "loss": 3.4263, - "step": 5216 - }, - { - "epoch": 1.61, - "learning_rate": 0.0001606870571599032, - "loss": 3.2036, - "step": 5217 - }, - { - "epoch": 1.61, - "learning_rate": 0.00016065142810066596, - "loss": 3.1855, - "step": 5218 - }, - { - "epoch": 1.61, - "learning_rate": 0.00016061579904142875, - "loss": 3.2582, - "step": 5219 - }, - { - "epoch": 1.61, - "learning_rate": 0.0001605801699821915, - "loss": 3.3939, - "step": 5220 - }, - { - "epoch": 1.61, - "learning_rate": 0.00016054454092295428, - "loss": 3.1356, - "step": 5221 - }, - { - "epoch": 1.61, - "learning_rate": 0.00016050891186371704, - "loss": 3.1504, - "step": 5222 - }, - { - "epoch": 1.61, - "learning_rate": 0.00016047328280447982, - "loss": 2.9691, - "step": 5223 - }, - { - "epoch": 1.61, - "learning_rate": 0.0001604376537452426, - "loss": 3.0669, - "step": 5224 - }, - { - "epoch": 1.61, - "learning_rate": 0.00016040202468600538, - "loss": 2.7848, - "step": 5225 - }, - { - "epoch": 1.61, - "learning_rate": 0.00016036639562676814, - "loss": 3.0582, - "step": 5226 - }, - { - "epoch": 1.61, - "learning_rate": 0.0001603307665675309, - "loss": 2.896, - "step": 5227 - }, - { - "epoch": 1.61, - "learning_rate": 0.00016029513750829367, - "loss": 2.9343, - "step": 5228 - }, - { - "epoch": 1.61, - "learning_rate": 0.00016025950844905646, - "loss": 2.8872, - "step": 5229 - }, - { - "epoch": 1.61, - "learning_rate": 0.0001602238793898192, - "loss": 2.664, - "step": 5230 - }, - { - "epoch": 1.61, - "learning_rate": 0.000160188250330582, - "loss": 2.779, - "step": 5231 - }, - { - "epoch": 1.61, - "learning_rate": 0.00016015262127134475, - "loss": 2.5402, - "step": 5232 - }, - { - "epoch": 1.62, - "learning_rate": 0.00016011699221210753, - "loss": 2.3321, - "step": 5233 - }, - { - "epoch": 1.62, - "learning_rate": 0.0001600813631528703, - "loss": 2.5587, - "step": 5234 - }, - { - "epoch": 1.62, - "learning_rate": 0.00016004573409363307, - "loss": 2.5617, - "step": 5235 - }, - { - "epoch": 1.62, - "learning_rate": 0.00016001010503439585, - "loss": 2.3759, - "step": 5236 - }, - { - "epoch": 1.62, - "learning_rate": 0.00015997447597515863, - "loss": 2.1283, - "step": 5237 - }, - { - "epoch": 1.62, - "learning_rate": 0.00015993884691592138, - "loss": 2.0047, - "step": 5238 - }, - { - "epoch": 1.62, - "learning_rate": 0.00015990321785668414, - "loss": 2.1114, - "step": 5239 - }, - { - "epoch": 1.62, - "learning_rate": 0.00015986758879744692, - "loss": 1.6458, - "step": 5240 - }, - { - "epoch": 1.62, - "learning_rate": 0.0001598319597382097, - "loss": 4.4028, - "step": 5241 - }, - { - "epoch": 1.62, - "learning_rate": 0.00015979633067897249, - "loss": 4.1807, - "step": 5242 - }, - { - "epoch": 1.62, - "learning_rate": 0.00015976070161973524, - "loss": 4.0226, - "step": 5243 - }, - { - "epoch": 1.62, - "learning_rate": 0.000159725072560498, - "loss": 4.0885, - "step": 5244 - }, - { - "epoch": 1.62, - "learning_rate": 0.00015968944350126078, - "loss": 4.0597, - "step": 5245 - }, - { - "epoch": 1.62, - "learning_rate": 0.00015965381444202356, - "loss": 3.9328, - "step": 5246 - }, - { - "epoch": 1.62, - "learning_rate": 0.0001596181853827863, - "loss": 3.9196, - "step": 5247 - }, - { - "epoch": 1.62, - "learning_rate": 0.0001595825563235491, - "loss": 3.7107, - "step": 5248 - }, - { - "epoch": 1.62, - "learning_rate": 0.00015954692726431188, - "loss": 3.7358, - "step": 5249 - }, - { - "epoch": 1.62, - "learning_rate": 0.00015951129820507463, - "loss": 3.5633, - "step": 5250 - }, - { - "epoch": 1.62, - "learning_rate": 0.00015947566914583741, - "loss": 3.6134, - "step": 5251 - }, - { - "epoch": 1.62, - "learning_rate": 0.00015944004008660017, - "loss": 3.8023, - "step": 5252 - }, - { - "epoch": 1.62, - "learning_rate": 0.00015940441102736295, - "loss": 3.564, - "step": 5253 - }, - { - "epoch": 1.62, - "learning_rate": 0.00015936878196812573, - "loss": 3.6418, - "step": 5254 - }, - { - "epoch": 1.62, - "learning_rate": 0.0001593331529088885, - "loss": 3.501, - "step": 5255 - }, - { - "epoch": 1.62, - "learning_rate": 0.00015929752384965127, - "loss": 3.7053, - "step": 5256 - }, - { - "epoch": 1.62, - "learning_rate": 0.00015926189479041402, - "loss": 3.7097, - "step": 5257 - }, - { - "epoch": 1.62, - "learning_rate": 0.0001592262657311768, - "loss": 3.5674, - "step": 5258 - }, - { - "epoch": 1.62, - "learning_rate": 0.0001591906366719396, - "loss": 3.4934, - "step": 5259 - }, - { - "epoch": 1.62, - "learning_rate": 0.00015915500761270234, - "loss": 3.4788, - "step": 5260 - }, - { - "epoch": 1.62, - "learning_rate": 0.00015911937855346512, - "loss": 3.512, - "step": 5261 - }, - { - "epoch": 1.62, - "learning_rate": 0.00015908374949422788, - "loss": 3.4634, - "step": 5262 - }, - { - "epoch": 1.62, - "learning_rate": 0.00015904812043499066, - "loss": 3.3822, - "step": 5263 - }, - { - "epoch": 1.62, - "learning_rate": 0.00015901249137575342, - "loss": 3.402, - "step": 5264 - }, - { - "epoch": 1.62, - "learning_rate": 0.0001589768623165162, - "loss": 3.4611, - "step": 5265 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015894123325727898, - "loss": 3.3815, - "step": 5266 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015890560419804173, - "loss": 3.4136, - "step": 5267 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015886997513880452, - "loss": 3.422, - "step": 5268 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015883434607956727, - "loss": 3.0335, - "step": 5269 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015879871702033005, - "loss": 3.1037, - "step": 5270 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015876308796109283, - "loss": 3.2943, - "step": 5271 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015872745890185562, - "loss": 2.9649, - "step": 5272 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015869182984261837, - "loss": 3.2643, - "step": 5273 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015865620078338113, - "loss": 3.2008, - "step": 5274 - }, - { - "epoch": 1.63, - "learning_rate": 0.0001586205717241439, - "loss": 3.1336, - "step": 5275 - }, - { - "epoch": 1.63, - "learning_rate": 0.0001585849426649067, - "loss": 3.0109, - "step": 5276 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015854931360566944, - "loss": 2.9037, - "step": 5277 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015851368454643223, - "loss": 2.9982, - "step": 5278 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015847805548719498, - "loss": 2.7704, - "step": 5279 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015844242642795776, - "loss": 2.7598, - "step": 5280 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015840679736872054, - "loss": 2.6462, - "step": 5281 - }, - { - "epoch": 1.63, - "learning_rate": 0.0001583711683094833, - "loss": 2.484, - "step": 5282 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015833553925024608, - "loss": 2.3663, - "step": 5283 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015829991019100886, - "loss": 2.4378, - "step": 5284 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015826428113177162, - "loss": 2.4262, - "step": 5285 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015822865207253437, - "loss": 2.0486, - "step": 5286 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015819302301329715, - "loss": 2.0989, - "step": 5287 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015815739395405994, - "loss": 2.0386, - "step": 5288 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015812176489482272, - "loss": 2.0589, - "step": 5289 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015808613583558547, - "loss": 1.8574, - "step": 5290 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015805050677634823, - "loss": 4.5484, - "step": 5291 - }, - { - "epoch": 1.63, - "learning_rate": 0.000158014877717111, - "loss": 4.3412, - "step": 5292 - }, - { - "epoch": 1.63, - "learning_rate": 0.0001579792486578738, - "loss": 4.0992, - "step": 5293 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015794361959863655, - "loss": 3.865, - "step": 5294 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015790799053939933, - "loss": 4.1363, - "step": 5295 - }, - { - "epoch": 1.63, - "learning_rate": 0.0001578723614801621, - "loss": 3.836, - "step": 5296 - }, - { - "epoch": 1.63, - "learning_rate": 0.00015783673242092486, - "loss": 3.6615, - "step": 5297 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015780110336168765, - "loss": 3.9645, - "step": 5298 - }, - { - "epoch": 1.64, - "learning_rate": 0.0001577654743024504, - "loss": 3.9112, - "step": 5299 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015772984524321318, - "loss": 3.9274, - "step": 5300 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015769421618397597, - "loss": 3.5643, - "step": 5301 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015765858712473872, - "loss": 3.7087, - "step": 5302 - }, - { - "epoch": 1.64, - "learning_rate": 0.0001576229580655015, - "loss": 3.5157, - "step": 5303 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015758732900626426, - "loss": 3.6347, - "step": 5304 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015755169994702704, - "loss": 3.6884, - "step": 5305 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015751607088778982, - "loss": 3.6845, - "step": 5306 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015748044182855258, - "loss": 3.8435, - "step": 5307 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015744481276931536, - "loss": 3.3399, - "step": 5308 - }, - { - "epoch": 1.64, - "learning_rate": 0.0001574091837100781, - "loss": 3.6724, - "step": 5309 - }, - { - "epoch": 1.64, - "learning_rate": 0.0001573735546508409, - "loss": 3.4578, - "step": 5310 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015733792559160365, - "loss": 3.335, - "step": 5311 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015730229653236643, - "loss": 3.4325, - "step": 5312 - }, - { - "epoch": 1.64, - "learning_rate": 0.0001572666674731292, - "loss": 3.5355, - "step": 5313 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015723103841389197, - "loss": 3.3331, - "step": 5314 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015719540935465475, - "loss": 3.4525, - "step": 5315 - }, - { - "epoch": 1.64, - "learning_rate": 0.0001571597802954175, - "loss": 3.4096, - "step": 5316 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015712415123618029, - "loss": 3.4189, - "step": 5317 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015708852217694307, - "loss": 3.3634, - "step": 5318 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015705289311770585, - "loss": 3.2232, - "step": 5319 - }, - { - "epoch": 1.64, - "learning_rate": 0.0001570172640584686, - "loss": 3.1236, - "step": 5320 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015698163499923136, - "loss": 2.9329, - "step": 5321 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015694600593999414, - "loss": 2.8678, - "step": 5322 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015691037688075692, - "loss": 2.9798, - "step": 5323 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015687474782151968, - "loss": 3.1061, - "step": 5324 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015683911876228246, - "loss": 2.9691, - "step": 5325 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015680348970304521, - "loss": 3.0512, - "step": 5326 - }, - { - "epoch": 1.64, - "learning_rate": 0.000156767860643808, - "loss": 2.8258, - "step": 5327 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015673223158457075, - "loss": 2.7082, - "step": 5328 - }, - { - "epoch": 1.64, - "learning_rate": 0.00015669660252533353, - "loss": 2.6516, - "step": 5329 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015666097346609631, - "loss": 2.7729, - "step": 5330 - }, - { - "epoch": 1.65, - "learning_rate": 0.0001566253444068591, - "loss": 2.487, - "step": 5331 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015658971534762185, - "loss": 2.8002, - "step": 5332 - }, - { - "epoch": 1.65, - "learning_rate": 0.0001565540862883846, - "loss": 2.4237, - "step": 5333 - }, - { - "epoch": 1.65, - "learning_rate": 0.0001565184572291474, - "loss": 2.5043, - "step": 5334 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015648282816991017, - "loss": 2.248, - "step": 5335 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015644719911067295, - "loss": 2.3324, - "step": 5336 - }, - { - "epoch": 1.65, - "learning_rate": 0.0001564115700514357, - "loss": 2.2492, - "step": 5337 - }, - { - "epoch": 1.65, - "learning_rate": 0.0001563759409921985, - "loss": 2.0222, - "step": 5338 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015634031193296124, - "loss": 1.8288, - "step": 5339 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015630468287372402, - "loss": 2.0684, - "step": 5340 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015626905381448678, - "loss": 4.6978, - "step": 5341 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015623342475524956, - "loss": 4.4149, - "step": 5342 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015619779569601234, - "loss": 4.1851, - "step": 5343 - }, - { - "epoch": 1.65, - "learning_rate": 0.0001561621666367751, - "loss": 4.0687, - "step": 5344 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015612653757753785, - "loss": 3.7786, - "step": 5345 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015609090851830063, - "loss": 3.9241, - "step": 5346 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015605527945906342, - "loss": 3.9158, - "step": 5347 - }, - { - "epoch": 1.65, - "learning_rate": 0.0001560196503998262, - "loss": 3.67, - "step": 5348 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015598402134058895, - "loss": 3.9437, - "step": 5349 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015594839228135174, - "loss": 3.6236, - "step": 5350 - }, - { - "epoch": 1.65, - "learning_rate": 0.0001559127632221145, - "loss": 3.4508, - "step": 5351 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015587713416287727, - "loss": 3.7527, - "step": 5352 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015584150510364005, - "loss": 3.5063, - "step": 5353 - }, - { - "epoch": 1.65, - "learning_rate": 0.0001558058760444028, - "loss": 3.5275, - "step": 5354 - }, - { - "epoch": 1.65, - "learning_rate": 0.0001557702469851656, - "loss": 3.5498, - "step": 5355 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015573461792592834, - "loss": 3.6228, - "step": 5356 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015569898886669113, - "loss": 3.3563, - "step": 5357 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015566335980745388, - "loss": 3.612, - "step": 5358 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015562773074821666, - "loss": 3.262, - "step": 5359 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015559210168897945, - "loss": 3.4583, - "step": 5360 - }, - { - "epoch": 1.65, - "learning_rate": 0.0001555564726297422, - "loss": 3.5562, - "step": 5361 - }, - { - "epoch": 1.65, - "learning_rate": 0.00015552084357050498, - "loss": 3.5809, - "step": 5362 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015548521451126774, - "loss": 3.4615, - "step": 5363 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015544958545203052, - "loss": 3.3333, - "step": 5364 - }, - { - "epoch": 1.66, - "learning_rate": 0.0001554139563927933, - "loss": 3.6466, - "step": 5365 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015537832733355608, - "loss": 3.3663, - "step": 5366 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015534269827431884, - "loss": 3.2724, - "step": 5367 - }, - { - "epoch": 1.66, - "learning_rate": 0.0001553070692150816, - "loss": 3.3544, - "step": 5368 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015527144015584437, - "loss": 3.1482, - "step": 5369 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015523581109660716, - "loss": 3.4169, - "step": 5370 - }, - { - "epoch": 1.66, - "learning_rate": 0.0001552001820373699, - "loss": 3.0173, - "step": 5371 - }, - { - "epoch": 1.66, - "learning_rate": 0.0001551645529781327, - "loss": 3.2318, - "step": 5372 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015512892391889545, - "loss": 2.9219, - "step": 5373 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015509329485965823, - "loss": 3.0258, - "step": 5374 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015505766580042098, - "loss": 2.98, - "step": 5375 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015502203674118377, - "loss": 3.0908, - "step": 5376 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015498640768194655, - "loss": 2.9656, - "step": 5377 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015495077862270933, - "loss": 2.6355, - "step": 5378 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015491514956347208, - "loss": 2.666, - "step": 5379 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015487952050423484, - "loss": 2.6063, - "step": 5380 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015484389144499762, - "loss": 2.6801, - "step": 5381 - }, - { - "epoch": 1.66, - "learning_rate": 0.0001548082623857604, - "loss": 2.7131, - "step": 5382 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015477263332652318, - "loss": 2.4428, - "step": 5383 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015473700426728594, - "loss": 2.4866, - "step": 5384 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015470137520804872, - "loss": 2.3281, - "step": 5385 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015466574614881148, - "loss": 2.2687, - "step": 5386 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015463011708957426, - "loss": 2.1568, - "step": 5387 - }, - { - "epoch": 1.66, - "learning_rate": 0.000154594488030337, - "loss": 1.9512, - "step": 5388 - }, - { - "epoch": 1.66, - "learning_rate": 0.0001545588589710998, - "loss": 1.9971, - "step": 5389 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015452322991186258, - "loss": 1.8034, - "step": 5390 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015448760085262533, - "loss": 4.5996, - "step": 5391 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015445197179338809, - "loss": 4.3325, - "step": 5392 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015441634273415087, - "loss": 4.3082, - "step": 5393 - }, - { - "epoch": 1.66, - "learning_rate": 0.00015438071367491365, - "loss": 4.0709, - "step": 5394 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015434508461567643, - "loss": 3.9002, - "step": 5395 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015430945555643919, - "loss": 3.8123, - "step": 5396 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015427382649720197, - "loss": 4.0367, - "step": 5397 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015423819743796472, - "loss": 3.9584, - "step": 5398 - }, - { - "epoch": 1.67, - "learning_rate": 0.0001542025683787275, - "loss": 3.981, - "step": 5399 - }, - { - "epoch": 1.67, - "learning_rate": 0.0001541669393194903, - "loss": 3.7749, - "step": 5400 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015413131026025304, - "loss": 3.6046, - "step": 5401 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015409568120101582, - "loss": 3.6083, - "step": 5402 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015406005214177858, - "loss": 3.6249, - "step": 5403 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015402442308254136, - "loss": 3.6604, - "step": 5404 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015398879402330411, - "loss": 3.6986, - "step": 5405 - }, - { - "epoch": 1.67, - "learning_rate": 0.0001539531649640669, - "loss": 3.5999, - "step": 5406 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015391753590482968, - "loss": 3.3804, - "step": 5407 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015388190684559243, - "loss": 3.8272, - "step": 5408 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015384627778635522, - "loss": 3.3957, - "step": 5409 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015381064872711797, - "loss": 3.4414, - "step": 5410 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015377501966788075, - "loss": 3.393, - "step": 5411 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015373939060864353, - "loss": 3.4137, - "step": 5412 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015370376154940632, - "loss": 3.2973, - "step": 5413 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015366813249016907, - "loss": 3.4753, - "step": 5414 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015363250343093183, - "loss": 3.3918, - "step": 5415 - }, - { - "epoch": 1.67, - "learning_rate": 0.0001535968743716946, - "loss": 3.2406, - "step": 5416 - }, - { - "epoch": 1.67, - "learning_rate": 0.0001535612453124574, - "loss": 3.0821, - "step": 5417 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015352561625322014, - "loss": 2.9332, - "step": 5418 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015348998719398293, - "loss": 3.1528, - "step": 5419 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015345435813474568, - "loss": 2.9657, - "step": 5420 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015341872907550846, - "loss": 3.1445, - "step": 5421 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015338310001627122, - "loss": 2.7626, - "step": 5422 - }, - { - "epoch": 1.67, - "learning_rate": 0.000153347470957034, - "loss": 3.0536, - "step": 5423 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015331184189779678, - "loss": 2.7758, - "step": 5424 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015327621283855956, - "loss": 2.69, - "step": 5425 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015324058377932232, - "loss": 2.9515, - "step": 5426 - }, - { - "epoch": 1.67, - "learning_rate": 0.00015320495472008507, - "loss": 2.7783, - "step": 5427 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015316932566084785, - "loss": 2.7437, - "step": 5428 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015313369660161064, - "loss": 2.6291, - "step": 5429 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015309806754237342, - "loss": 2.7039, - "step": 5430 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015306243848313617, - "loss": 2.6126, - "step": 5431 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015302680942389895, - "loss": 2.8178, - "step": 5432 - }, - { - "epoch": 1.68, - "learning_rate": 0.0001529911803646617, - "loss": 2.6131, - "step": 5433 - }, - { - "epoch": 1.68, - "learning_rate": 0.0001529555513054245, - "loss": 2.562, - "step": 5434 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015291992224618725, - "loss": 2.3232, - "step": 5435 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015288429318695003, - "loss": 2.191, - "step": 5436 - }, - { - "epoch": 1.68, - "learning_rate": 0.0001528486641277128, - "loss": 1.9938, - "step": 5437 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015281303506847556, - "loss": 1.8411, - "step": 5438 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015277740600923832, - "loss": 1.913, - "step": 5439 - }, - { - "epoch": 1.68, - "learning_rate": 0.0001527417769500011, - "loss": 1.8718, - "step": 5440 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015270614789076388, - "loss": 4.6979, - "step": 5441 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015267051883152666, - "loss": 4.1634, - "step": 5442 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015263488977228942, - "loss": 4.0241, - "step": 5443 - }, - { - "epoch": 1.68, - "learning_rate": 0.0001525992607130522, - "loss": 3.9266, - "step": 5444 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015256363165381496, - "loss": 3.9151, - "step": 5445 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015252800259457774, - "loss": 3.7785, - "step": 5446 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015249237353534052, - "loss": 3.7194, - "step": 5447 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015245674447610327, - "loss": 3.811, - "step": 5448 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015242111541686606, - "loss": 4.0665, - "step": 5449 - }, - { - "epoch": 1.68, - "learning_rate": 0.0001523854863576288, - "loss": 3.6027, - "step": 5450 - }, - { - "epoch": 1.68, - "learning_rate": 0.0001523498572983916, - "loss": 3.8156, - "step": 5451 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015231422823915435, - "loss": 3.7478, - "step": 5452 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015227859917991713, - "loss": 3.5301, - "step": 5453 - }, - { - "epoch": 1.68, - "learning_rate": 0.0001522429701206799, - "loss": 3.602, - "step": 5454 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015220734106144267, - "loss": 3.6644, - "step": 5455 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015217171200220545, - "loss": 3.751, - "step": 5456 - }, - { - "epoch": 1.68, - "learning_rate": 0.0001521360829429682, - "loss": 3.5989, - "step": 5457 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015210045388373098, - "loss": 3.5263, - "step": 5458 - }, - { - "epoch": 1.68, - "learning_rate": 0.00015206482482449377, - "loss": 3.541, - "step": 5459 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015202919576525652, - "loss": 3.3655, - "step": 5460 - }, - { - "epoch": 1.69, - "learning_rate": 0.0001519935667060193, - "loss": 3.2226, - "step": 5461 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015195793764678206, - "loss": 3.4594, - "step": 5462 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015192230858754484, - "loss": 3.3427, - "step": 5463 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015188667952830762, - "loss": 3.4615, - "step": 5464 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015185105046907038, - "loss": 3.2545, - "step": 5465 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015181542140983316, - "loss": 3.311, - "step": 5466 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015177979235059594, - "loss": 3.2136, - "step": 5467 - }, - { - "epoch": 1.69, - "learning_rate": 0.0001517441632913587, - "loss": 3.1311, - "step": 5468 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015170853423212145, - "loss": 3.2447, - "step": 5469 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015167290517288423, - "loss": 3.1598, - "step": 5470 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015163727611364701, - "loss": 3.0128, - "step": 5471 - }, - { - "epoch": 1.69, - "learning_rate": 0.0001516016470544098, - "loss": 3.1044, - "step": 5472 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015156601799517255, - "loss": 3.1648, - "step": 5473 - }, - { - "epoch": 1.69, - "learning_rate": 0.0001515303889359353, - "loss": 2.8662, - "step": 5474 - }, - { - "epoch": 1.69, - "learning_rate": 0.0001514947598766981, - "loss": 3.0589, - "step": 5475 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015145913081746087, - "loss": 2.9643, - "step": 5476 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015142350175822365, - "loss": 2.8947, - "step": 5477 - }, - { - "epoch": 1.69, - "learning_rate": 0.0001513878726989864, - "loss": 2.7112, - "step": 5478 - }, - { - "epoch": 1.69, - "learning_rate": 0.0001513522436397492, - "loss": 2.6601, - "step": 5479 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015131661458051194, - "loss": 2.7519, - "step": 5480 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015128098552127472, - "loss": 2.6266, - "step": 5481 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015124535646203748, - "loss": 2.6025, - "step": 5482 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015120972740280026, - "loss": 2.6708, - "step": 5483 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015117409834356304, - "loss": 2.541, - "step": 5484 - }, - { - "epoch": 1.69, - "learning_rate": 0.0001511384692843258, - "loss": 2.1244, - "step": 5485 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015110284022508855, - "loss": 2.1802, - "step": 5486 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015106721116585133, - "loss": 2.3398, - "step": 5487 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015103158210661412, - "loss": 2.3417, - "step": 5488 - }, - { - "epoch": 1.69, - "learning_rate": 0.0001509959530473769, - "loss": 1.9809, - "step": 5489 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015096032398813965, - "loss": 1.8383, - "step": 5490 - }, - { - "epoch": 1.69, - "learning_rate": 0.00015092469492890243, - "loss": 4.4891, - "step": 5491 - }, - { - "epoch": 1.7, - "learning_rate": 0.0001508890658696652, - "loss": 4.4108, - "step": 5492 - }, - { - "epoch": 1.7, - "learning_rate": 0.00015085343681042797, - "loss": 4.0917, - "step": 5493 - }, - { - "epoch": 1.7, - "learning_rate": 0.00015081780775119075, - "loss": 3.9461, - "step": 5494 - }, - { - "epoch": 1.7, - "learning_rate": 0.0001507821786919535, - "loss": 3.8929, - "step": 5495 - }, - { - "epoch": 1.7, - "learning_rate": 0.0001507465496327163, - "loss": 3.8981, - "step": 5496 - }, - { - "epoch": 1.7, - "learning_rate": 0.00015071092057347904, - "loss": 3.6854, - "step": 5497 - }, - { - "epoch": 1.7, - "learning_rate": 0.00015067529151424183, - "loss": 3.6973, - "step": 5498 - }, - { - "epoch": 1.7, - "learning_rate": 0.00015063966245500458, - "loss": 3.8432, - "step": 5499 - }, - { - "epoch": 1.7, - "learning_rate": 0.00015060403339576736, - "loss": 3.6918, - "step": 5500 - }, - { - "epoch": 1.7, - "eval_bleu": 4.030183230582677e-18, - "eval_loss": 4.157788276672363, - "eval_runtime": 2571.6238, - "eval_samples_per_second": 5.74, - "eval_steps_per_second": 0.717, - "step": 5500 - }, - { - "epoch": 1.7, - "learning_rate": 0.00015056840433653014, - "loss": 3.658, - "step": 5501 - }, - { - "epoch": 1.7, - "learning_rate": 0.0001505327752772929, - "loss": 3.8132, - "step": 5502 - }, - { - "epoch": 1.7, - "learning_rate": 0.00015049714621805568, - "loss": 3.8641, - "step": 5503 - }, - { - "epoch": 1.7, - "learning_rate": 0.00015046151715881844, - "loss": 3.5749, - "step": 5504 - }, - { - "epoch": 1.7, - "learning_rate": 0.00015042588809958122, - "loss": 3.663, - "step": 5505 - }, - { - "epoch": 1.7, - "learning_rate": 0.000150390259040344, - "loss": 3.4505, - "step": 5506 - }, - { - "epoch": 1.7, - "learning_rate": 0.00015035462998110675, - "loss": 3.6202, - "step": 5507 - }, - { - "epoch": 1.7, - "learning_rate": 0.00015031900092186954, - "loss": 3.5025, - "step": 5508 - }, - { - "epoch": 1.7, - "learning_rate": 0.0001502833718626323, - "loss": 3.5353, - "step": 5509 - }, - { - "epoch": 1.7, - "learning_rate": 0.00015024774280339507, - "loss": 3.6737, - "step": 5510 - }, - { - "epoch": 1.7, - "learning_rate": 0.00015021211374415786, - "loss": 3.267, - "step": 5511 - }, - { - "epoch": 1.7, - "learning_rate": 0.0001501764846849206, - "loss": 3.524, - "step": 5512 - }, - { - "epoch": 1.7, - "learning_rate": 0.0001501408556256834, - "loss": 3.4764, - "step": 5513 - }, - { - "epoch": 1.7, - "learning_rate": 0.00015010522656644617, - "loss": 3.3606, - "step": 5514 - }, - { - "epoch": 1.7, - "learning_rate": 0.00015006959750720893, - "loss": 3.147, - "step": 5515 - }, - { - "epoch": 1.7, - "learning_rate": 0.00015003396844797168, - "loss": 3.2406, - "step": 5516 - }, - { - "epoch": 1.7, - "learning_rate": 0.00014999833938873447, - "loss": 3.2133, - "step": 5517 - }, - { - "epoch": 1.7, - "learning_rate": 0.00014996271032949725, - "loss": 3.5627, - "step": 5518 - }, - { - "epoch": 1.7, - "learning_rate": 0.00014992708127026003, - "loss": 3.0251, - "step": 5519 - }, - { - "epoch": 1.7, - "learning_rate": 0.00014989145221102278, - "loss": 3.3646, - "step": 5520 - }, - { - "epoch": 1.7, - "learning_rate": 0.00014985582315178554, - "loss": 2.9825, - "step": 5521 - }, - { - "epoch": 1.7, - "learning_rate": 0.00014982019409254832, - "loss": 3.1127, - "step": 5522 - }, - { - "epoch": 1.7, - "learning_rate": 0.0001497845650333111, - "loss": 3.1705, - "step": 5523 - }, - { - "epoch": 1.7, - "learning_rate": 0.00014974893597407386, - "loss": 2.7239, - "step": 5524 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014971330691483664, - "loss": 3.009, - "step": 5525 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014967767785559942, - "loss": 2.9662, - "step": 5526 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014964204879636218, - "loss": 2.8263, - "step": 5527 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014960641973712496, - "loss": 2.8388, - "step": 5528 - }, - { - "epoch": 1.71, - "learning_rate": 0.0001495707906778877, - "loss": 2.529, - "step": 5529 - }, - { - "epoch": 1.71, - "learning_rate": 0.0001495351616186505, - "loss": 2.5943, - "step": 5530 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014949953255941328, - "loss": 2.5313, - "step": 5531 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014946390350017603, - "loss": 2.6059, - "step": 5532 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014942827444093879, - "loss": 2.3711, - "step": 5533 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014939264538170157, - "loss": 2.4221, - "step": 5534 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014935701632246435, - "loss": 2.2722, - "step": 5535 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014932138726322713, - "loss": 2.0282, - "step": 5536 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014928575820398989, - "loss": 2.3052, - "step": 5537 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014925012914475267, - "loss": 2.1528, - "step": 5538 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014921450008551542, - "loss": 1.9045, - "step": 5539 - }, - { - "epoch": 1.71, - "learning_rate": 0.0001491788710262782, - "loss": 1.6845, - "step": 5540 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014914324196704096, - "loss": 4.6296, - "step": 5541 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014910761290780374, - "loss": 4.1702, - "step": 5542 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014907198384856652, - "loss": 4.1163, - "step": 5543 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014903635478932928, - "loss": 3.6316, - "step": 5544 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014900072573009206, - "loss": 4.0896, - "step": 5545 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014896509667085481, - "loss": 3.6407, - "step": 5546 - }, - { - "epoch": 1.71, - "learning_rate": 0.0001489294676116176, - "loss": 3.6821, - "step": 5547 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014889383855238038, - "loss": 3.5323, - "step": 5548 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014885820949314313, - "loss": 3.5287, - "step": 5549 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014882258043390591, - "loss": 3.4893, - "step": 5550 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014878695137466867, - "loss": 3.6535, - "step": 5551 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014875132231543145, - "loss": 3.5942, - "step": 5552 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014871569325619423, - "loss": 3.5964, - "step": 5553 - }, - { - "epoch": 1.71, - "learning_rate": 0.000148680064196957, - "loss": 3.5881, - "step": 5554 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014864443513771977, - "loss": 3.5855, - "step": 5555 - }, - { - "epoch": 1.71, - "learning_rate": 0.00014860880607848252, - "loss": 3.4316, - "step": 5556 - }, - { - "epoch": 1.72, - "learning_rate": 0.0001485731770192453, - "loss": 3.3362, - "step": 5557 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014853754796000806, - "loss": 3.7759, - "step": 5558 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014850191890077084, - "loss": 3.7938, - "step": 5559 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014846628984153362, - "loss": 3.365, - "step": 5560 - }, - { - "epoch": 1.72, - "learning_rate": 0.0001484306607822964, - "loss": 3.2844, - "step": 5561 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014839503172305916, - "loss": 3.186, - "step": 5562 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014835940266382192, - "loss": 3.3805, - "step": 5563 - }, - { - "epoch": 1.72, - "learning_rate": 0.0001483237736045847, - "loss": 3.4512, - "step": 5564 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014828814454534748, - "loss": 3.5641, - "step": 5565 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014825251548611026, - "loss": 3.2269, - "step": 5566 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014821688642687302, - "loss": 3.0515, - "step": 5567 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014818125736763577, - "loss": 3.4683, - "step": 5568 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014814562830839855, - "loss": 3.1459, - "step": 5569 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014810999924916134, - "loss": 3.0918, - "step": 5570 - }, - { - "epoch": 1.72, - "learning_rate": 0.0001480743701899241, - "loss": 2.9646, - "step": 5571 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014803874113068687, - "loss": 3.2952, - "step": 5572 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014800311207144965, - "loss": 3.0044, - "step": 5573 - }, - { - "epoch": 1.72, - "learning_rate": 0.0001479674830122124, - "loss": 2.9747, - "step": 5574 - }, - { - "epoch": 1.72, - "learning_rate": 0.0001479318539529752, - "loss": 2.8545, - "step": 5575 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014789622489373795, - "loss": 2.8722, - "step": 5576 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014786059583450073, - "loss": 2.6834, - "step": 5577 - }, - { - "epoch": 1.72, - "learning_rate": 0.0001478249667752635, - "loss": 2.6967, - "step": 5578 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014778933771602626, - "loss": 2.6918, - "step": 5579 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014775370865678902, - "loss": 2.7595, - "step": 5580 - }, - { - "epoch": 1.72, - "learning_rate": 0.0001477180795975518, - "loss": 2.6591, - "step": 5581 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014768245053831458, - "loss": 2.6631, - "step": 5582 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014764682147907736, - "loss": 2.2898, - "step": 5583 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014761119241984012, - "loss": 2.4094, - "step": 5584 - }, - { - "epoch": 1.72, - "learning_rate": 0.0001475755633606029, - "loss": 2.1231, - "step": 5585 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014753993430136566, - "loss": 2.0845, - "step": 5586 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014750430524212844, - "loss": 2.0964, - "step": 5587 - }, - { - "epoch": 1.72, - "learning_rate": 0.0001474686761828912, - "loss": 1.9888, - "step": 5588 - }, - { - "epoch": 1.72, - "learning_rate": 0.00014743304712365397, - "loss": 1.837, - "step": 5589 - }, - { - "epoch": 1.73, - "learning_rate": 0.00014739741806441676, - "loss": 1.8832, - "step": 5590 - }, - { - "epoch": 1.73, - "learning_rate": 0.0001473617890051795, - "loss": 4.2848, - "step": 5591 - }, - { - "epoch": 1.73, - "learning_rate": 0.0001473261599459423, - "loss": 4.1003, - "step": 5592 - }, - { - "epoch": 1.73, - "learning_rate": 0.00014729053088670505, - "loss": 4.1547, - "step": 5593 - }, - { - "epoch": 1.73, - "learning_rate": 0.00014725490182746783, - "loss": 4.0653, - "step": 5594 - }, - { - "epoch": 1.73, - "learning_rate": 0.0001472192727682306, - "loss": 3.7567, - "step": 5595 - }, - { - "epoch": 1.73, - "learning_rate": 0.0001471836437089934, - "loss": 3.6083, - "step": 5596 - }, - { - "epoch": 1.73, - "learning_rate": 0.00014714801464975615, - "loss": 3.6028, - "step": 5597 - }, - { - "epoch": 1.73, - "learning_rate": 0.0001471123855905189, - "loss": 3.5621, - "step": 5598 - }, - { - "epoch": 1.73, - "learning_rate": 0.00014707675653128168, - "loss": 3.6638, - "step": 5599 - }, - { - "epoch": 1.73, - "learning_rate": 0.00014704112747204447, - "loss": 3.676, - "step": 5600 - }, - { - "epoch": 1.73, - "learning_rate": 0.00014700549841280722, - "loss": 3.6808, - "step": 5601 - }, - { - "epoch": 1.73, - "learning_rate": 0.00014696986935357, - "loss": 3.6699, - "step": 5602 - }, - { - "epoch": 1.73, - "learning_rate": 0.00014693424029433276, - "loss": 3.3798, - "step": 5603 - }, - { - "epoch": 1.73, - "learning_rate": 0.00014689861123509554, - "loss": 3.6955, - "step": 5604 - }, - { - "epoch": 1.73, - "learning_rate": 0.0001468629821758583, - "loss": 3.6643, - "step": 5605 - }, - { - "epoch": 1.73, - "learning_rate": 0.00014682735311662108, - "loss": 3.4743, - "step": 5606 - }, - { - "epoch": 1.73, - "learning_rate": 0.00014679172405738386, - "loss": 3.5675, - "step": 5607 - }, - { - "epoch": 1.73, - "learning_rate": 0.00014675609499814664, - "loss": 3.3213, - "step": 5608 - }, - { - "epoch": 1.73, - "learning_rate": 0.0001467204659389094, - "loss": 3.2201, - "step": 5609 - }, - { - "epoch": 1.73, - "learning_rate": 0.00014668483687967215, - "loss": 3.5433, - "step": 5610 - }, - { - "epoch": 1.73, - "learning_rate": 0.00014664920782043493, - "loss": 3.3921, - "step": 5611 - }, - { - "epoch": 1.73, - "learning_rate": 0.0001466135787611977, - "loss": 3.3963, - "step": 5612 - }, - { - "epoch": 1.73, - "learning_rate": 0.0001465779497019605, - "loss": 3.4627, - "step": 5613 - }, - { - "epoch": 1.73, - "learning_rate": 0.00014654232064272325, - "loss": 3.2099, - "step": 5614 - }, - { - "epoch": 1.73, - "learning_rate": 0.000146506691583486, - "loss": 3.3638, - "step": 5615 - }, - { - "epoch": 1.73, - "learning_rate": 0.0001464710625242488, - "loss": 3.3504, - "step": 5616 - }, - { - "epoch": 1.73, - "learning_rate": 0.00014643543346501157, - "loss": 3.3775, - "step": 5617 - }, - { - "epoch": 1.73, - "learning_rate": 0.00014639980440577432, - "loss": 3.2816, - "step": 5618 - }, - { - "epoch": 1.73, - "learning_rate": 0.0001463641753465371, - "loss": 3.1509, - "step": 5619 - }, - { - "epoch": 1.73, - "learning_rate": 0.0001463285462872999, - "loss": 2.9796, - "step": 5620 - }, - { - "epoch": 1.73, - "learning_rate": 0.00014629291722806264, - "loss": 3.0476, - "step": 5621 - }, - { - "epoch": 1.74, - "learning_rate": 0.0001462572881688254, - "loss": 2.975, - "step": 5622 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014622165910958818, - "loss": 2.9154, - "step": 5623 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014618603005035096, - "loss": 2.8408, - "step": 5624 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014615040099111374, - "loss": 2.8701, - "step": 5625 - }, - { - "epoch": 1.74, - "learning_rate": 0.0001461147719318765, - "loss": 2.8103, - "step": 5626 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014607914287263925, - "loss": 2.784, - "step": 5627 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014604351381340203, - "loss": 2.7294, - "step": 5628 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014600788475416482, - "loss": 2.7379, - "step": 5629 - }, - { - "epoch": 1.74, - "learning_rate": 0.0001459722556949276, - "loss": 2.3428, - "step": 5630 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014593662663569035, - "loss": 2.4326, - "step": 5631 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014590099757645313, - "loss": 2.3228, - "step": 5632 - }, - { - "epoch": 1.74, - "learning_rate": 0.0001458653685172159, - "loss": 2.4725, - "step": 5633 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014582973945797867, - "loss": 2.2962, - "step": 5634 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014579411039874143, - "loss": 2.4484, - "step": 5635 - }, - { - "epoch": 1.74, - "learning_rate": 0.0001457584813395042, - "loss": 2.2544, - "step": 5636 - }, - { - "epoch": 1.74, - "learning_rate": 0.000145722852280267, - "loss": 2.0403, - "step": 5637 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014568722322102974, - "loss": 1.9084, - "step": 5638 - }, - { - "epoch": 1.74, - "learning_rate": 0.0001456515941617925, - "loss": 1.7714, - "step": 5639 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014561596510255528, - "loss": 2.093, - "step": 5640 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014558033604331806, - "loss": 4.4372, - "step": 5641 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014554470698408084, - "loss": 4.3955, - "step": 5642 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014550907792484363, - "loss": 4.0727, - "step": 5643 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014547344886560638, - "loss": 4.1392, - "step": 5644 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014543781980636914, - "loss": 3.7589, - "step": 5645 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014540219074713192, - "loss": 3.8234, - "step": 5646 - }, - { - "epoch": 1.74, - "learning_rate": 0.0001453665616878947, - "loss": 3.7734, - "step": 5647 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014533093262865745, - "loss": 3.6684, - "step": 5648 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014529530356942024, - "loss": 3.6131, - "step": 5649 - }, - { - "epoch": 1.74, - "learning_rate": 0.000145259674510183, - "loss": 3.5179, - "step": 5650 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014522404545094577, - "loss": 3.6042, - "step": 5651 - }, - { - "epoch": 1.74, - "learning_rate": 0.00014518841639170853, - "loss": 3.6474, - "step": 5652 - }, - { - "epoch": 1.74, - "learning_rate": 0.0001451527873324713, - "loss": 3.4946, - "step": 5653 - }, - { - "epoch": 1.75, - "learning_rate": 0.0001451171582732341, - "loss": 3.2698, - "step": 5654 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014508152921399687, - "loss": 3.43, - "step": 5655 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014504590015475963, - "loss": 3.4751, - "step": 5656 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014501027109552238, - "loss": 3.5235, - "step": 5657 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014497464203628516, - "loss": 3.4055, - "step": 5658 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014493901297704795, - "loss": 3.3193, - "step": 5659 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014490338391781073, - "loss": 3.3369, - "step": 5660 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014486775485857348, - "loss": 3.2913, - "step": 5661 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014483212579933624, - "loss": 3.2146, - "step": 5662 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014479649674009902, - "loss": 3.5114, - "step": 5663 - }, - { - "epoch": 1.75, - "learning_rate": 0.0001447608676808618, - "loss": 3.2997, - "step": 5664 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014472523862162456, - "loss": 3.1016, - "step": 5665 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014468960956238734, - "loss": 3.4805, - "step": 5666 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014465398050315012, - "loss": 3.4458, - "step": 5667 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014461835144391287, - "loss": 2.9711, - "step": 5668 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014458272238467563, - "loss": 3.1976, - "step": 5669 - }, - { - "epoch": 1.75, - "learning_rate": 0.0001445470933254384, - "loss": 3.0139, - "step": 5670 - }, - { - "epoch": 1.75, - "learning_rate": 0.0001445114642662012, - "loss": 3.0189, - "step": 5671 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014447583520696398, - "loss": 2.8537, - "step": 5672 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014444020614772673, - "loss": 2.9053, - "step": 5673 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014440457708848948, - "loss": 2.8189, - "step": 5674 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014436894802925227, - "loss": 2.8961, - "step": 5675 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014433331897001505, - "loss": 3.1378, - "step": 5676 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014429768991077783, - "loss": 2.8621, - "step": 5677 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014426206085154059, - "loss": 2.6243, - "step": 5678 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014422643179230337, - "loss": 2.9517, - "step": 5679 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014419080273306612, - "loss": 2.736, - "step": 5680 - }, - { - "epoch": 1.75, - "learning_rate": 0.0001441551736738289, - "loss": 2.4369, - "step": 5681 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014411954461459166, - "loss": 2.3669, - "step": 5682 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014408391555535444, - "loss": 2.4263, - "step": 5683 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014404828649611722, - "loss": 2.6093, - "step": 5684 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014401265743687998, - "loss": 2.145, - "step": 5685 - }, - { - "epoch": 1.75, - "learning_rate": 0.00014397702837764276, - "loss": 2.211, - "step": 5686 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014394139931840551, - "loss": 2.1429, - "step": 5687 - }, - { - "epoch": 1.76, - "learning_rate": 0.0001439057702591683, - "loss": 1.8354, - "step": 5688 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014387014119993108, - "loss": 1.7859, - "step": 5689 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014383451214069386, - "loss": 1.9359, - "step": 5690 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014379888308145661, - "loss": 4.3598, - "step": 5691 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014376325402221937, - "loss": 4.1434, - "step": 5692 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014372762496298215, - "loss": 4.1539, - "step": 5693 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014369199590374493, - "loss": 4.1399, - "step": 5694 - }, - { - "epoch": 1.76, - "learning_rate": 0.0001436563668445077, - "loss": 3.6838, - "step": 5695 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014362073778527047, - "loss": 3.9645, - "step": 5696 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014358510872603322, - "loss": 3.7597, - "step": 5697 - }, - { - "epoch": 1.76, - "learning_rate": 0.000143549479666796, - "loss": 3.6812, - "step": 5698 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014351385060755876, - "loss": 3.8042, - "step": 5699 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014347822154832154, - "loss": 3.4445, - "step": 5700 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014344259248908432, - "loss": 3.6026, - "step": 5701 - }, - { - "epoch": 1.76, - "learning_rate": 0.0001434069634298471, - "loss": 3.5622, - "step": 5702 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014337133437060986, - "loss": 3.6085, - "step": 5703 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014333570531137262, - "loss": 3.2756, - "step": 5704 - }, - { - "epoch": 1.76, - "learning_rate": 0.0001433000762521354, - "loss": 3.4594, - "step": 5705 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014326444719289818, - "loss": 3.6386, - "step": 5706 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014322881813366096, - "loss": 3.5918, - "step": 5707 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014319318907442372, - "loss": 3.3417, - "step": 5708 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014315756001518647, - "loss": 3.6192, - "step": 5709 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014312193095594925, - "loss": 3.4743, - "step": 5710 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014308630189671203, - "loss": 3.3527, - "step": 5711 - }, - { - "epoch": 1.76, - "learning_rate": 0.0001430506728374748, - "loss": 3.3838, - "step": 5712 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014301504377823757, - "loss": 3.3154, - "step": 5713 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014297941471900035, - "loss": 3.3733, - "step": 5714 - }, - { - "epoch": 1.76, - "learning_rate": 0.0001429437856597631, - "loss": 3.4356, - "step": 5715 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014290815660052586, - "loss": 3.2536, - "step": 5716 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014287252754128864, - "loss": 3.2536, - "step": 5717 - }, - { - "epoch": 1.76, - "learning_rate": 0.00014283689848205143, - "loss": 3.2298, - "step": 5718 - }, - { - "epoch": 1.77, - "learning_rate": 0.0001428012694228142, - "loss": 3.0417, - "step": 5719 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014276564036357696, - "loss": 3.17, - "step": 5720 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014273001130433972, - "loss": 3.173, - "step": 5721 - }, - { - "epoch": 1.77, - "learning_rate": 0.0001426943822451025, - "loss": 3.0711, - "step": 5722 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014265875318586528, - "loss": 3.0671, - "step": 5723 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014262312412662806, - "loss": 2.843, - "step": 5724 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014258749506739082, - "loss": 3.0564, - "step": 5725 - }, - { - "epoch": 1.77, - "learning_rate": 0.0001425518660081536, - "loss": 2.8392, - "step": 5726 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014251623694891635, - "loss": 2.6445, - "step": 5727 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014248060788967914, - "loss": 2.5914, - "step": 5728 - }, - { - "epoch": 1.77, - "learning_rate": 0.0001424449788304419, - "loss": 2.5418, - "step": 5729 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014240934977120467, - "loss": 2.4745, - "step": 5730 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014237372071196746, - "loss": 2.6439, - "step": 5731 - }, - { - "epoch": 1.77, - "learning_rate": 0.0001423380916527302, - "loss": 2.5404, - "step": 5732 - }, - { - "epoch": 1.77, - "learning_rate": 0.000142302462593493, - "loss": 2.3958, - "step": 5733 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014226683353425575, - "loss": 2.4156, - "step": 5734 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014223120447501853, - "loss": 2.3506, - "step": 5735 - }, - { - "epoch": 1.77, - "learning_rate": 0.0001421955754157813, - "loss": 2.2906, - "step": 5736 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014215994635654407, - "loss": 2.1339, - "step": 5737 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014212431729730685, - "loss": 2.0886, - "step": 5738 - }, - { - "epoch": 1.77, - "learning_rate": 0.0001420886882380696, - "loss": 1.9057, - "step": 5739 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014205305917883238, - "loss": 1.8557, - "step": 5740 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014201743011959517, - "loss": 4.5447, - "step": 5741 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014198180106035792, - "loss": 4.255, - "step": 5742 - }, - { - "epoch": 1.77, - "learning_rate": 0.0001419461720011207, - "loss": 4.0219, - "step": 5743 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014191054294188346, - "loss": 3.7892, - "step": 5744 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014187491388264624, - "loss": 3.7616, - "step": 5745 - }, - { - "epoch": 1.77, - "learning_rate": 0.000141839284823409, - "loss": 3.8975, - "step": 5746 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014180365576417178, - "loss": 3.7408, - "step": 5747 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014176802670493456, - "loss": 3.6341, - "step": 5748 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014173239764569734, - "loss": 3.8169, - "step": 5749 - }, - { - "epoch": 1.77, - "learning_rate": 0.0001416967685864601, - "loss": 3.3752, - "step": 5750 - }, - { - "epoch": 1.77, - "learning_rate": 0.00014166113952722285, - "loss": 3.4488, - "step": 5751 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014162551046798563, - "loss": 3.632, - "step": 5752 - }, - { - "epoch": 1.78, - "learning_rate": 0.0001415898814087484, - "loss": 3.3617, - "step": 5753 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014155425234951117, - "loss": 3.1134, - "step": 5754 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014151862329027395, - "loss": 3.6366, - "step": 5755 - }, - { - "epoch": 1.78, - "learning_rate": 0.0001414829942310367, - "loss": 3.3295, - "step": 5756 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014144736517179949, - "loss": 3.6436, - "step": 5757 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014141173611256227, - "loss": 3.4852, - "step": 5758 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014137610705332502, - "loss": 3.1439, - "step": 5759 - }, - { - "epoch": 1.78, - "learning_rate": 0.0001413404779940878, - "loss": 3.3862, - "step": 5760 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014130484893485059, - "loss": 3.5979, - "step": 5761 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014126921987561334, - "loss": 3.4132, - "step": 5762 - }, - { - "epoch": 1.78, - "learning_rate": 0.0001412335908163761, - "loss": 3.3857, - "step": 5763 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014119796175713888, - "loss": 3.3123, - "step": 5764 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014116233269790166, - "loss": 2.8581, - "step": 5765 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014112670363866444, - "loss": 3.0842, - "step": 5766 - }, - { - "epoch": 1.78, - "learning_rate": 0.0001410910745794272, - "loss": 3.4408, - "step": 5767 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014105544552018995, - "loss": 3.0885, - "step": 5768 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014101981646095273, - "loss": 3.0074, - "step": 5769 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014098418740171551, - "loss": 2.9398, - "step": 5770 - }, - { - "epoch": 1.78, - "learning_rate": 0.0001409485583424783, - "loss": 3.0733, - "step": 5771 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014091292928324105, - "loss": 3.3067, - "step": 5772 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014087730022400383, - "loss": 2.7651, - "step": 5773 - }, - { - "epoch": 1.78, - "learning_rate": 0.0001408416711647666, - "loss": 2.83, - "step": 5774 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014080604210552937, - "loss": 2.8875, - "step": 5775 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014077041304629212, - "loss": 2.6647, - "step": 5776 - }, - { - "epoch": 1.78, - "learning_rate": 0.0001407347839870549, - "loss": 2.8033, - "step": 5777 - }, - { - "epoch": 1.78, - "learning_rate": 0.0001406991549278177, - "loss": 2.7867, - "step": 5778 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014066352586858044, - "loss": 2.5961, - "step": 5779 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014062789680934323, - "loss": 2.6612, - "step": 5780 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014059226775010598, - "loss": 2.438, - "step": 5781 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014055663869086876, - "loss": 2.3715, - "step": 5782 - }, - { - "epoch": 1.78, - "learning_rate": 0.00014052100963163154, - "loss": 2.3758, - "step": 5783 - }, - { - "epoch": 1.79, - "learning_rate": 0.0001404853805723943, - "loss": 2.3585, - "step": 5784 - }, - { - "epoch": 1.79, - "learning_rate": 0.00014044975151315708, - "loss": 2.2134, - "step": 5785 - }, - { - "epoch": 1.79, - "learning_rate": 0.00014041412245391984, - "loss": 1.9247, - "step": 5786 - }, - { - "epoch": 1.79, - "learning_rate": 0.00014037849339468262, - "loss": 2.0851, - "step": 5787 - }, - { - "epoch": 1.79, - "learning_rate": 0.0001403428643354454, - "loss": 2.0736, - "step": 5788 - }, - { - "epoch": 1.79, - "learning_rate": 0.00014030723527620815, - "loss": 1.7418, - "step": 5789 - }, - { - "epoch": 1.79, - "learning_rate": 0.00014027160621697094, - "loss": 1.7725, - "step": 5790 - }, - { - "epoch": 1.79, - "learning_rate": 0.0001402359771577337, - "loss": 4.4998, - "step": 5791 - }, - { - "epoch": 1.79, - "learning_rate": 0.00014020034809849647, - "loss": 4.1821, - "step": 5792 - }, - { - "epoch": 1.79, - "learning_rate": 0.00014016471903925923, - "loss": 4.3068, - "step": 5793 - }, - { - "epoch": 1.79, - "learning_rate": 0.000140129089980022, - "loss": 3.7252, - "step": 5794 - }, - { - "epoch": 1.79, - "learning_rate": 0.0001400934609207848, - "loss": 4.0177, - "step": 5795 - }, - { - "epoch": 1.79, - "learning_rate": 0.00014005783186154757, - "loss": 3.7797, - "step": 5796 - }, - { - "epoch": 1.79, - "learning_rate": 0.00014002220280231033, - "loss": 3.7268, - "step": 5797 - }, - { - "epoch": 1.79, - "learning_rate": 0.00013998657374307308, - "loss": 3.7286, - "step": 5798 - }, - { - "epoch": 1.79, - "learning_rate": 0.00013995094468383586, - "loss": 3.5305, - "step": 5799 - }, - { - "epoch": 1.79, - "learning_rate": 0.00013991531562459865, - "loss": 3.6905, - "step": 5800 - }, - { - "epoch": 1.79, - "learning_rate": 0.0001398796865653614, - "loss": 3.4284, - "step": 5801 - }, - { - "epoch": 1.79, - "learning_rate": 0.00013984405750612418, - "loss": 3.9358, - "step": 5802 - }, - { - "epoch": 1.79, - "learning_rate": 0.00013980842844688694, - "loss": 3.5555, - "step": 5803 - }, - { - "epoch": 1.79, - "learning_rate": 0.00013977279938764972, - "loss": 3.5612, - "step": 5804 - }, - { - "epoch": 1.79, - "learning_rate": 0.0001397371703284125, - "loss": 3.3969, - "step": 5805 - }, - { - "epoch": 1.79, - "learning_rate": 0.00013970154126917526, - "loss": 3.5514, - "step": 5806 - }, - { - "epoch": 1.79, - "learning_rate": 0.00013966591220993804, - "loss": 3.5584, - "step": 5807 - }, - { - "epoch": 1.79, - "learning_rate": 0.00013963028315070082, - "loss": 3.2074, - "step": 5808 - }, - { - "epoch": 1.79, - "learning_rate": 0.00013959465409146357, - "loss": 3.3537, - "step": 5809 - }, - { - "epoch": 1.79, - "learning_rate": 0.00013955902503222633, - "loss": 3.4354, - "step": 5810 - }, - { - "epoch": 1.79, - "learning_rate": 0.0001395233959729891, - "loss": 3.2865, - "step": 5811 - }, - { - "epoch": 1.79, - "learning_rate": 0.0001394877669137519, - "loss": 3.3629, - "step": 5812 - }, - { - "epoch": 1.79, - "learning_rate": 0.00013945213785451467, - "loss": 3.3553, - "step": 5813 - }, - { - "epoch": 1.79, - "learning_rate": 0.00013941650879527743, - "loss": 3.5052, - "step": 5814 - }, - { - "epoch": 1.79, - "learning_rate": 0.0001393808797360402, - "loss": 3.3821, - "step": 5815 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013934525067680297, - "loss": 3.2393, - "step": 5816 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013930962161756575, - "loss": 3.5431, - "step": 5817 - }, - { - "epoch": 1.8, - "learning_rate": 0.0001392739925583285, - "loss": 2.9671, - "step": 5818 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013923836349909128, - "loss": 2.9769, - "step": 5819 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013920273443985407, - "loss": 2.9013, - "step": 5820 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013916710538061682, - "loss": 3.0875, - "step": 5821 - }, - { - "epoch": 1.8, - "learning_rate": 0.0001391314763213796, - "loss": 3.0266, - "step": 5822 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013909584726214236, - "loss": 2.9963, - "step": 5823 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013906021820290514, - "loss": 2.8772, - "step": 5824 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013902458914366792, - "loss": 2.8313, - "step": 5825 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013898896008443068, - "loss": 2.8887, - "step": 5826 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013895333102519346, - "loss": 2.6444, - "step": 5827 - }, - { - "epoch": 1.8, - "learning_rate": 0.0001389177019659562, - "loss": 2.7736, - "step": 5828 - }, - { - "epoch": 1.8, - "learning_rate": 0.000138882072906719, - "loss": 2.5794, - "step": 5829 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013884644384748178, - "loss": 2.7664, - "step": 5830 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013881081478824453, - "loss": 2.5491, - "step": 5831 - }, - { - "epoch": 1.8, - "learning_rate": 0.0001387751857290073, - "loss": 2.4192, - "step": 5832 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013873955666977007, - "loss": 2.3881, - "step": 5833 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013870392761053285, - "loss": 2.4427, - "step": 5834 - }, - { - "epoch": 1.8, - "learning_rate": 0.0001386682985512956, - "loss": 2.1865, - "step": 5835 - }, - { - "epoch": 1.8, - "learning_rate": 0.0001386326694920584, - "loss": 2.1334, - "step": 5836 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013859704043282117, - "loss": 1.8397, - "step": 5837 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013856141137358392, - "loss": 2.1014, - "step": 5838 - }, - { - "epoch": 1.8, - "learning_rate": 0.0001385257823143467, - "loss": 1.7367, - "step": 5839 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013849015325510946, - "loss": 1.8255, - "step": 5840 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013845452419587224, - "loss": 4.5626, - "step": 5841 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013841889513663502, - "loss": 4.2866, - "step": 5842 - }, - { - "epoch": 1.8, - "learning_rate": 0.0001383832660773978, - "loss": 3.9008, - "step": 5843 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013834763701816056, - "loss": 3.9697, - "step": 5844 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013831200795892332, - "loss": 3.9319, - "step": 5845 - }, - { - "epoch": 1.8, - "learning_rate": 0.0001382763788996861, - "loss": 3.8111, - "step": 5846 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013824074984044888, - "loss": 3.8537, - "step": 5847 - }, - { - "epoch": 1.8, - "learning_rate": 0.00013820512078121163, - "loss": 3.9585, - "step": 5848 - }, - { - "epoch": 1.81, - "learning_rate": 0.00013816949172197442, - "loss": 3.5808, - "step": 5849 - }, - { - "epoch": 1.81, - "learning_rate": 0.00013813386266273717, - "loss": 3.7264, - "step": 5850 - }, - { - "epoch": 1.81, - "learning_rate": 0.00013809823360349995, - "loss": 3.8275, - "step": 5851 - }, - { - "epoch": 1.81, - "learning_rate": 0.0001380626045442627, - "loss": 3.7473, - "step": 5852 - }, - { - "epoch": 1.81, - "learning_rate": 0.0001380269754850255, - "loss": 3.5695, - "step": 5853 - }, - { - "epoch": 1.81, - "learning_rate": 0.00013799134642578827, - "loss": 3.7141, - "step": 5854 - }, - { - "epoch": 1.81, - "learning_rate": 0.00013795571736655105, - "loss": 3.2946, - "step": 5855 - }, - { - "epoch": 1.81, - "learning_rate": 0.0001379200883073138, - "loss": 3.3726, - "step": 5856 - }, - { - "epoch": 1.81, - "learning_rate": 0.00013788445924807656, - "loss": 3.4202, - "step": 5857 - }, - { - "epoch": 1.81, - "learning_rate": 0.00013784883018883934, - "loss": 3.4304, - "step": 5858 - }, - { - "epoch": 1.81, - "learning_rate": 0.00013781320112960213, - "loss": 3.3203, - "step": 5859 - }, - { - "epoch": 1.81, - "learning_rate": 0.0001377775720703649, - "loss": 3.3563, - "step": 5860 - }, - { - "epoch": 1.81, - "learning_rate": 0.00013774194301112766, - "loss": 3.5916, - "step": 5861 - }, - { - "epoch": 1.81, - "learning_rate": 0.00013770631395189044, - "loss": 3.4161, - "step": 5862 - }, - { - "epoch": 1.81, - "learning_rate": 0.0001376706848926532, - "loss": 3.5122, - "step": 5863 - }, - { - "epoch": 1.81, - "learning_rate": 0.00013763505583341598, - "loss": 3.5159, - "step": 5864 - }, - { - "epoch": 1.81, - "learning_rate": 0.00013759942677417874, - "loss": 3.2272, - "step": 5865 - }, - { - "epoch": 1.81, - "learning_rate": 0.00013756379771494152, - "loss": 3.1416, - "step": 5866 - }, - { - "epoch": 1.81, - "learning_rate": 0.0001375281686557043, - "loss": 3.3573, - "step": 5867 - }, - { - "epoch": 1.81, - "learning_rate": 0.00013749253959646705, - "loss": 3.0445, - "step": 5868 - }, - { - "epoch": 1.81, - "learning_rate": 0.00013745691053722984, - "loss": 3.1516, - "step": 5869 - }, - { - "epoch": 1.81, - "learning_rate": 0.0001374212814779926, - "loss": 3.2233, - "step": 5870 - }, - { - "epoch": 1.81, - "learning_rate": 0.00013738565241875537, - "loss": 2.9618, - "step": 5871 - }, - { - "epoch": 1.81, - "learning_rate": 0.00013735002335951815, - "loss": 3.2353, - "step": 5872 - }, - { - "epoch": 1.81, - "learning_rate": 0.0001373143943002809, - "loss": 2.9632, - "step": 5873 - }, - { - "epoch": 1.81, - "learning_rate": 0.0001372787652410437, - "loss": 2.7811, - "step": 5874 - }, - { - "epoch": 1.81, - "learning_rate": 0.00013724313618180645, - "loss": 2.8286, - "step": 5875 - }, - { - "epoch": 1.81, - "learning_rate": 0.00013720750712256923, - "loss": 2.6761, - "step": 5876 - }, - { - "epoch": 1.81, - "learning_rate": 0.000137171878063332, - "loss": 2.7204, - "step": 5877 - }, - { - "epoch": 1.81, - "learning_rate": 0.00013713624900409476, - "loss": 2.4742, - "step": 5878 - }, - { - "epoch": 1.81, - "learning_rate": 0.00013710061994485755, - "loss": 2.4664, - "step": 5879 - }, - { - "epoch": 1.81, - "learning_rate": 0.0001370649908856203, - "loss": 2.6297, - "step": 5880 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013702936182638308, - "loss": 2.5812, - "step": 5881 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013699373276714584, - "loss": 2.5629, - "step": 5882 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013695810370790862, - "loss": 2.3846, - "step": 5883 - }, - { - "epoch": 1.82, - "learning_rate": 0.0001369224746486714, - "loss": 2.4844, - "step": 5884 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013688684558943416, - "loss": 2.202, - "step": 5885 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013685121653019694, - "loss": 2.0192, - "step": 5886 - }, - { - "epoch": 1.82, - "learning_rate": 0.0001368155874709597, - "loss": 1.7672, - "step": 5887 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013677995841172248, - "loss": 1.9306, - "step": 5888 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013674432935248526, - "loss": 1.875, - "step": 5889 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013670870029324804, - "loss": 1.868, - "step": 5890 - }, - { - "epoch": 1.82, - "learning_rate": 0.0001366730712340108, - "loss": 4.6997, - "step": 5891 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013663744217477355, - "loss": 4.2344, - "step": 5892 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013660181311553633, - "loss": 3.8964, - "step": 5893 - }, - { - "epoch": 1.82, - "learning_rate": 0.0001365661840562991, - "loss": 4.0173, - "step": 5894 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013653055499706187, - "loss": 3.9137, - "step": 5895 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013649492593782465, - "loss": 3.8561, - "step": 5896 - }, - { - "epoch": 1.82, - "learning_rate": 0.0001364592968785874, - "loss": 3.418, - "step": 5897 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013642366781935019, - "loss": 3.4838, - "step": 5898 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013638803876011294, - "loss": 3.5848, - "step": 5899 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013635240970087572, - "loss": 3.6944, - "step": 5900 - }, - { - "epoch": 1.82, - "learning_rate": 0.0001363167806416385, - "loss": 3.3327, - "step": 5901 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013628115158240129, - "loss": 3.6176, - "step": 5902 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013624552252316404, - "loss": 3.8383, - "step": 5903 - }, - { - "epoch": 1.82, - "learning_rate": 0.0001362098934639268, - "loss": 3.5444, - "step": 5904 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013617426440468958, - "loss": 3.5803, - "step": 5905 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013613863534545236, - "loss": 3.4823, - "step": 5906 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013610300628621514, - "loss": 3.5749, - "step": 5907 - }, - { - "epoch": 1.82, - "learning_rate": 0.0001360673772269779, - "loss": 3.0849, - "step": 5908 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013603174816774068, - "loss": 3.3145, - "step": 5909 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013599611910850343, - "loss": 3.2813, - "step": 5910 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013596049004926621, - "loss": 3.387, - "step": 5911 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013592486099002897, - "loss": 3.2134, - "step": 5912 - }, - { - "epoch": 1.82, - "learning_rate": 0.00013588923193079175, - "loss": 3.3901, - "step": 5913 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013585360287155453, - "loss": 3.2539, - "step": 5914 - }, - { - "epoch": 1.83, - "learning_rate": 0.0001358179738123173, - "loss": 3.1583, - "step": 5915 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013578234475308004, - "loss": 3.2365, - "step": 5916 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013574671569384282, - "loss": 3.3301, - "step": 5917 - }, - { - "epoch": 1.83, - "learning_rate": 0.0001357110866346056, - "loss": 2.9674, - "step": 5918 - }, - { - "epoch": 1.83, - "learning_rate": 0.0001356754575753684, - "loss": 3.1886, - "step": 5919 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013563982851613114, - "loss": 3.0777, - "step": 5920 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013560419945689392, - "loss": 2.919, - "step": 5921 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013556857039765668, - "loss": 2.9699, - "step": 5922 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013553294133841946, - "loss": 3.2553, - "step": 5923 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013549731227918224, - "loss": 3.0933, - "step": 5924 - }, - { - "epoch": 1.83, - "learning_rate": 0.000135461683219945, - "loss": 3.114, - "step": 5925 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013542605416070778, - "loss": 2.7203, - "step": 5926 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013539042510147053, - "loss": 2.6611, - "step": 5927 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013535479604223332, - "loss": 2.577, - "step": 5928 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013531916698299607, - "loss": 2.5996, - "step": 5929 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013528353792375885, - "loss": 2.3903, - "step": 5930 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013524790886452163, - "loss": 2.4562, - "step": 5931 - }, - { - "epoch": 1.83, - "learning_rate": 0.0001352122798052844, - "loss": 2.5713, - "step": 5932 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013517665074604717, - "loss": 2.1677, - "step": 5933 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013514102168680993, - "loss": 2.0905, - "step": 5934 - }, - { - "epoch": 1.83, - "learning_rate": 0.0001351053926275727, - "loss": 2.3372, - "step": 5935 - }, - { - "epoch": 1.83, - "learning_rate": 0.0001350697635683355, - "loss": 1.7569, - "step": 5936 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013503413450909827, - "loss": 1.9582, - "step": 5937 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013499850544986103, - "loss": 1.8115, - "step": 5938 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013496287639062378, - "loss": 1.7901, - "step": 5939 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013492724733138656, - "loss": 1.8771, - "step": 5940 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013489161827214935, - "loss": 4.5518, - "step": 5941 - }, - { - "epoch": 1.83, - "learning_rate": 0.0001348559892129121, - "loss": 3.931, - "step": 5942 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013482036015367488, - "loss": 4.075, - "step": 5943 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013478473109443766, - "loss": 3.7665, - "step": 5944 - }, - { - "epoch": 1.83, - "learning_rate": 0.00013474910203520042, - "loss": 3.7563, - "step": 5945 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013471347297596317, - "loss": 3.944, - "step": 5946 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013467784391672596, - "loss": 3.8102, - "step": 5947 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013464221485748874, - "loss": 3.936, - "step": 5948 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013460658579825152, - "loss": 3.6858, - "step": 5949 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013457095673901427, - "loss": 3.4338, - "step": 5950 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013453532767977703, - "loss": 3.6434, - "step": 5951 - }, - { - "epoch": 1.84, - "learning_rate": 0.0001344996986205398, - "loss": 3.2834, - "step": 5952 - }, - { - "epoch": 1.84, - "learning_rate": 0.0001344640695613026, - "loss": 3.5688, - "step": 5953 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013442844050206537, - "loss": 3.808, - "step": 5954 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013439281144282813, - "loss": 3.3996, - "step": 5955 - }, - { - "epoch": 1.84, - "learning_rate": 0.0001343571823835909, - "loss": 3.5764, - "step": 5956 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013432155332435367, - "loss": 3.5001, - "step": 5957 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013428592426511645, - "loss": 3.4617, - "step": 5958 - }, - { - "epoch": 1.84, - "learning_rate": 0.0001342502952058792, - "loss": 3.2654, - "step": 5959 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013421466614664198, - "loss": 3.2503, - "step": 5960 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013417903708740477, - "loss": 3.3622, - "step": 5961 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013414340802816752, - "loss": 3.295, - "step": 5962 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013410777896893028, - "loss": 3.6142, - "step": 5963 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013407214990969306, - "loss": 3.0737, - "step": 5964 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013403652085045584, - "loss": 3.3459, - "step": 5965 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013400089179121862, - "loss": 3.19, - "step": 5966 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013396526273198138, - "loss": 3.0298, - "step": 5967 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013392963367274416, - "loss": 3.1364, - "step": 5968 - }, - { - "epoch": 1.84, - "learning_rate": 0.0001338940046135069, - "loss": 3.1435, - "step": 5969 - }, - { - "epoch": 1.84, - "learning_rate": 0.0001338583755542697, - "loss": 3.1396, - "step": 5970 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013382274649503248, - "loss": 3.03, - "step": 5971 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013378711743579523, - "loss": 2.9322, - "step": 5972 - }, - { - "epoch": 1.84, - "learning_rate": 0.000133751488376558, - "loss": 2.5041, - "step": 5973 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013371585931732077, - "loss": 2.8171, - "step": 5974 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013368023025808355, - "loss": 3.1091, - "step": 5975 - }, - { - "epoch": 1.84, - "learning_rate": 0.0001336446011988463, - "loss": 2.5389, - "step": 5976 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013360897213960909, - "loss": 2.9834, - "step": 5977 - }, - { - "epoch": 1.84, - "learning_rate": 0.00013357334308037187, - "loss": 2.4387, - "step": 5978 - }, - { - "epoch": 1.85, - "learning_rate": 0.00013353771402113462, - "loss": 2.5696, - "step": 5979 - }, - { - "epoch": 1.85, - "learning_rate": 0.0001335020849618974, - "loss": 2.6627, - "step": 5980 - }, - { - "epoch": 1.85, - "learning_rate": 0.00013346645590266016, - "loss": 2.67, - "step": 5981 - }, - { - "epoch": 1.85, - "learning_rate": 0.00013343082684342294, - "loss": 2.3773, - "step": 5982 - }, - { - "epoch": 1.85, - "learning_rate": 0.00013339519778418572, - "loss": 2.481, - "step": 5983 - }, - { - "epoch": 1.85, - "learning_rate": 0.0001333595687249485, - "loss": 2.1774, - "step": 5984 - }, - { - "epoch": 1.85, - "learning_rate": 0.00013332393966571126, - "loss": 2.2818, - "step": 5985 - }, - { - "epoch": 1.85, - "learning_rate": 0.00013328831060647401, - "loss": 1.9512, - "step": 5986 - }, - { - "epoch": 1.85, - "learning_rate": 0.0001332526815472368, - "loss": 2.0007, - "step": 5987 - }, - { - "epoch": 1.85, - "learning_rate": 0.00013321705248799958, - "loss": 1.9723, - "step": 5988 - }, - { - "epoch": 1.85, - "learning_rate": 0.00013318142342876233, - "loss": 1.7827, - "step": 5989 - }, - { - "epoch": 1.85, - "learning_rate": 0.00013314579436952511, - "loss": 1.7195, - "step": 5990 - }, - { - "epoch": 1.85, - "learning_rate": 0.0001331101653102879, - "loss": 4.4681, - "step": 5991 - }, - { - "epoch": 1.85, - "learning_rate": 0.00013307453625105065, - "loss": 3.9231, - "step": 5992 - }, - { - "epoch": 1.85, - "learning_rate": 0.0001330389071918134, - "loss": 3.9347, - "step": 5993 - }, - { - "epoch": 1.85, - "learning_rate": 0.0001330032781325762, - "loss": 3.8091, - "step": 5994 - }, - { - "epoch": 1.85, - "learning_rate": 0.00013296764907333897, - "loss": 3.7506, - "step": 5995 - }, - { - "epoch": 1.85, - "learning_rate": 0.00013293202001410175, - "loss": 3.6377, - "step": 5996 - }, - { - "epoch": 1.85, - "learning_rate": 0.0001328963909548645, - "loss": 3.6111, - "step": 5997 - }, - { - "epoch": 1.85, - "learning_rate": 0.00013286076189562726, - "loss": 3.7294, - "step": 5998 - }, - { - "epoch": 1.85, - "learning_rate": 0.00013282513283639004, - "loss": 3.6815, - "step": 5999 - }, - { - "epoch": 1.85, - "learning_rate": 0.00013278950377715283, - "loss": 3.4118, - "step": 6000 - }, - { - "epoch": 1.85, - "eval_bleu": 2.4864461373906615e-13, - "eval_loss": 4.131202220916748, - "eval_runtime": 2568.1952, - "eval_samples_per_second": 5.747, - "eval_steps_per_second": 0.718, - "step": 6000 - }, - { - "epoch": 1.85, - "learning_rate": 0.0001327538747179156, - "loss": 3.752, - "step": 6001 - }, - { - "epoch": 1.85, - "learning_rate": 0.00013271824565867836, - "loss": 3.6127, - "step": 6002 - }, - { - "epoch": 1.85, - "learning_rate": 0.00013268261659944114, - "loss": 3.491, - "step": 6003 - }, - { - "epoch": 1.85, - "learning_rate": 0.0001326469875402039, - "loss": 3.5244, - "step": 6004 - }, - { - "epoch": 1.85, - "learning_rate": 0.00013261135848096668, - "loss": 3.5025, - "step": 6005 - }, - { - "epoch": 1.85, - "learning_rate": 0.00013257572942172944, - "loss": 3.48, - "step": 6006 - }, - { - "epoch": 1.85, - "learning_rate": 0.00013254010036249222, - "loss": 3.3622, - "step": 6007 - }, - { - "epoch": 1.85, - "learning_rate": 0.000132504471303255, - "loss": 3.4931, - "step": 6008 - }, - { - "epoch": 1.85, - "learning_rate": 0.00013246884224401775, - "loss": 3.3361, - "step": 6009 - }, - { - "epoch": 1.85, - "learning_rate": 0.0001324332131847805, - "loss": 3.3524, - "step": 6010 - }, - { - "epoch": 1.86, - "learning_rate": 0.0001323975841255433, - "loss": 3.3579, - "step": 6011 - }, - { - "epoch": 1.86, - "learning_rate": 0.00013236195506630607, - "loss": 3.586, - "step": 6012 - }, - { - "epoch": 1.86, - "learning_rate": 0.00013232632600706885, - "loss": 3.4122, - "step": 6013 - }, - { - "epoch": 1.86, - "learning_rate": 0.0001322906969478316, - "loss": 3.1571, - "step": 6014 - }, - { - "epoch": 1.86, - "learning_rate": 0.0001322550678885944, - "loss": 3.3911, - "step": 6015 - }, - { - "epoch": 1.86, - "learning_rate": 0.00013221943882935715, - "loss": 3.1801, - "step": 6016 - }, - { - "epoch": 1.86, - "learning_rate": 0.00013218380977011993, - "loss": 3.3695, - "step": 6017 - }, - { - "epoch": 1.86, - "learning_rate": 0.0001321481807108827, - "loss": 3.3284, - "step": 6018 - }, - { - "epoch": 1.86, - "learning_rate": 0.00013211255165164546, - "loss": 3.2316, - "step": 6019 - }, - { - "epoch": 1.86, - "learning_rate": 0.00013207692259240825, - "loss": 3.0599, - "step": 6020 - }, - { - "epoch": 1.86, - "learning_rate": 0.000132041293533171, - "loss": 2.9838, - "step": 6021 - }, - { - "epoch": 1.86, - "learning_rate": 0.00013200566447393378, - "loss": 3.1124, - "step": 6022 - }, - { - "epoch": 1.86, - "learning_rate": 0.00013197003541469654, - "loss": 3.1385, - "step": 6023 - }, - { - "epoch": 1.86, - "learning_rate": 0.00013193440635545932, - "loss": 2.6956, - "step": 6024 - }, - { - "epoch": 1.86, - "learning_rate": 0.0001318987772962221, - "loss": 2.7608, - "step": 6025 - }, - { - "epoch": 1.86, - "learning_rate": 0.00013186314823698486, - "loss": 2.8516, - "step": 6026 - }, - { - "epoch": 1.86, - "learning_rate": 0.00013182751917774764, - "loss": 2.6226, - "step": 6027 - }, - { - "epoch": 1.86, - "learning_rate": 0.0001317918901185104, - "loss": 2.6602, - "step": 6028 - }, - { - "epoch": 1.86, - "learning_rate": 0.00013175626105927317, - "loss": 2.8539, - "step": 6029 - }, - { - "epoch": 1.86, - "learning_rate": 0.00013172063200003596, - "loss": 2.7956, - "step": 6030 - }, - { - "epoch": 1.86, - "learning_rate": 0.0001316850029407987, - "loss": 2.4599, - "step": 6031 - }, - { - "epoch": 1.86, - "learning_rate": 0.0001316493738815615, - "loss": 2.4493, - "step": 6032 - }, - { - "epoch": 1.86, - "learning_rate": 0.00013161374482232425, - "loss": 2.1797, - "step": 6033 - }, - { - "epoch": 1.86, - "learning_rate": 0.00013157811576308703, - "loss": 2.2403, - "step": 6034 - }, - { - "epoch": 1.86, - "learning_rate": 0.0001315424867038498, - "loss": 2.1513, - "step": 6035 - }, - { - "epoch": 1.86, - "learning_rate": 0.00013150685764461257, - "loss": 2.0846, - "step": 6036 - }, - { - "epoch": 1.86, - "learning_rate": 0.00013147122858537535, - "loss": 1.853, - "step": 6037 - }, - { - "epoch": 1.86, - "learning_rate": 0.00013143559952613813, - "loss": 2.0322, - "step": 6038 - }, - { - "epoch": 1.86, - "learning_rate": 0.00013139997046690088, - "loss": 1.5869, - "step": 6039 - }, - { - "epoch": 1.86, - "learning_rate": 0.00013136434140766364, - "loss": 1.6679, - "step": 6040 - }, - { - "epoch": 1.86, - "learning_rate": 0.00013132871234842642, - "loss": 4.4113, - "step": 6041 - }, - { - "epoch": 1.86, - "learning_rate": 0.0001312930832891892, - "loss": 4.2237, - "step": 6042 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013125745422995199, - "loss": 3.9464, - "step": 6043 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013122182517071474, - "loss": 3.8926, - "step": 6044 - }, - { - "epoch": 1.87, - "learning_rate": 0.0001311861961114775, - "loss": 3.5047, - "step": 6045 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013115056705224028, - "loss": 3.3792, - "step": 6046 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013111493799300306, - "loss": 3.7744, - "step": 6047 - }, - { - "epoch": 1.87, - "learning_rate": 0.0001310793089337658, - "loss": 3.321, - "step": 6048 - }, - { - "epoch": 1.87, - "learning_rate": 0.0001310436798745286, - "loss": 3.672, - "step": 6049 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013100805081529138, - "loss": 3.7048, - "step": 6050 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013097242175605413, - "loss": 3.3416, - "step": 6051 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013093679269681691, - "loss": 3.4552, - "step": 6052 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013090116363757967, - "loss": 3.2367, - "step": 6053 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013086553457834245, - "loss": 3.204, - "step": 6054 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013082990551910523, - "loss": 3.4702, - "step": 6055 - }, - { - "epoch": 1.87, - "learning_rate": 0.000130794276459868, - "loss": 3.3258, - "step": 6056 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013075864740063074, - "loss": 3.6332, - "step": 6057 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013072301834139352, - "loss": 3.5916, - "step": 6058 - }, - { - "epoch": 1.87, - "learning_rate": 0.0001306873892821563, - "loss": 3.4383, - "step": 6059 - }, - { - "epoch": 1.87, - "learning_rate": 0.0001306517602229191, - "loss": 3.2962, - "step": 6060 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013061613116368184, - "loss": 3.2363, - "step": 6061 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013058050210444462, - "loss": 3.1746, - "step": 6062 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013054487304520738, - "loss": 3.2743, - "step": 6063 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013050924398597016, - "loss": 3.3248, - "step": 6064 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013047361492673292, - "loss": 3.1668, - "step": 6065 - }, - { - "epoch": 1.87, - "learning_rate": 0.0001304379858674957, - "loss": 3.2781, - "step": 6066 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013040235680825848, - "loss": 3.0551, - "step": 6067 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013036672774902123, - "loss": 3.1617, - "step": 6068 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013033109868978402, - "loss": 2.9368, - "step": 6069 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013029546963054677, - "loss": 3.0594, - "step": 6070 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013025984057130955, - "loss": 2.9856, - "step": 6071 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013022421151207233, - "loss": 3.0231, - "step": 6072 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013018858245283512, - "loss": 3.1744, - "step": 6073 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013015295339359787, - "loss": 2.9685, - "step": 6074 - }, - { - "epoch": 1.87, - "learning_rate": 0.00013011732433436063, - "loss": 2.9049, - "step": 6075 - }, - { - "epoch": 1.88, - "learning_rate": 0.0001300816952751234, - "loss": 2.5497, - "step": 6076 - }, - { - "epoch": 1.88, - "learning_rate": 0.0001300460662158862, - "loss": 2.663, - "step": 6077 - }, - { - "epoch": 1.88, - "learning_rate": 0.00013001043715664894, - "loss": 2.6952, - "step": 6078 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012997480809741173, - "loss": 2.6857, - "step": 6079 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012993917903817448, - "loss": 2.2846, - "step": 6080 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012990354997893726, - "loss": 2.3562, - "step": 6081 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012986792091970004, - "loss": 2.4923, - "step": 6082 - }, - { - "epoch": 1.88, - "learning_rate": 0.0001298322918604628, - "loss": 2.3585, - "step": 6083 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012979666280122558, - "loss": 2.308, - "step": 6084 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012976103374198836, - "loss": 2.2434, - "step": 6085 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012972540468275112, - "loss": 2.1674, - "step": 6086 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012968977562351387, - "loss": 2.0708, - "step": 6087 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012965414656427665, - "loss": 2.0111, - "step": 6088 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012961851750503944, - "loss": 1.765, - "step": 6089 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012958288844580222, - "loss": 1.7188, - "step": 6090 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012954725938656497, - "loss": 4.4693, - "step": 6091 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012951163032732773, - "loss": 4.0489, - "step": 6092 - }, - { - "epoch": 1.88, - "learning_rate": 0.0001294760012680905, - "loss": 3.7864, - "step": 6093 - }, - { - "epoch": 1.88, - "learning_rate": 0.0001294403722088533, - "loss": 3.9008, - "step": 6094 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012940474314961605, - "loss": 3.721, - "step": 6095 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012936911409037883, - "loss": 3.7364, - "step": 6096 - }, - { - "epoch": 1.88, - "learning_rate": 0.0001293334850311416, - "loss": 3.7179, - "step": 6097 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012929785597190436, - "loss": 3.3805, - "step": 6098 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012926222691266715, - "loss": 3.4667, - "step": 6099 - }, - { - "epoch": 1.88, - "learning_rate": 0.0001292265978534299, - "loss": 3.6643, - "step": 6100 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012919096879419268, - "loss": 3.5647, - "step": 6101 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012915533973495547, - "loss": 3.3146, - "step": 6102 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012911971067571822, - "loss": 3.6588, - "step": 6103 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012908408161648097, - "loss": 3.5325, - "step": 6104 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012904845255724376, - "loss": 3.2702, - "step": 6105 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012901282349800654, - "loss": 3.4937, - "step": 6106 - }, - { - "epoch": 1.88, - "learning_rate": 0.00012897719443876932, - "loss": 3.386, - "step": 6107 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012894156537953208, - "loss": 3.5427, - "step": 6108 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012890593632029486, - "loss": 3.4888, - "step": 6109 - }, - { - "epoch": 1.89, - "learning_rate": 0.0001288703072610576, - "loss": 3.2706, - "step": 6110 - }, - { - "epoch": 1.89, - "learning_rate": 0.0001288346782018204, - "loss": 3.2448, - "step": 6111 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012879904914258315, - "loss": 3.2949, - "step": 6112 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012876342008334593, - "loss": 3.4233, - "step": 6113 - }, - { - "epoch": 1.89, - "learning_rate": 0.0001287277910241087, - "loss": 3.1605, - "step": 6114 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012869216196487147, - "loss": 3.4345, - "step": 6115 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012865653290563425, - "loss": 3.1638, - "step": 6116 - }, - { - "epoch": 1.89, - "learning_rate": 0.000128620903846397, - "loss": 3.0649, - "step": 6117 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012858527478715979, - "loss": 2.8955, - "step": 6118 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012854964572792257, - "loss": 3.0884, - "step": 6119 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012851401666868535, - "loss": 2.9642, - "step": 6120 - }, - { - "epoch": 1.89, - "learning_rate": 0.0001284783876094481, - "loss": 3.1684, - "step": 6121 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012844275855021086, - "loss": 3.0019, - "step": 6122 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012840712949097364, - "loss": 2.7471, - "step": 6123 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012837150043173642, - "loss": 2.7786, - "step": 6124 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012833587137249918, - "loss": 2.7105, - "step": 6125 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012830024231326196, - "loss": 2.874, - "step": 6126 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012826461325402471, - "loss": 2.5026, - "step": 6127 - }, - { - "epoch": 1.89, - "learning_rate": 0.0001282289841947875, - "loss": 2.6514, - "step": 6128 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012819335513555025, - "loss": 2.536, - "step": 6129 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012815772607631303, - "loss": 2.5392, - "step": 6130 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012812209701707581, - "loss": 2.5288, - "step": 6131 - }, - { - "epoch": 1.89, - "learning_rate": 0.0001280864679578386, - "loss": 2.2261, - "step": 6132 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012805083889860135, - "loss": 2.2867, - "step": 6133 - }, - { - "epoch": 1.89, - "learning_rate": 0.0001280152098393641, - "loss": 2.4738, - "step": 6134 - }, - { - "epoch": 1.89, - "learning_rate": 0.0001279795807801269, - "loss": 2.1634, - "step": 6135 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012794395172088967, - "loss": 2.0825, - "step": 6136 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012790832266165245, - "loss": 2.0493, - "step": 6137 - }, - { - "epoch": 1.89, - "learning_rate": 0.0001278726936024152, - "loss": 1.7895, - "step": 6138 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012783706454317796, - "loss": 1.9103, - "step": 6139 - }, - { - "epoch": 1.89, - "learning_rate": 0.00012780143548394074, - "loss": 1.6702, - "step": 6140 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012776580642470352, - "loss": 4.0141, - "step": 6141 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012773017736546628, - "loss": 4.1807, - "step": 6142 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012769454830622906, - "loss": 3.8057, - "step": 6143 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012765891924699184, - "loss": 3.8518, - "step": 6144 - }, - { - "epoch": 1.9, - "learning_rate": 0.0001276232901877546, - "loss": 3.6612, - "step": 6145 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012758766112851735, - "loss": 3.625, - "step": 6146 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012755203206928013, - "loss": 3.589, - "step": 6147 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012751640301004292, - "loss": 3.525, - "step": 6148 - }, - { - "epoch": 1.9, - "learning_rate": 0.0001274807739508057, - "loss": 3.5978, - "step": 6149 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012744514489156845, - "loss": 3.7018, - "step": 6150 - }, - { - "epoch": 1.9, - "learning_rate": 0.0001274095158323312, - "loss": 3.3978, - "step": 6151 - }, - { - "epoch": 1.9, - "learning_rate": 0.000127373886773094, - "loss": 3.4101, - "step": 6152 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012733825771385677, - "loss": 3.3752, - "step": 6153 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012730262865461955, - "loss": 3.4096, - "step": 6154 - }, - { - "epoch": 1.9, - "learning_rate": 0.0001272669995953823, - "loss": 3.6248, - "step": 6155 - }, - { - "epoch": 1.9, - "learning_rate": 0.0001272313705361451, - "loss": 3.3092, - "step": 6156 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012719574147690784, - "loss": 3.6158, - "step": 6157 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012716011241767063, - "loss": 3.3156, - "step": 6158 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012712448335843338, - "loss": 3.2952, - "step": 6159 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012708885429919616, - "loss": 3.3187, - "step": 6160 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012705322523995895, - "loss": 3.2348, - "step": 6161 - }, - { - "epoch": 1.9, - "learning_rate": 0.0001270175961807217, - "loss": 3.1555, - "step": 6162 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012698196712148448, - "loss": 3.3545, - "step": 6163 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012694633806224724, - "loss": 3.0691, - "step": 6164 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012691070900301002, - "loss": 3.2594, - "step": 6165 - }, - { - "epoch": 1.9, - "learning_rate": 0.0001268750799437728, - "loss": 3.1374, - "step": 6166 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012683945088453558, - "loss": 3.2788, - "step": 6167 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012680382182529834, - "loss": 3.0143, - "step": 6168 - }, - { - "epoch": 1.9, - "learning_rate": 0.0001267681927660611, - "loss": 3.0504, - "step": 6169 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012673256370682387, - "loss": 3.0275, - "step": 6170 - }, - { - "epoch": 1.9, - "learning_rate": 0.00012669693464758666, - "loss": 2.9656, - "step": 6171 - }, - { - "epoch": 1.9, - "learning_rate": 0.0001266613055883494, - "loss": 2.8567, - "step": 6172 - }, - { - "epoch": 1.91, - "learning_rate": 0.0001266256765291122, - "loss": 2.8394, - "step": 6173 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012659004746987495, - "loss": 2.6523, - "step": 6174 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012655441841063773, - "loss": 2.7721, - "step": 6175 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012651878935140048, - "loss": 2.8439, - "step": 6176 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012648316029216327, - "loss": 2.7845, - "step": 6177 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012644753123292605, - "loss": 2.6195, - "step": 6178 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012641190217368883, - "loss": 2.5627, - "step": 6179 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012637627311445158, - "loss": 2.4224, - "step": 6180 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012634064405521434, - "loss": 2.2148, - "step": 6181 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012630501499597712, - "loss": 2.3441, - "step": 6182 - }, - { - "epoch": 1.91, - "learning_rate": 0.0001262693859367399, - "loss": 2.3608, - "step": 6183 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012623375687750268, - "loss": 2.2942, - "step": 6184 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012619812781826544, - "loss": 2.0802, - "step": 6185 - }, - { - "epoch": 1.91, - "learning_rate": 0.0001261624987590282, - "loss": 2.004, - "step": 6186 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012612686969979098, - "loss": 1.914, - "step": 6187 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012609124064055376, - "loss": 1.8443, - "step": 6188 - }, - { - "epoch": 1.91, - "learning_rate": 0.0001260556115813165, - "loss": 1.6837, - "step": 6189 - }, - { - "epoch": 1.91, - "learning_rate": 0.0001260199825220793, - "loss": 1.5332, - "step": 6190 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012598435346284208, - "loss": 4.4366, - "step": 6191 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012594872440360483, - "loss": 4.3487, - "step": 6192 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012591309534436759, - "loss": 4.0833, - "step": 6193 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012587746628513037, - "loss": 3.9538, - "step": 6194 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012584183722589315, - "loss": 4.2508, - "step": 6195 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012580620816665593, - "loss": 3.4193, - "step": 6196 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012577057910741869, - "loss": 3.3557, - "step": 6197 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012573495004818144, - "loss": 3.8474, - "step": 6198 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012569932098894422, - "loss": 3.6939, - "step": 6199 - }, - { - "epoch": 1.91, - "learning_rate": 0.000125663691929707, - "loss": 3.5276, - "step": 6200 - }, - { - "epoch": 1.91, - "learning_rate": 0.0001256280628704698, - "loss": 3.5977, - "step": 6201 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012559243381123254, - "loss": 3.5591, - "step": 6202 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012555680475199532, - "loss": 3.4502, - "step": 6203 - }, - { - "epoch": 1.91, - "learning_rate": 0.00012552117569275808, - "loss": 3.4961, - "step": 6204 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012548554663352086, - "loss": 3.2569, - "step": 6205 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012544991757428361, - "loss": 3.1582, - "step": 6206 - }, - { - "epoch": 1.92, - "learning_rate": 0.0001254142885150464, - "loss": 3.3687, - "step": 6207 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012537865945580918, - "loss": 3.2443, - "step": 6208 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012534303039657193, - "loss": 3.1947, - "step": 6209 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012530740133733472, - "loss": 3.34, - "step": 6210 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012527177227809747, - "loss": 3.294, - "step": 6211 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012523614321886025, - "loss": 3.2394, - "step": 6212 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012520051415962303, - "loss": 3.3341, - "step": 6213 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012516488510038582, - "loss": 3.2142, - "step": 6214 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012512925604114857, - "loss": 3.3231, - "step": 6215 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012509362698191133, - "loss": 3.2161, - "step": 6216 - }, - { - "epoch": 1.92, - "learning_rate": 0.0001250579979226741, - "loss": 3.025, - "step": 6217 - }, - { - "epoch": 1.92, - "learning_rate": 0.0001250223688634369, - "loss": 2.9952, - "step": 6218 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012498673980419964, - "loss": 2.825, - "step": 6219 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012495111074496243, - "loss": 2.8667, - "step": 6220 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012491548168572518, - "loss": 2.9087, - "step": 6221 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012487985262648796, - "loss": 2.9174, - "step": 6222 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012484422356725072, - "loss": 3.13, - "step": 6223 - }, - { - "epoch": 1.92, - "learning_rate": 0.0001248085945080135, - "loss": 2.7399, - "step": 6224 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012477296544877628, - "loss": 2.8083, - "step": 6225 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012473733638953906, - "loss": 2.5095, - "step": 6226 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012470170733030182, - "loss": 2.5915, - "step": 6227 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012466607827106457, - "loss": 2.5622, - "step": 6228 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012463044921182735, - "loss": 2.3931, - "step": 6229 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012459482015259014, - "loss": 2.5797, - "step": 6230 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012455919109335292, - "loss": 2.3015, - "step": 6231 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012452356203411567, - "loss": 2.1765, - "step": 6232 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012448793297487843, - "loss": 2.2767, - "step": 6233 - }, - { - "epoch": 1.92, - "learning_rate": 0.0001244523039156412, - "loss": 2.1792, - "step": 6234 - }, - { - "epoch": 1.92, - "learning_rate": 0.000124416674856404, - "loss": 2.038, - "step": 6235 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012438104579716675, - "loss": 2.1239, - "step": 6236 - }, - { - "epoch": 1.92, - "learning_rate": 0.00012434541673792953, - "loss": 2.1532, - "step": 6237 - }, - { - "epoch": 1.93, - "learning_rate": 0.0001243097876786923, - "loss": 2.0321, - "step": 6238 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012427415861945506, - "loss": 1.8024, - "step": 6239 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012423852956021782, - "loss": 1.7173, - "step": 6240 - }, - { - "epoch": 1.93, - "learning_rate": 0.0001242029005009806, - "loss": 4.3351, - "step": 6241 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012416727144174338, - "loss": 4.4694, - "step": 6242 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012413164238250616, - "loss": 3.8428, - "step": 6243 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012409601332326892, - "loss": 3.8765, - "step": 6244 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012406038426403167, - "loss": 3.5356, - "step": 6245 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012402475520479446, - "loss": 3.6469, - "step": 6246 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012398912614555724, - "loss": 3.7448, - "step": 6247 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012395349708632002, - "loss": 3.6421, - "step": 6248 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012391786802708277, - "loss": 3.4599, - "step": 6249 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012388223896784556, - "loss": 3.5939, - "step": 6250 - }, - { - "epoch": 1.93, - "learning_rate": 0.0001238466099086083, - "loss": 3.476, - "step": 6251 - }, - { - "epoch": 1.93, - "learning_rate": 0.0001238109808493711, - "loss": 3.7101, - "step": 6252 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012377535179013385, - "loss": 3.5088, - "step": 6253 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012373972273089663, - "loss": 3.4404, - "step": 6254 - }, - { - "epoch": 1.93, - "learning_rate": 0.0001237040936716594, - "loss": 3.167, - "step": 6255 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012366846461242217, - "loss": 3.2573, - "step": 6256 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012363283555318495, - "loss": 3.2855, - "step": 6257 - }, - { - "epoch": 1.93, - "learning_rate": 0.0001235972064939477, - "loss": 3.3245, - "step": 6258 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012356157743471048, - "loss": 3.4611, - "step": 6259 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012352594837547327, - "loss": 3.4554, - "step": 6260 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012349031931623602, - "loss": 3.3117, - "step": 6261 - }, - { - "epoch": 1.93, - "learning_rate": 0.0001234546902569988, - "loss": 2.9961, - "step": 6262 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012341906119776156, - "loss": 3.4245, - "step": 6263 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012338343213852434, - "loss": 3.2503, - "step": 6264 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012334780307928712, - "loss": 3.1815, - "step": 6265 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012331217402004988, - "loss": 3.1395, - "step": 6266 - }, - { - "epoch": 1.93, - "learning_rate": 0.00012327654496081266, - "loss": 3.0552, - "step": 6267 - }, - { - "epoch": 1.93, - "learning_rate": 0.0001232409159015754, - "loss": 3.0211, - "step": 6268 - }, - { - "epoch": 1.93, - "learning_rate": 0.0001232052868423382, - "loss": 2.8067, - "step": 6269 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012316965778310095, - "loss": 2.9134, - "step": 6270 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012313402872386373, - "loss": 3.0687, - "step": 6271 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012309839966462651, - "loss": 2.6641, - "step": 6272 - }, - { - "epoch": 1.94, - "learning_rate": 0.0001230627706053893, - "loss": 2.8185, - "step": 6273 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012302714154615205, - "loss": 2.6145, - "step": 6274 - }, - { - "epoch": 1.94, - "learning_rate": 0.0001229915124869148, - "loss": 2.7394, - "step": 6275 - }, - { - "epoch": 1.94, - "learning_rate": 0.0001229558834276776, - "loss": 2.5418, - "step": 6276 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012292025436844037, - "loss": 2.6461, - "step": 6277 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012288462530920315, - "loss": 2.4767, - "step": 6278 - }, - { - "epoch": 1.94, - "learning_rate": 0.0001228489962499659, - "loss": 2.4668, - "step": 6279 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012281336719072866, - "loss": 2.4523, - "step": 6280 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012277773813149144, - "loss": 2.4706, - "step": 6281 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012274210907225422, - "loss": 2.2497, - "step": 6282 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012270648001301698, - "loss": 2.3645, - "step": 6283 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012267085095377976, - "loss": 2.0245, - "step": 6284 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012263522189454254, - "loss": 2.0714, - "step": 6285 - }, - { - "epoch": 1.94, - "learning_rate": 0.0001225995928353053, - "loss": 1.9947, - "step": 6286 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012256396377606805, - "loss": 1.9098, - "step": 6287 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012252833471683083, - "loss": 1.7932, - "step": 6288 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012249270565759362, - "loss": 1.8808, - "step": 6289 - }, - { - "epoch": 1.94, - "learning_rate": 0.0001224570765983564, - "loss": 1.6662, - "step": 6290 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012242144753911915, - "loss": 4.4508, - "step": 6291 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012238581847988193, - "loss": 4.1852, - "step": 6292 - }, - { - "epoch": 1.94, - "learning_rate": 0.0001223501894206447, - "loss": 3.9447, - "step": 6293 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012231456036140747, - "loss": 3.7443, - "step": 6294 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012227893130217025, - "loss": 3.7074, - "step": 6295 - }, - { - "epoch": 1.94, - "learning_rate": 0.000122243302242933, - "loss": 3.5568, - "step": 6296 - }, - { - "epoch": 1.94, - "learning_rate": 0.0001222076731836958, - "loss": 3.8518, - "step": 6297 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012217204412445854, - "loss": 3.5252, - "step": 6298 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012213641506522133, - "loss": 3.6679, - "step": 6299 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012210078600598408, - "loss": 3.2884, - "step": 6300 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012206515694674686, - "loss": 3.7034, - "step": 6301 - }, - { - "epoch": 1.94, - "learning_rate": 0.00012202952788750963, - "loss": 3.5611, - "step": 6302 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012199389882827241, - "loss": 3.4053, - "step": 6303 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012195826976903517, - "loss": 3.3161, - "step": 6304 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012192264070979795, - "loss": 3.6776, - "step": 6305 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012188701165056072, - "loss": 3.2157, - "step": 6306 - }, - { - "epoch": 1.95, - "learning_rate": 0.0001218513825913235, - "loss": 3.2946, - "step": 6307 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012181575353208625, - "loss": 3.2176, - "step": 6308 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012178012447284902, - "loss": 3.2924, - "step": 6309 - }, - { - "epoch": 1.95, - "learning_rate": 0.0001217444954136118, - "loss": 3.2238, - "step": 6310 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012170886635437457, - "loss": 3.1917, - "step": 6311 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012167323729513736, - "loss": 3.0384, - "step": 6312 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012163760823590011, - "loss": 3.2451, - "step": 6313 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012160197917666288, - "loss": 3.2805, - "step": 6314 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012156635011742566, - "loss": 3.2868, - "step": 6315 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012153072105818843, - "loss": 3.2563, - "step": 6316 - }, - { - "epoch": 1.95, - "learning_rate": 0.0001214950919989512, - "loss": 3.2465, - "step": 6317 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012145946293971397, - "loss": 3.1684, - "step": 6318 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012142383388047675, - "loss": 3.0084, - "step": 6319 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012138820482123952, - "loss": 2.7951, - "step": 6320 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012135257576200227, - "loss": 3.0063, - "step": 6321 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012131694670276505, - "loss": 2.7995, - "step": 6322 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012128131764352782, - "loss": 2.8224, - "step": 6323 - }, - { - "epoch": 1.95, - "learning_rate": 0.0001212456885842906, - "loss": 2.9922, - "step": 6324 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012121005952505336, - "loss": 2.6167, - "step": 6325 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012117443046581613, - "loss": 2.6477, - "step": 6326 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012113880140657891, - "loss": 2.5494, - "step": 6327 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012110317234734168, - "loss": 2.4819, - "step": 6328 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012106754328810446, - "loss": 2.2827, - "step": 6329 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012103191422886721, - "loss": 2.4099, - "step": 6330 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012099628516963, - "loss": 2.3359, - "step": 6331 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012096065611039276, - "loss": 2.0893, - "step": 6332 - }, - { - "epoch": 1.95, - "learning_rate": 0.00012092502705115554, - "loss": 2.227, - "step": 6333 - }, - { - "epoch": 1.95, - "learning_rate": 0.0001208893979919183, - "loss": 2.2745, - "step": 6334 - }, - { - "epoch": 1.96, - "learning_rate": 0.00012085376893268107, - "loss": 2.0862, - "step": 6335 - }, - { - "epoch": 1.96, - "learning_rate": 0.00012081813987344385, - "loss": 2.0579, - "step": 6336 - }, - { - "epoch": 1.96, - "learning_rate": 0.00012078251081420662, - "loss": 1.8122, - "step": 6337 - }, - { - "epoch": 1.96, - "learning_rate": 0.00012074688175496939, - "loss": 1.8825, - "step": 6338 - }, - { - "epoch": 1.96, - "learning_rate": 0.00012071125269573215, - "loss": 1.7406, - "step": 6339 - }, - { - "epoch": 1.96, - "learning_rate": 0.00012067562363649494, - "loss": 1.6829, - "step": 6340 - }, - { - "epoch": 1.96, - "learning_rate": 0.0001206399945772577, - "loss": 4.3326, - "step": 6341 - }, - { - "epoch": 1.96, - "learning_rate": 0.00012060436551802046, - "loss": 3.9366, - "step": 6342 - }, - { - "epoch": 1.96, - "learning_rate": 0.00012056873645878324, - "loss": 4.0536, - "step": 6343 - }, - { - "epoch": 1.96, - "learning_rate": 0.00012053310739954601, - "loss": 3.856, - "step": 6344 - }, - { - "epoch": 1.96, - "learning_rate": 0.00012049747834030879, - "loss": 3.6989, - "step": 6345 - }, - { - "epoch": 1.96, - "learning_rate": 0.00012046184928107156, - "loss": 3.5423, - "step": 6346 - }, - { - "epoch": 1.96, - "learning_rate": 0.00012042622022183431, - "loss": 3.6103, - "step": 6347 - }, - { - "epoch": 1.96, - "learning_rate": 0.0001203905911625971, - "loss": 3.5651, - "step": 6348 - }, - { - "epoch": 1.96, - "learning_rate": 0.00012035496210335986, - "loss": 3.5381, - "step": 6349 - }, - { - "epoch": 1.96, - "learning_rate": 0.00012031933304412265, - "loss": 3.5106, - "step": 6350 - }, - { - "epoch": 1.96, - "learning_rate": 0.0001202837039848854, - "loss": 3.291, - "step": 6351 - }, - { - "epoch": 1.96, - "learning_rate": 0.00012024807492564818, - "loss": 3.4486, - "step": 6352 - }, - { - "epoch": 1.96, - "learning_rate": 0.00012021244586641095, - "loss": 3.3991, - "step": 6353 - }, - { - "epoch": 1.96, - "learning_rate": 0.00012017681680717373, - "loss": 3.3376, - "step": 6354 - }, - { - "epoch": 1.96, - "learning_rate": 0.00012014118774793649, - "loss": 3.1131, - "step": 6355 - }, - { - "epoch": 1.96, - "learning_rate": 0.00012010555868869926, - "loss": 3.3873, - "step": 6356 - }, - { - "epoch": 1.96, - "learning_rate": 0.00012006992962946204, - "loss": 3.3403, - "step": 6357 - }, - { - "epoch": 1.96, - "learning_rate": 0.0001200343005702248, - "loss": 3.3094, - "step": 6358 - }, - { - "epoch": 1.96, - "learning_rate": 0.00011999867151098756, - "loss": 3.6904, - "step": 6359 - }, - { - "epoch": 1.96, - "learning_rate": 0.00011996304245175034, - "loss": 3.2395, - "step": 6360 - }, - { - "epoch": 1.96, - "learning_rate": 0.00011992741339251311, - "loss": 3.2525, - "step": 6361 - }, - { - "epoch": 1.96, - "learning_rate": 0.0001198917843332759, - "loss": 3.1715, - "step": 6362 - }, - { - "epoch": 1.96, - "learning_rate": 0.00011985615527403866, - "loss": 3.1244, - "step": 6363 - }, - { - "epoch": 1.96, - "learning_rate": 0.00011982052621480143, - "loss": 2.9708, - "step": 6364 - }, - { - "epoch": 1.96, - "learning_rate": 0.0001197848971555642, - "loss": 3.0925, - "step": 6365 - }, - { - "epoch": 1.96, - "learning_rate": 0.00011974926809632698, - "loss": 3.1954, - "step": 6366 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011971363903708975, - "loss": 3.2942, - "step": 6367 - }, - { - "epoch": 1.97, - "learning_rate": 0.0001196780099778525, - "loss": 3.1907, - "step": 6368 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011964238091861529, - "loss": 2.9039, - "step": 6369 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011960675185937805, - "loss": 2.7689, - "step": 6370 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011957112280014084, - "loss": 3.0075, - "step": 6371 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011953549374090359, - "loss": 3.208, - "step": 6372 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011949986468166636, - "loss": 2.8777, - "step": 6373 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011946423562242914, - "loss": 3.0454, - "step": 6374 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011942860656319191, - "loss": 2.6165, - "step": 6375 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011939297750395469, - "loss": 2.605, - "step": 6376 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011935734844471745, - "loss": 2.5803, - "step": 6377 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011932171938548023, - "loss": 2.5476, - "step": 6378 - }, - { - "epoch": 1.97, - "learning_rate": 0.000119286090326243, - "loss": 2.4307, - "step": 6379 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011925046126700578, - "loss": 2.1848, - "step": 6380 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011921483220776853, - "loss": 2.2606, - "step": 6381 - }, - { - "epoch": 1.97, - "learning_rate": 0.0001191792031485313, - "loss": 2.3911, - "step": 6382 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011914357408929408, - "loss": 2.3356, - "step": 6383 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011910794503005685, - "loss": 1.9967, - "step": 6384 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011907231597081962, - "loss": 1.9493, - "step": 6385 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011903668691158239, - "loss": 1.8802, - "step": 6386 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011900105785234517, - "loss": 1.918, - "step": 6387 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011896542879310794, - "loss": 1.7918, - "step": 6388 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011892979973387069, - "loss": 1.7174, - "step": 6389 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011889417067463347, - "loss": 1.7381, - "step": 6390 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011885854161539624, - "loss": 4.4643, - "step": 6391 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011882291255615902, - "loss": 4.0275, - "step": 6392 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011878728349692179, - "loss": 3.8139, - "step": 6393 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011875165443768455, - "loss": 3.567, - "step": 6394 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011871602537844733, - "loss": 3.4271, - "step": 6395 - }, - { - "epoch": 1.97, - "learning_rate": 0.0001186803963192101, - "loss": 3.6323, - "step": 6396 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011864476725997288, - "loss": 3.5337, - "step": 6397 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011860913820073563, - "loss": 3.5572, - "step": 6398 - }, - { - "epoch": 1.97, - "learning_rate": 0.00011857350914149842, - "loss": 3.4327, - "step": 6399 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011853788008226118, - "loss": 3.3467, - "step": 6400 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011850225102302397, - "loss": 3.4823, - "step": 6401 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011846662196378672, - "loss": 3.6176, - "step": 6402 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011843099290454949, - "loss": 3.5742, - "step": 6403 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011839536384531227, - "loss": 3.5294, - "step": 6404 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011835973478607504, - "loss": 3.5319, - "step": 6405 - }, - { - "epoch": 1.98, - "learning_rate": 0.0001183241057268378, - "loss": 3.3953, - "step": 6406 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011828847666760058, - "loss": 3.4968, - "step": 6407 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011825284760836334, - "loss": 3.5302, - "step": 6408 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011821721854912613, - "loss": 3.1863, - "step": 6409 - }, - { - "epoch": 1.98, - "learning_rate": 0.0001181815894898889, - "loss": 3.2545, - "step": 6410 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011814596043065166, - "loss": 3.1285, - "step": 6411 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011811033137141443, - "loss": 3.1186, - "step": 6412 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011807470231217721, - "loss": 3.2511, - "step": 6413 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011803907325293998, - "loss": 3.1158, - "step": 6414 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011800344419370274, - "loss": 3.0221, - "step": 6415 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011796781513446552, - "loss": 2.826, - "step": 6416 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011793218607522829, - "loss": 3.192, - "step": 6417 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011789655701599107, - "loss": 3.1183, - "step": 6418 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011786092795675382, - "loss": 2.9161, - "step": 6419 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011782529889751659, - "loss": 3.0305, - "step": 6420 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011778966983827937, - "loss": 2.715, - "step": 6421 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011775404077904216, - "loss": 2.791, - "step": 6422 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011771841171980491, - "loss": 2.5531, - "step": 6423 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011768278266056768, - "loss": 2.5369, - "step": 6424 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011764715360133046, - "loss": 2.7432, - "step": 6425 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011761152454209323, - "loss": 2.7252, - "step": 6426 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011757589548285601, - "loss": 2.3303, - "step": 6427 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011754026642361877, - "loss": 2.6895, - "step": 6428 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011750463736438153, - "loss": 2.479, - "step": 6429 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011746900830514432, - "loss": 2.4087, - "step": 6430 - }, - { - "epoch": 1.98, - "learning_rate": 0.00011743337924590708, - "loss": 2.558, - "step": 6431 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011739775018666985, - "loss": 2.2772, - "step": 6432 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011736212112743262, - "loss": 2.5903, - "step": 6433 - }, - { - "epoch": 1.99, - "learning_rate": 0.0001173264920681954, - "loss": 2.1338, - "step": 6434 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011729086300895817, - "loss": 1.991, - "step": 6435 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011725523394972093, - "loss": 2.0097, - "step": 6436 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011721960489048371, - "loss": 1.8478, - "step": 6437 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011718397583124648, - "loss": 1.8268, - "step": 6438 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011714834677200926, - "loss": 1.7238, - "step": 6439 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011711271771277201, - "loss": 1.6445, - "step": 6440 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011707708865353478, - "loss": 4.2796, - "step": 6441 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011704145959429756, - "loss": 4.29, - "step": 6442 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011700583053506033, - "loss": 3.6304, - "step": 6443 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011697020147582311, - "loss": 3.9167, - "step": 6444 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011693457241658587, - "loss": 3.799, - "step": 6445 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011689894335734865, - "loss": 3.4665, - "step": 6446 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011686331429811142, - "loss": 3.2171, - "step": 6447 - }, - { - "epoch": 1.99, - "learning_rate": 0.0001168276852388742, - "loss": 3.6258, - "step": 6448 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011679205617963695, - "loss": 3.6364, - "step": 6449 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011675642712039972, - "loss": 3.463, - "step": 6450 - }, - { - "epoch": 1.99, - "learning_rate": 0.0001167207980611625, - "loss": 3.452, - "step": 6451 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011668516900192527, - "loss": 3.1833, - "step": 6452 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011664953994268803, - "loss": 3.4622, - "step": 6453 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011661391088345081, - "loss": 3.4034, - "step": 6454 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011657828182421358, - "loss": 3.5632, - "step": 6455 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011654265276497636, - "loss": 3.2367, - "step": 6456 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011650702370573911, - "loss": 3.208, - "step": 6457 - }, - { - "epoch": 1.99, - "learning_rate": 0.0001164713946465019, - "loss": 3.3104, - "step": 6458 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011643576558726466, - "loss": 3.1702, - "step": 6459 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011640013652802745, - "loss": 2.971, - "step": 6460 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011636450746879021, - "loss": 3.0789, - "step": 6461 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011632887840955297, - "loss": 3.0441, - "step": 6462 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011629324935031575, - "loss": 2.9161, - "step": 6463 - }, - { - "epoch": 1.99, - "learning_rate": 0.00011625762029107852, - "loss": 3.1718, - "step": 6464 - }, - { - "epoch": 2.0, - "learning_rate": 0.0001162219912318413, - "loss": 2.9247, - "step": 6465 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011618636217260406, - "loss": 3.0687, - "step": 6466 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011615073311336684, - "loss": 2.7461, - "step": 6467 - }, - { - "epoch": 2.0, - "learning_rate": 0.0001161151040541296, - "loss": 2.7759, - "step": 6468 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011607947499489239, - "loss": 2.814, - "step": 6469 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011604384593565514, - "loss": 2.5327, - "step": 6470 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011600821687641791, - "loss": 2.4799, - "step": 6471 - }, - { - "epoch": 2.0, - "learning_rate": 0.0001159725878171807, - "loss": 2.3533, - "step": 6472 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011593695875794346, - "loss": 2.6088, - "step": 6473 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011590132969870624, - "loss": 2.453, - "step": 6474 - }, - { - "epoch": 2.0, - "learning_rate": 0.000115865700639469, - "loss": 2.2397, - "step": 6475 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011583007158023177, - "loss": 1.9031, - "step": 6476 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011579444252099455, - "loss": 1.9986, - "step": 6477 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011575881346175732, - "loss": 2.0547, - "step": 6478 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011572318440252009, - "loss": 1.6775, - "step": 6479 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011568755534328285, - "loss": 1.6999, - "step": 6480 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011565192628404564, - "loss": 4.3112, - "step": 6481 - }, - { - "epoch": 2.0, - "learning_rate": 0.0001156162972248084, - "loss": 3.8602, - "step": 6482 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011558066816557116, - "loss": 3.4041, - "step": 6483 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011554503910633394, - "loss": 3.5769, - "step": 6484 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011550941004709671, - "loss": 3.4436, - "step": 6485 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011547378098785949, - "loss": 3.1102, - "step": 6486 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011543815192862225, - "loss": 3.1508, - "step": 6487 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011540252286938501, - "loss": 3.0366, - "step": 6488 - }, - { - "epoch": 2.0, - "learning_rate": 0.0001153668938101478, - "loss": 3.2328, - "step": 6489 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011533126475091056, - "loss": 2.8792, - "step": 6490 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011529563569167335, - "loss": 2.9948, - "step": 6491 - }, - { - "epoch": 2.0, - "learning_rate": 0.0001152600066324361, - "loss": 3.0081, - "step": 6492 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011522437757319888, - "loss": 3.0525, - "step": 6493 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011518874851396165, - "loss": 2.9947, - "step": 6494 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011515311945472443, - "loss": 2.9529, - "step": 6495 - }, - { - "epoch": 2.0, - "learning_rate": 0.00011511749039548719, - "loss": 3.1256, - "step": 6496 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011508186133624996, - "loss": 3.0644, - "step": 6497 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011504623227701274, - "loss": 3.0221, - "step": 6498 - }, - { - "epoch": 2.01, - "learning_rate": 0.0001150106032177755, - "loss": 3.0206, - "step": 6499 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011497497415853826, - "loss": 2.9148, - "step": 6500 - }, - { - "epoch": 2.01, - "eval_bleu": 3.042772821821628e-14, - "eval_loss": 4.028158187866211, - "eval_runtime": 2574.1053, - "eval_samples_per_second": 5.734, - "eval_steps_per_second": 0.717, - "step": 6500 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011493934509930104, - "loss": 2.8993, - "step": 6501 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011490371604006381, - "loss": 3.095, - "step": 6502 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011486808698082659, - "loss": 2.9025, - "step": 6503 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011483245792158935, - "loss": 2.8447, - "step": 6504 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011479682886235213, - "loss": 2.7538, - "step": 6505 - }, - { - "epoch": 2.01, - "learning_rate": 0.0001147611998031149, - "loss": 2.903, - "step": 6506 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011472557074387768, - "loss": 2.5932, - "step": 6507 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011468994168464045, - "loss": 2.5023, - "step": 6508 - }, - { - "epoch": 2.01, - "learning_rate": 0.0001146543126254032, - "loss": 2.524, - "step": 6509 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011461868356616598, - "loss": 2.6866, - "step": 6510 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011458305450692875, - "loss": 2.3969, - "step": 6511 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011454742544769153, - "loss": 2.386, - "step": 6512 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011451179638845429, - "loss": 2.5887, - "step": 6513 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011447616732921707, - "loss": 2.1673, - "step": 6514 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011444053826997984, - "loss": 2.2874, - "step": 6515 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011440490921074262, - "loss": 2.2893, - "step": 6516 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011436928015150538, - "loss": 2.3741, - "step": 6517 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011433365109226814, - "loss": 2.1559, - "step": 6518 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011429802203303093, - "loss": 2.0367, - "step": 6519 - }, - { - "epoch": 2.01, - "learning_rate": 0.0001142623929737937, - "loss": 2.0505, - "step": 6520 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011422676391455645, - "loss": 1.7658, - "step": 6521 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011419113485531923, - "loss": 1.963, - "step": 6522 - }, - { - "epoch": 2.01, - "learning_rate": 0.000114155505796082, - "loss": 2.0053, - "step": 6523 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011411987673684478, - "loss": 1.7896, - "step": 6524 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011408424767760755, - "loss": 1.8566, - "step": 6525 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011404861861837032, - "loss": 1.6627, - "step": 6526 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011401298955913309, - "loss": 1.5814, - "step": 6527 - }, - { - "epoch": 2.01, - "learning_rate": 0.00011397736049989587, - "loss": 1.5535, - "step": 6528 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011394173144065864, - "loss": 1.5668, - "step": 6529 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011390610238142139, - "loss": 1.5186, - "step": 6530 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011387047332218417, - "loss": 4.0115, - "step": 6531 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011383484426294694, - "loss": 3.9812, - "step": 6532 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011379921520370972, - "loss": 3.6604, - "step": 6533 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011376358614447248, - "loss": 3.5509, - "step": 6534 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011372795708523525, - "loss": 3.4303, - "step": 6535 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011369232802599803, - "loss": 3.4668, - "step": 6536 - }, - { - "epoch": 2.02, - "learning_rate": 0.0001136566989667608, - "loss": 3.3841, - "step": 6537 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011362106990752357, - "loss": 3.0714, - "step": 6538 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011358544084828633, - "loss": 3.2686, - "step": 6539 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011354981178904912, - "loss": 3.3071, - "step": 6540 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011351418272981188, - "loss": 3.1565, - "step": 6541 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011347855367057467, - "loss": 3.0182, - "step": 6542 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011344292461133742, - "loss": 3.2453, - "step": 6543 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011340729555210019, - "loss": 2.9619, - "step": 6544 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011337166649286297, - "loss": 3.0996, - "step": 6545 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011333603743362574, - "loss": 2.9627, - "step": 6546 - }, - { - "epoch": 2.02, - "learning_rate": 0.0001133004083743885, - "loss": 2.9814, - "step": 6547 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011326477931515128, - "loss": 3.0741, - "step": 6548 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011322915025591404, - "loss": 2.9039, - "step": 6549 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011319352119667683, - "loss": 2.9234, - "step": 6550 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011315789213743958, - "loss": 2.7293, - "step": 6551 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011312226307820236, - "loss": 2.8402, - "step": 6552 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011308663401896513, - "loss": 2.7189, - "step": 6553 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011305100495972791, - "loss": 2.723, - "step": 6554 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011301537590049067, - "loss": 2.468, - "step": 6555 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011297974684125344, - "loss": 2.6912, - "step": 6556 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011294411778201622, - "loss": 2.645, - "step": 6557 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011290848872277899, - "loss": 2.475, - "step": 6558 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011287285966354177, - "loss": 2.5526, - "step": 6559 - }, - { - "epoch": 2.02, - "learning_rate": 0.00011283723060430452, - "loss": 2.6966, - "step": 6560 - }, - { - "epoch": 2.02, - "learning_rate": 0.0001128016015450673, - "loss": 2.4613, - "step": 6561 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011276597248583007, - "loss": 2.5327, - "step": 6562 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011273034342659285, - "loss": 2.4223, - "step": 6563 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011269471436735561, - "loss": 2.2428, - "step": 6564 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011265908530811838, - "loss": 2.5619, - "step": 6565 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011262345624888116, - "loss": 2.1988, - "step": 6566 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011258782718964393, - "loss": 2.1883, - "step": 6567 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011255219813040668, - "loss": 2.0917, - "step": 6568 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011251656907116946, - "loss": 2.0602, - "step": 6569 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011248094001193223, - "loss": 2.2042, - "step": 6570 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011244531095269501, - "loss": 1.9537, - "step": 6571 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011240968189345778, - "loss": 1.8875, - "step": 6572 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011237405283422055, - "loss": 1.9476, - "step": 6573 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011233842377498332, - "loss": 1.7358, - "step": 6574 - }, - { - "epoch": 2.03, - "learning_rate": 0.0001123027947157461, - "loss": 1.8412, - "step": 6575 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011226716565650887, - "loss": 1.7834, - "step": 6576 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011223153659727162, - "loss": 1.7059, - "step": 6577 - }, - { - "epoch": 2.03, - "learning_rate": 0.0001121959075380344, - "loss": 1.6407, - "step": 6578 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011216027847879717, - "loss": 1.3587, - "step": 6579 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011212464941955996, - "loss": 1.4483, - "step": 6580 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011208902036032271, - "loss": 4.1891, - "step": 6581 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011205339130108548, - "loss": 3.6822, - "step": 6582 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011201776224184826, - "loss": 3.8089, - "step": 6583 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011198213318261103, - "loss": 3.3374, - "step": 6584 - }, - { - "epoch": 2.03, - "learning_rate": 0.0001119465041233738, - "loss": 3.2984, - "step": 6585 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011191087506413657, - "loss": 3.1327, - "step": 6586 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011187524600489935, - "loss": 3.3811, - "step": 6587 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011183961694566212, - "loss": 3.2754, - "step": 6588 - }, - { - "epoch": 2.03, - "learning_rate": 0.0001118039878864249, - "loss": 3.4036, - "step": 6589 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011176835882718765, - "loss": 3.2466, - "step": 6590 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011173272976795042, - "loss": 3.0652, - "step": 6591 - }, - { - "epoch": 2.03, - "learning_rate": 0.0001116971007087132, - "loss": 2.925, - "step": 6592 - }, - { - "epoch": 2.03, - "learning_rate": 0.00011166147164947597, - "loss": 2.8919, - "step": 6593 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011162584259023873, - "loss": 3.1966, - "step": 6594 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011159021353100151, - "loss": 3.0383, - "step": 6595 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011155458447176429, - "loss": 3.1049, - "step": 6596 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011151895541252706, - "loss": 2.8841, - "step": 6597 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011148332635328981, - "loss": 3.1001, - "step": 6598 - }, - { - "epoch": 2.04, - "learning_rate": 0.0001114476972940526, - "loss": 2.8866, - "step": 6599 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011141206823481536, - "loss": 3.0198, - "step": 6600 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011137643917557815, - "loss": 2.833, - "step": 6601 - }, - { - "epoch": 2.04, - "learning_rate": 0.0001113408101163409, - "loss": 2.7192, - "step": 6602 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011130518105710367, - "loss": 2.8455, - "step": 6603 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011126955199786645, - "loss": 2.5502, - "step": 6604 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011123392293862922, - "loss": 2.9362, - "step": 6605 - }, - { - "epoch": 2.04, - "learning_rate": 0.000111198293879392, - "loss": 2.7703, - "step": 6606 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011116266482015476, - "loss": 2.7916, - "step": 6607 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011112703576091754, - "loss": 2.6947, - "step": 6608 - }, - { - "epoch": 2.04, - "learning_rate": 0.0001110914067016803, - "loss": 2.6207, - "step": 6609 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011105577764244309, - "loss": 2.5449, - "step": 6610 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011102014858320584, - "loss": 2.3841, - "step": 6611 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011098451952396861, - "loss": 2.5461, - "step": 6612 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011094889046473139, - "loss": 2.5012, - "step": 6613 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011091326140549416, - "loss": 2.2376, - "step": 6614 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011087763234625692, - "loss": 2.3564, - "step": 6615 - }, - { - "epoch": 2.04, - "learning_rate": 0.0001108420032870197, - "loss": 2.3199, - "step": 6616 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011080637422778247, - "loss": 2.1498, - "step": 6617 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011077074516854525, - "loss": 2.2694, - "step": 6618 - }, - { - "epoch": 2.04, - "learning_rate": 0.000110735116109308, - "loss": 2.1036, - "step": 6619 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011069948705007078, - "loss": 1.9732, - "step": 6620 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011066385799083355, - "loss": 2.1049, - "step": 6621 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011062822893159633, - "loss": 1.8685, - "step": 6622 - }, - { - "epoch": 2.04, - "learning_rate": 0.0001105925998723591, - "loss": 1.585, - "step": 6623 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011055697081312186, - "loss": 1.9859, - "step": 6624 - }, - { - "epoch": 2.04, - "learning_rate": 0.00011052134175388464, - "loss": 1.7531, - "step": 6625 - }, - { - "epoch": 2.05, - "learning_rate": 0.00011048571269464741, - "loss": 1.5564, - "step": 6626 - }, - { - "epoch": 2.05, - "learning_rate": 0.00011045008363541019, - "loss": 1.5823, - "step": 6627 - }, - { - "epoch": 2.05, - "learning_rate": 0.00011041445457617294, - "loss": 1.4735, - "step": 6628 - }, - { - "epoch": 2.05, - "learning_rate": 0.00011037882551693571, - "loss": 1.5498, - "step": 6629 - }, - { - "epoch": 2.05, - "learning_rate": 0.0001103431964576985, - "loss": 1.4144, - "step": 6630 - }, - { - "epoch": 2.05, - "learning_rate": 0.00011030756739846126, - "loss": 3.9211, - "step": 6631 - }, - { - "epoch": 2.05, - "learning_rate": 0.00011027193833922403, - "loss": 4.0421, - "step": 6632 - }, - { - "epoch": 2.05, - "learning_rate": 0.0001102363092799868, - "loss": 3.4149, - "step": 6633 - }, - { - "epoch": 2.05, - "learning_rate": 0.00011020068022074958, - "loss": 3.46, - "step": 6634 - }, - { - "epoch": 2.05, - "learning_rate": 0.00011016505116151235, - "loss": 3.4306, - "step": 6635 - }, - { - "epoch": 2.05, - "learning_rate": 0.0001101294221022751, - "loss": 3.3964, - "step": 6636 - }, - { - "epoch": 2.05, - "learning_rate": 0.00011009379304303789, - "loss": 3.133, - "step": 6637 - }, - { - "epoch": 2.05, - "learning_rate": 0.00011005816398380066, - "loss": 3.4635, - "step": 6638 - }, - { - "epoch": 2.05, - "learning_rate": 0.00011002253492456344, - "loss": 3.2331, - "step": 6639 - }, - { - "epoch": 2.05, - "learning_rate": 0.0001099869058653262, - "loss": 3.042, - "step": 6640 - }, - { - "epoch": 2.05, - "learning_rate": 0.00010995127680608897, - "loss": 2.8603, - "step": 6641 - }, - { - "epoch": 2.05, - "learning_rate": 0.00010991564774685174, - "loss": 3.3571, - "step": 6642 - }, - { - "epoch": 2.05, - "learning_rate": 0.00010988001868761452, - "loss": 3.0635, - "step": 6643 - }, - { - "epoch": 2.05, - "learning_rate": 0.00010984438962837729, - "loss": 2.9387, - "step": 6644 - }, - { - "epoch": 2.05, - "learning_rate": 0.00010980876056914005, - "loss": 3.0088, - "step": 6645 - }, - { - "epoch": 2.05, - "learning_rate": 0.00010977313150990283, - "loss": 3.2358, - "step": 6646 - }, - { - "epoch": 2.05, - "learning_rate": 0.0001097375024506656, - "loss": 2.7372, - "step": 6647 - }, - { - "epoch": 2.05, - "learning_rate": 0.00010970187339142838, - "loss": 2.8706, - "step": 6648 - }, - { - "epoch": 2.05, - "learning_rate": 0.00010966624433219113, - "loss": 2.9393, - "step": 6649 - }, - { - "epoch": 2.05, - "learning_rate": 0.0001096306152729539, - "loss": 2.9428, - "step": 6650 - }, - { - "epoch": 2.05, - "learning_rate": 0.00010959498621371668, - "loss": 2.706, - "step": 6651 - }, - { - "epoch": 2.05, - "learning_rate": 0.00010955935715447945, - "loss": 2.7872, - "step": 6652 - }, - { - "epoch": 2.05, - "learning_rate": 0.00010952372809524222, - "loss": 2.7953, - "step": 6653 - }, - { - "epoch": 2.05, - "learning_rate": 0.00010948809903600499, - "loss": 2.7283, - "step": 6654 - }, - { - "epoch": 2.05, - "learning_rate": 0.00010945246997676777, - "loss": 2.9031, - "step": 6655 - }, - { - "epoch": 2.05, - "learning_rate": 0.00010941684091753054, - "loss": 2.3934, - "step": 6656 - }, - { - "epoch": 2.05, - "learning_rate": 0.00010938121185829332, - "loss": 2.6522, - "step": 6657 - }, - { - "epoch": 2.05, - "learning_rate": 0.00010934558279905608, - "loss": 2.4981, - "step": 6658 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010930995373981884, - "loss": 2.6997, - "step": 6659 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010927432468058163, - "loss": 2.5212, - "step": 6660 - }, - { - "epoch": 2.06, - "learning_rate": 0.0001092386956213444, - "loss": 2.5728, - "step": 6661 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010920306656210715, - "loss": 2.2189, - "step": 6662 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010916743750286993, - "loss": 2.3778, - "step": 6663 - }, - { - "epoch": 2.06, - "learning_rate": 0.0001091318084436327, - "loss": 2.2217, - "step": 6664 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010909617938439548, - "loss": 2.3179, - "step": 6665 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010906055032515824, - "loss": 2.2392, - "step": 6666 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010902492126592102, - "loss": 2.0508, - "step": 6667 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010898929220668379, - "loss": 2.1421, - "step": 6668 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010895366314744657, - "loss": 1.9896, - "step": 6669 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010891803408820932, - "loss": 2.2937, - "step": 6670 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010888240502897209, - "loss": 2.0295, - "step": 6671 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010884677596973487, - "loss": 1.9067, - "step": 6672 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010881114691049764, - "loss": 1.6967, - "step": 6673 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010877551785126042, - "loss": 1.7408, - "step": 6674 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010873988879202318, - "loss": 1.7335, - "step": 6675 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010870425973278595, - "loss": 1.4996, - "step": 6676 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010866863067354873, - "loss": 1.693, - "step": 6677 - }, - { - "epoch": 2.06, - "learning_rate": 0.0001086330016143115, - "loss": 1.4793, - "step": 6678 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010859737255507426, - "loss": 1.5471, - "step": 6679 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010856174349583703, - "loss": 1.4735, - "step": 6680 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010852611443659981, - "loss": 4.2272, - "step": 6681 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010849048537736258, - "loss": 3.7386, - "step": 6682 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010845485631812534, - "loss": 3.7507, - "step": 6683 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010841922725888812, - "loss": 3.4383, - "step": 6684 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010838359819965089, - "loss": 3.4366, - "step": 6685 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010834796914041367, - "loss": 3.2472, - "step": 6686 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010831234008117644, - "loss": 3.3389, - "step": 6687 - }, - { - "epoch": 2.06, - "learning_rate": 0.0001082767110219392, - "loss": 3.3233, - "step": 6688 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010824108196270198, - "loss": 3.594, - "step": 6689 - }, - { - "epoch": 2.06, - "learning_rate": 0.00010820545290346476, - "loss": 3.1586, - "step": 6690 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010816982384422753, - "loss": 3.1905, - "step": 6691 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010813419478499028, - "loss": 3.1702, - "step": 6692 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010809856572575306, - "loss": 3.0376, - "step": 6693 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010806293666651583, - "loss": 3.3847, - "step": 6694 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010802730760727861, - "loss": 3.0483, - "step": 6695 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010799167854804137, - "loss": 3.0372, - "step": 6696 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010795604948880414, - "loss": 2.9742, - "step": 6697 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010792042042956692, - "loss": 2.8438, - "step": 6698 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010788479137032969, - "loss": 2.839, - "step": 6699 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010784916231109245, - "loss": 2.82, - "step": 6700 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010781353325185522, - "loss": 2.8822, - "step": 6701 - }, - { - "epoch": 2.07, - "learning_rate": 0.000107777904192618, - "loss": 2.9541, - "step": 6702 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010774227513338077, - "loss": 2.8304, - "step": 6703 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010770664607414355, - "loss": 2.699, - "step": 6704 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010767101701490631, - "loss": 2.6638, - "step": 6705 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010763538795566908, - "loss": 2.8597, - "step": 6706 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010759975889643186, - "loss": 2.6487, - "step": 6707 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010756412983719463, - "loss": 2.6656, - "step": 6708 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010752850077795738, - "loss": 2.4464, - "step": 6709 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010749287171872016, - "loss": 2.5298, - "step": 6710 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010745724265948293, - "loss": 2.2329, - "step": 6711 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010742161360024571, - "loss": 2.6153, - "step": 6712 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010738598454100847, - "loss": 2.3679, - "step": 6713 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010735035548177125, - "loss": 2.3106, - "step": 6714 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010731472642253402, - "loss": 2.3148, - "step": 6715 - }, - { - "epoch": 2.07, - "learning_rate": 0.0001072790973632968, - "loss": 2.2709, - "step": 6716 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010724346830405956, - "loss": 2.0408, - "step": 6717 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010720783924482232, - "loss": 2.1912, - "step": 6718 - }, - { - "epoch": 2.07, - "learning_rate": 0.0001071722101855851, - "loss": 2.255, - "step": 6719 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010713658112634787, - "loss": 2.0005, - "step": 6720 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010710095206711066, - "loss": 2.1383, - "step": 6721 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010706532300787341, - "loss": 1.9746, - "step": 6722 - }, - { - "epoch": 2.07, - "learning_rate": 0.00010702969394863618, - "loss": 1.8708, - "step": 6723 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010699406488939896, - "loss": 2.0502, - "step": 6724 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010695843583016174, - "loss": 1.8066, - "step": 6725 - }, - { - "epoch": 2.08, - "learning_rate": 0.0001069228067709245, - "loss": 1.7719, - "step": 6726 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010688717771168727, - "loss": 1.6049, - "step": 6727 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010685154865245005, - "loss": 1.676, - "step": 6728 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010681591959321282, - "loss": 1.653, - "step": 6729 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010678029053397557, - "loss": 1.6481, - "step": 6730 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010674466147473835, - "loss": 4.2176, - "step": 6731 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010670903241550112, - "loss": 3.9395, - "step": 6732 - }, - { - "epoch": 2.08, - "learning_rate": 0.0001066734033562639, - "loss": 3.414, - "step": 6733 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010663777429702666, - "loss": 3.3841, - "step": 6734 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010660214523778944, - "loss": 3.3388, - "step": 6735 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010656651617855221, - "loss": 3.1991, - "step": 6736 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010653088711931499, - "loss": 3.4822, - "step": 6737 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010649525806007776, - "loss": 3.2995, - "step": 6738 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010645962900084051, - "loss": 3.0478, - "step": 6739 - }, - { - "epoch": 2.08, - "learning_rate": 0.0001064239999416033, - "loss": 3.131, - "step": 6740 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010638837088236606, - "loss": 3.1576, - "step": 6741 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010635274182312885, - "loss": 2.9552, - "step": 6742 - }, - { - "epoch": 2.08, - "learning_rate": 0.0001063171127638916, - "loss": 2.9475, - "step": 6743 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010628148370465437, - "loss": 3.049, - "step": 6744 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010624585464541715, - "loss": 3.119, - "step": 6745 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010621022558617992, - "loss": 3.0643, - "step": 6746 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010617459652694269, - "loss": 3.1145, - "step": 6747 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010613896746770546, - "loss": 3.0809, - "step": 6748 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010610333840846824, - "loss": 3.2176, - "step": 6749 - }, - { - "epoch": 2.08, - "learning_rate": 0.000106067709349231, - "loss": 2.9402, - "step": 6750 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010603208028999376, - "loss": 2.877, - "step": 6751 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010599645123075654, - "loss": 2.6399, - "step": 6752 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010596082217151931, - "loss": 2.6628, - "step": 6753 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010592519311228209, - "loss": 2.9589, - "step": 6754 - }, - { - "epoch": 2.08, - "learning_rate": 0.00010588956405304486, - "loss": 2.4684, - "step": 6755 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010585393499380762, - "loss": 2.5053, - "step": 6756 - }, - { - "epoch": 2.09, - "learning_rate": 0.0001058183059345704, - "loss": 2.7898, - "step": 6757 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010578267687533317, - "loss": 2.6135, - "step": 6758 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010574704781609595, - "loss": 2.3924, - "step": 6759 - }, - { - "epoch": 2.09, - "learning_rate": 0.0001057114187568587, - "loss": 2.5524, - "step": 6760 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010567578969762148, - "loss": 2.3589, - "step": 6761 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010564016063838425, - "loss": 2.5457, - "step": 6762 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010560453157914703, - "loss": 2.3449, - "step": 6763 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010556890251990979, - "loss": 2.2094, - "step": 6764 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010553327346067256, - "loss": 2.3364, - "step": 6765 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010549764440143534, - "loss": 2.1815, - "step": 6766 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010546201534219811, - "loss": 2.1512, - "step": 6767 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010542638628296086, - "loss": 2.0824, - "step": 6768 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010539075722372364, - "loss": 1.8254, - "step": 6769 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010535512816448643, - "loss": 1.986, - "step": 6770 - }, - { - "epoch": 2.09, - "learning_rate": 0.0001053194991052492, - "loss": 1.833, - "step": 6771 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010528387004601198, - "loss": 1.8148, - "step": 6772 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010524824098677473, - "loss": 1.813, - "step": 6773 - }, - { - "epoch": 2.09, - "learning_rate": 0.0001052126119275375, - "loss": 1.7249, - "step": 6774 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010517698286830028, - "loss": 1.8822, - "step": 6775 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010514135380906305, - "loss": 1.7373, - "step": 6776 - }, - { - "epoch": 2.09, - "learning_rate": 0.0001051057247498258, - "loss": 1.5375, - "step": 6777 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010507009569058859, - "loss": 1.5699, - "step": 6778 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010503446663135135, - "loss": 1.5355, - "step": 6779 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010499883757211414, - "loss": 1.601, - "step": 6780 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010496320851287689, - "loss": 4.027, - "step": 6781 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010492757945363967, - "loss": 3.7195, - "step": 6782 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010489195039440244, - "loss": 3.5488, - "step": 6783 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010485632133516522, - "loss": 3.7329, - "step": 6784 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010482069227592799, - "loss": 3.4978, - "step": 6785 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010478506321669075, - "loss": 3.3975, - "step": 6786 - }, - { - "epoch": 2.09, - "learning_rate": 0.00010474943415745353, - "loss": 3.3864, - "step": 6787 - }, - { - "epoch": 2.1, - "learning_rate": 0.0001047138050982163, - "loss": 3.2962, - "step": 6788 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010467817603897908, - "loss": 3.4337, - "step": 6789 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010464254697974183, - "loss": 3.0863, - "step": 6790 - }, - { - "epoch": 2.1, - "learning_rate": 0.0001046069179205046, - "loss": 3.0575, - "step": 6791 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010457128886126738, - "loss": 3.1208, - "step": 6792 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010453565980203015, - "loss": 2.9817, - "step": 6793 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010450003074279292, - "loss": 3.1266, - "step": 6794 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010446440168355569, - "loss": 2.9504, - "step": 6795 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010442877262431847, - "loss": 2.9294, - "step": 6796 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010439314356508124, - "loss": 3.0287, - "step": 6797 - }, - { - "epoch": 2.1, - "learning_rate": 0.000104357514505844, - "loss": 3.0241, - "step": 6798 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010432188544660678, - "loss": 2.9447, - "step": 6799 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010428625638736954, - "loss": 2.8845, - "step": 6800 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010425062732813233, - "loss": 2.8644, - "step": 6801 - }, - { - "epoch": 2.1, - "learning_rate": 0.0001042149982688951, - "loss": 2.526, - "step": 6802 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010417936920965785, - "loss": 2.8357, - "step": 6803 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010414374015042063, - "loss": 2.7722, - "step": 6804 - }, - { - "epoch": 2.1, - "learning_rate": 0.0001041081110911834, - "loss": 2.5179, - "step": 6805 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010407248203194618, - "loss": 2.8025, - "step": 6806 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010403685297270894, - "loss": 2.6135, - "step": 6807 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010400122391347172, - "loss": 2.4911, - "step": 6808 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010396559485423449, - "loss": 2.5968, - "step": 6809 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010392996579499727, - "loss": 2.6267, - "step": 6810 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010389433673576002, - "loss": 2.4699, - "step": 6811 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010385870767652279, - "loss": 2.2511, - "step": 6812 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010382307861728557, - "loss": 2.1994, - "step": 6813 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010378744955804834, - "loss": 2.5834, - "step": 6814 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010375182049881111, - "loss": 2.4132, - "step": 6815 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010371619143957388, - "loss": 2.2885, - "step": 6816 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010368056238033666, - "loss": 2.2937, - "step": 6817 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010364493332109943, - "loss": 2.1916, - "step": 6818 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010360930426186221, - "loss": 2.1889, - "step": 6819 - }, - { - "epoch": 2.1, - "learning_rate": 0.00010357367520262496, - "loss": 2.1748, - "step": 6820 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010353804614338773, - "loss": 2.1198, - "step": 6821 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010350241708415051, - "loss": 1.8255, - "step": 6822 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010346678802491328, - "loss": 1.9655, - "step": 6823 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010343115896567604, - "loss": 1.5755, - "step": 6824 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010339552990643882, - "loss": 1.7794, - "step": 6825 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010335990084720159, - "loss": 1.5916, - "step": 6826 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010332427178796437, - "loss": 1.9121, - "step": 6827 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010328864272872712, - "loss": 1.5924, - "step": 6828 - }, - { - "epoch": 2.11, - "learning_rate": 0.0001032530136694899, - "loss": 1.5807, - "step": 6829 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010321738461025267, - "loss": 1.509, - "step": 6830 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010318175555101546, - "loss": 4.0181, - "step": 6831 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010314612649177821, - "loss": 3.7851, - "step": 6832 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010311049743254098, - "loss": 3.8218, - "step": 6833 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010307486837330376, - "loss": 3.3311, - "step": 6834 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010303923931406653, - "loss": 3.4001, - "step": 6835 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010300361025482931, - "loss": 3.1248, - "step": 6836 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010296798119559207, - "loss": 3.3355, - "step": 6837 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010293235213635483, - "loss": 3.2488, - "step": 6838 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010289672307711762, - "loss": 3.2512, - "step": 6839 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010286109401788038, - "loss": 3.1948, - "step": 6840 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010282546495864315, - "loss": 2.8782, - "step": 6841 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010278983589940592, - "loss": 3.1984, - "step": 6842 - }, - { - "epoch": 2.11, - "learning_rate": 0.0001027542068401687, - "loss": 3.305, - "step": 6843 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010271857778093147, - "loss": 3.1312, - "step": 6844 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010268294872169423, - "loss": 3.2467, - "step": 6845 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010264731966245701, - "loss": 2.9008, - "step": 6846 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010261169060321978, - "loss": 3.0515, - "step": 6847 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010257606154398256, - "loss": 2.7849, - "step": 6848 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010254043248474531, - "loss": 3.0263, - "step": 6849 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010250480342550808, - "loss": 2.7956, - "step": 6850 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010246917436627086, - "loss": 2.8083, - "step": 6851 - }, - { - "epoch": 2.11, - "learning_rate": 0.00010243354530703363, - "loss": 2.7364, - "step": 6852 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010239791624779641, - "loss": 2.6416, - "step": 6853 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010236228718855917, - "loss": 2.7854, - "step": 6854 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010232665812932195, - "loss": 2.5164, - "step": 6855 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010229102907008472, - "loss": 2.9212, - "step": 6856 - }, - { - "epoch": 2.12, - "learning_rate": 0.0001022554000108475, - "loss": 2.5381, - "step": 6857 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010221977095161026, - "loss": 2.591, - "step": 6858 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010218414189237302, - "loss": 2.487, - "step": 6859 - }, - { - "epoch": 2.12, - "learning_rate": 0.0001021485128331358, - "loss": 2.3172, - "step": 6860 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010211288377389857, - "loss": 2.2826, - "step": 6861 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010207725471466134, - "loss": 2.2676, - "step": 6862 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010204162565542411, - "loss": 2.4021, - "step": 6863 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010200599659618689, - "loss": 2.5505, - "step": 6864 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010197036753694966, - "loss": 2.4505, - "step": 6865 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010193473847771242, - "loss": 2.3745, - "step": 6866 - }, - { - "epoch": 2.12, - "learning_rate": 0.0001018991094184752, - "loss": 2.3471, - "step": 6867 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010186348035923797, - "loss": 2.234, - "step": 6868 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010182785130000075, - "loss": 2.0486, - "step": 6869 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010179222224076352, - "loss": 1.8601, - "step": 6870 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010175659318152627, - "loss": 1.9034, - "step": 6871 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010172096412228905, - "loss": 2.0267, - "step": 6872 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010168533506305182, - "loss": 1.793, - "step": 6873 - }, - { - "epoch": 2.12, - "learning_rate": 0.0001016497060038146, - "loss": 1.5908, - "step": 6874 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010161407694457736, - "loss": 1.6571, - "step": 6875 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010157844788534014, - "loss": 1.7025, - "step": 6876 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010154281882610291, - "loss": 1.6723, - "step": 6877 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010150718976686569, - "loss": 1.4563, - "step": 6878 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010147156070762844, - "loss": 1.4884, - "step": 6879 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010143593164839121, - "loss": 1.4144, - "step": 6880 - }, - { - "epoch": 2.12, - "learning_rate": 0.000101400302589154, - "loss": 4.1636, - "step": 6881 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010136467352991676, - "loss": 3.9834, - "step": 6882 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010132904447067954, - "loss": 3.6091, - "step": 6883 - }, - { - "epoch": 2.12, - "learning_rate": 0.0001012934154114423, - "loss": 3.3629, - "step": 6884 - }, - { - "epoch": 2.12, - "learning_rate": 0.00010125778635220507, - "loss": 3.2727, - "step": 6885 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010122215729296785, - "loss": 3.1479, - "step": 6886 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010118652823373062, - "loss": 3.3427, - "step": 6887 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010115089917449339, - "loss": 3.1874, - "step": 6888 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010111527011525615, - "loss": 3.0249, - "step": 6889 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010107964105601894, - "loss": 3.2617, - "step": 6890 - }, - { - "epoch": 2.13, - "learning_rate": 0.0001010440119967817, - "loss": 3.2121, - "step": 6891 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010100838293754446, - "loss": 3.2809, - "step": 6892 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010097275387830724, - "loss": 3.024, - "step": 6893 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010093712481907001, - "loss": 2.9289, - "step": 6894 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010090149575983279, - "loss": 2.8105, - "step": 6895 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010086586670059555, - "loss": 2.839, - "step": 6896 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010083023764135831, - "loss": 2.9934, - "step": 6897 - }, - { - "epoch": 2.13, - "learning_rate": 0.0001007946085821211, - "loss": 2.8787, - "step": 6898 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010075897952288386, - "loss": 2.992, - "step": 6899 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010072335046364665, - "loss": 2.9649, - "step": 6900 - }, - { - "epoch": 2.13, - "learning_rate": 0.0001006877214044094, - "loss": 2.6845, - "step": 6901 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010065209234517218, - "loss": 2.946, - "step": 6902 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010061646328593495, - "loss": 2.7703, - "step": 6903 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010058083422669773, - "loss": 2.7239, - "step": 6904 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010054520516746049, - "loss": 2.6666, - "step": 6905 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010050957610822326, - "loss": 2.5851, - "step": 6906 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010047394704898604, - "loss": 2.704, - "step": 6907 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010043831798974881, - "loss": 2.7292, - "step": 6908 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010040268893051158, - "loss": 2.4195, - "step": 6909 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010036705987127434, - "loss": 2.5844, - "step": 6910 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010033143081203713, - "loss": 2.3988, - "step": 6911 - }, - { - "epoch": 2.13, - "learning_rate": 0.0001002958017527999, - "loss": 2.4287, - "step": 6912 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010026017269356265, - "loss": 2.4358, - "step": 6913 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010022454363432543, - "loss": 2.1797, - "step": 6914 - }, - { - "epoch": 2.13, - "learning_rate": 0.0001001889145750882, - "loss": 2.2145, - "step": 6915 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010015328551585098, - "loss": 2.1095, - "step": 6916 - }, - { - "epoch": 2.13, - "learning_rate": 0.00010011765645661375, - "loss": 2.226, - "step": 6917 - }, - { - "epoch": 2.14, - "learning_rate": 0.0001000820273973765, - "loss": 2.0479, - "step": 6918 - }, - { - "epoch": 2.14, - "learning_rate": 0.00010004639833813929, - "loss": 2.0911, - "step": 6919 - }, - { - "epoch": 2.14, - "learning_rate": 0.00010001076927890205, - "loss": 2.132, - "step": 6920 - }, - { - "epoch": 2.14, - "learning_rate": 9.997514021966484e-05, - "loss": 1.938, - "step": 6921 - }, - { - "epoch": 2.14, - "learning_rate": 9.993951116042759e-05, - "loss": 1.904, - "step": 6922 - }, - { - "epoch": 2.14, - "learning_rate": 9.990388210119037e-05, - "loss": 1.7755, - "step": 6923 - }, - { - "epoch": 2.14, - "learning_rate": 9.986825304195314e-05, - "loss": 1.78, - "step": 6924 - }, - { - "epoch": 2.14, - "learning_rate": 9.983262398271592e-05, - "loss": 1.7125, - "step": 6925 - }, - { - "epoch": 2.14, - "learning_rate": 9.979699492347868e-05, - "loss": 1.6069, - "step": 6926 - }, - { - "epoch": 2.14, - "learning_rate": 9.976136586424145e-05, - "loss": 1.571, - "step": 6927 - }, - { - "epoch": 2.14, - "learning_rate": 9.972573680500423e-05, - "loss": 1.469, - "step": 6928 - }, - { - "epoch": 2.14, - "learning_rate": 9.9690107745767e-05, - "loss": 1.5851, - "step": 6929 - }, - { - "epoch": 2.14, - "learning_rate": 9.965447868652975e-05, - "loss": 1.4929, - "step": 6930 - }, - { - "epoch": 2.14, - "learning_rate": 9.961884962729253e-05, - "loss": 3.8439, - "step": 6931 - }, - { - "epoch": 2.14, - "learning_rate": 9.95832205680553e-05, - "loss": 3.9943, - "step": 6932 - }, - { - "epoch": 2.14, - "learning_rate": 9.954759150881808e-05, - "loss": 3.6159, - "step": 6933 - }, - { - "epoch": 2.14, - "learning_rate": 9.951196244958085e-05, - "loss": 3.079, - "step": 6934 - }, - { - "epoch": 2.14, - "learning_rate": 9.947633339034362e-05, - "loss": 3.3909, - "step": 6935 - }, - { - "epoch": 2.14, - "learning_rate": 9.944070433110639e-05, - "loss": 3.1971, - "step": 6936 - }, - { - "epoch": 2.14, - "learning_rate": 9.940507527186917e-05, - "loss": 2.9335, - "step": 6937 - }, - { - "epoch": 2.14, - "learning_rate": 9.936944621263194e-05, - "loss": 3.2814, - "step": 6938 - }, - { - "epoch": 2.14, - "learning_rate": 9.933381715339469e-05, - "loss": 3.1233, - "step": 6939 - }, - { - "epoch": 2.14, - "learning_rate": 9.929818809415747e-05, - "loss": 3.0175, - "step": 6940 - }, - { - "epoch": 2.14, - "learning_rate": 9.926255903492024e-05, - "loss": 3.1412, - "step": 6941 - }, - { - "epoch": 2.14, - "learning_rate": 9.922692997568302e-05, - "loss": 3.1114, - "step": 6942 - }, - { - "epoch": 2.14, - "learning_rate": 9.919130091644578e-05, - "loss": 3.1215, - "step": 6943 - }, - { - "epoch": 2.14, - "learning_rate": 9.915567185720855e-05, - "loss": 2.8719, - "step": 6944 - }, - { - "epoch": 2.14, - "learning_rate": 9.912004279797133e-05, - "loss": 2.8315, - "step": 6945 - }, - { - "epoch": 2.14, - "learning_rate": 9.908441373873411e-05, - "loss": 2.8352, - "step": 6946 - }, - { - "epoch": 2.14, - "learning_rate": 9.904878467949687e-05, - "loss": 2.8364, - "step": 6947 - }, - { - "epoch": 2.14, - "learning_rate": 9.901315562025963e-05, - "loss": 2.7274, - "step": 6948 - }, - { - "epoch": 2.14, - "learning_rate": 9.897752656102242e-05, - "loss": 3.0258, - "step": 6949 - }, - { - "epoch": 2.15, - "learning_rate": 9.894189750178518e-05, - "loss": 2.6869, - "step": 6950 - }, - { - "epoch": 2.15, - "learning_rate": 9.890626844254797e-05, - "loss": 2.7359, - "step": 6951 - }, - { - "epoch": 2.15, - "learning_rate": 9.887063938331072e-05, - "loss": 2.6575, - "step": 6952 - }, - { - "epoch": 2.15, - "learning_rate": 9.883501032407349e-05, - "loss": 2.5025, - "step": 6953 - }, - { - "epoch": 2.15, - "learning_rate": 9.879938126483627e-05, - "loss": 2.8654, - "step": 6954 - }, - { - "epoch": 2.15, - "learning_rate": 9.876375220559904e-05, - "loss": 2.8307, - "step": 6955 - }, - { - "epoch": 2.15, - "learning_rate": 9.872812314636181e-05, - "loss": 2.5592, - "step": 6956 - }, - { - "epoch": 2.15, - "learning_rate": 9.869249408712458e-05, - "loss": 2.4106, - "step": 6957 - }, - { - "epoch": 2.15, - "learning_rate": 9.865686502788736e-05, - "loss": 2.4343, - "step": 6958 - }, - { - "epoch": 2.15, - "learning_rate": 9.862123596865013e-05, - "loss": 2.782, - "step": 6959 - }, - { - "epoch": 2.15, - "learning_rate": 9.858560690941288e-05, - "loss": 2.5913, - "step": 6960 - }, - { - "epoch": 2.15, - "learning_rate": 9.854997785017566e-05, - "loss": 2.2276, - "step": 6961 - }, - { - "epoch": 2.15, - "learning_rate": 9.851434879093843e-05, - "loss": 2.5371, - "step": 6962 - }, - { - "epoch": 2.15, - "learning_rate": 9.847871973170121e-05, - "loss": 2.3627, - "step": 6963 - }, - { - "epoch": 2.15, - "learning_rate": 9.844309067246397e-05, - "loss": 2.1625, - "step": 6964 - }, - { - "epoch": 2.15, - "learning_rate": 9.840746161322674e-05, - "loss": 2.04, - "step": 6965 - }, - { - "epoch": 2.15, - "learning_rate": 9.837183255398952e-05, - "loss": 1.9796, - "step": 6966 - }, - { - "epoch": 2.15, - "learning_rate": 9.833620349475229e-05, - "loss": 2.169, - "step": 6967 - }, - { - "epoch": 2.15, - "learning_rate": 9.830057443551507e-05, - "loss": 2.2094, - "step": 6968 - }, - { - "epoch": 2.15, - "learning_rate": 9.826494537627782e-05, - "loss": 2.1764, - "step": 6969 - }, - { - "epoch": 2.15, - "learning_rate": 9.82293163170406e-05, - "loss": 2.2942, - "step": 6970 - }, - { - "epoch": 2.15, - "learning_rate": 9.819368725780337e-05, - "loss": 2.171, - "step": 6971 - }, - { - "epoch": 2.15, - "learning_rate": 9.815805819856616e-05, - "loss": 2.0024, - "step": 6972 - }, - { - "epoch": 2.15, - "learning_rate": 9.812242913932891e-05, - "loss": 1.7423, - "step": 6973 - }, - { - "epoch": 2.15, - "learning_rate": 9.808680008009168e-05, - "loss": 1.5334, - "step": 6974 - }, - { - "epoch": 2.15, - "learning_rate": 9.805117102085446e-05, - "loss": 1.7629, - "step": 6975 - }, - { - "epoch": 2.15, - "learning_rate": 9.801554196161723e-05, - "loss": 1.5446, - "step": 6976 - }, - { - "epoch": 2.15, - "learning_rate": 9.797991290237998e-05, - "loss": 1.855, - "step": 6977 - }, - { - "epoch": 2.15, - "learning_rate": 9.794428384314277e-05, - "loss": 1.4562, - "step": 6978 - }, - { - "epoch": 2.15, - "learning_rate": 9.790865478390553e-05, - "loss": 1.4098, - "step": 6979 - }, - { - "epoch": 2.15, - "learning_rate": 9.787302572466832e-05, - "loss": 1.4501, - "step": 6980 - }, - { - "epoch": 2.15, - "learning_rate": 9.783739666543108e-05, - "loss": 3.8114, - "step": 6981 - }, - { - "epoch": 2.15, - "learning_rate": 9.780176760619385e-05, - "loss": 3.8206, - "step": 6982 - }, - { - "epoch": 2.16, - "learning_rate": 9.776613854695662e-05, - "loss": 3.4749, - "step": 6983 - }, - { - "epoch": 2.16, - "learning_rate": 9.77305094877194e-05, - "loss": 3.2544, - "step": 6984 - }, - { - "epoch": 2.16, - "learning_rate": 9.769488042848217e-05, - "loss": 3.2258, - "step": 6985 - }, - { - "epoch": 2.16, - "learning_rate": 9.765925136924493e-05, - "loss": 3.2236, - "step": 6986 - }, - { - "epoch": 2.16, - "learning_rate": 9.762362231000771e-05, - "loss": 3.2548, - "step": 6987 - }, - { - "epoch": 2.16, - "learning_rate": 9.758799325077048e-05, - "loss": 3.1096, - "step": 6988 - }, - { - "epoch": 2.16, - "learning_rate": 9.755236419153326e-05, - "loss": 2.954, - "step": 6989 - }, - { - "epoch": 2.16, - "learning_rate": 9.751673513229601e-05, - "loss": 3.1953, - "step": 6990 - }, - { - "epoch": 2.16, - "learning_rate": 9.74811060730588e-05, - "loss": 3.1113, - "step": 6991 - }, - { - "epoch": 2.16, - "learning_rate": 9.744547701382156e-05, - "loss": 3.0404, - "step": 6992 - }, - { - "epoch": 2.16, - "learning_rate": 9.740984795458434e-05, - "loss": 3.0546, - "step": 6993 - }, - { - "epoch": 2.16, - "learning_rate": 9.73742188953471e-05, - "loss": 3.1705, - "step": 6994 - }, - { - "epoch": 2.16, - "learning_rate": 9.733858983610987e-05, - "loss": 2.8259, - "step": 6995 - }, - { - "epoch": 2.16, - "learning_rate": 9.730296077687265e-05, - "loss": 2.8109, - "step": 6996 - }, - { - "epoch": 2.16, - "learning_rate": 9.726733171763542e-05, - "loss": 3.0071, - "step": 6997 - }, - { - "epoch": 2.16, - "learning_rate": 9.72317026583982e-05, - "loss": 2.7021, - "step": 6998 - }, - { - "epoch": 2.16, - "learning_rate": 9.719607359916095e-05, - "loss": 2.801, - "step": 6999 - }, - { - "epoch": 2.16, - "learning_rate": 9.716044453992372e-05, - "loss": 2.6245, - "step": 7000 - }, - { - "epoch": 2.16, - "eval_bleu": 2.8620835163254036e-20, - "eval_loss": 4.03611946105957, - "eval_runtime": 2538.6544, - "eval_samples_per_second": 5.814, - "eval_steps_per_second": 0.727, - "step": 7000 - }, - { - "epoch": 2.16, - "learning_rate": 9.71248154806865e-05, - "loss": 2.574, - "step": 7001 - }, - { - "epoch": 2.16, - "learning_rate": 9.708918642144927e-05, - "loss": 2.9205, - "step": 7002 - }, - { - "epoch": 2.16, - "learning_rate": 9.705355736221204e-05, - "loss": 2.6834, - "step": 7003 - }, - { - "epoch": 2.16, - "learning_rate": 9.701792830297481e-05, - "loss": 2.5536, - "step": 7004 - }, - { - "epoch": 2.16, - "learning_rate": 9.698229924373759e-05, - "loss": 2.4832, - "step": 7005 - }, - { - "epoch": 2.16, - "learning_rate": 9.694667018450036e-05, - "loss": 2.5602, - "step": 7006 - }, - { - "epoch": 2.16, - "learning_rate": 9.691104112526311e-05, - "loss": 2.6101, - "step": 7007 - }, - { - "epoch": 2.16, - "learning_rate": 9.68754120660259e-05, - "loss": 2.4503, - "step": 7008 - }, - { - "epoch": 2.16, - "learning_rate": 9.683978300678867e-05, - "loss": 2.3585, - "step": 7009 - }, - { - "epoch": 2.16, - "learning_rate": 9.680415394755145e-05, - "loss": 2.4902, - "step": 7010 - }, - { - "epoch": 2.16, - "learning_rate": 9.67685248883142e-05, - "loss": 2.2664, - "step": 7011 - }, - { - "epoch": 2.16, - "learning_rate": 9.673289582907697e-05, - "loss": 2.5169, - "step": 7012 - }, - { - "epoch": 2.16, - "learning_rate": 9.669726676983975e-05, - "loss": 2.1318, - "step": 7013 - }, - { - "epoch": 2.16, - "learning_rate": 9.666163771060252e-05, - "loss": 2.2824, - "step": 7014 - }, - { - "epoch": 2.17, - "learning_rate": 9.66260086513653e-05, - "loss": 2.3317, - "step": 7015 - }, - { - "epoch": 2.17, - "learning_rate": 9.659037959212806e-05, - "loss": 2.2483, - "step": 7016 - }, - { - "epoch": 2.17, - "learning_rate": 9.655475053289084e-05, - "loss": 1.7396, - "step": 7017 - }, - { - "epoch": 2.17, - "learning_rate": 9.651912147365361e-05, - "loss": 2.0274, - "step": 7018 - }, - { - "epoch": 2.17, - "learning_rate": 9.648349241441639e-05, - "loss": 1.9994, - "step": 7019 - }, - { - "epoch": 2.17, - "learning_rate": 9.644786335517914e-05, - "loss": 1.9376, - "step": 7020 - }, - { - "epoch": 2.17, - "learning_rate": 9.641223429594191e-05, - "loss": 1.8724, - "step": 7021 - }, - { - "epoch": 2.17, - "learning_rate": 9.63766052367047e-05, - "loss": 1.885, - "step": 7022 - }, - { - "epoch": 2.17, - "learning_rate": 9.634097617746746e-05, - "loss": 1.7585, - "step": 7023 - }, - { - "epoch": 2.17, - "learning_rate": 9.630534711823022e-05, - "loss": 1.6454, - "step": 7024 - }, - { - "epoch": 2.17, - "learning_rate": 9.6269718058993e-05, - "loss": 1.5186, - "step": 7025 - }, - { - "epoch": 2.17, - "learning_rate": 9.623408899975577e-05, - "loss": 1.6844, - "step": 7026 - }, - { - "epoch": 2.17, - "learning_rate": 9.619845994051855e-05, - "loss": 1.6363, - "step": 7027 - }, - { - "epoch": 2.17, - "learning_rate": 9.61628308812813e-05, - "loss": 1.4135, - "step": 7028 - }, - { - "epoch": 2.17, - "learning_rate": 9.612720182204409e-05, - "loss": 1.3912, - "step": 7029 - }, - { - "epoch": 2.17, - "learning_rate": 9.609157276280685e-05, - "loss": 1.5031, - "step": 7030 - }, - { - "epoch": 2.17, - "learning_rate": 9.605594370356964e-05, - "loss": 4.1439, - "step": 7031 - }, - { - "epoch": 2.17, - "learning_rate": 9.60203146443324e-05, - "loss": 4.0924, - "step": 7032 - }, - { - "epoch": 2.17, - "learning_rate": 9.598468558509516e-05, - "loss": 3.574, - "step": 7033 - }, - { - "epoch": 2.17, - "learning_rate": 9.594905652585794e-05, - "loss": 3.42, - "step": 7034 - }, - { - "epoch": 2.17, - "learning_rate": 9.591342746662071e-05, - "loss": 3.3048, - "step": 7035 - }, - { - "epoch": 2.17, - "learning_rate": 9.587779840738349e-05, - "loss": 3.0996, - "step": 7036 - }, - { - "epoch": 2.17, - "learning_rate": 9.584216934814625e-05, - "loss": 3.3798, - "step": 7037 - }, - { - "epoch": 2.17, - "learning_rate": 9.580654028890903e-05, - "loss": 3.1394, - "step": 7038 - }, - { - "epoch": 2.17, - "learning_rate": 9.57709112296718e-05, - "loss": 3.0501, - "step": 7039 - }, - { - "epoch": 2.17, - "learning_rate": 9.573528217043458e-05, - "loss": 2.9482, - "step": 7040 - }, - { - "epoch": 2.17, - "learning_rate": 9.569965311119733e-05, - "loss": 3.1753, - "step": 7041 - }, - { - "epoch": 2.17, - "learning_rate": 9.56640240519601e-05, - "loss": 3.0658, - "step": 7042 - }, - { - "epoch": 2.17, - "learning_rate": 9.562839499272288e-05, - "loss": 3.309, - "step": 7043 - }, - { - "epoch": 2.17, - "learning_rate": 9.559276593348565e-05, - "loss": 2.9948, - "step": 7044 - }, - { - "epoch": 2.17, - "learning_rate": 9.55571368742484e-05, - "loss": 2.9465, - "step": 7045 - }, - { - "epoch": 2.17, - "learning_rate": 9.552150781501119e-05, - "loss": 3.063, - "step": 7046 - }, - { - "epoch": 2.17, - "learning_rate": 9.548587875577396e-05, - "loss": 2.5341, - "step": 7047 - }, - { - "epoch": 2.18, - "learning_rate": 9.545024969653674e-05, - "loss": 2.8882, - "step": 7048 - }, - { - "epoch": 2.18, - "learning_rate": 9.54146206372995e-05, - "loss": 3.1003, - "step": 7049 - }, - { - "epoch": 2.18, - "learning_rate": 9.537899157806227e-05, - "loss": 2.7254, - "step": 7050 - }, - { - "epoch": 2.18, - "learning_rate": 9.534336251882504e-05, - "loss": 2.6725, - "step": 7051 - }, - { - "epoch": 2.18, - "learning_rate": 9.530773345958782e-05, - "loss": 2.5556, - "step": 7052 - }, - { - "epoch": 2.18, - "learning_rate": 9.527210440035059e-05, - "loss": 2.9176, - "step": 7053 - }, - { - "epoch": 2.18, - "learning_rate": 9.523647534111335e-05, - "loss": 2.6866, - "step": 7054 - }, - { - "epoch": 2.18, - "learning_rate": 9.520084628187613e-05, - "loss": 2.7528, - "step": 7055 - }, - { - "epoch": 2.18, - "learning_rate": 9.51652172226389e-05, - "loss": 2.7326, - "step": 7056 - }, - { - "epoch": 2.18, - "learning_rate": 9.512958816340168e-05, - "loss": 2.8559, - "step": 7057 - }, - { - "epoch": 2.18, - "learning_rate": 9.509395910416443e-05, - "loss": 2.437, - "step": 7058 - }, - { - "epoch": 2.18, - "learning_rate": 9.50583300449272e-05, - "loss": 2.3919, - "step": 7059 - }, - { - "epoch": 2.18, - "learning_rate": 9.502270098568999e-05, - "loss": 2.5903, - "step": 7060 - }, - { - "epoch": 2.18, - "learning_rate": 9.498707192645275e-05, - "loss": 2.2275, - "step": 7061 - }, - { - "epoch": 2.18, - "learning_rate": 9.495144286721552e-05, - "loss": 2.4596, - "step": 7062 - }, - { - "epoch": 2.18, - "learning_rate": 9.491581380797829e-05, - "loss": 2.2534, - "step": 7063 - }, - { - "epoch": 2.18, - "learning_rate": 9.488018474874107e-05, - "loss": 2.1276, - "step": 7064 - }, - { - "epoch": 2.18, - "learning_rate": 9.484455568950384e-05, - "loss": 2.2766, - "step": 7065 - }, - { - "epoch": 2.18, - "learning_rate": 9.480892663026662e-05, - "loss": 1.99, - "step": 7066 - }, - { - "epoch": 2.18, - "learning_rate": 9.477329757102938e-05, - "loss": 2.0476, - "step": 7067 - }, - { - "epoch": 2.18, - "learning_rate": 9.473766851179215e-05, - "loss": 1.9067, - "step": 7068 - }, - { - "epoch": 2.18, - "learning_rate": 9.470203945255493e-05, - "loss": 2.0632, - "step": 7069 - }, - { - "epoch": 2.18, - "learning_rate": 9.46664103933177e-05, - "loss": 2.0635, - "step": 7070 - }, - { - "epoch": 2.18, - "learning_rate": 9.463078133408045e-05, - "loss": 1.9559, - "step": 7071 - }, - { - "epoch": 2.18, - "learning_rate": 9.459515227484323e-05, - "loss": 1.9201, - "step": 7072 - }, - { - "epoch": 2.18, - "learning_rate": 9.4559523215606e-05, - "loss": 1.8794, - "step": 7073 - }, - { - "epoch": 2.18, - "learning_rate": 9.452389415636878e-05, - "loss": 1.8192, - "step": 7074 - }, - { - "epoch": 2.18, - "learning_rate": 9.448826509713154e-05, - "loss": 1.5874, - "step": 7075 - }, - { - "epoch": 2.18, - "learning_rate": 9.445263603789432e-05, - "loss": 1.7251, - "step": 7076 - }, - { - "epoch": 2.18, - "learning_rate": 9.441700697865709e-05, - "loss": 1.6698, - "step": 7077 - }, - { - "epoch": 2.18, - "learning_rate": 9.438137791941987e-05, - "loss": 1.4697, - "step": 7078 - }, - { - "epoch": 2.18, - "learning_rate": 9.434574886018264e-05, - "loss": 1.5252, - "step": 7079 - }, - { - "epoch": 2.19, - "learning_rate": 9.431011980094539e-05, - "loss": 1.3799, - "step": 7080 - }, - { - "epoch": 2.19, - "learning_rate": 9.427449074170817e-05, - "loss": 4.0793, - "step": 7081 - }, - { - "epoch": 2.19, - "learning_rate": 9.423886168247094e-05, - "loss": 3.9396, - "step": 7082 - }, - { - "epoch": 2.19, - "learning_rate": 9.420323262323372e-05, - "loss": 3.4548, - "step": 7083 - }, - { - "epoch": 2.19, - "learning_rate": 9.416760356399648e-05, - "loss": 3.5485, - "step": 7084 - }, - { - "epoch": 2.19, - "learning_rate": 9.413197450475926e-05, - "loss": 3.3483, - "step": 7085 - }, - { - "epoch": 2.19, - "learning_rate": 9.409634544552203e-05, - "loss": 3.2709, - "step": 7086 - }, - { - "epoch": 2.19, - "learning_rate": 9.406071638628481e-05, - "loss": 3.2314, - "step": 7087 - }, - { - "epoch": 2.19, - "learning_rate": 9.402508732704757e-05, - "loss": 3.0133, - "step": 7088 - }, - { - "epoch": 2.19, - "learning_rate": 9.398945826781033e-05, - "loss": 3.2818, - "step": 7089 - }, - { - "epoch": 2.19, - "learning_rate": 9.395382920857312e-05, - "loss": 3.3336, - "step": 7090 - }, - { - "epoch": 2.19, - "learning_rate": 9.391820014933588e-05, - "loss": 3.2512, - "step": 7091 - }, - { - "epoch": 2.19, - "learning_rate": 9.388257109009864e-05, - "loss": 2.8849, - "step": 7092 - }, - { - "epoch": 2.19, - "learning_rate": 9.384694203086142e-05, - "loss": 3.2229, - "step": 7093 - }, - { - "epoch": 2.19, - "learning_rate": 9.381131297162419e-05, - "loss": 2.6233, - "step": 7094 - }, - { - "epoch": 2.19, - "learning_rate": 9.377568391238697e-05, - "loss": 2.509, - "step": 7095 - }, - { - "epoch": 2.19, - "learning_rate": 9.374005485314974e-05, - "loss": 2.8394, - "step": 7096 - }, - { - "epoch": 2.19, - "learning_rate": 9.370442579391251e-05, - "loss": 2.847, - "step": 7097 - }, - { - "epoch": 2.19, - "learning_rate": 9.366879673467528e-05, - "loss": 2.8392, - "step": 7098 - }, - { - "epoch": 2.19, - "learning_rate": 9.363316767543806e-05, - "loss": 2.665, - "step": 7099 - }, - { - "epoch": 2.19, - "learning_rate": 9.359753861620083e-05, - "loss": 2.9042, - "step": 7100 - }, - { - "epoch": 2.19, - "learning_rate": 9.356190955696358e-05, - "loss": 2.772, - "step": 7101 - }, - { - "epoch": 2.19, - "learning_rate": 9.352628049772636e-05, - "loss": 2.9732, - "step": 7102 - }, - { - "epoch": 2.19, - "learning_rate": 9.349065143848913e-05, - "loss": 2.6224, - "step": 7103 - }, - { - "epoch": 2.19, - "learning_rate": 9.345502237925191e-05, - "loss": 2.9007, - "step": 7104 - }, - { - "epoch": 2.19, - "learning_rate": 9.341939332001467e-05, - "loss": 2.5735, - "step": 7105 - }, - { - "epoch": 2.19, - "learning_rate": 9.338376426077744e-05, - "loss": 2.5649, - "step": 7106 - }, - { - "epoch": 2.19, - "learning_rate": 9.334813520154022e-05, - "loss": 2.4071, - "step": 7107 - }, - { - "epoch": 2.19, - "learning_rate": 9.331250614230299e-05, - "loss": 2.4105, - "step": 7108 - }, - { - "epoch": 2.19, - "learning_rate": 9.327687708306575e-05, - "loss": 2.3269, - "step": 7109 - }, - { - "epoch": 2.19, - "learning_rate": 9.324124802382852e-05, - "loss": 2.0712, - "step": 7110 - }, - { - "epoch": 2.19, - "learning_rate": 9.32056189645913e-05, - "loss": 2.2462, - "step": 7111 - }, - { - "epoch": 2.2, - "learning_rate": 9.316998990535407e-05, - "loss": 2.1748, - "step": 7112 - }, - { - "epoch": 2.2, - "learning_rate": 9.313436084611686e-05, - "loss": 2.5728, - "step": 7113 - }, - { - "epoch": 2.2, - "learning_rate": 9.309873178687961e-05, - "loss": 2.3369, - "step": 7114 - }, - { - "epoch": 2.2, - "learning_rate": 9.306310272764238e-05, - "loss": 2.046, - "step": 7115 - }, - { - "epoch": 2.2, - "learning_rate": 9.302747366840516e-05, - "loss": 2.3248, - "step": 7116 - }, - { - "epoch": 2.2, - "learning_rate": 9.299184460916793e-05, - "loss": 2.0643, - "step": 7117 - }, - { - "epoch": 2.2, - "learning_rate": 9.295621554993068e-05, - "loss": 2.1423, - "step": 7118 - }, - { - "epoch": 2.2, - "learning_rate": 9.292058649069347e-05, - "loss": 2.1726, - "step": 7119 - }, - { - "epoch": 2.2, - "learning_rate": 9.288495743145625e-05, - "loss": 1.9508, - "step": 7120 - }, - { - "epoch": 2.2, - "learning_rate": 9.284932837221902e-05, - "loss": 2.0883, - "step": 7121 - }, - { - "epoch": 2.2, - "learning_rate": 9.281369931298177e-05, - "loss": 1.8155, - "step": 7122 - }, - { - "epoch": 2.2, - "learning_rate": 9.277807025374455e-05, - "loss": 1.9422, - "step": 7123 - }, - { - "epoch": 2.2, - "learning_rate": 9.274244119450732e-05, - "loss": 1.9246, - "step": 7124 - }, - { - "epoch": 2.2, - "learning_rate": 9.27068121352701e-05, - "loss": 1.8587, - "step": 7125 - }, - { - "epoch": 2.2, - "learning_rate": 9.267118307603286e-05, - "loss": 1.6912, - "step": 7126 - }, - { - "epoch": 2.2, - "learning_rate": 9.263555401679563e-05, - "loss": 1.5325, - "step": 7127 - }, - { - "epoch": 2.2, - "learning_rate": 9.259992495755841e-05, - "loss": 1.5973, - "step": 7128 - }, - { - "epoch": 2.2, - "learning_rate": 9.256429589832118e-05, - "loss": 1.3618, - "step": 7129 - }, - { - "epoch": 2.2, - "learning_rate": 9.252866683908396e-05, - "loss": 1.4086, - "step": 7130 - }, - { - "epoch": 2.2, - "learning_rate": 9.249303777984671e-05, - "loss": 4.0022, - "step": 7131 - }, - { - "epoch": 2.2, - "learning_rate": 9.24574087206095e-05, - "loss": 3.7553, - "step": 7132 - }, - { - "epoch": 2.2, - "learning_rate": 9.242177966137226e-05, - "loss": 3.1806, - "step": 7133 - }, - { - "epoch": 2.2, - "learning_rate": 9.238615060213504e-05, - "loss": 3.5333, - "step": 7134 - }, - { - "epoch": 2.2, - "learning_rate": 9.23505215428978e-05, - "loss": 3.2706, - "step": 7135 - }, - { - "epoch": 2.2, - "learning_rate": 9.231489248366057e-05, - "loss": 3.2139, - "step": 7136 - }, - { - "epoch": 2.2, - "learning_rate": 9.227926342442335e-05, - "loss": 3.103, - "step": 7137 - }, - { - "epoch": 2.2, - "learning_rate": 9.224363436518612e-05, - "loss": 3.3027, - "step": 7138 - }, - { - "epoch": 2.2, - "learning_rate": 9.220800530594887e-05, - "loss": 3.1915, - "step": 7139 - }, - { - "epoch": 2.2, - "learning_rate": 9.217237624671165e-05, - "loss": 3.0811, - "step": 7140 - }, - { - "epoch": 2.2, - "learning_rate": 9.213674718747442e-05, - "loss": 3.1187, - "step": 7141 - }, - { - "epoch": 2.2, - "learning_rate": 9.21011181282372e-05, - "loss": 2.8269, - "step": 7142 - }, - { - "epoch": 2.2, - "learning_rate": 9.206548906899996e-05, - "loss": 2.9461, - "step": 7143 - }, - { - "epoch": 2.2, - "learning_rate": 9.202986000976274e-05, - "loss": 3.0701, - "step": 7144 - }, - { - "epoch": 2.21, - "learning_rate": 9.199423095052551e-05, - "loss": 2.6662, - "step": 7145 - }, - { - "epoch": 2.21, - "learning_rate": 9.195860189128829e-05, - "loss": 3.0339, - "step": 7146 - }, - { - "epoch": 2.21, - "learning_rate": 9.192297283205106e-05, - "loss": 3.0879, - "step": 7147 - }, - { - "epoch": 2.21, - "learning_rate": 9.188734377281381e-05, - "loss": 2.7355, - "step": 7148 - }, - { - "epoch": 2.21, - "learning_rate": 9.18517147135766e-05, - "loss": 2.7228, - "step": 7149 - }, - { - "epoch": 2.21, - "learning_rate": 9.181608565433936e-05, - "loss": 3.1554, - "step": 7150 - }, - { - "epoch": 2.21, - "learning_rate": 9.178045659510215e-05, - "loss": 2.656, - "step": 7151 - }, - { - "epoch": 2.21, - "learning_rate": 9.17448275358649e-05, - "loss": 2.7158, - "step": 7152 - }, - { - "epoch": 2.21, - "learning_rate": 9.170919847662767e-05, - "loss": 2.8427, - "step": 7153 - }, - { - "epoch": 2.21, - "learning_rate": 9.167356941739045e-05, - "loss": 2.6793, - "step": 7154 - }, - { - "epoch": 2.21, - "learning_rate": 9.163794035815322e-05, - "loss": 2.5299, - "step": 7155 - }, - { - "epoch": 2.21, - "learning_rate": 9.160231129891599e-05, - "loss": 2.7473, - "step": 7156 - }, - { - "epoch": 2.21, - "learning_rate": 9.156668223967876e-05, - "loss": 2.6903, - "step": 7157 - }, - { - "epoch": 2.21, - "learning_rate": 9.153105318044154e-05, - "loss": 2.5411, - "step": 7158 - }, - { - "epoch": 2.21, - "learning_rate": 9.14954241212043e-05, - "loss": 2.4399, - "step": 7159 - }, - { - "epoch": 2.21, - "learning_rate": 9.145979506196706e-05, - "loss": 2.2822, - "step": 7160 - }, - { - "epoch": 2.21, - "learning_rate": 9.142416600272984e-05, - "loss": 2.4888, - "step": 7161 - }, - { - "epoch": 2.21, - "learning_rate": 9.138853694349261e-05, - "loss": 2.2486, - "step": 7162 - }, - { - "epoch": 2.21, - "learning_rate": 9.13529078842554e-05, - "loss": 2.2903, - "step": 7163 - }, - { - "epoch": 2.21, - "learning_rate": 9.131727882501816e-05, - "loss": 2.2899, - "step": 7164 - }, - { - "epoch": 2.21, - "learning_rate": 9.128164976578093e-05, - "loss": 2.2444, - "step": 7165 - }, - { - "epoch": 2.21, - "learning_rate": 9.12460207065437e-05, - "loss": 2.097, - "step": 7166 - }, - { - "epoch": 2.21, - "learning_rate": 9.121039164730648e-05, - "loss": 2.0552, - "step": 7167 - }, - { - "epoch": 2.21, - "learning_rate": 9.117476258806925e-05, - "loss": 2.032, - "step": 7168 - }, - { - "epoch": 2.21, - "learning_rate": 9.1139133528832e-05, - "loss": 2.1937, - "step": 7169 - }, - { - "epoch": 2.21, - "learning_rate": 9.110350446959479e-05, - "loss": 1.9071, - "step": 7170 - }, - { - "epoch": 2.21, - "learning_rate": 9.106787541035755e-05, - "loss": 1.8828, - "step": 7171 - }, - { - "epoch": 2.21, - "learning_rate": 9.103224635112034e-05, - "loss": 1.7573, - "step": 7172 - }, - { - "epoch": 2.21, - "learning_rate": 9.099661729188309e-05, - "loss": 1.829, - "step": 7173 - }, - { - "epoch": 2.21, - "learning_rate": 9.096098823264586e-05, - "loss": 1.5917, - "step": 7174 - }, - { - "epoch": 2.21, - "learning_rate": 9.092535917340864e-05, - "loss": 1.664, - "step": 7175 - }, - { - "epoch": 2.21, - "learning_rate": 9.088973011417141e-05, - "loss": 1.6731, - "step": 7176 - }, - { - "epoch": 2.22, - "learning_rate": 9.085410105493419e-05, - "loss": 1.3628, - "step": 7177 - }, - { - "epoch": 2.22, - "learning_rate": 9.081847199569695e-05, - "loss": 1.3825, - "step": 7178 - }, - { - "epoch": 2.22, - "learning_rate": 9.078284293645973e-05, - "loss": 1.344, - "step": 7179 - }, - { - "epoch": 2.22, - "learning_rate": 9.07472138772225e-05, - "loss": 1.4166, - "step": 7180 - }, - { - "epoch": 2.22, - "learning_rate": 9.071158481798528e-05, - "loss": 3.9118, - "step": 7181 - }, - { - "epoch": 2.22, - "learning_rate": 9.067595575874803e-05, - "loss": 3.8231, - "step": 7182 - }, - { - "epoch": 2.22, - "learning_rate": 9.06403266995108e-05, - "loss": 3.2446, - "step": 7183 - }, - { - "epoch": 2.22, - "learning_rate": 9.060469764027358e-05, - "loss": 3.4876, - "step": 7184 - }, - { - "epoch": 2.22, - "learning_rate": 9.056906858103635e-05, - "loss": 3.3315, - "step": 7185 - }, - { - "epoch": 2.22, - "learning_rate": 9.05334395217991e-05, - "loss": 3.005, - "step": 7186 - }, - { - "epoch": 2.22, - "learning_rate": 9.049781046256189e-05, - "loss": 3.4032, - "step": 7187 - }, - { - "epoch": 2.22, - "learning_rate": 9.046218140332466e-05, - "loss": 3.1122, - "step": 7188 - }, - { - "epoch": 2.22, - "learning_rate": 9.042655234408744e-05, - "loss": 3.0031, - "step": 7189 - }, - { - "epoch": 2.22, - "learning_rate": 9.039092328485019e-05, - "loss": 2.7823, - "step": 7190 - }, - { - "epoch": 2.22, - "learning_rate": 9.035529422561297e-05, - "loss": 3.2138, - "step": 7191 - }, - { - "epoch": 2.22, - "learning_rate": 9.031966516637574e-05, - "loss": 3.2384, - "step": 7192 - }, - { - "epoch": 2.22, - "learning_rate": 9.028403610713852e-05, - "loss": 3.061, - "step": 7193 - }, - { - "epoch": 2.22, - "learning_rate": 9.024840704790129e-05, - "loss": 2.6907, - "step": 7194 - }, - { - "epoch": 2.22, - "learning_rate": 9.021277798866405e-05, - "loss": 2.8913, - "step": 7195 - }, - { - "epoch": 2.22, - "learning_rate": 9.017714892942683e-05, - "loss": 3.0697, - "step": 7196 - }, - { - "epoch": 2.22, - "learning_rate": 9.01415198701896e-05, - "loss": 2.7372, - "step": 7197 - }, - { - "epoch": 2.22, - "learning_rate": 9.010589081095238e-05, - "loss": 2.8258, - "step": 7198 - }, - { - "epoch": 2.22, - "learning_rate": 9.007026175171513e-05, - "loss": 2.8758, - "step": 7199 - }, - { - "epoch": 2.22, - "learning_rate": 9.00346326924779e-05, - "loss": 2.758, - "step": 7200 - }, - { - "epoch": 2.22, - "learning_rate": 8.999900363324068e-05, - "loss": 2.7042, - "step": 7201 - }, - { - "epoch": 2.22, - "learning_rate": 8.996337457400345e-05, - "loss": 2.8882, - "step": 7202 - }, - { - "epoch": 2.22, - "learning_rate": 8.992774551476622e-05, - "loss": 2.7637, - "step": 7203 - }, - { - "epoch": 2.22, - "learning_rate": 8.989211645552899e-05, - "loss": 2.5469, - "step": 7204 - }, - { - "epoch": 2.22, - "learning_rate": 8.985648739629177e-05, - "loss": 2.5556, - "step": 7205 - }, - { - "epoch": 2.22, - "learning_rate": 8.982085833705454e-05, - "loss": 2.4803, - "step": 7206 - }, - { - "epoch": 2.22, - "learning_rate": 8.97852292778173e-05, - "loss": 2.4158, - "step": 7207 - }, - { - "epoch": 2.22, - "learning_rate": 8.974960021858008e-05, - "loss": 2.4123, - "step": 7208 - }, - { - "epoch": 2.22, - "learning_rate": 8.971397115934284e-05, - "loss": 2.4978, - "step": 7209 - }, - { - "epoch": 2.23, - "learning_rate": 8.967834210010563e-05, - "loss": 2.4013, - "step": 7210 - }, - { - "epoch": 2.23, - "learning_rate": 8.96427130408684e-05, - "loss": 2.1052, - "step": 7211 - }, - { - "epoch": 2.23, - "learning_rate": 8.960708398163116e-05, - "loss": 2.4701, - "step": 7212 - }, - { - "epoch": 2.23, - "learning_rate": 8.957145492239393e-05, - "loss": 2.3424, - "step": 7213 - }, - { - "epoch": 2.23, - "learning_rate": 8.953582586315671e-05, - "loss": 2.2205, - "step": 7214 - }, - { - "epoch": 2.23, - "learning_rate": 8.950019680391948e-05, - "loss": 2.4749, - "step": 7215 - }, - { - "epoch": 2.23, - "learning_rate": 8.946456774468224e-05, - "loss": 2.1082, - "step": 7216 - }, - { - "epoch": 2.23, - "learning_rate": 8.942893868544502e-05, - "loss": 2.2423, - "step": 7217 - }, - { - "epoch": 2.23, - "learning_rate": 8.939330962620779e-05, - "loss": 2.1538, - "step": 7218 - }, - { - "epoch": 2.23, - "learning_rate": 8.935768056697057e-05, - "loss": 2.0366, - "step": 7219 - }, - { - "epoch": 2.23, - "learning_rate": 8.932205150773332e-05, - "loss": 2.0335, - "step": 7220 - }, - { - "epoch": 2.23, - "learning_rate": 8.928642244849609e-05, - "loss": 1.9583, - "step": 7221 - }, - { - "epoch": 2.23, - "learning_rate": 8.925079338925887e-05, - "loss": 1.911, - "step": 7222 - }, - { - "epoch": 2.23, - "learning_rate": 8.921516433002164e-05, - "loss": 1.8713, - "step": 7223 - }, - { - "epoch": 2.23, - "learning_rate": 8.917953527078441e-05, - "loss": 1.7365, - "step": 7224 - }, - { - "epoch": 2.23, - "learning_rate": 8.914390621154718e-05, - "loss": 1.6285, - "step": 7225 - }, - { - "epoch": 2.23, - "learning_rate": 8.910827715230996e-05, - "loss": 1.5177, - "step": 7226 - }, - { - "epoch": 2.23, - "learning_rate": 8.907264809307273e-05, - "loss": 1.6017, - "step": 7227 - }, - { - "epoch": 2.23, - "learning_rate": 8.903701903383551e-05, - "loss": 1.4585, - "step": 7228 - }, - { - "epoch": 2.23, - "learning_rate": 8.900138997459827e-05, - "loss": 1.5473, - "step": 7229 - }, - { - "epoch": 2.23, - "learning_rate": 8.896576091536103e-05, - "loss": 1.4081, - "step": 7230 - }, - { - "epoch": 2.23, - "learning_rate": 8.893013185612382e-05, - "loss": 4.0931, - "step": 7231 - }, - { - "epoch": 2.23, - "learning_rate": 8.889450279688658e-05, - "loss": 3.8113, - "step": 7232 - }, - { - "epoch": 2.23, - "learning_rate": 8.885887373764934e-05, - "loss": 3.5583, - "step": 7233 - }, - { - "epoch": 2.23, - "learning_rate": 8.882324467841212e-05, - "loss": 3.4705, - "step": 7234 - }, - { - "epoch": 2.23, - "learning_rate": 8.878761561917489e-05, - "loss": 3.2775, - "step": 7235 - }, - { - "epoch": 2.23, - "learning_rate": 8.875198655993767e-05, - "loss": 3.144, - "step": 7236 - }, - { - "epoch": 2.23, - "learning_rate": 8.871635750070043e-05, - "loss": 2.9541, - "step": 7237 - }, - { - "epoch": 2.23, - "learning_rate": 8.868072844146321e-05, - "loss": 3.1904, - "step": 7238 - }, - { - "epoch": 2.23, - "learning_rate": 8.864509938222598e-05, - "loss": 2.912, - "step": 7239 - }, - { - "epoch": 2.23, - "learning_rate": 8.860947032298876e-05, - "loss": 3.1277, - "step": 7240 - }, - { - "epoch": 2.23, - "learning_rate": 8.857384126375151e-05, - "loss": 2.9082, - "step": 7241 - }, - { - "epoch": 2.24, - "learning_rate": 8.853821220451428e-05, - "loss": 2.9693, - "step": 7242 - }, - { - "epoch": 2.24, - "learning_rate": 8.850258314527706e-05, - "loss": 2.671, - "step": 7243 - }, - { - "epoch": 2.24, - "learning_rate": 8.846695408603983e-05, - "loss": 2.9176, - "step": 7244 - }, - { - "epoch": 2.24, - "learning_rate": 8.843132502680261e-05, - "loss": 2.978, - "step": 7245 - }, - { - "epoch": 2.24, - "learning_rate": 8.839569596756537e-05, - "loss": 3.0011, - "step": 7246 - }, - { - "epoch": 2.24, - "learning_rate": 8.836006690832814e-05, - "loss": 2.8607, - "step": 7247 - }, - { - "epoch": 2.24, - "learning_rate": 8.832443784909092e-05, - "loss": 2.8081, - "step": 7248 - }, - { - "epoch": 2.24, - "learning_rate": 8.82888087898537e-05, - "loss": 2.7689, - "step": 7249 - }, - { - "epoch": 2.24, - "learning_rate": 8.825317973061645e-05, - "loss": 2.7566, - "step": 7250 - }, - { - "epoch": 2.24, - "learning_rate": 8.821755067137922e-05, - "loss": 2.6587, - "step": 7251 - }, - { - "epoch": 2.24, - "learning_rate": 8.8181921612142e-05, - "loss": 2.6488, - "step": 7252 - }, - { - "epoch": 2.24, - "learning_rate": 8.814629255290477e-05, - "loss": 2.662, - "step": 7253 - }, - { - "epoch": 2.24, - "learning_rate": 8.811066349366753e-05, - "loss": 2.7426, - "step": 7254 - }, - { - "epoch": 2.24, - "learning_rate": 8.807503443443031e-05, - "loss": 2.4835, - "step": 7255 - }, - { - "epoch": 2.24, - "learning_rate": 8.803940537519308e-05, - "loss": 2.4472, - "step": 7256 - }, - { - "epoch": 2.24, - "learning_rate": 8.800377631595586e-05, - "loss": 2.7407, - "step": 7257 - }, - { - "epoch": 2.24, - "learning_rate": 8.796814725671861e-05, - "loss": 2.4041, - "step": 7258 - }, - { - "epoch": 2.24, - "learning_rate": 8.79325181974814e-05, - "loss": 2.3973, - "step": 7259 - }, - { - "epoch": 2.24, - "learning_rate": 8.789688913824416e-05, - "loss": 2.194, - "step": 7260 - }, - { - "epoch": 2.24, - "learning_rate": 8.786126007900695e-05, - "loss": 2.3359, - "step": 7261 - }, - { - "epoch": 2.24, - "learning_rate": 8.782563101976971e-05, - "loss": 2.1763, - "step": 7262 - }, - { - "epoch": 2.24, - "learning_rate": 8.779000196053247e-05, - "loss": 2.3156, - "step": 7263 - }, - { - "epoch": 2.24, - "learning_rate": 8.775437290129525e-05, - "loss": 2.0597, - "step": 7264 - }, - { - "epoch": 2.24, - "learning_rate": 8.771874384205802e-05, - "loss": 2.2585, - "step": 7265 - }, - { - "epoch": 2.24, - "learning_rate": 8.76831147828208e-05, - "loss": 2.1648, - "step": 7266 - }, - { - "epoch": 2.24, - "learning_rate": 8.764748572358356e-05, - "loss": 2.3241, - "step": 7267 - }, - { - "epoch": 2.24, - "learning_rate": 8.761185666434632e-05, - "loss": 1.9216, - "step": 7268 - }, - { - "epoch": 2.24, - "learning_rate": 8.75762276051091e-05, - "loss": 1.7388, - "step": 7269 - }, - { - "epoch": 2.24, - "learning_rate": 8.754059854587187e-05, - "loss": 1.8373, - "step": 7270 - }, - { - "epoch": 2.24, - "learning_rate": 8.750496948663464e-05, - "loss": 1.7429, - "step": 7271 - }, - { - "epoch": 2.24, - "learning_rate": 8.746934042739741e-05, - "loss": 1.8981, - "step": 7272 - }, - { - "epoch": 2.24, - "learning_rate": 8.74337113681602e-05, - "loss": 1.8534, - "step": 7273 - }, - { - "epoch": 2.25, - "learning_rate": 8.739808230892296e-05, - "loss": 1.7233, - "step": 7274 - }, - { - "epoch": 2.25, - "learning_rate": 8.736245324968574e-05, - "loss": 1.5557, - "step": 7275 - }, - { - "epoch": 2.25, - "learning_rate": 8.73268241904485e-05, - "loss": 1.6298, - "step": 7276 - }, - { - "epoch": 2.25, - "learning_rate": 8.729119513121127e-05, - "loss": 1.4592, - "step": 7277 - }, - { - "epoch": 2.25, - "learning_rate": 8.725556607197405e-05, - "loss": 1.4525, - "step": 7278 - }, - { - "epoch": 2.25, - "learning_rate": 8.721993701273682e-05, - "loss": 1.4594, - "step": 7279 - }, - { - "epoch": 2.25, - "learning_rate": 8.718430795349957e-05, - "loss": 1.4226, - "step": 7280 - }, - { - "epoch": 2.25, - "learning_rate": 8.714867889426235e-05, - "loss": 3.9869, - "step": 7281 - }, - { - "epoch": 2.25, - "learning_rate": 8.711304983502512e-05, - "loss": 3.4118, - "step": 7282 - }, - { - "epoch": 2.25, - "learning_rate": 8.70774207757879e-05, - "loss": 3.3932, - "step": 7283 - }, - { - "epoch": 2.25, - "learning_rate": 8.704179171655066e-05, - "loss": 3.4529, - "step": 7284 - }, - { - "epoch": 2.25, - "learning_rate": 8.700616265731344e-05, - "loss": 3.1607, - "step": 7285 - }, - { - "epoch": 2.25, - "learning_rate": 8.697053359807621e-05, - "loss": 3.1739, - "step": 7286 - }, - { - "epoch": 2.25, - "learning_rate": 8.693490453883899e-05, - "loss": 3.0788, - "step": 7287 - }, - { - "epoch": 2.25, - "learning_rate": 8.689927547960175e-05, - "loss": 2.7633, - "step": 7288 - }, - { - "epoch": 2.25, - "learning_rate": 8.686364642036451e-05, - "loss": 2.9515, - "step": 7289 - }, - { - "epoch": 2.25, - "learning_rate": 8.68280173611273e-05, - "loss": 3.1472, - "step": 7290 - }, - { - "epoch": 2.25, - "learning_rate": 8.679238830189006e-05, - "loss": 2.8283, - "step": 7291 - }, - { - "epoch": 2.25, - "learning_rate": 8.675675924265285e-05, - "loss": 2.9223, - "step": 7292 - }, - { - "epoch": 2.25, - "learning_rate": 8.67211301834156e-05, - "loss": 2.7886, - "step": 7293 - }, - { - "epoch": 2.25, - "learning_rate": 8.668550112417838e-05, - "loss": 3.0067, - "step": 7294 - }, - { - "epoch": 2.25, - "learning_rate": 8.664987206494115e-05, - "loss": 2.9514, - "step": 7295 - }, - { - "epoch": 2.25, - "learning_rate": 8.661424300570393e-05, - "loss": 2.8021, - "step": 7296 - }, - { - "epoch": 2.25, - "learning_rate": 8.657861394646669e-05, - "loss": 2.8767, - "step": 7297 - }, - { - "epoch": 2.25, - "learning_rate": 8.654298488722946e-05, - "loss": 3.0472, - "step": 7298 - }, - { - "epoch": 2.25, - "learning_rate": 8.650735582799224e-05, - "loss": 2.7621, - "step": 7299 - }, - { - "epoch": 2.25, - "learning_rate": 8.6471726768755e-05, - "loss": 2.8706, - "step": 7300 - }, - { - "epoch": 2.25, - "learning_rate": 8.643609770951776e-05, - "loss": 2.6389, - "step": 7301 - }, - { - "epoch": 2.25, - "learning_rate": 8.640046865028054e-05, - "loss": 2.4966, - "step": 7302 - }, - { - "epoch": 2.25, - "learning_rate": 8.636483959104331e-05, - "loss": 2.8713, - "step": 7303 - }, - { - "epoch": 2.25, - "learning_rate": 8.632921053180609e-05, - "loss": 2.5554, - "step": 7304 - }, - { - "epoch": 2.25, - "learning_rate": 8.629358147256885e-05, - "loss": 2.5989, - "step": 7305 - }, - { - "epoch": 2.25, - "learning_rate": 8.625795241333163e-05, - "loss": 2.5358, - "step": 7306 - }, - { - "epoch": 2.26, - "learning_rate": 8.62223233540944e-05, - "loss": 2.4453, - "step": 7307 - }, - { - "epoch": 2.26, - "learning_rate": 8.618669429485718e-05, - "loss": 2.4131, - "step": 7308 - }, - { - "epoch": 2.26, - "learning_rate": 8.615106523561995e-05, - "loss": 2.5333, - "step": 7309 - }, - { - "epoch": 2.26, - "learning_rate": 8.61154361763827e-05, - "loss": 2.2447, - "step": 7310 - }, - { - "epoch": 2.26, - "learning_rate": 8.607980711714548e-05, - "loss": 2.236, - "step": 7311 - }, - { - "epoch": 2.26, - "learning_rate": 8.604417805790825e-05, - "loss": 2.3854, - "step": 7312 - }, - { - "epoch": 2.26, - "learning_rate": 8.600854899867103e-05, - "loss": 2.6612, - "step": 7313 - }, - { - "epoch": 2.26, - "learning_rate": 8.597291993943379e-05, - "loss": 2.3226, - "step": 7314 - }, - { - "epoch": 2.26, - "learning_rate": 8.593729088019656e-05, - "loss": 1.947, - "step": 7315 - }, - { - "epoch": 2.26, - "learning_rate": 8.590166182095934e-05, - "loss": 2.2354, - "step": 7316 - }, - { - "epoch": 2.26, - "learning_rate": 8.586603276172211e-05, - "loss": 2.1071, - "step": 7317 - }, - { - "epoch": 2.26, - "learning_rate": 8.583040370248488e-05, - "loss": 1.9427, - "step": 7318 - }, - { - "epoch": 2.26, - "learning_rate": 8.579477464324764e-05, - "loss": 1.86, - "step": 7319 - }, - { - "epoch": 2.26, - "learning_rate": 8.575914558401043e-05, - "loss": 2.1141, - "step": 7320 - }, - { - "epoch": 2.26, - "learning_rate": 8.57235165247732e-05, - "loss": 1.7861, - "step": 7321 - }, - { - "epoch": 2.26, - "learning_rate": 8.568788746553595e-05, - "loss": 1.6747, - "step": 7322 - }, - { - "epoch": 2.26, - "learning_rate": 8.565225840629873e-05, - "loss": 1.5317, - "step": 7323 - }, - { - "epoch": 2.26, - "learning_rate": 8.56166293470615e-05, - "loss": 1.4805, - "step": 7324 - }, - { - "epoch": 2.26, - "learning_rate": 8.558100028782428e-05, - "loss": 1.8469, - "step": 7325 - }, - { - "epoch": 2.26, - "learning_rate": 8.554537122858705e-05, - "loss": 1.4073, - "step": 7326 - }, - { - "epoch": 2.26, - "learning_rate": 8.55097421693498e-05, - "loss": 1.6262, - "step": 7327 - }, - { - "epoch": 2.26, - "learning_rate": 8.547411311011259e-05, - "loss": 1.5072, - "step": 7328 - }, - { - "epoch": 2.26, - "learning_rate": 8.543848405087536e-05, - "loss": 1.3857, - "step": 7329 - }, - { - "epoch": 2.26, - "learning_rate": 8.540285499163814e-05, - "loss": 1.2527, - "step": 7330 - }, - { - "epoch": 2.26, - "learning_rate": 8.536722593240089e-05, - "loss": 4.0039, - "step": 7331 - }, - { - "epoch": 2.26, - "learning_rate": 8.533159687316367e-05, - "loss": 3.4839, - "step": 7332 - }, - { - "epoch": 2.26, - "learning_rate": 8.529596781392644e-05, - "loss": 3.5989, - "step": 7333 - }, - { - "epoch": 2.26, - "learning_rate": 8.526033875468922e-05, - "loss": 3.4113, - "step": 7334 - }, - { - "epoch": 2.26, - "learning_rate": 8.522470969545198e-05, - "loss": 3.1882, - "step": 7335 - }, - { - "epoch": 2.26, - "learning_rate": 8.518908063621475e-05, - "loss": 3.107, - "step": 7336 - }, - { - "epoch": 2.26, - "learning_rate": 8.515345157697753e-05, - "loss": 3.115, - "step": 7337 - }, - { - "epoch": 2.26, - "learning_rate": 8.51178225177403e-05, - "loss": 3.3886, - "step": 7338 - }, - { - "epoch": 2.27, - "learning_rate": 8.508219345850307e-05, - "loss": 2.9071, - "step": 7339 - }, - { - "epoch": 2.27, - "learning_rate": 8.504656439926583e-05, - "loss": 3.2551, - "step": 7340 - }, - { - "epoch": 2.27, - "learning_rate": 8.501093534002862e-05, - "loss": 2.6899, - "step": 7341 - }, - { - "epoch": 2.27, - "learning_rate": 8.497530628079138e-05, - "loss": 2.6294, - "step": 7342 - }, - { - "epoch": 2.27, - "learning_rate": 8.493967722155417e-05, - "loss": 2.9224, - "step": 7343 - }, - { - "epoch": 2.27, - "learning_rate": 8.490404816231692e-05, - "loss": 2.8362, - "step": 7344 - }, - { - "epoch": 2.27, - "learning_rate": 8.486841910307969e-05, - "loss": 2.7795, - "step": 7345 - }, - { - "epoch": 2.27, - "learning_rate": 8.483279004384247e-05, - "loss": 3.1438, - "step": 7346 - }, - { - "epoch": 2.27, - "learning_rate": 8.479716098460524e-05, - "loss": 2.8074, - "step": 7347 - }, - { - "epoch": 2.27, - "learning_rate": 8.4761531925368e-05, - "loss": 2.8382, - "step": 7348 - }, - { - "epoch": 2.27, - "learning_rate": 8.472590286613078e-05, - "loss": 2.5524, - "step": 7349 - }, - { - "epoch": 2.27, - "learning_rate": 8.469027380689354e-05, - "loss": 2.6243, - "step": 7350 - }, - { - "epoch": 2.27, - "learning_rate": 8.465464474765633e-05, - "loss": 2.7082, - "step": 7351 - }, - { - "epoch": 2.27, - "learning_rate": 8.461901568841908e-05, - "loss": 2.581, - "step": 7352 - }, - { - "epoch": 2.27, - "learning_rate": 8.458338662918186e-05, - "loss": 2.839, - "step": 7353 - }, - { - "epoch": 2.27, - "learning_rate": 8.454775756994463e-05, - "loss": 2.4519, - "step": 7354 - }, - { - "epoch": 2.27, - "learning_rate": 8.451212851070741e-05, - "loss": 2.6922, - "step": 7355 - }, - { - "epoch": 2.27, - "learning_rate": 8.447649945147017e-05, - "loss": 2.4649, - "step": 7356 - }, - { - "epoch": 2.27, - "learning_rate": 8.444087039223294e-05, - "loss": 2.6324, - "step": 7357 - }, - { - "epoch": 2.27, - "learning_rate": 8.440524133299572e-05, - "loss": 2.5245, - "step": 7358 - }, - { - "epoch": 2.27, - "learning_rate": 8.436961227375849e-05, - "loss": 2.3805, - "step": 7359 - }, - { - "epoch": 2.27, - "learning_rate": 8.433398321452127e-05, - "loss": 2.2849, - "step": 7360 - }, - { - "epoch": 2.27, - "learning_rate": 8.429835415528402e-05, - "loss": 2.45, - "step": 7361 - }, - { - "epoch": 2.27, - "learning_rate": 8.426272509604679e-05, - "loss": 2.3264, - "step": 7362 - }, - { - "epoch": 2.27, - "learning_rate": 8.422709603680957e-05, - "loss": 2.3016, - "step": 7363 - }, - { - "epoch": 2.27, - "learning_rate": 8.419146697757234e-05, - "loss": 2.2555, - "step": 7364 - }, - { - "epoch": 2.27, - "learning_rate": 8.415583791833511e-05, - "loss": 2.1756, - "step": 7365 - }, - { - "epoch": 2.27, - "learning_rate": 8.412020885909788e-05, - "loss": 1.9768, - "step": 7366 - }, - { - "epoch": 2.27, - "learning_rate": 8.408457979986066e-05, - "loss": 1.9893, - "step": 7367 - }, - { - "epoch": 2.27, - "learning_rate": 8.404895074062343e-05, - "loss": 1.945, - "step": 7368 - }, - { - "epoch": 2.27, - "learning_rate": 8.401332168138618e-05, - "loss": 1.8732, - "step": 7369 - }, - { - "epoch": 2.27, - "learning_rate": 8.397769262214896e-05, - "loss": 1.8894, - "step": 7370 - }, - { - "epoch": 2.27, - "learning_rate": 8.394206356291173e-05, - "loss": 2.0709, - "step": 7371 - }, - { - "epoch": 2.28, - "learning_rate": 8.390643450367451e-05, - "loss": 1.8056, - "step": 7372 - }, - { - "epoch": 2.28, - "learning_rate": 8.387080544443728e-05, - "loss": 1.7692, - "step": 7373 - }, - { - "epoch": 2.28, - "learning_rate": 8.383517638520004e-05, - "loss": 1.6711, - "step": 7374 - }, - { - "epoch": 2.28, - "learning_rate": 8.379954732596282e-05, - "loss": 1.5355, - "step": 7375 - }, - { - "epoch": 2.28, - "learning_rate": 8.376391826672559e-05, - "loss": 1.5167, - "step": 7376 - }, - { - "epoch": 2.28, - "learning_rate": 8.372828920748837e-05, - "loss": 1.3761, - "step": 7377 - }, - { - "epoch": 2.28, - "learning_rate": 8.369266014825112e-05, - "loss": 1.5712, - "step": 7378 - }, - { - "epoch": 2.28, - "learning_rate": 8.36570310890139e-05, - "loss": 1.383, - "step": 7379 - }, - { - "epoch": 2.28, - "learning_rate": 8.362140202977667e-05, - "loss": 1.3392, - "step": 7380 - }, - { - "epoch": 2.28, - "learning_rate": 8.358577297053946e-05, - "loss": 4.1555, - "step": 7381 - }, - { - "epoch": 2.28, - "learning_rate": 8.355014391130221e-05, - "loss": 3.5113, - "step": 7382 - }, - { - "epoch": 2.28, - "learning_rate": 8.351451485206498e-05, - "loss": 3.7362, - "step": 7383 - }, - { - "epoch": 2.28, - "learning_rate": 8.347888579282776e-05, - "loss": 3.3676, - "step": 7384 - }, - { - "epoch": 2.28, - "learning_rate": 8.344325673359053e-05, - "loss": 3.4055, - "step": 7385 - }, - { - "epoch": 2.28, - "learning_rate": 8.34076276743533e-05, - "loss": 3.2574, - "step": 7386 - }, - { - "epoch": 2.28, - "learning_rate": 8.337199861511607e-05, - "loss": 3.0435, - "step": 7387 - }, - { - "epoch": 2.28, - "learning_rate": 8.333636955587885e-05, - "loss": 3.2044, - "step": 7388 - }, - { - "epoch": 2.28, - "learning_rate": 8.330074049664162e-05, - "loss": 3.1019, - "step": 7389 - }, - { - "epoch": 2.28, - "learning_rate": 8.32651114374044e-05, - "loss": 3.0177, - "step": 7390 - }, - { - "epoch": 2.28, - "learning_rate": 8.322948237816715e-05, - "loss": 2.835, - "step": 7391 - }, - { - "epoch": 2.28, - "learning_rate": 8.319385331892992e-05, - "loss": 2.9252, - "step": 7392 - }, - { - "epoch": 2.28, - "learning_rate": 8.31582242596927e-05, - "loss": 2.8171, - "step": 7393 - }, - { - "epoch": 2.28, - "learning_rate": 8.312259520045547e-05, - "loss": 2.7037, - "step": 7394 - }, - { - "epoch": 2.28, - "learning_rate": 8.308696614121823e-05, - "loss": 3.0213, - "step": 7395 - }, - { - "epoch": 2.28, - "learning_rate": 8.305133708198101e-05, - "loss": 2.8775, - "step": 7396 - }, - { - "epoch": 2.28, - "learning_rate": 8.301570802274378e-05, - "loss": 2.7093, - "step": 7397 - }, - { - "epoch": 2.28, - "learning_rate": 8.298007896350656e-05, - "loss": 2.9679, - "step": 7398 - }, - { - "epoch": 2.28, - "learning_rate": 8.294444990426931e-05, - "loss": 2.7833, - "step": 7399 - }, - { - "epoch": 2.28, - "learning_rate": 8.29088208450321e-05, - "loss": 2.8266, - "step": 7400 - }, - { - "epoch": 2.28, - "learning_rate": 8.287319178579486e-05, - "loss": 2.68, - "step": 7401 - }, - { - "epoch": 2.28, - "learning_rate": 8.283756272655765e-05, - "loss": 2.5568, - "step": 7402 - }, - { - "epoch": 2.28, - "learning_rate": 8.28019336673204e-05, - "loss": 2.9066, - "step": 7403 - }, - { - "epoch": 2.29, - "learning_rate": 8.276630460808317e-05, - "loss": 2.3456, - "step": 7404 - }, - { - "epoch": 2.29, - "learning_rate": 8.273067554884595e-05, - "loss": 2.3838, - "step": 7405 - }, - { - "epoch": 2.29, - "learning_rate": 8.269504648960872e-05, - "loss": 2.5798, - "step": 7406 - }, - { - "epoch": 2.29, - "learning_rate": 8.26594174303715e-05, - "loss": 2.5223, - "step": 7407 - }, - { - "epoch": 2.29, - "learning_rate": 8.262378837113426e-05, - "loss": 2.4106, - "step": 7408 - }, - { - "epoch": 2.29, - "learning_rate": 8.258815931189702e-05, - "loss": 2.3584, - "step": 7409 - }, - { - "epoch": 2.29, - "learning_rate": 8.25525302526598e-05, - "loss": 2.1321, - "step": 7410 - }, - { - "epoch": 2.29, - "learning_rate": 8.251690119342257e-05, - "loss": 2.2294, - "step": 7411 - }, - { - "epoch": 2.29, - "learning_rate": 8.248127213418534e-05, - "loss": 2.4526, - "step": 7412 - }, - { - "epoch": 2.29, - "learning_rate": 8.244564307494811e-05, - "loss": 1.9779, - "step": 7413 - }, - { - "epoch": 2.29, - "learning_rate": 8.241001401571089e-05, - "loss": 1.9256, - "step": 7414 - }, - { - "epoch": 2.29, - "learning_rate": 8.237438495647366e-05, - "loss": 2.045, - "step": 7415 - }, - { - "epoch": 2.29, - "learning_rate": 8.233875589723642e-05, - "loss": 1.9307, - "step": 7416 - }, - { - "epoch": 2.29, - "learning_rate": 8.23031268379992e-05, - "loss": 1.8251, - "step": 7417 - }, - { - "epoch": 2.29, - "learning_rate": 8.226749777876197e-05, - "loss": 1.8073, - "step": 7418 - }, - { - "epoch": 2.29, - "learning_rate": 8.223186871952475e-05, - "loss": 2.0955, - "step": 7419 - }, - { - "epoch": 2.29, - "learning_rate": 8.21962396602875e-05, - "loss": 1.8342, - "step": 7420 - }, - { - "epoch": 2.29, - "learning_rate": 8.216061060105027e-05, - "loss": 2.1738, - "step": 7421 - }, - { - "epoch": 2.29, - "learning_rate": 8.212498154181305e-05, - "loss": 1.8269, - "step": 7422 - }, - { - "epoch": 2.29, - "learning_rate": 8.208935248257583e-05, - "loss": 1.6411, - "step": 7423 - }, - { - "epoch": 2.29, - "learning_rate": 8.205372342333859e-05, - "loss": 1.8276, - "step": 7424 - }, - { - "epoch": 2.29, - "learning_rate": 8.201809436410137e-05, - "loss": 1.689, - "step": 7425 - }, - { - "epoch": 2.29, - "learning_rate": 8.198246530486414e-05, - "loss": 1.2682, - "step": 7426 - }, - { - "epoch": 2.29, - "learning_rate": 8.194683624562691e-05, - "loss": 1.3829, - "step": 7427 - }, - { - "epoch": 2.29, - "learning_rate": 8.191120718638968e-05, - "loss": 1.411, - "step": 7428 - }, - { - "epoch": 2.29, - "learning_rate": 8.187557812715246e-05, - "loss": 1.3233, - "step": 7429 - }, - { - "epoch": 2.29, - "learning_rate": 8.183994906791521e-05, - "loss": 1.3459, - "step": 7430 - }, - { - "epoch": 2.29, - "learning_rate": 8.1804320008678e-05, - "loss": 3.9149, - "step": 7431 - }, - { - "epoch": 2.29, - "learning_rate": 8.176869094944076e-05, - "loss": 3.5717, - "step": 7432 - }, - { - "epoch": 2.29, - "learning_rate": 8.173306189020353e-05, - "loss": 3.6903, - "step": 7433 - }, - { - "epoch": 2.29, - "learning_rate": 8.16974328309663e-05, - "loss": 3.0654, - "step": 7434 - }, - { - "epoch": 2.29, - "learning_rate": 8.166180377172908e-05, - "loss": 3.2325, - "step": 7435 - }, - { - "epoch": 2.3, - "learning_rate": 8.162617471249184e-05, - "loss": 2.9976, - "step": 7436 - }, - { - "epoch": 2.3, - "learning_rate": 8.159054565325462e-05, - "loss": 3.003, - "step": 7437 - }, - { - "epoch": 2.3, - "learning_rate": 8.155491659401739e-05, - "loss": 2.9751, - "step": 7438 - }, - { - "epoch": 2.3, - "learning_rate": 8.151928753478016e-05, - "loss": 2.8654, - "step": 7439 - }, - { - "epoch": 2.3, - "learning_rate": 8.148365847554294e-05, - "loss": 2.9618, - "step": 7440 - }, - { - "epoch": 2.3, - "learning_rate": 8.14480294163057e-05, - "loss": 2.7485, - "step": 7441 - }, - { - "epoch": 2.3, - "learning_rate": 8.141240035706847e-05, - "loss": 2.8901, - "step": 7442 - }, - { - "epoch": 2.3, - "learning_rate": 8.137677129783124e-05, - "loss": 2.7413, - "step": 7443 - }, - { - "epoch": 2.3, - "learning_rate": 8.134114223859401e-05, - "loss": 2.9215, - "step": 7444 - }, - { - "epoch": 2.3, - "learning_rate": 8.130551317935678e-05, - "loss": 2.7968, - "step": 7445 - }, - { - "epoch": 2.3, - "learning_rate": 8.126988412011956e-05, - "loss": 3.1416, - "step": 7446 - }, - { - "epoch": 2.3, - "learning_rate": 8.123425506088233e-05, - "loss": 2.7434, - "step": 7447 - }, - { - "epoch": 2.3, - "learning_rate": 8.11986260016451e-05, - "loss": 2.6512, - "step": 7448 - }, - { - "epoch": 2.3, - "learning_rate": 8.116299694240787e-05, - "loss": 2.7086, - "step": 7449 - }, - { - "epoch": 2.3, - "learning_rate": 8.112736788317063e-05, - "loss": 2.5756, - "step": 7450 - }, - { - "epoch": 2.3, - "learning_rate": 8.10917388239334e-05, - "loss": 2.7522, - "step": 7451 - }, - { - "epoch": 2.3, - "learning_rate": 8.105610976469618e-05, - "loss": 2.8828, - "step": 7452 - }, - { - "epoch": 2.3, - "learning_rate": 8.102048070545895e-05, - "loss": 2.5501, - "step": 7453 - }, - { - "epoch": 2.3, - "learning_rate": 8.098485164622172e-05, - "loss": 2.5649, - "step": 7454 - }, - { - "epoch": 2.3, - "learning_rate": 8.09492225869845e-05, - "loss": 2.4903, - "step": 7455 - }, - { - "epoch": 2.3, - "learning_rate": 8.091359352774726e-05, - "loss": 2.483, - "step": 7456 - }, - { - "epoch": 2.3, - "learning_rate": 8.087796446851004e-05, - "loss": 2.5665, - "step": 7457 - }, - { - "epoch": 2.3, - "learning_rate": 8.084233540927281e-05, - "loss": 2.5078, - "step": 7458 - }, - { - "epoch": 2.3, - "learning_rate": 8.080670635003558e-05, - "loss": 2.4216, - "step": 7459 - }, - { - "epoch": 2.3, - "learning_rate": 8.077107729079834e-05, - "loss": 2.4534, - "step": 7460 - }, - { - "epoch": 2.3, - "learning_rate": 8.073544823156113e-05, - "loss": 2.2759, - "step": 7461 - }, - { - "epoch": 2.3, - "learning_rate": 8.069981917232388e-05, - "loss": 2.3877, - "step": 7462 - }, - { - "epoch": 2.3, - "learning_rate": 8.066419011308666e-05, - "loss": 2.3674, - "step": 7463 - }, - { - "epoch": 2.3, - "learning_rate": 8.062856105384943e-05, - "loss": 2.2703, - "step": 7464 - }, - { - "epoch": 2.3, - "learning_rate": 8.05929319946122e-05, - "loss": 2.2484, - "step": 7465 - }, - { - "epoch": 2.3, - "learning_rate": 8.055730293537497e-05, - "loss": 1.9137, - "step": 7466 - }, - { - "epoch": 2.3, - "learning_rate": 8.052167387613775e-05, - "loss": 2.0042, - "step": 7467 - }, - { - "epoch": 2.3, - "learning_rate": 8.048604481690052e-05, - "loss": 1.8161, - "step": 7468 - }, - { - "epoch": 2.31, - "learning_rate": 8.045041575766329e-05, - "loss": 1.7166, - "step": 7469 - }, - { - "epoch": 2.31, - "learning_rate": 8.041478669842605e-05, - "loss": 1.9269, - "step": 7470 - }, - { - "epoch": 2.31, - "learning_rate": 8.037915763918882e-05, - "loss": 1.9599, - "step": 7471 - }, - { - "epoch": 2.31, - "learning_rate": 8.03435285799516e-05, - "loss": 1.7545, - "step": 7472 - }, - { - "epoch": 2.31, - "learning_rate": 8.030789952071437e-05, - "loss": 1.6191, - "step": 7473 - }, - { - "epoch": 2.31, - "learning_rate": 8.027227046147714e-05, - "loss": 1.7029, - "step": 7474 - }, - { - "epoch": 2.31, - "learning_rate": 8.023664140223991e-05, - "loss": 1.4696, - "step": 7475 - }, - { - "epoch": 2.31, - "learning_rate": 8.020101234300269e-05, - "loss": 1.3825, - "step": 7476 - }, - { - "epoch": 2.31, - "learning_rate": 8.016538328376545e-05, - "loss": 1.6883, - "step": 7477 - }, - { - "epoch": 2.31, - "learning_rate": 8.012975422452823e-05, - "loss": 1.5452, - "step": 7478 - }, - { - "epoch": 2.31, - "learning_rate": 8.0094125165291e-05, - "loss": 1.2535, - "step": 7479 - }, - { - "epoch": 2.31, - "learning_rate": 8.005849610605376e-05, - "loss": 1.3116, - "step": 7480 - }, - { - "epoch": 2.31, - "learning_rate": 8.002286704681653e-05, - "loss": 3.9622, - "step": 7481 - }, - { - "epoch": 2.31, - "learning_rate": 7.998723798757931e-05, - "loss": 3.4976, - "step": 7482 - }, - { - "epoch": 2.31, - "learning_rate": 7.995160892834207e-05, - "loss": 3.4168, - "step": 7483 - }, - { - "epoch": 2.31, - "learning_rate": 7.991597986910485e-05, - "loss": 3.5935, - "step": 7484 - }, - { - "epoch": 2.31, - "learning_rate": 7.988035080986762e-05, - "loss": 2.8975, - "step": 7485 - }, - { - "epoch": 2.31, - "learning_rate": 7.984472175063039e-05, - "loss": 3.2137, - "step": 7486 - }, - { - "epoch": 2.31, - "learning_rate": 7.980909269139316e-05, - "loss": 2.8585, - "step": 7487 - }, - { - "epoch": 2.31, - "learning_rate": 7.977346363215594e-05, - "loss": 3.054, - "step": 7488 - }, - { - "epoch": 2.31, - "learning_rate": 7.973783457291871e-05, - "loss": 2.844, - "step": 7489 - }, - { - "epoch": 2.31, - "learning_rate": 7.970220551368148e-05, - "loss": 2.7525, - "step": 7490 - }, - { - "epoch": 2.31, - "learning_rate": 7.966657645444424e-05, - "loss": 2.9564, - "step": 7491 - }, - { - "epoch": 2.31, - "learning_rate": 7.963094739520701e-05, - "loss": 3.0422, - "step": 7492 - }, - { - "epoch": 2.31, - "learning_rate": 7.95953183359698e-05, - "loss": 3.0033, - "step": 7493 - }, - { - "epoch": 2.31, - "learning_rate": 7.955968927673256e-05, - "loss": 2.9347, - "step": 7494 - }, - { - "epoch": 2.31, - "learning_rate": 7.952406021749533e-05, - "loss": 2.7911, - "step": 7495 - }, - { - "epoch": 2.31, - "learning_rate": 7.94884311582581e-05, - "loss": 2.7826, - "step": 7496 - }, - { - "epoch": 2.31, - "learning_rate": 7.945280209902087e-05, - "loss": 2.7779, - "step": 7497 - }, - { - "epoch": 2.31, - "learning_rate": 7.941717303978364e-05, - "loss": 2.7568, - "step": 7498 - }, - { - "epoch": 2.31, - "learning_rate": 7.938154398054642e-05, - "loss": 2.8117, - "step": 7499 - }, - { - "epoch": 2.31, - "learning_rate": 7.934591492130919e-05, - "loss": 2.4924, - "step": 7500 - }, - { - "epoch": 2.31, - "eval_bleu": 2.1186204163334704e-13, - "eval_loss": 3.8835675716400146, - "eval_runtime": 2570.3045, - "eval_samples_per_second": 5.743, - "eval_steps_per_second": 0.718, - "step": 7500 - }, - { - "epoch": 2.32, - "learning_rate": 7.931028586207195e-05, - "loss": 3.1026, - "step": 7501 - }, - { - "epoch": 2.32, - "learning_rate": 7.927465680283472e-05, - "loss": 2.5167, - "step": 7502 - }, - { - "epoch": 2.32, - "learning_rate": 7.923902774359749e-05, - "loss": 2.7082, - "step": 7503 - }, - { - "epoch": 2.32, - "learning_rate": 7.920339868436027e-05, - "loss": 2.4041, - "step": 7504 - }, - { - "epoch": 2.32, - "learning_rate": 7.916776962512304e-05, - "loss": 2.4252, - "step": 7505 - }, - { - "epoch": 2.32, - "learning_rate": 7.913214056588581e-05, - "loss": 2.3165, - "step": 7506 - }, - { - "epoch": 2.32, - "learning_rate": 7.909651150664858e-05, - "loss": 2.5555, - "step": 7507 - }, - { - "epoch": 2.32, - "learning_rate": 7.906088244741136e-05, - "loss": 2.3461, - "step": 7508 - }, - { - "epoch": 2.32, - "learning_rate": 7.902525338817411e-05, - "loss": 2.3082, - "step": 7509 - }, - { - "epoch": 2.32, - "learning_rate": 7.89896243289369e-05, - "loss": 2.6328, - "step": 7510 - }, - { - "epoch": 2.32, - "learning_rate": 7.895399526969966e-05, - "loss": 2.4581, - "step": 7511 - }, - { - "epoch": 2.32, - "learning_rate": 7.891836621046243e-05, - "loss": 2.2054, - "step": 7512 - }, - { - "epoch": 2.32, - "learning_rate": 7.88827371512252e-05, - "loss": 2.0143, - "step": 7513 - }, - { - "epoch": 2.32, - "learning_rate": 7.884710809198798e-05, - "loss": 2.1773, - "step": 7514 - }, - { - "epoch": 2.32, - "learning_rate": 7.881147903275075e-05, - "loss": 2.0321, - "step": 7515 - }, - { - "epoch": 2.32, - "learning_rate": 7.877584997351352e-05, - "loss": 1.9164, - "step": 7516 - }, - { - "epoch": 2.32, - "learning_rate": 7.874022091427629e-05, - "loss": 2.115, - "step": 7517 - }, - { - "epoch": 2.32, - "learning_rate": 7.870459185503906e-05, - "loss": 1.8817, - "step": 7518 - }, - { - "epoch": 2.32, - "learning_rate": 7.866896279580182e-05, - "loss": 1.858, - "step": 7519 - }, - { - "epoch": 2.32, - "learning_rate": 7.86333337365646e-05, - "loss": 1.7912, - "step": 7520 - }, - { - "epoch": 2.32, - "learning_rate": 7.859770467732737e-05, - "loss": 1.5856, - "step": 7521 - }, - { - "epoch": 2.32, - "learning_rate": 7.856207561809014e-05, - "loss": 1.7842, - "step": 7522 - }, - { - "epoch": 2.32, - "learning_rate": 7.852644655885292e-05, - "loss": 1.8922, - "step": 7523 - }, - { - "epoch": 2.32, - "learning_rate": 7.849081749961568e-05, - "loss": 1.735, - "step": 7524 - }, - { - "epoch": 2.32, - "learning_rate": 7.845518844037846e-05, - "loss": 1.5129, - "step": 7525 - }, - { - "epoch": 2.32, - "learning_rate": 7.841955938114123e-05, - "loss": 1.345, - "step": 7526 - }, - { - "epoch": 2.32, - "learning_rate": 7.8383930321904e-05, - "loss": 1.352, - "step": 7527 - }, - { - "epoch": 2.32, - "learning_rate": 7.834830126266677e-05, - "loss": 1.2819, - "step": 7528 - }, - { - "epoch": 2.32, - "learning_rate": 7.831267220342955e-05, - "loss": 1.3264, - "step": 7529 - }, - { - "epoch": 2.32, - "learning_rate": 7.82770431441923e-05, - "loss": 1.357, - "step": 7530 - }, - { - "epoch": 2.32, - "learning_rate": 7.824141408495508e-05, - "loss": 3.7947, - "step": 7531 - }, - { - "epoch": 2.32, - "learning_rate": 7.820578502571785e-05, - "loss": 3.7291, - "step": 7532 - }, - { - "epoch": 2.32, - "learning_rate": 7.817015596648062e-05, - "loss": 3.3865, - "step": 7533 - }, - { - "epoch": 2.33, - "learning_rate": 7.813452690724339e-05, - "loss": 3.3081, - "step": 7534 - }, - { - "epoch": 2.33, - "learning_rate": 7.809889784800617e-05, - "loss": 3.1611, - "step": 7535 - }, - { - "epoch": 2.33, - "learning_rate": 7.806326878876893e-05, - "loss": 2.9382, - "step": 7536 - }, - { - "epoch": 2.33, - "learning_rate": 7.802763972953171e-05, - "loss": 3.0917, - "step": 7537 - }, - { - "epoch": 2.33, - "learning_rate": 7.799201067029448e-05, - "loss": 3.0087, - "step": 7538 - }, - { - "epoch": 2.33, - "learning_rate": 7.795638161105724e-05, - "loss": 3.0693, - "step": 7539 - }, - { - "epoch": 2.33, - "learning_rate": 7.792075255182003e-05, - "loss": 2.8523, - "step": 7540 - }, - { - "epoch": 2.33, - "learning_rate": 7.78851234925828e-05, - "loss": 2.8185, - "step": 7541 - }, - { - "epoch": 2.33, - "learning_rate": 7.784949443334556e-05, - "loss": 3.1713, - "step": 7542 - }, - { - "epoch": 2.33, - "learning_rate": 7.781386537410833e-05, - "loss": 2.941, - "step": 7543 - }, - { - "epoch": 2.33, - "learning_rate": 7.77782363148711e-05, - "loss": 2.678, - "step": 7544 - }, - { - "epoch": 2.33, - "learning_rate": 7.774260725563387e-05, - "loss": 2.5384, - "step": 7545 - }, - { - "epoch": 2.33, - "learning_rate": 7.770697819639665e-05, - "loss": 2.5304, - "step": 7546 - }, - { - "epoch": 2.33, - "learning_rate": 7.767134913715942e-05, - "loss": 2.6411, - "step": 7547 - }, - { - "epoch": 2.33, - "learning_rate": 7.763572007792219e-05, - "loss": 2.7365, - "step": 7548 - }, - { - "epoch": 2.33, - "learning_rate": 7.760009101868496e-05, - "loss": 2.6854, - "step": 7549 - }, - { - "epoch": 2.33, - "learning_rate": 7.756446195944772e-05, - "loss": 2.8272, - "step": 7550 - }, - { - "epoch": 2.33, - "learning_rate": 7.752883290021049e-05, - "loss": 2.5442, - "step": 7551 - }, - { - "epoch": 2.33, - "learning_rate": 7.749320384097327e-05, - "loss": 2.4563, - "step": 7552 - }, - { - "epoch": 2.33, - "learning_rate": 7.745757478173604e-05, - "loss": 2.5408, - "step": 7553 - }, - { - "epoch": 2.33, - "learning_rate": 7.742194572249881e-05, - "loss": 2.5569, - "step": 7554 - }, - { - "epoch": 2.33, - "learning_rate": 7.738631666326159e-05, - "loss": 2.4231, - "step": 7555 - }, - { - "epoch": 2.33, - "learning_rate": 7.735068760402436e-05, - "loss": 2.3596, - "step": 7556 - }, - { - "epoch": 2.33, - "learning_rate": 7.731505854478713e-05, - "loss": 2.1234, - "step": 7557 - }, - { - "epoch": 2.33, - "learning_rate": 7.72794294855499e-05, - "loss": 2.3171, - "step": 7558 - }, - { - "epoch": 2.33, - "learning_rate": 7.724380042631267e-05, - "loss": 2.5613, - "step": 7559 - }, - { - "epoch": 2.33, - "learning_rate": 7.720817136707543e-05, - "loss": 2.3768, - "step": 7560 - }, - { - "epoch": 2.33, - "learning_rate": 7.717254230783822e-05, - "loss": 2.0545, - "step": 7561 - }, - { - "epoch": 2.33, - "learning_rate": 7.713691324860098e-05, - "loss": 2.2502, - "step": 7562 - }, - { - "epoch": 2.33, - "learning_rate": 7.710128418936375e-05, - "loss": 2.2215, - "step": 7563 - }, - { - "epoch": 2.33, - "learning_rate": 7.706565513012652e-05, - "loss": 2.0275, - "step": 7564 - }, - { - "epoch": 2.33, - "learning_rate": 7.703002607088929e-05, - "loss": 2.0886, - "step": 7565 - }, - { - "epoch": 2.34, - "learning_rate": 7.699439701165206e-05, - "loss": 1.8885, - "step": 7566 - }, - { - "epoch": 2.34, - "learning_rate": 7.695876795241484e-05, - "loss": 1.8684, - "step": 7567 - }, - { - "epoch": 2.34, - "learning_rate": 7.692313889317761e-05, - "loss": 2.0401, - "step": 7568 - }, - { - "epoch": 2.34, - "learning_rate": 7.688750983394038e-05, - "loss": 1.6281, - "step": 7569 - }, - { - "epoch": 2.34, - "learning_rate": 7.685188077470316e-05, - "loss": 1.6126, - "step": 7570 - }, - { - "epoch": 2.34, - "learning_rate": 7.681625171546591e-05, - "loss": 1.6012, - "step": 7571 - }, - { - "epoch": 2.34, - "learning_rate": 7.67806226562287e-05, - "loss": 1.8957, - "step": 7572 - }, - { - "epoch": 2.34, - "learning_rate": 7.674499359699146e-05, - "loss": 1.5524, - "step": 7573 - }, - { - "epoch": 2.34, - "learning_rate": 7.670936453775423e-05, - "loss": 1.5746, - "step": 7574 - }, - { - "epoch": 2.34, - "learning_rate": 7.6673735478517e-05, - "loss": 1.4824, - "step": 7575 - }, - { - "epoch": 2.34, - "learning_rate": 7.663810641927978e-05, - "loss": 1.3893, - "step": 7576 - }, - { - "epoch": 2.34, - "learning_rate": 7.660247736004254e-05, - "loss": 1.4933, - "step": 7577 - }, - { - "epoch": 2.34, - "learning_rate": 7.656684830080532e-05, - "loss": 1.2601, - "step": 7578 - }, - { - "epoch": 2.34, - "learning_rate": 7.653121924156809e-05, - "loss": 1.4801, - "step": 7579 - }, - { - "epoch": 2.34, - "learning_rate": 7.649559018233085e-05, - "loss": 1.4058, - "step": 7580 - }, - { - "epoch": 2.34, - "learning_rate": 7.645996112309362e-05, - "loss": 3.7893, - "step": 7581 - }, - { - "epoch": 2.34, - "learning_rate": 7.64243320638564e-05, - "loss": 3.3854, - "step": 7582 - }, - { - "epoch": 2.34, - "learning_rate": 7.638870300461916e-05, - "loss": 3.6847, - "step": 7583 - }, - { - "epoch": 2.34, - "learning_rate": 7.635307394538194e-05, - "loss": 3.3516, - "step": 7584 - }, - { - "epoch": 2.34, - "learning_rate": 7.631744488614471e-05, - "loss": 2.8726, - "step": 7585 - }, - { - "epoch": 2.34, - "learning_rate": 7.628181582690748e-05, - "loss": 2.9758, - "step": 7586 - }, - { - "epoch": 2.34, - "learning_rate": 7.624618676767026e-05, - "loss": 3.2309, - "step": 7587 - }, - { - "epoch": 2.34, - "learning_rate": 7.621055770843303e-05, - "loss": 2.7325, - "step": 7588 - }, - { - "epoch": 2.34, - "learning_rate": 7.61749286491958e-05, - "loss": 2.9371, - "step": 7589 - }, - { - "epoch": 2.34, - "learning_rate": 7.613929958995856e-05, - "loss": 2.9833, - "step": 7590 - }, - { - "epoch": 2.34, - "learning_rate": 7.610367053072133e-05, - "loss": 2.7978, - "step": 7591 - }, - { - "epoch": 2.34, - "learning_rate": 7.60680414714841e-05, - "loss": 2.7741, - "step": 7592 - }, - { - "epoch": 2.34, - "learning_rate": 7.603241241224688e-05, - "loss": 2.8195, - "step": 7593 - }, - { - "epoch": 2.34, - "learning_rate": 7.599678335300965e-05, - "loss": 2.7497, - "step": 7594 - }, - { - "epoch": 2.34, - "learning_rate": 7.596115429377242e-05, - "loss": 2.7129, - "step": 7595 - }, - { - "epoch": 2.34, - "learning_rate": 7.592552523453519e-05, - "loss": 2.5611, - "step": 7596 - }, - { - "epoch": 2.34, - "learning_rate": 7.588989617529797e-05, - "loss": 2.6992, - "step": 7597 - }, - { - "epoch": 2.35, - "learning_rate": 7.585426711606073e-05, - "loss": 2.6889, - "step": 7598 - }, - { - "epoch": 2.35, - "learning_rate": 7.581863805682351e-05, - "loss": 2.4659, - "step": 7599 - }, - { - "epoch": 2.35, - "learning_rate": 7.578300899758628e-05, - "loss": 2.5871, - "step": 7600 - }, - { - "epoch": 2.35, - "learning_rate": 7.574737993834904e-05, - "loss": 2.5718, - "step": 7601 - }, - { - "epoch": 2.35, - "learning_rate": 7.571175087911183e-05, - "loss": 2.4882, - "step": 7602 - }, - { - "epoch": 2.35, - "learning_rate": 7.56761218198746e-05, - "loss": 2.2575, - "step": 7603 - }, - { - "epoch": 2.35, - "learning_rate": 7.564049276063736e-05, - "loss": 2.6308, - "step": 7604 - }, - { - "epoch": 2.35, - "learning_rate": 7.560486370140013e-05, - "loss": 2.3178, - "step": 7605 - }, - { - "epoch": 2.35, - "learning_rate": 7.55692346421629e-05, - "loss": 2.4872, - "step": 7606 - }, - { - "epoch": 2.35, - "learning_rate": 7.553360558292567e-05, - "loss": 2.1628, - "step": 7607 - }, - { - "epoch": 2.35, - "learning_rate": 7.549797652368845e-05, - "loss": 2.392, - "step": 7608 - }, - { - "epoch": 2.35, - "learning_rate": 7.546234746445122e-05, - "loss": 2.3006, - "step": 7609 - }, - { - "epoch": 2.35, - "learning_rate": 7.542671840521399e-05, - "loss": 2.2781, - "step": 7610 - }, - { - "epoch": 2.35, - "learning_rate": 7.539108934597675e-05, - "loss": 2.2025, - "step": 7611 - }, - { - "epoch": 2.35, - "learning_rate": 7.535546028673952e-05, - "loss": 1.9751, - "step": 7612 - }, - { - "epoch": 2.35, - "learning_rate": 7.531983122750229e-05, - "loss": 2.0512, - "step": 7613 - }, - { - "epoch": 2.35, - "learning_rate": 7.528420216826507e-05, - "loss": 1.9669, - "step": 7614 - }, - { - "epoch": 2.35, - "learning_rate": 7.524857310902784e-05, - "loss": 2.0624, - "step": 7615 - }, - { - "epoch": 2.35, - "learning_rate": 7.521294404979061e-05, - "loss": 1.8951, - "step": 7616 - }, - { - "epoch": 2.35, - "learning_rate": 7.517731499055338e-05, - "loss": 1.9394, - "step": 7617 - }, - { - "epoch": 2.35, - "learning_rate": 7.514168593131615e-05, - "loss": 1.8421, - "step": 7618 - }, - { - "epoch": 2.35, - "learning_rate": 7.510605687207893e-05, - "loss": 1.739, - "step": 7619 - }, - { - "epoch": 2.35, - "learning_rate": 7.50704278128417e-05, - "loss": 1.9675, - "step": 7620 - }, - { - "epoch": 2.35, - "learning_rate": 7.503479875360446e-05, - "loss": 1.7248, - "step": 7621 - }, - { - "epoch": 2.35, - "learning_rate": 7.499916969436723e-05, - "loss": 1.8458, - "step": 7622 - }, - { - "epoch": 2.35, - "learning_rate": 7.496354063513001e-05, - "loss": 1.736, - "step": 7623 - }, - { - "epoch": 2.35, - "learning_rate": 7.492791157589277e-05, - "loss": 1.6196, - "step": 7624 - }, - { - "epoch": 2.35, - "learning_rate": 7.489228251665555e-05, - "loss": 1.6609, - "step": 7625 - }, - { - "epoch": 2.35, - "learning_rate": 7.485665345741832e-05, - "loss": 1.4505, - "step": 7626 - }, - { - "epoch": 2.35, - "learning_rate": 7.482102439818109e-05, - "loss": 1.4612, - "step": 7627 - }, - { - "epoch": 2.35, - "learning_rate": 7.478539533894386e-05, - "loss": 1.3866, - "step": 7628 - }, - { - "epoch": 2.35, - "learning_rate": 7.474976627970664e-05, - "loss": 1.2129, - "step": 7629 - }, - { - "epoch": 2.35, - "learning_rate": 7.471413722046939e-05, - "loss": 1.45, - "step": 7630 - }, - { - "epoch": 2.36, - "learning_rate": 7.467850816123217e-05, - "loss": 4.2251, - "step": 7631 - }, - { - "epoch": 2.36, - "learning_rate": 7.464287910199494e-05, - "loss": 3.6898, - "step": 7632 - }, - { - "epoch": 2.36, - "learning_rate": 7.460725004275771e-05, - "loss": 3.4566, - "step": 7633 - }, - { - "epoch": 2.36, - "learning_rate": 7.457162098352048e-05, - "loss": 2.9826, - "step": 7634 - }, - { - "epoch": 2.36, - "learning_rate": 7.453599192428326e-05, - "loss": 3.1194, - "step": 7635 - }, - { - "epoch": 2.36, - "learning_rate": 7.450036286504603e-05, - "loss": 2.9838, - "step": 7636 - }, - { - "epoch": 2.36, - "learning_rate": 7.44647338058088e-05, - "loss": 3.1525, - "step": 7637 - }, - { - "epoch": 2.36, - "learning_rate": 7.442910474657157e-05, - "loss": 3.0632, - "step": 7638 - }, - { - "epoch": 2.36, - "learning_rate": 7.439347568733433e-05, - "loss": 2.8537, - "step": 7639 - }, - { - "epoch": 2.36, - "learning_rate": 7.435784662809712e-05, - "loss": 2.9539, - "step": 7640 - }, - { - "epoch": 2.36, - "learning_rate": 7.432221756885988e-05, - "loss": 2.8468, - "step": 7641 - }, - { - "epoch": 2.36, - "learning_rate": 7.428658850962265e-05, - "loss": 2.8823, - "step": 7642 - }, - { - "epoch": 2.36, - "learning_rate": 7.425095945038542e-05, - "loss": 2.9821, - "step": 7643 - }, - { - "epoch": 2.36, - "learning_rate": 7.42153303911482e-05, - "loss": 2.8934, - "step": 7644 - }, - { - "epoch": 2.36, - "learning_rate": 7.417970133191096e-05, - "loss": 2.7269, - "step": 7645 - }, - { - "epoch": 2.36, - "learning_rate": 7.414407227267374e-05, - "loss": 2.6669, - "step": 7646 - }, - { - "epoch": 2.36, - "learning_rate": 7.410844321343651e-05, - "loss": 2.8512, - "step": 7647 - }, - { - "epoch": 2.36, - "learning_rate": 7.407281415419928e-05, - "loss": 2.7353, - "step": 7648 - }, - { - "epoch": 2.36, - "learning_rate": 7.403718509496204e-05, - "loss": 2.3502, - "step": 7649 - }, - { - "epoch": 2.36, - "learning_rate": 7.400155603572483e-05, - "loss": 2.5331, - "step": 7650 - }, - { - "epoch": 2.36, - "learning_rate": 7.39659269764876e-05, - "loss": 2.4242, - "step": 7651 - }, - { - "epoch": 2.36, - "learning_rate": 7.393029791725036e-05, - "loss": 2.6075, - "step": 7652 - }, - { - "epoch": 2.36, - "learning_rate": 7.389466885801313e-05, - "loss": 2.6051, - "step": 7653 - }, - { - "epoch": 2.36, - "learning_rate": 7.38590397987759e-05, - "loss": 2.6306, - "step": 7654 - }, - { - "epoch": 2.36, - "learning_rate": 7.382341073953868e-05, - "loss": 2.3518, - "step": 7655 - }, - { - "epoch": 2.36, - "learning_rate": 7.378778168030145e-05, - "loss": 2.3441, - "step": 7656 - }, - { - "epoch": 2.36, - "learning_rate": 7.375215262106422e-05, - "loss": 2.5802, - "step": 7657 - }, - { - "epoch": 2.36, - "learning_rate": 7.371652356182699e-05, - "loss": 2.2663, - "step": 7658 - }, - { - "epoch": 2.36, - "learning_rate": 7.368089450258976e-05, - "loss": 2.3043, - "step": 7659 - }, - { - "epoch": 2.36, - "learning_rate": 7.364526544335252e-05, - "loss": 2.3244, - "step": 7660 - }, - { - "epoch": 2.36, - "learning_rate": 7.36096363841153e-05, - "loss": 2.1115, - "step": 7661 - }, - { - "epoch": 2.36, - "learning_rate": 7.357400732487807e-05, - "loss": 2.093, - "step": 7662 - }, - { - "epoch": 2.37, - "learning_rate": 7.353837826564084e-05, - "loss": 2.2874, - "step": 7663 - }, - { - "epoch": 2.37, - "learning_rate": 7.350274920640361e-05, - "loss": 2.1176, - "step": 7664 - }, - { - "epoch": 2.37, - "learning_rate": 7.346712014716638e-05, - "loss": 1.9713, - "step": 7665 - }, - { - "epoch": 2.37, - "learning_rate": 7.343149108792915e-05, - "loss": 1.9171, - "step": 7666 - }, - { - "epoch": 2.37, - "learning_rate": 7.339586202869193e-05, - "loss": 1.9053, - "step": 7667 - }, - { - "epoch": 2.37, - "learning_rate": 7.33602329694547e-05, - "loss": 1.8139, - "step": 7668 - }, - { - "epoch": 2.37, - "learning_rate": 7.332460391021747e-05, - "loss": 1.8448, - "step": 7669 - }, - { - "epoch": 2.37, - "learning_rate": 7.328897485098025e-05, - "loss": 1.4769, - "step": 7670 - }, - { - "epoch": 2.37, - "learning_rate": 7.3253345791743e-05, - "loss": 1.7171, - "step": 7671 - }, - { - "epoch": 2.37, - "learning_rate": 7.321771673250578e-05, - "loss": 1.7301, - "step": 7672 - }, - { - "epoch": 2.37, - "learning_rate": 7.318208767326855e-05, - "loss": 1.8105, - "step": 7673 - }, - { - "epoch": 2.37, - "learning_rate": 7.314645861403132e-05, - "loss": 1.6341, - "step": 7674 - }, - { - "epoch": 2.37, - "learning_rate": 7.311082955479409e-05, - "loss": 1.6876, - "step": 7675 - }, - { - "epoch": 2.37, - "learning_rate": 7.307520049555687e-05, - "loss": 1.4963, - "step": 7676 - }, - { - "epoch": 2.37, - "learning_rate": 7.303957143631963e-05, - "loss": 1.3702, - "step": 7677 - }, - { - "epoch": 2.37, - "learning_rate": 7.300394237708241e-05, - "loss": 1.3357, - "step": 7678 - }, - { - "epoch": 2.37, - "learning_rate": 7.296831331784518e-05, - "loss": 1.3969, - "step": 7679 - }, - { - "epoch": 2.37, - "learning_rate": 7.293268425860794e-05, - "loss": 1.2642, - "step": 7680 - }, - { - "epoch": 2.37, - "learning_rate": 7.289705519937071e-05, - "loss": 3.4763, - "step": 7681 - }, - { - "epoch": 2.37, - "learning_rate": 7.28614261401335e-05, - "loss": 3.4817, - "step": 7682 - }, - { - "epoch": 2.37, - "learning_rate": 7.282579708089625e-05, - "loss": 3.551, - "step": 7683 - }, - { - "epoch": 2.37, - "learning_rate": 7.279016802165903e-05, - "loss": 3.1054, - "step": 7684 - }, - { - "epoch": 2.37, - "learning_rate": 7.275453896242181e-05, - "loss": 3.072, - "step": 7685 - }, - { - "epoch": 2.37, - "learning_rate": 7.271890990318457e-05, - "loss": 2.9524, - "step": 7686 - }, - { - "epoch": 2.37, - "learning_rate": 7.268328084394735e-05, - "loss": 3.0583, - "step": 7687 - }, - { - "epoch": 2.37, - "learning_rate": 7.264765178471012e-05, - "loss": 2.9368, - "step": 7688 - }, - { - "epoch": 2.37, - "learning_rate": 7.261202272547289e-05, - "loss": 3.1746, - "step": 7689 - }, - { - "epoch": 2.37, - "learning_rate": 7.257639366623565e-05, - "loss": 2.8037, - "step": 7690 - }, - { - "epoch": 2.37, - "learning_rate": 7.254076460699844e-05, - "loss": 2.8145, - "step": 7691 - }, - { - "epoch": 2.37, - "learning_rate": 7.250513554776119e-05, - "loss": 2.661, - "step": 7692 - }, - { - "epoch": 2.37, - "learning_rate": 7.246950648852397e-05, - "loss": 2.9017, - "step": 7693 - }, - { - "epoch": 2.37, - "learning_rate": 7.243387742928674e-05, - "loss": 2.6463, - "step": 7694 - }, - { - "epoch": 2.37, - "learning_rate": 7.239824837004951e-05, - "loss": 2.6542, - "step": 7695 - }, - { - "epoch": 2.38, - "learning_rate": 7.236261931081228e-05, - "loss": 2.7907, - "step": 7696 - }, - { - "epoch": 2.38, - "learning_rate": 7.232699025157506e-05, - "loss": 2.6997, - "step": 7697 - }, - { - "epoch": 2.38, - "learning_rate": 7.229136119233781e-05, - "loss": 2.4586, - "step": 7698 - }, - { - "epoch": 2.38, - "learning_rate": 7.22557321331006e-05, - "loss": 2.8381, - "step": 7699 - }, - { - "epoch": 2.38, - "learning_rate": 7.222010307386336e-05, - "loss": 2.5212, - "step": 7700 - }, - { - "epoch": 2.38, - "learning_rate": 7.218447401462613e-05, - "loss": 2.51, - "step": 7701 - }, - { - "epoch": 2.38, - "learning_rate": 7.214884495538892e-05, - "loss": 2.3454, - "step": 7702 - }, - { - "epoch": 2.38, - "learning_rate": 7.211321589615168e-05, - "loss": 2.3398, - "step": 7703 - }, - { - "epoch": 2.38, - "learning_rate": 7.207758683691445e-05, - "loss": 2.2402, - "step": 7704 - }, - { - "epoch": 2.38, - "learning_rate": 7.204195777767722e-05, - "loss": 2.457, - "step": 7705 - }, - { - "epoch": 2.38, - "learning_rate": 7.200632871843999e-05, - "loss": 2.6168, - "step": 7706 - }, - { - "epoch": 2.38, - "learning_rate": 7.197069965920276e-05, - "loss": 2.5826, - "step": 7707 - }, - { - "epoch": 2.38, - "learning_rate": 7.193507059996554e-05, - "loss": 2.2733, - "step": 7708 - }, - { - "epoch": 2.38, - "learning_rate": 7.189944154072831e-05, - "loss": 2.417, - "step": 7709 - }, - { - "epoch": 2.38, - "learning_rate": 7.186381248149108e-05, - "loss": 2.2342, - "step": 7710 - }, - { - "epoch": 2.38, - "learning_rate": 7.182818342225384e-05, - "loss": 2.0266, - "step": 7711 - }, - { - "epoch": 2.38, - "learning_rate": 7.179255436301661e-05, - "loss": 2.1959, - "step": 7712 - }, - { - "epoch": 2.38, - "learning_rate": 7.175692530377938e-05, - "loss": 2.1205, - "step": 7713 - }, - { - "epoch": 2.38, - "learning_rate": 7.172129624454216e-05, - "loss": 2.2253, - "step": 7714 - }, - { - "epoch": 2.38, - "learning_rate": 7.168566718530493e-05, - "loss": 1.8714, - "step": 7715 - }, - { - "epoch": 2.38, - "learning_rate": 7.16500381260677e-05, - "loss": 1.8876, - "step": 7716 - }, - { - "epoch": 2.38, - "learning_rate": 7.161440906683048e-05, - "loss": 1.9299, - "step": 7717 - }, - { - "epoch": 2.38, - "learning_rate": 7.157878000759324e-05, - "loss": 2.1508, - "step": 7718 - }, - { - "epoch": 2.38, - "learning_rate": 7.154315094835602e-05, - "loss": 1.8397, - "step": 7719 - }, - { - "epoch": 2.38, - "learning_rate": 7.150752188911879e-05, - "loss": 1.7628, - "step": 7720 - }, - { - "epoch": 2.38, - "learning_rate": 7.147189282988155e-05, - "loss": 1.7074, - "step": 7721 - }, - { - "epoch": 2.38, - "learning_rate": 7.143626377064432e-05, - "loss": 1.773, - "step": 7722 - }, - { - "epoch": 2.38, - "learning_rate": 7.14006347114071e-05, - "loss": 1.7621, - "step": 7723 - }, - { - "epoch": 2.38, - "learning_rate": 7.136500565216986e-05, - "loss": 1.5361, - "step": 7724 - }, - { - "epoch": 2.38, - "learning_rate": 7.132937659293264e-05, - "loss": 1.6136, - "step": 7725 - }, - { - "epoch": 2.38, - "learning_rate": 7.129374753369541e-05, - "loss": 1.5032, - "step": 7726 - }, - { - "epoch": 2.38, - "learning_rate": 7.125811847445818e-05, - "loss": 1.4019, - "step": 7727 - }, - { - "epoch": 2.39, - "learning_rate": 7.122248941522095e-05, - "loss": 1.4135, - "step": 7728 - }, - { - "epoch": 2.39, - "learning_rate": 7.118686035598373e-05, - "loss": 1.2504, - "step": 7729 - }, - { - "epoch": 2.39, - "learning_rate": 7.11512312967465e-05, - "loss": 1.1386, - "step": 7730 - }, - { - "epoch": 2.39, - "learning_rate": 7.111560223750926e-05, - "loss": 3.385, - "step": 7731 - }, - { - "epoch": 2.39, - "learning_rate": 7.107997317827203e-05, - "loss": 3.4787, - "step": 7732 - }, - { - "epoch": 2.39, - "learning_rate": 7.10443441190348e-05, - "loss": 3.3064, - "step": 7733 - }, - { - "epoch": 2.39, - "learning_rate": 7.100871505979758e-05, - "loss": 3.2023, - "step": 7734 - }, - { - "epoch": 2.39, - "learning_rate": 7.097308600056035e-05, - "loss": 2.8663, - "step": 7735 - }, - { - "epoch": 2.39, - "learning_rate": 7.093745694132312e-05, - "loss": 3.1788, - "step": 7736 - }, - { - "epoch": 2.39, - "learning_rate": 7.090182788208589e-05, - "loss": 3.1374, - "step": 7737 - }, - { - "epoch": 2.39, - "learning_rate": 7.086619882284867e-05, - "loss": 2.7899, - "step": 7738 - }, - { - "epoch": 2.39, - "learning_rate": 7.083056976361142e-05, - "loss": 2.8757, - "step": 7739 - }, - { - "epoch": 2.39, - "learning_rate": 7.07949407043742e-05, - "loss": 2.7851, - "step": 7740 - }, - { - "epoch": 2.39, - "learning_rate": 7.075931164513697e-05, - "loss": 2.6727, - "step": 7741 - }, - { - "epoch": 2.39, - "learning_rate": 7.072368258589974e-05, - "loss": 2.8675, - "step": 7742 - }, - { - "epoch": 2.39, - "learning_rate": 7.068805352666251e-05, - "loss": 2.6687, - "step": 7743 - }, - { - "epoch": 2.39, - "learning_rate": 7.065242446742529e-05, - "loss": 2.7614, - "step": 7744 - }, - { - "epoch": 2.39, - "learning_rate": 7.061679540818805e-05, - "loss": 2.5521, - "step": 7745 - }, - { - "epoch": 2.39, - "learning_rate": 7.058116634895083e-05, - "loss": 2.7908, - "step": 7746 - }, - { - "epoch": 2.39, - "learning_rate": 7.05455372897136e-05, - "loss": 2.4883, - "step": 7747 - }, - { - "epoch": 2.39, - "learning_rate": 7.050990823047637e-05, - "loss": 2.6457, - "step": 7748 - }, - { - "epoch": 2.39, - "learning_rate": 7.047427917123915e-05, - "loss": 2.8473, - "step": 7749 - }, - { - "epoch": 2.39, - "learning_rate": 7.043865011200192e-05, - "loss": 2.6216, - "step": 7750 - }, - { - "epoch": 2.39, - "learning_rate": 7.040302105276468e-05, - "loss": 2.4056, - "step": 7751 - }, - { - "epoch": 2.39, - "learning_rate": 7.036739199352745e-05, - "loss": 2.432, - "step": 7752 - }, - { - "epoch": 2.39, - "learning_rate": 7.033176293429022e-05, - "loss": 2.3974, - "step": 7753 - }, - { - "epoch": 2.39, - "learning_rate": 7.029613387505299e-05, - "loss": 2.4859, - "step": 7754 - }, - { - "epoch": 2.39, - "learning_rate": 7.026050481581577e-05, - "loss": 2.5073, - "step": 7755 - }, - { - "epoch": 2.39, - "learning_rate": 7.022487575657854e-05, - "loss": 2.5543, - "step": 7756 - }, - { - "epoch": 2.39, - "learning_rate": 7.018924669734131e-05, - "loss": 2.513, - "step": 7757 - }, - { - "epoch": 2.39, - "learning_rate": 7.015361763810408e-05, - "loss": 2.4267, - "step": 7758 - }, - { - "epoch": 2.39, - "learning_rate": 7.011798857886685e-05, - "loss": 2.4187, - "step": 7759 - }, - { - "epoch": 2.4, - "learning_rate": 7.008235951962961e-05, - "loss": 2.1179, - "step": 7760 - }, - { - "epoch": 2.4, - "learning_rate": 7.00467304603924e-05, - "loss": 2.1867, - "step": 7761 - }, - { - "epoch": 2.4, - "learning_rate": 7.001110140115516e-05, - "loss": 2.1019, - "step": 7762 - }, - { - "epoch": 2.4, - "learning_rate": 6.997547234191793e-05, - "loss": 2.0118, - "step": 7763 - }, - { - "epoch": 2.4, - "learning_rate": 6.99398432826807e-05, - "loss": 2.0783, - "step": 7764 - }, - { - "epoch": 2.4, - "learning_rate": 6.990421422344347e-05, - "loss": 1.7735, - "step": 7765 - }, - { - "epoch": 2.4, - "learning_rate": 6.986858516420625e-05, - "loss": 2.1662, - "step": 7766 - }, - { - "epoch": 2.4, - "learning_rate": 6.983295610496902e-05, - "loss": 1.8919, - "step": 7767 - }, - { - "epoch": 2.4, - "learning_rate": 6.979732704573179e-05, - "loss": 1.6556, - "step": 7768 - }, - { - "epoch": 2.4, - "learning_rate": 6.976169798649456e-05, - "loss": 1.9232, - "step": 7769 - }, - { - "epoch": 2.4, - "learning_rate": 6.972606892725734e-05, - "loss": 1.5576, - "step": 7770 - }, - { - "epoch": 2.4, - "learning_rate": 6.96904398680201e-05, - "loss": 1.7531, - "step": 7771 - }, - { - "epoch": 2.4, - "learning_rate": 6.965481080878287e-05, - "loss": 1.7591, - "step": 7772 - }, - { - "epoch": 2.4, - "learning_rate": 6.961918174954564e-05, - "loss": 1.6962, - "step": 7773 - }, - { - "epoch": 2.4, - "learning_rate": 6.958355269030841e-05, - "loss": 1.5536, - "step": 7774 - }, - { - "epoch": 2.4, - "learning_rate": 6.954792363107118e-05, - "loss": 1.5093, - "step": 7775 - }, - { - "epoch": 2.4, - "learning_rate": 6.951229457183396e-05, - "loss": 1.4562, - "step": 7776 - }, - { - "epoch": 2.4, - "learning_rate": 6.947666551259673e-05, - "loss": 1.3053, - "step": 7777 - }, - { - "epoch": 2.4, - "learning_rate": 6.94410364533595e-05, - "loss": 1.2665, - "step": 7778 - }, - { - "epoch": 2.4, - "learning_rate": 6.940540739412227e-05, - "loss": 1.2663, - "step": 7779 - }, - { - "epoch": 2.4, - "learning_rate": 6.936977833488503e-05, - "loss": 1.3505, - "step": 7780 - }, - { - "epoch": 2.4, - "learning_rate": 6.93341492756478e-05, - "loss": 3.7558, - "step": 7781 - }, - { - "epoch": 2.4, - "learning_rate": 6.929852021641058e-05, - "loss": 3.3249, - "step": 7782 - }, - { - "epoch": 2.4, - "learning_rate": 6.926289115717335e-05, - "loss": 3.1812, - "step": 7783 - }, - { - "epoch": 2.4, - "learning_rate": 6.922726209793612e-05, - "loss": 3.0523, - "step": 7784 - }, - { - "epoch": 2.4, - "learning_rate": 6.91916330386989e-05, - "loss": 2.816, - "step": 7785 - }, - { - "epoch": 2.4, - "learning_rate": 6.915600397946166e-05, - "loss": 2.897, - "step": 7786 - }, - { - "epoch": 2.4, - "learning_rate": 6.912037492022444e-05, - "loss": 2.9205, - "step": 7787 - }, - { - "epoch": 2.4, - "learning_rate": 6.908474586098721e-05, - "loss": 2.9991, - "step": 7788 - }, - { - "epoch": 2.4, - "learning_rate": 6.904911680174998e-05, - "loss": 3.0929, - "step": 7789 - }, - { - "epoch": 2.4, - "learning_rate": 6.901348774251274e-05, - "loss": 2.7561, - "step": 7790 - }, - { - "epoch": 2.4, - "learning_rate": 6.897785868327553e-05, - "loss": 2.6904, - "step": 7791 - }, - { - "epoch": 2.4, - "learning_rate": 6.894222962403828e-05, - "loss": 2.6866, - "step": 7792 - }, - { - "epoch": 2.41, - "learning_rate": 6.890660056480106e-05, - "loss": 2.8004, - "step": 7793 - }, - { - "epoch": 2.41, - "learning_rate": 6.887097150556383e-05, - "loss": 2.4816, - "step": 7794 - }, - { - "epoch": 2.41, - "learning_rate": 6.88353424463266e-05, - "loss": 2.7183, - "step": 7795 - }, - { - "epoch": 2.41, - "learning_rate": 6.879971338708937e-05, - "loss": 2.4381, - "step": 7796 - }, - { - "epoch": 2.41, - "learning_rate": 6.876408432785215e-05, - "loss": 2.6938, - "step": 7797 - }, - { - "epoch": 2.41, - "learning_rate": 6.872845526861492e-05, - "loss": 2.5151, - "step": 7798 - }, - { - "epoch": 2.41, - "learning_rate": 6.869282620937769e-05, - "loss": 2.509, - "step": 7799 - }, - { - "epoch": 2.41, - "learning_rate": 6.865719715014045e-05, - "loss": 2.3981, - "step": 7800 - }, - { - "epoch": 2.41, - "learning_rate": 6.862156809090322e-05, - "loss": 2.3659, - "step": 7801 - }, - { - "epoch": 2.41, - "learning_rate": 6.8585939031666e-05, - "loss": 2.4368, - "step": 7802 - }, - { - "epoch": 2.41, - "learning_rate": 6.855030997242877e-05, - "loss": 2.2669, - "step": 7803 - }, - { - "epoch": 2.41, - "learning_rate": 6.851468091319154e-05, - "loss": 2.3747, - "step": 7804 - }, - { - "epoch": 2.41, - "learning_rate": 6.847905185395431e-05, - "loss": 2.3811, - "step": 7805 - }, - { - "epoch": 2.41, - "learning_rate": 6.844342279471708e-05, - "loss": 2.257, - "step": 7806 - }, - { - "epoch": 2.41, - "learning_rate": 6.840779373547985e-05, - "loss": 2.4318, - "step": 7807 - }, - { - "epoch": 2.41, - "learning_rate": 6.837216467624263e-05, - "loss": 1.9572, - "step": 7808 - }, - { - "epoch": 2.41, - "learning_rate": 6.83365356170054e-05, - "loss": 2.0928, - "step": 7809 - }, - { - "epoch": 2.41, - "learning_rate": 6.830090655776817e-05, - "loss": 2.0709, - "step": 7810 - }, - { - "epoch": 2.41, - "learning_rate": 6.826527749853093e-05, - "loss": 2.0569, - "step": 7811 - }, - { - "epoch": 2.41, - "learning_rate": 6.82296484392937e-05, - "loss": 1.9202, - "step": 7812 - }, - { - "epoch": 2.41, - "learning_rate": 6.819401938005647e-05, - "loss": 1.7589, - "step": 7813 - }, - { - "epoch": 2.41, - "learning_rate": 6.815839032081925e-05, - "loss": 1.9078, - "step": 7814 - }, - { - "epoch": 2.41, - "learning_rate": 6.812276126158202e-05, - "loss": 1.6702, - "step": 7815 - }, - { - "epoch": 2.41, - "learning_rate": 6.808713220234479e-05, - "loss": 1.8967, - "step": 7816 - }, - { - "epoch": 2.41, - "learning_rate": 6.805150314310757e-05, - "loss": 2.0073, - "step": 7817 - }, - { - "epoch": 2.41, - "learning_rate": 6.801587408387034e-05, - "loss": 1.7825, - "step": 7818 - }, - { - "epoch": 2.41, - "learning_rate": 6.798024502463311e-05, - "loss": 1.7133, - "step": 7819 - }, - { - "epoch": 2.41, - "learning_rate": 6.794461596539588e-05, - "loss": 1.6444, - "step": 7820 - }, - { - "epoch": 2.41, - "learning_rate": 6.790898690615864e-05, - "loss": 1.4581, - "step": 7821 - }, - { - "epoch": 2.41, - "learning_rate": 6.787335784692141e-05, - "loss": 1.424, - "step": 7822 - }, - { - "epoch": 2.41, - "learning_rate": 6.78377287876842e-05, - "loss": 1.7781, - "step": 7823 - }, - { - "epoch": 2.41, - "learning_rate": 6.780209972844696e-05, - "loss": 1.4845, - "step": 7824 - }, - { - "epoch": 2.42, - "learning_rate": 6.776647066920973e-05, - "loss": 1.4216, - "step": 7825 - }, - { - "epoch": 2.42, - "learning_rate": 6.77308416099725e-05, - "loss": 1.5291, - "step": 7826 - }, - { - "epoch": 2.42, - "learning_rate": 6.769521255073527e-05, - "loss": 1.2039, - "step": 7827 - }, - { - "epoch": 2.42, - "learning_rate": 6.765958349149804e-05, - "loss": 1.3614, - "step": 7828 - }, - { - "epoch": 2.42, - "learning_rate": 6.762395443226082e-05, - "loss": 1.2885, - "step": 7829 - }, - { - "epoch": 2.42, - "learning_rate": 6.758832537302359e-05, - "loss": 1.1618, - "step": 7830 - }, - { - "epoch": 2.42, - "learning_rate": 6.755269631378635e-05, - "loss": 3.8549, - "step": 7831 - }, - { - "epoch": 2.42, - "learning_rate": 6.751706725454914e-05, - "loss": 3.1316, - "step": 7832 - }, - { - "epoch": 2.42, - "learning_rate": 6.748143819531189e-05, - "loss": 3.1912, - "step": 7833 - }, - { - "epoch": 2.42, - "learning_rate": 6.744580913607467e-05, - "loss": 3.0965, - "step": 7834 - }, - { - "epoch": 2.42, - "learning_rate": 6.741018007683744e-05, - "loss": 3.1449, - "step": 7835 - }, - { - "epoch": 2.42, - "learning_rate": 6.737455101760021e-05, - "loss": 2.8179, - "step": 7836 - }, - { - "epoch": 2.42, - "learning_rate": 6.733892195836298e-05, - "loss": 2.8559, - "step": 7837 - }, - { - "epoch": 2.42, - "learning_rate": 6.730329289912576e-05, - "loss": 2.8135, - "step": 7838 - }, - { - "epoch": 2.42, - "learning_rate": 6.726766383988851e-05, - "loss": 3.0382, - "step": 7839 - }, - { - "epoch": 2.42, - "learning_rate": 6.72320347806513e-05, - "loss": 2.9399, - "step": 7840 - }, - { - "epoch": 2.42, - "learning_rate": 6.719640572141406e-05, - "loss": 2.8862, - "step": 7841 - }, - { - "epoch": 2.42, - "learning_rate": 6.716077666217683e-05, - "loss": 2.7485, - "step": 7842 - }, - { - "epoch": 2.42, - "learning_rate": 6.71251476029396e-05, - "loss": 2.6385, - "step": 7843 - }, - { - "epoch": 2.42, - "learning_rate": 6.708951854370238e-05, - "loss": 2.7138, - "step": 7844 - }, - { - "epoch": 2.42, - "learning_rate": 6.705388948446514e-05, - "loss": 2.6547, - "step": 7845 - }, - { - "epoch": 2.42, - "learning_rate": 6.701826042522792e-05, - "loss": 2.7052, - "step": 7846 - }, - { - "epoch": 2.42, - "learning_rate": 6.698263136599069e-05, - "loss": 2.7174, - "step": 7847 - }, - { - "epoch": 2.42, - "learning_rate": 6.694700230675346e-05, - "loss": 2.8173, - "step": 7848 - }, - { - "epoch": 2.42, - "learning_rate": 6.691137324751624e-05, - "loss": 2.5625, - "step": 7849 - }, - { - "epoch": 2.42, - "learning_rate": 6.6875744188279e-05, - "loss": 2.6696, - "step": 7850 - }, - { - "epoch": 2.42, - "learning_rate": 6.684011512904177e-05, - "loss": 2.6878, - "step": 7851 - }, - { - "epoch": 2.42, - "learning_rate": 6.680448606980454e-05, - "loss": 2.3361, - "step": 7852 - }, - { - "epoch": 2.42, - "learning_rate": 6.676885701056731e-05, - "loss": 2.3596, - "step": 7853 - }, - { - "epoch": 2.42, - "learning_rate": 6.673322795133008e-05, - "loss": 2.4507, - "step": 7854 - }, - { - "epoch": 2.42, - "learning_rate": 6.669759889209286e-05, - "loss": 2.5421, - "step": 7855 - }, - { - "epoch": 2.42, - "learning_rate": 6.666196983285563e-05, - "loss": 2.0953, - "step": 7856 - }, - { - "epoch": 2.42, - "learning_rate": 6.66263407736184e-05, - "loss": 2.3242, - "step": 7857 - }, - { - "epoch": 2.43, - "learning_rate": 6.659071171438117e-05, - "loss": 2.2409, - "step": 7858 - }, - { - "epoch": 2.43, - "learning_rate": 6.655508265514395e-05, - "loss": 2.1697, - "step": 7859 - }, - { - "epoch": 2.43, - "learning_rate": 6.65194535959067e-05, - "loss": 2.1331, - "step": 7860 - }, - { - "epoch": 2.43, - "learning_rate": 6.648382453666949e-05, - "loss": 2.2246, - "step": 7861 - }, - { - "epoch": 2.43, - "learning_rate": 6.644819547743225e-05, - "loss": 1.7736, - "step": 7862 - }, - { - "epoch": 2.43, - "learning_rate": 6.641256641819502e-05, - "loss": 2.135, - "step": 7863 - }, - { - "epoch": 2.43, - "learning_rate": 6.63769373589578e-05, - "loss": 1.8572, - "step": 7864 - }, - { - "epoch": 2.43, - "learning_rate": 6.634130829972057e-05, - "loss": 1.9604, - "step": 7865 - }, - { - "epoch": 2.43, - "learning_rate": 6.630567924048334e-05, - "loss": 1.8674, - "step": 7866 - }, - { - "epoch": 2.43, - "learning_rate": 6.627005018124611e-05, - "loss": 1.9261, - "step": 7867 - }, - { - "epoch": 2.43, - "learning_rate": 6.623442112200888e-05, - "loss": 1.7326, - "step": 7868 - }, - { - "epoch": 2.43, - "learning_rate": 6.619879206277165e-05, - "loss": 1.7613, - "step": 7869 - }, - { - "epoch": 2.43, - "learning_rate": 6.616316300353443e-05, - "loss": 1.8381, - "step": 7870 - }, - { - "epoch": 2.43, - "learning_rate": 6.61275339442972e-05, - "loss": 1.7016, - "step": 7871 - }, - { - "epoch": 2.43, - "learning_rate": 6.609190488505996e-05, - "loss": 1.7409, - "step": 7872 - }, - { - "epoch": 2.43, - "learning_rate": 6.605627582582273e-05, - "loss": 1.4567, - "step": 7873 - }, - { - "epoch": 2.43, - "learning_rate": 6.60206467665855e-05, - "loss": 1.4967, - "step": 7874 - }, - { - "epoch": 2.43, - "learning_rate": 6.598501770734827e-05, - "loss": 1.4032, - "step": 7875 - }, - { - "epoch": 2.43, - "learning_rate": 6.594938864811105e-05, - "loss": 1.5433, - "step": 7876 - }, - { - "epoch": 2.43, - "learning_rate": 6.591375958887382e-05, - "loss": 1.2509, - "step": 7877 - }, - { - "epoch": 2.43, - "learning_rate": 6.587813052963659e-05, - "loss": 1.4249, - "step": 7878 - }, - { - "epoch": 2.43, - "learning_rate": 6.584250147039936e-05, - "loss": 1.2086, - "step": 7879 - }, - { - "epoch": 2.43, - "learning_rate": 6.580687241116212e-05, - "loss": 1.2866, - "step": 7880 - }, - { - "epoch": 2.43, - "learning_rate": 6.57712433519249e-05, - "loss": 4.0232, - "step": 7881 - }, - { - "epoch": 2.43, - "learning_rate": 6.573561429268767e-05, - "loss": 3.3427, - "step": 7882 - }, - { - "epoch": 2.43, - "learning_rate": 6.569998523345044e-05, - "loss": 3.3558, - "step": 7883 - }, - { - "epoch": 2.43, - "learning_rate": 6.566435617421321e-05, - "loss": 3.3439, - "step": 7884 - }, - { - "epoch": 2.43, - "learning_rate": 6.562872711497599e-05, - "loss": 2.969, - "step": 7885 - }, - { - "epoch": 2.43, - "learning_rate": 6.559309805573875e-05, - "loss": 3.0058, - "step": 7886 - }, - { - "epoch": 2.43, - "learning_rate": 6.555746899650153e-05, - "loss": 3.015, - "step": 7887 - }, - { - "epoch": 2.43, - "learning_rate": 6.55218399372643e-05, - "loss": 2.7362, - "step": 7888 - }, - { - "epoch": 2.43, - "learning_rate": 6.548621087802707e-05, - "loss": 2.8832, - "step": 7889 - }, - { - "epoch": 2.44, - "learning_rate": 6.545058181878983e-05, - "loss": 2.889, - "step": 7890 - }, - { - "epoch": 2.44, - "learning_rate": 6.541495275955262e-05, - "loss": 2.6142, - "step": 7891 - }, - { - "epoch": 2.44, - "learning_rate": 6.537932370031537e-05, - "loss": 2.8801, - "step": 7892 - }, - { - "epoch": 2.44, - "learning_rate": 6.534369464107815e-05, - "loss": 2.6102, - "step": 7893 - }, - { - "epoch": 2.44, - "learning_rate": 6.530806558184092e-05, - "loss": 2.567, - "step": 7894 - }, - { - "epoch": 2.44, - "learning_rate": 6.527243652260369e-05, - "loss": 2.7, - "step": 7895 - }, - { - "epoch": 2.44, - "learning_rate": 6.523680746336646e-05, - "loss": 2.526, - "step": 7896 - }, - { - "epoch": 2.44, - "learning_rate": 6.520117840412924e-05, - "loss": 2.6436, - "step": 7897 - }, - { - "epoch": 2.44, - "learning_rate": 6.516554934489201e-05, - "loss": 2.4397, - "step": 7898 - }, - { - "epoch": 2.44, - "learning_rate": 6.512992028565478e-05, - "loss": 2.4029, - "step": 7899 - }, - { - "epoch": 2.44, - "learning_rate": 6.509429122641756e-05, - "loss": 2.3413, - "step": 7900 - }, - { - "epoch": 2.44, - "learning_rate": 6.505866216718031e-05, - "loss": 2.445, - "step": 7901 - }, - { - "epoch": 2.44, - "learning_rate": 6.50230331079431e-05, - "loss": 2.439, - "step": 7902 - }, - { - "epoch": 2.44, - "learning_rate": 6.498740404870586e-05, - "loss": 2.4618, - "step": 7903 - }, - { - "epoch": 2.44, - "learning_rate": 6.495177498946863e-05, - "loss": 2.3954, - "step": 7904 - }, - { - "epoch": 2.44, - "learning_rate": 6.49161459302314e-05, - "loss": 2.4628, - "step": 7905 - }, - { - "epoch": 2.44, - "learning_rate": 6.488051687099418e-05, - "loss": 2.3536, - "step": 7906 - }, - { - "epoch": 2.44, - "learning_rate": 6.484488781175694e-05, - "loss": 2.3192, - "step": 7907 - }, - { - "epoch": 2.44, - "learning_rate": 6.480925875251972e-05, - "loss": 2.2436, - "step": 7908 - }, - { - "epoch": 2.44, - "learning_rate": 6.477362969328249e-05, - "loss": 2.0623, - "step": 7909 - }, - { - "epoch": 2.44, - "learning_rate": 6.473800063404525e-05, - "loss": 2.1135, - "step": 7910 - }, - { - "epoch": 2.44, - "learning_rate": 6.470237157480802e-05, - "loss": 2.1445, - "step": 7911 - }, - { - "epoch": 2.44, - "learning_rate": 6.470237157480802e-05, - "loss": 2.2949, - "step": 7912 - }, - { - "epoch": 2.44, - "learning_rate": 6.46667425155708e-05, - "loss": 2.1258, - "step": 7913 - }, - { - "epoch": 2.44, - "learning_rate": 6.463111345633357e-05, - "loss": 1.9946, - "step": 7914 - }, - { - "epoch": 2.44, - "learning_rate": 6.459548439709634e-05, - "loss": 1.7539, - "step": 7915 - }, - { - "epoch": 2.44, - "learning_rate": 6.455985533785911e-05, - "loss": 1.8515, - "step": 7916 - }, - { - "epoch": 2.44, - "learning_rate": 6.452422627862188e-05, - "loss": 1.9675, - "step": 7917 - }, - { - "epoch": 2.44, - "learning_rate": 6.448859721938466e-05, - "loss": 1.6075, - "step": 7918 - }, - { - "epoch": 2.44, - "learning_rate": 6.445296816014743e-05, - "loss": 1.7512, - "step": 7919 - }, - { - "epoch": 2.44, - "learning_rate": 6.44173391009102e-05, - "loss": 1.658, - "step": 7920 - }, - { - "epoch": 2.44, - "learning_rate": 6.438171004167297e-05, - "loss": 1.6142, - "step": 7921 - }, - { - "epoch": 2.45, - "learning_rate": 6.434608098243573e-05, - "loss": 1.6225, - "step": 7922 - }, - { - "epoch": 2.45, - "learning_rate": 6.43104519231985e-05, - "loss": 1.6056, - "step": 7923 - }, - { - "epoch": 2.45, - "learning_rate": 6.427482286396128e-05, - "loss": 1.4985, - "step": 7924 - }, - { - "epoch": 2.45, - "learning_rate": 6.423919380472405e-05, - "loss": 1.4567, - "step": 7925 - }, - { - "epoch": 2.45, - "learning_rate": 6.420356474548682e-05, - "loss": 1.2598, - "step": 7926 - }, - { - "epoch": 2.45, - "learning_rate": 6.416793568624959e-05, - "loss": 1.3144, - "step": 7927 - }, - { - "epoch": 2.45, - "learning_rate": 6.413230662701236e-05, - "loss": 1.2632, - "step": 7928 - }, - { - "epoch": 2.45, - "learning_rate": 6.409667756777513e-05, - "loss": 1.2334, - "step": 7929 - }, - { - "epoch": 2.45, - "learning_rate": 6.406104850853791e-05, - "loss": 1.2923, - "step": 7930 - }, - { - "epoch": 2.45, - "learning_rate": 6.402541944930068e-05, - "loss": 3.5951, - "step": 7931 - }, - { - "epoch": 2.45, - "learning_rate": 6.398979039006344e-05, - "loss": 3.7701, - "step": 7932 - }, - { - "epoch": 2.45, - "learning_rate": 6.395416133082623e-05, - "loss": 3.1551, - "step": 7933 - }, - { - "epoch": 2.45, - "learning_rate": 6.391853227158898e-05, - "loss": 3.2705, - "step": 7934 - }, - { - "epoch": 2.45, - "learning_rate": 6.388290321235176e-05, - "loss": 3.1958, - "step": 7935 - }, - { - "epoch": 2.45, - "learning_rate": 6.384727415311453e-05, - "loss": 3.0288, - "step": 7936 - }, - { - "epoch": 2.45, - "learning_rate": 6.38116450938773e-05, - "loss": 2.8311, - "step": 7937 - }, - { - "epoch": 2.45, - "learning_rate": 6.377601603464007e-05, - "loss": 2.7868, - "step": 7938 - }, - { - "epoch": 2.45, - "learning_rate": 6.374038697540285e-05, - "loss": 3.1898, - "step": 7939 - }, - { - "epoch": 2.45, - "learning_rate": 6.37047579161656e-05, - "loss": 2.8153, - "step": 7940 - }, - { - "epoch": 2.45, - "learning_rate": 6.366912885692839e-05, - "loss": 2.4941, - "step": 7941 - }, - { - "epoch": 2.45, - "learning_rate": 6.363349979769115e-05, - "loss": 2.7173, - "step": 7942 - }, - { - "epoch": 2.45, - "learning_rate": 6.359787073845392e-05, - "loss": 2.6117, - "step": 7943 - }, - { - "epoch": 2.45, - "learning_rate": 6.356224167921669e-05, - "loss": 2.6274, - "step": 7944 - }, - { - "epoch": 2.45, - "learning_rate": 6.352661261997947e-05, - "loss": 2.9317, - "step": 7945 - }, - { - "epoch": 2.45, - "learning_rate": 6.349098356074224e-05, - "loss": 2.6433, - "step": 7946 - }, - { - "epoch": 2.45, - "learning_rate": 6.345535450150501e-05, - "loss": 2.6089, - "step": 7947 - }, - { - "epoch": 2.45, - "learning_rate": 6.341972544226779e-05, - "loss": 2.6618, - "step": 7948 - }, - { - "epoch": 2.45, - "learning_rate": 6.338409638303055e-05, - "loss": 2.5497, - "step": 7949 - }, - { - "epoch": 2.45, - "learning_rate": 6.334846732379333e-05, - "loss": 2.4734, - "step": 7950 - }, - { - "epoch": 2.45, - "learning_rate": 6.33128382645561e-05, - "loss": 2.6527, - "step": 7951 - }, - { - "epoch": 2.45, - "learning_rate": 6.327720920531886e-05, - "loss": 2.3293, - "step": 7952 - }, - { - "epoch": 2.45, - "learning_rate": 6.324158014608163e-05, - "loss": 2.2725, - "step": 7953 - }, - { - "epoch": 2.45, - "learning_rate": 6.320595108684441e-05, - "loss": 2.1242, - "step": 7954 - }, - { - "epoch": 2.46, - "learning_rate": 6.317032202760717e-05, - "loss": 2.6352, - "step": 7955 - }, - { - "epoch": 2.46, - "learning_rate": 6.313469296836995e-05, - "loss": 2.1152, - "step": 7956 - }, - { - "epoch": 2.46, - "learning_rate": 6.309906390913272e-05, - "loss": 2.5651, - "step": 7957 - }, - { - "epoch": 2.46, - "learning_rate": 6.306343484989549e-05, - "loss": 2.3933, - "step": 7958 - }, - { - "epoch": 2.46, - "learning_rate": 6.302780579065826e-05, - "loss": 2.3546, - "step": 7959 - }, - { - "epoch": 2.46, - "learning_rate": 6.299217673142104e-05, - "loss": 2.2679, - "step": 7960 - }, - { - "epoch": 2.46, - "learning_rate": 6.295654767218379e-05, - "loss": 2.2121, - "step": 7961 - }, - { - "epoch": 2.46, - "learning_rate": 6.292091861294657e-05, - "loss": 2.0256, - "step": 7962 - }, - { - "epoch": 2.46, - "learning_rate": 6.288528955370934e-05, - "loss": 1.6857, - "step": 7963 - }, - { - "epoch": 2.46, - "learning_rate": 6.284966049447211e-05, - "loss": 2.0285, - "step": 7964 - }, - { - "epoch": 2.46, - "learning_rate": 6.28140314352349e-05, - "loss": 1.73, - "step": 7965 - }, - { - "epoch": 2.46, - "learning_rate": 6.277840237599766e-05, - "loss": 1.8257, - "step": 7966 - }, - { - "epoch": 2.46, - "learning_rate": 6.274277331676043e-05, - "loss": 1.7126, - "step": 7967 - }, - { - "epoch": 2.46, - "learning_rate": 6.27071442575232e-05, - "loss": 1.7914, - "step": 7968 - }, - { - "epoch": 2.46, - "learning_rate": 6.267151519828597e-05, - "loss": 1.8199, - "step": 7969 - }, - { - "epoch": 2.46, - "learning_rate": 6.263588613904873e-05, - "loss": 1.7294, - "step": 7970 - }, - { - "epoch": 2.46, - "learning_rate": 6.260025707981152e-05, - "loss": 1.403, - "step": 7971 - }, - { - "epoch": 2.46, - "learning_rate": 6.256462802057429e-05, - "loss": 1.495, - "step": 7972 - }, - { - "epoch": 2.46, - "learning_rate": 6.252899896133705e-05, - "loss": 1.4786, - "step": 7973 - }, - { - "epoch": 2.46, - "learning_rate": 6.249336990209982e-05, - "loss": 1.3949, - "step": 7974 - }, - { - "epoch": 2.46, - "learning_rate": 6.245774084286259e-05, - "loss": 1.5175, - "step": 7975 - }, - { - "epoch": 2.46, - "learning_rate": 6.242211178362536e-05, - "loss": 1.4641, - "step": 7976 - }, - { - "epoch": 2.46, - "learning_rate": 6.238648272438814e-05, - "loss": 1.3942, - "step": 7977 - }, - { - "epoch": 2.46, - "learning_rate": 6.235085366515091e-05, - "loss": 1.0813, - "step": 7978 - }, - { - "epoch": 2.46, - "learning_rate": 6.231522460591368e-05, - "loss": 1.22, - "step": 7979 - }, - { - "epoch": 2.46, - "learning_rate": 6.227959554667646e-05, - "loss": 1.3055, - "step": 7980 - }, - { - "epoch": 2.46, - "learning_rate": 6.224396648743921e-05, - "loss": 3.9451, - "step": 7981 - }, - { - "epoch": 2.46, - "learning_rate": 6.2208337428202e-05, - "loss": 3.5157, - "step": 7982 - }, - { - "epoch": 2.46, - "learning_rate": 6.217270836896476e-05, - "loss": 3.0303, - "step": 7983 - }, - { - "epoch": 2.46, - "learning_rate": 6.213707930972753e-05, - "loss": 2.9565, - "step": 7984 - }, - { - "epoch": 2.46, - "learning_rate": 6.21014502504903e-05, - "loss": 2.9541, - "step": 7985 - }, - { - "epoch": 2.46, - "learning_rate": 6.206582119125308e-05, - "loss": 2.8482, - "step": 7986 - }, - { - "epoch": 2.47, - "learning_rate": 6.203019213201584e-05, - "loss": 2.8123, - "step": 7987 - }, - { - "epoch": 2.47, - "learning_rate": 6.199456307277862e-05, - "loss": 2.6416, - "step": 7988 - }, - { - "epoch": 2.47, - "learning_rate": 6.195893401354139e-05, - "loss": 2.7831, - "step": 7989 - }, - { - "epoch": 2.47, - "learning_rate": 6.192330495430416e-05, - "loss": 2.7502, - "step": 7990 - }, - { - "epoch": 2.47, - "learning_rate": 6.188767589506692e-05, - "loss": 2.8059, - "step": 7991 - }, - { - "epoch": 2.47, - "learning_rate": 6.18520468358297e-05, - "loss": 2.711, - "step": 7992 - }, - { - "epoch": 2.47, - "learning_rate": 6.181641777659247e-05, - "loss": 2.6527, - "step": 7993 - }, - { - "epoch": 2.47, - "learning_rate": 6.178078871735524e-05, - "loss": 2.6607, - "step": 7994 - }, - { - "epoch": 2.47, - "learning_rate": 6.174515965811801e-05, - "loss": 2.7427, - "step": 7995 - }, - { - "epoch": 2.47, - "learning_rate": 6.170953059888078e-05, - "loss": 2.4373, - "step": 7996 - }, - { - "epoch": 2.47, - "learning_rate": 6.167390153964356e-05, - "loss": 2.4249, - "step": 7997 - }, - { - "epoch": 2.47, - "learning_rate": 6.163827248040633e-05, - "loss": 2.4883, - "step": 7998 - }, - { - "epoch": 2.47, - "learning_rate": 6.16026434211691e-05, - "loss": 2.4887, - "step": 7999 - }, - { - "epoch": 2.47, - "learning_rate": 6.156701436193187e-05, - "loss": 2.6693, - "step": 8000 - }, - { - "epoch": 2.47, - "eval_bleu": 2.002877100137286e-15, - "eval_loss": 3.558516025543213, - "eval_runtime": 2573.2042, - "eval_samples_per_second": 5.736, - "eval_steps_per_second": 0.717, - "step": 8000 - }, - { - "epoch": 2.47, - "learning_rate": 6.153138530269465e-05, - "loss": 2.2599, - "step": 8001 - }, - { - "epoch": 2.47, - "learning_rate": 6.14957562434574e-05, - "loss": 2.3822, - "step": 8002 - }, - { - "epoch": 2.47, - "learning_rate": 6.146012718422018e-05, - "loss": 2.4467, - "step": 8003 - }, - { - "epoch": 2.47, - "learning_rate": 6.142449812498295e-05, - "loss": 2.5537, - "step": 8004 - }, - { - "epoch": 2.47, - "learning_rate": 6.138886906574572e-05, - "loss": 2.3505, - "step": 8005 - }, - { - "epoch": 2.47, - "learning_rate": 6.138886906574572e-05, - "loss": 2.2556, - "step": 8006 - }, - { - "epoch": 2.47, - "learning_rate": 6.135324000650849e-05, - "loss": 2.1347, - "step": 8007 - }, - { - "epoch": 2.47, - "learning_rate": 6.131761094727127e-05, - "loss": 2.2455, - "step": 8008 - }, - { - "epoch": 2.47, - "learning_rate": 6.128198188803403e-05, - "loss": 2.2675, - "step": 8009 - }, - { - "epoch": 2.47, - "learning_rate": 6.124635282879681e-05, - "loss": 2.1256, - "step": 8010 - }, - { - "epoch": 2.47, - "learning_rate": 6.121072376955958e-05, - "loss": 2.0208, - "step": 8011 - }, - { - "epoch": 2.47, - "learning_rate": 6.117509471032234e-05, - "loss": 2.1248, - "step": 8012 - }, - { - "epoch": 2.47, - "learning_rate": 6.113946565108513e-05, - "loss": 2.0277, - "step": 8013 - }, - { - "epoch": 2.47, - "learning_rate": 6.11038365918479e-05, - "loss": 1.9769, - "step": 8014 - }, - { - "epoch": 2.47, - "learning_rate": 6.106820753261066e-05, - "loss": 1.8413, - "step": 8015 - }, - { - "epoch": 2.47, - "learning_rate": 6.103257847337343e-05, - "loss": 1.976, - "step": 8016 - }, - { - "epoch": 2.47, - "learning_rate": 6.0996949414136207e-05, - "loss": 1.6732, - "step": 8017 - }, - { - "epoch": 2.47, - "learning_rate": 6.0961320354898975e-05, - "loss": 1.8202, - "step": 8018 - }, - { - "epoch": 2.47, - "learning_rate": 6.092569129566175e-05, - "loss": 1.6768, - "step": 8019 - }, - { - "epoch": 2.48, - "learning_rate": 6.089006223642451e-05, - "loss": 1.832, - "step": 8020 - }, - { - "epoch": 2.48, - "learning_rate": 6.085443317718729e-05, - "loss": 1.4765, - "step": 8021 - }, - { - "epoch": 2.48, - "learning_rate": 6.0818804117950055e-05, - "loss": 1.6575, - "step": 8022 - }, - { - "epoch": 2.48, - "learning_rate": 6.078317505871283e-05, - "loss": 1.7091, - "step": 8023 - }, - { - "epoch": 2.48, - "learning_rate": 6.07475459994756e-05, - "loss": 1.4353, - "step": 8024 - }, - { - "epoch": 2.48, - "learning_rate": 6.0711916940238373e-05, - "loss": 1.3867, - "step": 8025 - }, - { - "epoch": 2.48, - "learning_rate": 6.0676287881001135e-05, - "loss": 1.3543, - "step": 8026 - }, - { - "epoch": 2.48, - "learning_rate": 6.064065882176391e-05, - "loss": 1.4172, - "step": 8027 - }, - { - "epoch": 2.48, - "learning_rate": 6.060502976252668e-05, - "loss": 1.294, - "step": 8028 - }, - { - "epoch": 2.48, - "learning_rate": 6.0569400703289454e-05, - "loss": 1.2624, - "step": 8029 - }, - { - "epoch": 2.48, - "learning_rate": 6.053377164405223e-05, - "loss": 1.122, - "step": 8030 - }, - { - "epoch": 2.48, - "learning_rate": 6.0498142584815e-05, - "loss": 3.7671, - "step": 8031 - }, - { - "epoch": 2.48, - "learning_rate": 6.046251352557777e-05, - "loss": 3.3127, - "step": 8032 - }, - { - "epoch": 2.48, - "learning_rate": 6.0426884466340534e-05, - "loss": 3.3436, - "step": 8033 - }, - { - "epoch": 2.48, - "learning_rate": 6.039125540710331e-05, - "loss": 2.9302, - "step": 8034 - }, - { - "epoch": 2.48, - "learning_rate": 6.035562634786608e-05, - "loss": 2.8277, - "step": 8035 - }, - { - "epoch": 2.48, - "learning_rate": 6.031999728862885e-05, - "loss": 2.6586, - "step": 8036 - }, - { - "epoch": 2.48, - "learning_rate": 6.028436822939162e-05, - "loss": 2.6908, - "step": 8037 - }, - { - "epoch": 2.48, - "learning_rate": 6.0248739170154396e-05, - "loss": 2.8662, - "step": 8038 - }, - { - "epoch": 2.48, - "learning_rate": 6.021311011091716e-05, - "loss": 2.9442, - "step": 8039 - }, - { - "epoch": 2.48, - "learning_rate": 6.017748105167993e-05, - "loss": 2.6158, - "step": 8040 - }, - { - "epoch": 2.48, - "learning_rate": 6.01418519924427e-05, - "loss": 2.8784, - "step": 8041 - }, - { - "epoch": 2.48, - "learning_rate": 6.0106222933205476e-05, - "loss": 2.6543, - "step": 8042 - }, - { - "epoch": 2.48, - "learning_rate": 6.0070593873968244e-05, - "loss": 2.7793, - "step": 8043 - }, - { - "epoch": 2.48, - "learning_rate": 6.003496481473102e-05, - "loss": 2.6663, - "step": 8044 - }, - { - "epoch": 2.48, - "learning_rate": 5.999933575549378e-05, - "loss": 2.4359, - "step": 8045 - }, - { - "epoch": 2.48, - "learning_rate": 5.9963706696256556e-05, - "loss": 2.5488, - "step": 8046 - }, - { - "epoch": 2.48, - "learning_rate": 5.992807763701933e-05, - "loss": 2.5064, - "step": 8047 - }, - { - "epoch": 2.48, - "learning_rate": 5.98924485777821e-05, - "loss": 2.5299, - "step": 8048 - }, - { - "epoch": 2.48, - "learning_rate": 5.9856819518544874e-05, - "loss": 2.5013, - "step": 8049 - }, - { - "epoch": 2.48, - "learning_rate": 5.982119045930764e-05, - "loss": 2.3937, - "step": 8050 - }, - { - "epoch": 2.48, - "learning_rate": 5.978556140007042e-05, - "loss": 2.4979, - "step": 8051 - }, - { - "epoch": 2.49, - "learning_rate": 5.974993234083318e-05, - "loss": 2.2496, - "step": 8052 - }, - { - "epoch": 2.49, - "learning_rate": 5.9714303281595954e-05, - "loss": 2.4653, - "step": 8053 - }, - { - "epoch": 2.49, - "learning_rate": 5.967867422235872e-05, - "loss": 2.2077, - "step": 8054 - }, - { - "epoch": 2.49, - "learning_rate": 5.96430451631215e-05, - "loss": 2.2817, - "step": 8055 - }, - { - "epoch": 2.49, - "learning_rate": 5.9607416103884266e-05, - "loss": 2.3318, - "step": 8056 - }, - { - "epoch": 2.49, - "learning_rate": 5.957178704464704e-05, - "loss": 2.0848, - "step": 8057 - }, - { - "epoch": 2.49, - "learning_rate": 5.953615798540981e-05, - "loss": 2.1588, - "step": 8058 - }, - { - "epoch": 2.49, - "learning_rate": 5.9500528926172585e-05, - "loss": 2.2365, - "step": 8059 - }, - { - "epoch": 2.49, - "learning_rate": 5.9464899866935346e-05, - "loss": 2.132, - "step": 8060 - }, - { - "epoch": 2.49, - "learning_rate": 5.942927080769812e-05, - "loss": 1.9113, - "step": 8061 - }, - { - "epoch": 2.49, - "learning_rate": 5.9393641748460896e-05, - "loss": 1.9459, - "step": 8062 - }, - { - "epoch": 2.49, - "learning_rate": 5.9358012689223665e-05, - "loss": 1.7673, - "step": 8063 - }, - { - "epoch": 2.49, - "learning_rate": 5.932238362998644e-05, - "loss": 1.8131, - "step": 8064 - }, - { - "epoch": 2.49, - "learning_rate": 5.928675457074921e-05, - "loss": 2.0157, - "step": 8065 - }, - { - "epoch": 2.49, - "learning_rate": 5.925112551151198e-05, - "loss": 2.0441, - "step": 8066 - }, - { - "epoch": 2.49, - "learning_rate": 5.9215496452274745e-05, - "loss": 1.9258, - "step": 8067 - }, - { - "epoch": 2.49, - "learning_rate": 5.917986739303752e-05, - "loss": 1.6283, - "step": 8068 - }, - { - "epoch": 2.49, - "learning_rate": 5.914423833380029e-05, - "loss": 1.4977, - "step": 8069 - }, - { - "epoch": 2.49, - "learning_rate": 5.910860927456306e-05, - "loss": 1.53, - "step": 8070 - }, - { - "epoch": 2.49, - "learning_rate": 5.907298021532583e-05, - "loss": 1.5743, - "step": 8071 - }, - { - "epoch": 2.49, - "learning_rate": 5.903735115608861e-05, - "loss": 1.7177, - "step": 8072 - }, - { - "epoch": 2.49, - "learning_rate": 5.900172209685137e-05, - "loss": 1.6372, - "step": 8073 - }, - { - "epoch": 2.49, - "learning_rate": 5.896609303761414e-05, - "loss": 1.3698, - "step": 8074 - }, - { - "epoch": 2.49, - "learning_rate": 5.893046397837691e-05, - "loss": 1.5741, - "step": 8075 - }, - { - "epoch": 2.49, - "learning_rate": 5.889483491913969e-05, - "loss": 1.2633, - "step": 8076 - }, - { - "epoch": 2.49, - "learning_rate": 5.8859205859902455e-05, - "loss": 1.4312, - "step": 8077 - }, - { - "epoch": 2.49, - "learning_rate": 5.882357680066523e-05, - "loss": 1.2641, - "step": 8078 - }, - { - "epoch": 2.49, - "learning_rate": 5.8787947741428005e-05, - "loss": 1.1613, - "step": 8079 - }, - { - "epoch": 2.49, - "learning_rate": 5.875231868219077e-05, - "loss": 1.0927, - "step": 8080 - }, - { - "epoch": 2.49, - "learning_rate": 5.871668962295354e-05, - "loss": 3.7584, - "step": 8081 - }, - { - "epoch": 2.49, - "learning_rate": 5.868106056371631e-05, - "loss": 3.29, - "step": 8082 - }, - { - "epoch": 2.49, - "learning_rate": 5.8645431504479085e-05, - "loss": 2.88, - "step": 8083 - }, - { - "epoch": 2.5, - "learning_rate": 5.8609802445241854e-05, - "loss": 2.8912, - "step": 8084 - }, - { - "epoch": 2.5, - "learning_rate": 5.857417338600463e-05, - "loss": 2.9102, - "step": 8085 - }, - { - "epoch": 2.5, - "learning_rate": 5.853854432676739e-05, - "loss": 2.791, - "step": 8086 - }, - { - "epoch": 2.5, - "learning_rate": 5.8502915267530165e-05, - "loss": 2.8283, - "step": 8087 - }, - { - "epoch": 2.5, - "learning_rate": 5.8467286208292934e-05, - "loss": 2.812, - "step": 8088 - }, - { - "epoch": 2.5, - "learning_rate": 5.843165714905571e-05, - "loss": 2.6868, - "step": 8089 - }, - { - "epoch": 2.5, - "learning_rate": 5.839602808981848e-05, - "loss": 2.7071, - "step": 8090 - }, - { - "epoch": 2.5, - "learning_rate": 5.836039903058125e-05, - "loss": 2.6749, - "step": 8091 - }, - { - "epoch": 2.5, - "learning_rate": 5.8324769971344014e-05, - "loss": 2.324, - "step": 8092 - }, - { - "epoch": 2.5, - "learning_rate": 5.828914091210679e-05, - "loss": 2.4571, - "step": 8093 - }, - { - "epoch": 2.5, - "learning_rate": 5.825351185286956e-05, - "loss": 2.304, - "step": 8094 - }, - { - "epoch": 2.5, - "learning_rate": 5.821788279363233e-05, - "loss": 2.6561, - "step": 8095 - }, - { - "epoch": 2.5, - "learning_rate": 5.818225373439511e-05, - "loss": 2.5749, - "step": 8096 - }, - { - "epoch": 2.5, - "learning_rate": 5.8146624675157876e-05, - "loss": 2.606, - "step": 8097 - }, - { - "epoch": 2.5, - "learning_rate": 5.811099561592065e-05, - "loss": 2.2299, - "step": 8098 - }, - { - "epoch": 2.5, - "learning_rate": 5.807536655668342e-05, - "loss": 2.4148, - "step": 8099 - }, - { - "epoch": 2.5, - "learning_rate": 5.8039737497446194e-05, - "loss": 2.387, - "step": 8100 - }, - { - "epoch": 2.5, - "learning_rate": 5.8004108438208956e-05, - "loss": 2.4299, - "step": 8101 - }, - { - "epoch": 2.5, - "learning_rate": 5.796847937897173e-05, - "loss": 2.4114, - "step": 8102 - }, - { - "epoch": 2.5, - "learning_rate": 5.79328503197345e-05, - "loss": 2.4003, - "step": 8103 - }, - { - "epoch": 2.5, - "learning_rate": 5.7897221260497274e-05, - "loss": 2.279, - "step": 8104 - }, - { - "epoch": 2.5, - "learning_rate": 5.786159220126004e-05, - "loss": 2.1557, - "step": 8105 - }, - { - "epoch": 2.5, - "learning_rate": 5.782596314202282e-05, - "loss": 2.3724, - "step": 8106 - }, - { - "epoch": 2.5, - "learning_rate": 5.779033408278558e-05, - "loss": 2.009, - "step": 8107 - }, - { - "epoch": 2.5, - "learning_rate": 5.7754705023548354e-05, - "loss": 2.1518, - "step": 8108 - }, - { - "epoch": 2.5, - "learning_rate": 5.771907596431112e-05, - "loss": 2.1651, - "step": 8109 - }, - { - "epoch": 2.5, - "learning_rate": 5.76834469050739e-05, - "loss": 1.9886, - "step": 8110 - }, - { - "epoch": 2.5, - "learning_rate": 5.764781784583667e-05, - "loss": 2.0058, - "step": 8111 - }, - { - "epoch": 2.5, - "learning_rate": 5.761218878659944e-05, - "loss": 1.8827, - "step": 8112 - }, - { - "epoch": 2.5, - "learning_rate": 5.7576559727362216e-05, - "loss": 2.0571, - "step": 8113 - }, - { - "epoch": 2.5, - "learning_rate": 5.754093066812498e-05, - "loss": 1.7886, - "step": 8114 - }, - { - "epoch": 2.5, - "learning_rate": 5.750530160888775e-05, - "loss": 1.7758, - "step": 8115 - }, - { - "epoch": 2.5, - "learning_rate": 5.746967254965052e-05, - "loss": 1.6548, - "step": 8116 - }, - { - "epoch": 2.51, - "learning_rate": 5.7434043490413296e-05, - "loss": 1.7934, - "step": 8117 - }, - { - "epoch": 2.51, - "learning_rate": 5.7398414431176065e-05, - "loss": 1.8331, - "step": 8118 - }, - { - "epoch": 2.51, - "learning_rate": 5.736278537193884e-05, - "loss": 1.6385, - "step": 8119 - }, - { - "epoch": 2.51, - "learning_rate": 5.73271563127016e-05, - "loss": 1.4586, - "step": 8120 - }, - { - "epoch": 2.51, - "learning_rate": 5.7291527253464376e-05, - "loss": 1.3626, - "step": 8121 - }, - { - "epoch": 2.51, - "learning_rate": 5.7255898194227145e-05, - "loss": 1.6016, - "step": 8122 - }, - { - "epoch": 2.51, - "learning_rate": 5.722026913498992e-05, - "loss": 1.4789, - "step": 8123 - }, - { - "epoch": 2.51, - "learning_rate": 5.718464007575269e-05, - "loss": 1.2389, - "step": 8124 - }, - { - "epoch": 2.51, - "learning_rate": 5.714901101651546e-05, - "loss": 1.3368, - "step": 8125 - }, - { - "epoch": 2.51, - "learning_rate": 5.7113381957278225e-05, - "loss": 1.2305, - "step": 8126 - }, - { - "epoch": 2.51, - "learning_rate": 5.7077752898041e-05, - "loss": 1.147, - "step": 8127 - }, - { - "epoch": 2.51, - "learning_rate": 5.7042123838803775e-05, - "loss": 1.3414, - "step": 8128 - }, - { - "epoch": 2.51, - "learning_rate": 5.700649477956654e-05, - "loss": 1.1006, - "step": 8129 - }, - { - "epoch": 2.51, - "learning_rate": 5.697086572032932e-05, - "loss": 1.2604, - "step": 8130 - }, - { - "epoch": 2.51, - "learning_rate": 5.693523666109209e-05, - "loss": 3.5442, - "step": 8131 - }, - { - "epoch": 2.51, - "learning_rate": 5.689960760185486e-05, - "loss": 3.2533, - "step": 8132 - }, - { - "epoch": 2.51, - "learning_rate": 5.686397854261762e-05, - "loss": 2.9217, - "step": 8133 - }, - { - "epoch": 2.51, - "learning_rate": 5.68283494833804e-05, - "loss": 2.8576, - "step": 8134 - }, - { - "epoch": 2.51, - "learning_rate": 5.679272042414317e-05, - "loss": 2.6886, - "step": 8135 - }, - { - "epoch": 2.51, - "learning_rate": 5.675709136490594e-05, - "loss": 2.719, - "step": 8136 - }, - { - "epoch": 2.51, - "learning_rate": 5.672146230566871e-05, - "loss": 2.5422, - "step": 8137 - }, - { - "epoch": 2.51, - "learning_rate": 5.6685833246431485e-05, - "loss": 2.6128, - "step": 8138 - }, - { - "epoch": 2.51, - "learning_rate": 5.665020418719425e-05, - "loss": 2.7544, - "step": 8139 - }, - { - "epoch": 2.51, - "learning_rate": 5.661457512795702e-05, - "loss": 2.6845, - "step": 8140 - }, - { - "epoch": 2.51, - "learning_rate": 5.657894606871979e-05, - "loss": 2.7276, - "step": 8141 - }, - { - "epoch": 2.51, - "learning_rate": 5.6543317009482565e-05, - "loss": 2.5652, - "step": 8142 - }, - { - "epoch": 2.51, - "learning_rate": 5.6507687950245334e-05, - "loss": 2.5879, - "step": 8143 - }, - { - "epoch": 2.51, - "learning_rate": 5.647205889100811e-05, - "loss": 2.268, - "step": 8144 - }, - { - "epoch": 2.51, - "learning_rate": 5.6436429831770884e-05, - "loss": 2.3543, - "step": 8145 - }, - { - "epoch": 2.51, - "learning_rate": 5.640080077253365e-05, - "loss": 2.4159, - "step": 8146 - }, - { - "epoch": 2.51, - "learning_rate": 5.636517171329643e-05, - "loss": 2.698, - "step": 8147 - }, - { - "epoch": 2.51, - "learning_rate": 5.632954265405919e-05, - "loss": 2.5123, - "step": 8148 - }, - { - "epoch": 2.52, - "learning_rate": 5.6293913594821964e-05, - "loss": 2.4639, - "step": 8149 - }, - { - "epoch": 2.52, - "learning_rate": 5.625828453558473e-05, - "loss": 2.3406, - "step": 8150 - }, - { - "epoch": 2.52, - "learning_rate": 5.622265547634751e-05, - "loss": 2.4839, - "step": 8151 - }, - { - "epoch": 2.52, - "learning_rate": 5.6187026417110276e-05, - "loss": 2.3045, - "step": 8152 - }, - { - "epoch": 2.52, - "learning_rate": 5.615139735787305e-05, - "loss": 2.3945, - "step": 8153 - }, - { - "epoch": 2.52, - "learning_rate": 5.611576829863581e-05, - "loss": 2.3175, - "step": 8154 - }, - { - "epoch": 2.52, - "learning_rate": 5.608013923939859e-05, - "loss": 1.9502, - "step": 8155 - }, - { - "epoch": 2.52, - "learning_rate": 5.6044510180161356e-05, - "loss": 2.2553, - "step": 8156 - }, - { - "epoch": 2.52, - "learning_rate": 5.600888112092413e-05, - "loss": 2.2579, - "step": 8157 - }, - { - "epoch": 2.52, - "learning_rate": 5.59732520616869e-05, - "loss": 1.9157, - "step": 8158 - }, - { - "epoch": 2.52, - "learning_rate": 5.5937623002449674e-05, - "loss": 2.2055, - "step": 8159 - }, - { - "epoch": 2.52, - "learning_rate": 5.590199394321245e-05, - "loss": 1.9103, - "step": 8160 - }, - { - "epoch": 2.52, - "learning_rate": 5.586636488397521e-05, - "loss": 2.0723, - "step": 8161 - }, - { - "epoch": 2.52, - "learning_rate": 5.5830735824737986e-05, - "loss": 2.0191, - "step": 8162 - }, - { - "epoch": 2.52, - "learning_rate": 5.5795106765500754e-05, - "loss": 1.9351, - "step": 8163 - }, - { - "epoch": 2.52, - "learning_rate": 5.575947770626353e-05, - "loss": 1.7624, - "step": 8164 - }, - { - "epoch": 2.52, - "learning_rate": 5.57238486470263e-05, - "loss": 2.0485, - "step": 8165 - }, - { - "epoch": 2.52, - "learning_rate": 5.568821958778907e-05, - "loss": 1.8379, - "step": 8166 - }, - { - "epoch": 2.52, - "learning_rate": 5.5652590528551834e-05, - "loss": 1.9475, - "step": 8167 - }, - { - "epoch": 2.52, - "learning_rate": 5.561696146931461e-05, - "loss": 1.6924, - "step": 8168 - }, - { - "epoch": 2.52, - "learning_rate": 5.558133241007738e-05, - "loss": 1.6278, - "step": 8169 - }, - { - "epoch": 2.52, - "learning_rate": 5.554570335084015e-05, - "loss": 1.4579, - "step": 8170 - }, - { - "epoch": 2.52, - "learning_rate": 5.551007429160292e-05, - "loss": 1.4883, - "step": 8171 - }, - { - "epoch": 2.52, - "learning_rate": 5.5474445232365696e-05, - "loss": 1.5816, - "step": 8172 - }, - { - "epoch": 2.52, - "learning_rate": 5.543881617312846e-05, - "loss": 1.6799, - "step": 8173 - }, - { - "epoch": 2.52, - "learning_rate": 5.540318711389123e-05, - "loss": 1.4614, - "step": 8174 - }, - { - "epoch": 2.52, - "learning_rate": 5.5367558054654e-05, - "loss": 1.4349, - "step": 8175 - }, - { - "epoch": 2.52, - "learning_rate": 5.5331928995416776e-05, - "loss": 1.3221, - "step": 8176 - }, - { - "epoch": 2.52, - "learning_rate": 5.529629993617955e-05, - "loss": 1.2526, - "step": 8177 - }, - { - "epoch": 2.52, - "learning_rate": 5.526067087694232e-05, - "loss": 1.2352, - "step": 8178 - }, - { - "epoch": 2.52, - "learning_rate": 5.5225041817705095e-05, - "loss": 1.0473, - "step": 8179 - }, - { - "epoch": 2.52, - "learning_rate": 5.5189412758467857e-05, - "loss": 1.0403, - "step": 8180 - }, - { - "epoch": 2.52, - "learning_rate": 5.515378369923063e-05, - "loss": 3.7584, - "step": 8181 - }, - { - "epoch": 2.53, - "learning_rate": 5.51181546399934e-05, - "loss": 3.1954, - "step": 8182 - }, - { - "epoch": 2.53, - "learning_rate": 5.5082525580756175e-05, - "loss": 3.0258, - "step": 8183 - }, - { - "epoch": 2.53, - "learning_rate": 5.504689652151894e-05, - "loss": 3.0764, - "step": 8184 - }, - { - "epoch": 2.53, - "learning_rate": 5.501126746228172e-05, - "loss": 2.8416, - "step": 8185 - }, - { - "epoch": 2.53, - "learning_rate": 5.497563840304449e-05, - "loss": 3.0023, - "step": 8186 - }, - { - "epoch": 2.53, - "learning_rate": 5.494000934380726e-05, - "loss": 2.7052, - "step": 8187 - }, - { - "epoch": 2.53, - "learning_rate": 5.4904380284570023e-05, - "loss": 2.9448, - "step": 8188 - }, - { - "epoch": 2.53, - "learning_rate": 5.48687512253328e-05, - "loss": 3.0347, - "step": 8189 - }, - { - "epoch": 2.53, - "learning_rate": 5.483312216609557e-05, - "loss": 2.8951, - "step": 8190 - }, - { - "epoch": 2.53, - "learning_rate": 5.479749310685834e-05, - "loss": 2.5392, - "step": 8191 - }, - { - "epoch": 2.53, - "learning_rate": 5.476186404762111e-05, - "loss": 2.3996, - "step": 8192 - }, - { - "epoch": 2.53, - "learning_rate": 5.4726234988383885e-05, - "loss": 2.4283, - "step": 8193 - }, - { - "epoch": 2.53, - "learning_rate": 5.469060592914666e-05, - "loss": 2.2762, - "step": 8194 - }, - { - "epoch": 2.53, - "learning_rate": 5.465497686990942e-05, - "loss": 2.582, - "step": 8195 - }, - { - "epoch": 2.53, - "learning_rate": 5.46193478106722e-05, - "loss": 2.6537, - "step": 8196 - }, - { - "epoch": 2.53, - "learning_rate": 5.4583718751434965e-05, - "loss": 2.4001, - "step": 8197 - }, - { - "epoch": 2.53, - "learning_rate": 5.454808969219774e-05, - "loss": 2.3131, - "step": 8198 - }, - { - "epoch": 2.53, - "learning_rate": 5.451246063296051e-05, - "loss": 2.4533, - "step": 8199 - }, - { - "epoch": 2.53, - "learning_rate": 5.4476831573723284e-05, - "loss": 2.486, - "step": 8200 - }, - { - "epoch": 2.53, - "learning_rate": 5.4441202514486046e-05, - "loss": 2.0978, - "step": 8201 - }, - { - "epoch": 2.53, - "learning_rate": 5.440557345524882e-05, - "loss": 2.1464, - "step": 8202 - }, - { - "epoch": 2.53, - "learning_rate": 5.436994439601159e-05, - "loss": 2.1308, - "step": 8203 - }, - { - "epoch": 2.53, - "learning_rate": 5.4334315336774364e-05, - "loss": 2.1413, - "step": 8204 - }, - { - "epoch": 2.53, - "learning_rate": 5.429868627753713e-05, - "loss": 2.2564, - "step": 8205 - }, - { - "epoch": 2.53, - "learning_rate": 5.426305721829991e-05, - "loss": 1.9395, - "step": 8206 - }, - { - "epoch": 2.53, - "learning_rate": 5.422742815906267e-05, - "loss": 1.9924, - "step": 8207 - }, - { - "epoch": 2.53, - "learning_rate": 5.4191799099825444e-05, - "loss": 1.8784, - "step": 8208 - }, - { - "epoch": 2.53, - "learning_rate": 5.415617004058822e-05, - "loss": 2.0902, - "step": 8209 - }, - { - "epoch": 2.53, - "learning_rate": 5.412054098135099e-05, - "loss": 1.8776, - "step": 8210 - }, - { - "epoch": 2.53, - "learning_rate": 5.408491192211376e-05, - "loss": 1.8037, - "step": 8211 - }, - { - "epoch": 2.53, - "learning_rate": 5.404928286287653e-05, - "loss": 1.9831, - "step": 8212 - }, - { - "epoch": 2.53, - "learning_rate": 5.4013653803639306e-05, - "loss": 1.853, - "step": 8213 - }, - { - "epoch": 2.54, - "learning_rate": 5.397802474440207e-05, - "loss": 1.7513, - "step": 8214 - }, - { - "epoch": 2.54, - "learning_rate": 5.394239568516484e-05, - "loss": 2.029, - "step": 8215 - }, - { - "epoch": 2.54, - "learning_rate": 5.390676662592761e-05, - "loss": 1.7699, - "step": 8216 - }, - { - "epoch": 2.54, - "learning_rate": 5.3871137566690386e-05, - "loss": 1.7862, - "step": 8217 - }, - { - "epoch": 2.54, - "learning_rate": 5.3835508507453154e-05, - "loss": 1.8584, - "step": 8218 - }, - { - "epoch": 2.54, - "learning_rate": 5.379987944821593e-05, - "loss": 1.5326, - "step": 8219 - }, - { - "epoch": 2.54, - "learning_rate": 5.376425038897869e-05, - "loss": 1.5104, - "step": 8220 - }, - { - "epoch": 2.54, - "learning_rate": 5.3728621329741466e-05, - "loss": 1.5799, - "step": 8221 - }, - { - "epoch": 2.54, - "learning_rate": 5.3692992270504235e-05, - "loss": 1.4118, - "step": 8222 - }, - { - "epoch": 2.54, - "learning_rate": 5.365736321126701e-05, - "loss": 1.534, - "step": 8223 - }, - { - "epoch": 2.54, - "learning_rate": 5.362173415202978e-05, - "loss": 1.3795, - "step": 8224 - }, - { - "epoch": 2.54, - "learning_rate": 5.358610509279255e-05, - "loss": 1.3729, - "step": 8225 - }, - { - "epoch": 2.54, - "learning_rate": 5.355047603355533e-05, - "loss": 1.2419, - "step": 8226 - }, - { - "epoch": 2.54, - "learning_rate": 5.351484697431809e-05, - "loss": 1.2568, - "step": 8227 - }, - { - "epoch": 2.54, - "learning_rate": 5.347921791508087e-05, - "loss": 1.2249, - "step": 8228 - }, - { - "epoch": 2.54, - "learning_rate": 5.344358885584363e-05, - "loss": 0.9961, - "step": 8229 - }, - { - "epoch": 2.54, - "learning_rate": 5.340795979660641e-05, - "loss": 1.2348, - "step": 8230 - }, - { - "epoch": 2.54, - "learning_rate": 5.3372330737369177e-05, - "loss": 3.4172, - "step": 8231 - }, - { - "epoch": 2.54, - "learning_rate": 5.333670167813195e-05, - "loss": 3.0685, - "step": 8232 - }, - { - "epoch": 2.54, - "learning_rate": 5.330107261889472e-05, - "loss": 2.9726, - "step": 8233 - }, - { - "epoch": 2.54, - "learning_rate": 5.3265443559657495e-05, - "loss": 2.95, - "step": 8234 - }, - { - "epoch": 2.54, - "learning_rate": 5.3229814500420257e-05, - "loss": 3.1676, - "step": 8235 - }, - { - "epoch": 2.54, - "learning_rate": 5.319418544118303e-05, - "loss": 2.8175, - "step": 8236 - }, - { - "epoch": 2.54, - "learning_rate": 5.31585563819458e-05, - "loss": 2.6649, - "step": 8237 - }, - { - "epoch": 2.54, - "learning_rate": 5.3122927322708575e-05, - "loss": 2.6926, - "step": 8238 - }, - { - "epoch": 2.54, - "learning_rate": 5.3087298263471343e-05, - "loss": 2.7466, - "step": 8239 - }, - { - "epoch": 2.54, - "learning_rate": 5.305166920423412e-05, - "loss": 2.7711, - "step": 8240 - }, - { - "epoch": 2.54, - "learning_rate": 5.301604014499688e-05, - "loss": 2.6793, - "step": 8241 - }, - { - "epoch": 2.54, - "learning_rate": 5.2980411085759655e-05, - "loss": 2.6521, - "step": 8242 - }, - { - "epoch": 2.54, - "learning_rate": 5.294478202652243e-05, - "loss": 2.7193, - "step": 8243 - }, - { - "epoch": 2.54, - "learning_rate": 5.29091529672852e-05, - "loss": 2.5258, - "step": 8244 - }, - { - "epoch": 2.54, - "learning_rate": 5.2873523908047974e-05, - "loss": 2.4916, - "step": 8245 - }, - { - "epoch": 2.55, - "learning_rate": 5.283789484881074e-05, - "loss": 2.5156, - "step": 8246 - }, - { - "epoch": 2.55, - "learning_rate": 5.280226578957352e-05, - "loss": 2.4302, - "step": 8247 - }, - { - "epoch": 2.55, - "learning_rate": 5.276663673033628e-05, - "loss": 2.2268, - "step": 8248 - }, - { - "epoch": 2.55, - "learning_rate": 5.2731007671099054e-05, - "loss": 2.2694, - "step": 8249 - }, - { - "epoch": 2.55, - "learning_rate": 5.269537861186182e-05, - "loss": 2.4475, - "step": 8250 - }, - { - "epoch": 2.55, - "learning_rate": 5.26597495526246e-05, - "loss": 2.3569, - "step": 8251 - }, - { - "epoch": 2.55, - "learning_rate": 5.2624120493387366e-05, - "loss": 2.232, - "step": 8252 - }, - { - "epoch": 2.55, - "learning_rate": 5.258849143415014e-05, - "loss": 2.193, - "step": 8253 - }, - { - "epoch": 2.55, - "learning_rate": 5.25528623749129e-05, - "loss": 2.2134, - "step": 8254 - }, - { - "epoch": 2.55, - "learning_rate": 5.251723331567568e-05, - "loss": 2.2043, - "step": 8255 - }, - { - "epoch": 2.55, - "learning_rate": 5.2481604256438446e-05, - "loss": 2.1611, - "step": 8256 - }, - { - "epoch": 2.55, - "learning_rate": 5.244597519720122e-05, - "loss": 2.0997, - "step": 8257 - }, - { - "epoch": 2.55, - "learning_rate": 5.2410346137963996e-05, - "loss": 1.9993, - "step": 8258 - }, - { - "epoch": 2.55, - "learning_rate": 5.2374717078726764e-05, - "loss": 2.1637, - "step": 8259 - }, - { - "epoch": 2.55, - "learning_rate": 5.233908801948954e-05, - "loss": 2.0658, - "step": 8260 - }, - { - "epoch": 2.55, - "learning_rate": 5.23034589602523e-05, - "loss": 1.9571, - "step": 8261 - }, - { - "epoch": 2.55, - "learning_rate": 5.2267829901015076e-05, - "loss": 1.9184, - "step": 8262 - }, - { - "epoch": 2.55, - "learning_rate": 5.2232200841777844e-05, - "loss": 2.0527, - "step": 8263 - }, - { - "epoch": 2.55, - "learning_rate": 5.219657178254062e-05, - "loss": 1.7992, - "step": 8264 - }, - { - "epoch": 2.55, - "learning_rate": 5.216094272330339e-05, - "loss": 1.5691, - "step": 8265 - }, - { - "epoch": 2.55, - "learning_rate": 5.212531366406616e-05, - "loss": 1.3656, - "step": 8266 - }, - { - "epoch": 2.55, - "learning_rate": 5.2089684604828924e-05, - "loss": 1.876, - "step": 8267 - }, - { - "epoch": 2.55, - "learning_rate": 5.20540555455917e-05, - "loss": 1.6886, - "step": 8268 - }, - { - "epoch": 2.55, - "learning_rate": 5.201842648635447e-05, - "loss": 1.6744, - "step": 8269 - }, - { - "epoch": 2.55, - "learning_rate": 5.198279742711724e-05, - "loss": 1.5301, - "step": 8270 - }, - { - "epoch": 2.55, - "learning_rate": 5.194716836788001e-05, - "loss": 1.6009, - "step": 8271 - }, - { - "epoch": 2.55, - "learning_rate": 5.1911539308642786e-05, - "loss": 1.3767, - "step": 8272 - }, - { - "epoch": 2.55, - "learning_rate": 5.1875910249405554e-05, - "loss": 1.3455, - "step": 8273 - }, - { - "epoch": 2.55, - "learning_rate": 5.184028119016833e-05, - "loss": 1.3417, - "step": 8274 - }, - { - "epoch": 2.55, - "learning_rate": 5.1804652130931105e-05, - "loss": 1.2838, - "step": 8275 - }, - { - "epoch": 2.55, - "learning_rate": 5.1769023071693866e-05, - "loss": 1.2066, - "step": 8276 - }, - { - "epoch": 2.55, - "learning_rate": 5.173339401245664e-05, - "loss": 1.2598, - "step": 8277 - }, - { - "epoch": 2.55, - "learning_rate": 5.169776495321941e-05, - "loss": 1.414, - "step": 8278 - }, - { - "epoch": 2.56, - "learning_rate": 5.1662135893982185e-05, - "loss": 1.1093, - "step": 8279 - }, - { - "epoch": 2.56, - "learning_rate": 5.162650683474495e-05, - "loss": 1.1603, - "step": 8280 - }, - { - "epoch": 2.56, - "learning_rate": 5.159087777550773e-05, - "loss": 3.4933, - "step": 8281 - }, - { - "epoch": 2.56, - "learning_rate": 5.155524871627049e-05, - "loss": 3.0199, - "step": 8282 - }, - { - "epoch": 2.56, - "learning_rate": 5.1519619657033265e-05, - "loss": 3.0621, - "step": 8283 - }, - { - "epoch": 2.56, - "learning_rate": 5.148399059779603e-05, - "loss": 2.9217, - "step": 8284 - }, - { - "epoch": 2.56, - "learning_rate": 5.144836153855881e-05, - "loss": 2.6092, - "step": 8285 - }, - { - "epoch": 2.56, - "learning_rate": 5.1412732479321577e-05, - "loss": 2.6336, - "step": 8286 - }, - { - "epoch": 2.56, - "learning_rate": 5.137710342008435e-05, - "loss": 2.6181, - "step": 8287 - }, - { - "epoch": 2.56, - "learning_rate": 5.134147436084711e-05, - "loss": 2.836, - "step": 8288 - }, - { - "epoch": 2.56, - "learning_rate": 5.130584530160989e-05, - "loss": 2.5397, - "step": 8289 - }, - { - "epoch": 2.56, - "learning_rate": 5.127021624237266e-05, - "loss": 2.5716, - "step": 8290 - }, - { - "epoch": 2.56, - "learning_rate": 5.123458718313543e-05, - "loss": 2.4341, - "step": 8291 - }, - { - "epoch": 2.56, - "learning_rate": 5.119895812389821e-05, - "loss": 2.4736, - "step": 8292 - }, - { - "epoch": 2.56, - "learning_rate": 5.1163329064660975e-05, - "loss": 2.4941, - "step": 8293 - }, - { - "epoch": 2.56, - "learning_rate": 5.112770000542375e-05, - "loss": 2.3374, - "step": 8294 - }, - { - "epoch": 2.56, - "learning_rate": 5.109207094618651e-05, - "loss": 2.5927, - "step": 8295 - }, - { - "epoch": 2.56, - "learning_rate": 5.105644188694929e-05, - "loss": 2.3975, - "step": 8296 - }, - { - "epoch": 2.56, - "learning_rate": 5.1020812827712055e-05, - "loss": 2.4854, - "step": 8297 - }, - { - "epoch": 2.56, - "learning_rate": 5.098518376847483e-05, - "loss": 2.6614, - "step": 8298 - }, - { - "epoch": 2.56, - "learning_rate": 5.09495547092376e-05, - "loss": 2.3891, - "step": 8299 - }, - { - "epoch": 2.56, - "learning_rate": 5.0913925650000374e-05, - "loss": 2.2268, - "step": 8300 - }, - { - "epoch": 2.56, - "learning_rate": 5.0878296590763135e-05, - "loss": 2.1707, - "step": 8301 - }, - { - "epoch": 2.56, - "learning_rate": 5.084266753152591e-05, - "loss": 2.2187, - "step": 8302 - }, - { - "epoch": 2.56, - "learning_rate": 5.080703847228868e-05, - "loss": 2.1911, - "step": 8303 - }, - { - "epoch": 2.56, - "learning_rate": 5.0771409413051454e-05, - "loss": 2.2114, - "step": 8304 - }, - { - "epoch": 2.56, - "learning_rate": 5.073578035381422e-05, - "loss": 2.1526, - "step": 8305 - }, - { - "epoch": 2.56, - "learning_rate": 5.0700151294577e-05, - "loss": 2.3647, - "step": 8306 - }, - { - "epoch": 2.56, - "learning_rate": 5.066452223533977e-05, - "loss": 2.0714, - "step": 8307 - }, - { - "epoch": 2.56, - "learning_rate": 5.0628893176102534e-05, - "loss": 2.2013, - "step": 8308 - }, - { - "epoch": 2.56, - "learning_rate": 5.059326411686531e-05, - "loss": 2.1523, - "step": 8309 - }, - { - "epoch": 2.56, - "learning_rate": 5.055763505762808e-05, - "loss": 1.9394, - "step": 8310 - }, - { - "epoch": 2.57, - "learning_rate": 5.052200599839085e-05, - "loss": 1.7759, - "step": 8311 - }, - { - "epoch": 2.57, - "learning_rate": 5.048637693915362e-05, - "loss": 2.0797, - "step": 8312 - }, - { - "epoch": 2.57, - "learning_rate": 5.0450747879916396e-05, - "loss": 1.9336, - "step": 8313 - }, - { - "epoch": 2.57, - "learning_rate": 5.041511882067916e-05, - "loss": 1.8288, - "step": 8314 - }, - { - "epoch": 2.57, - "learning_rate": 5.037948976144193e-05, - "loss": 1.7948, - "step": 8315 - }, - { - "epoch": 2.57, - "learning_rate": 5.03438607022047e-05, - "loss": 1.8427, - "step": 8316 - }, - { - "epoch": 2.57, - "learning_rate": 5.0308231642967476e-05, - "loss": 1.5765, - "step": 8317 - }, - { - "epoch": 2.57, - "learning_rate": 5.0272602583730244e-05, - "loss": 1.6686, - "step": 8318 - }, - { - "epoch": 2.57, - "learning_rate": 5.023697352449302e-05, - "loss": 1.6521, - "step": 8319 - }, - { - "epoch": 2.57, - "learning_rate": 5.020134446525579e-05, - "loss": 1.4198, - "step": 8320 - }, - { - "epoch": 2.57, - "learning_rate": 5.016571540601856e-05, - "loss": 1.549, - "step": 8321 - }, - { - "epoch": 2.57, - "learning_rate": 5.0130086346781324e-05, - "loss": 1.6739, - "step": 8322 - }, - { - "epoch": 2.57, - "learning_rate": 5.00944572875441e-05, - "loss": 1.4012, - "step": 8323 - }, - { - "epoch": 2.57, - "learning_rate": 5.0058828228306874e-05, - "loss": 1.3636, - "step": 8324 - }, - { - "epoch": 2.57, - "learning_rate": 5.002319916906964e-05, - "loss": 1.3276, - "step": 8325 - }, - { - "epoch": 2.57, - "learning_rate": 4.998757010983242e-05, - "loss": 1.3004, - "step": 8326 - }, - { - "epoch": 2.57, - "learning_rate": 4.9951941050595186e-05, - "loss": 1.228, - "step": 8327 - }, - { - "epoch": 2.57, - "learning_rate": 4.991631199135796e-05, - "loss": 1.332, - "step": 8328 - }, - { - "epoch": 2.57, - "learning_rate": 4.988068293212072e-05, - "loss": 1.0974, - "step": 8329 - }, - { - "epoch": 2.57, - "learning_rate": 4.98450538728835e-05, - "loss": 1.1592, - "step": 8330 - }, - { - "epoch": 2.57, - "learning_rate": 4.9809424813646266e-05, - "loss": 3.5586, - "step": 8331 - }, - { - "epoch": 2.57, - "learning_rate": 4.977379575440904e-05, - "loss": 3.3555, - "step": 8332 - }, - { - "epoch": 2.57, - "learning_rate": 4.973816669517181e-05, - "loss": 3.032, - "step": 8333 - }, - { - "epoch": 2.57, - "learning_rate": 4.9702537635934585e-05, - "loss": 2.678, - "step": 8334 - }, - { - "epoch": 2.57, - "learning_rate": 4.9666908576697346e-05, - "loss": 2.9682, - "step": 8335 - }, - { - "epoch": 2.57, - "learning_rate": 4.963127951746012e-05, - "loss": 2.7617, - "step": 8336 - }, - { - "epoch": 2.57, - "learning_rate": 4.959565045822289e-05, - "loss": 2.6088, - "step": 8337 - }, - { - "epoch": 2.57, - "learning_rate": 4.9560021398985665e-05, - "loss": 2.3312, - "step": 8338 - }, - { - "epoch": 2.57, - "learning_rate": 4.952439233974843e-05, - "loss": 2.6546, - "step": 8339 - }, - { - "epoch": 2.57, - "learning_rate": 4.948876328051121e-05, - "loss": 2.5418, - "step": 8340 - }, - { - "epoch": 2.57, - "learning_rate": 4.9453134221273983e-05, - "loss": 2.452, - "step": 8341 - }, - { - "epoch": 2.57, - "learning_rate": 4.9417505162036745e-05, - "loss": 2.7838, - "step": 8342 - }, - { - "epoch": 2.57, - "learning_rate": 4.938187610279952e-05, - "loss": 2.3311, - "step": 8343 - }, - { - "epoch": 2.58, - "learning_rate": 4.934624704356229e-05, - "loss": 2.3479, - "step": 8344 - }, - { - "epoch": 2.58, - "learning_rate": 4.9310617984325063e-05, - "loss": 2.2318, - "step": 8345 - }, - { - "epoch": 2.58, - "learning_rate": 4.927498892508783e-05, - "loss": 2.4403, - "step": 8346 - }, - { - "epoch": 2.58, - "learning_rate": 4.923935986585061e-05, - "loss": 2.2913, - "step": 8347 - }, - { - "epoch": 2.58, - "learning_rate": 4.920373080661337e-05, - "loss": 2.4643, - "step": 8348 - }, - { - "epoch": 2.58, - "learning_rate": 4.9168101747376144e-05, - "loss": 2.242, - "step": 8349 - }, - { - "epoch": 2.58, - "learning_rate": 4.913247268813891e-05, - "loss": 2.4522, - "step": 8350 - }, - { - "epoch": 2.58, - "learning_rate": 4.909684362890169e-05, - "loss": 2.2363, - "step": 8351 - }, - { - "epoch": 2.58, - "learning_rate": 4.9061214569664455e-05, - "loss": 2.3315, - "step": 8352 - }, - { - "epoch": 2.58, - "learning_rate": 4.902558551042723e-05, - "loss": 2.2199, - "step": 8353 - }, - { - "epoch": 2.58, - "learning_rate": 4.898995645118999e-05, - "loss": 2.1899, - "step": 8354 - }, - { - "epoch": 2.58, - "learning_rate": 4.895432739195277e-05, - "loss": 1.9787, - "step": 8355 - }, - { - "epoch": 2.58, - "learning_rate": 4.891869833271554e-05, - "loss": 2.0953, - "step": 8356 - }, - { - "epoch": 2.58, - "learning_rate": 4.888306927347831e-05, - "loss": 2.0047, - "step": 8357 - }, - { - "epoch": 2.58, - "learning_rate": 4.8847440214241086e-05, - "loss": 2.0378, - "step": 8358 - }, - { - "epoch": 2.58, - "learning_rate": 4.8811811155003854e-05, - "loss": 1.9387, - "step": 8359 - }, - { - "epoch": 2.58, - "learning_rate": 4.877618209576663e-05, - "loss": 1.9875, - "step": 8360 - }, - { - "epoch": 2.58, - "learning_rate": 4.87405530365294e-05, - "loss": 1.9935, - "step": 8361 - }, - { - "epoch": 2.58, - "learning_rate": 4.870492397729217e-05, - "loss": 1.8747, - "step": 8362 - }, - { - "epoch": 2.58, - "learning_rate": 4.8669294918054934e-05, - "loss": 1.8184, - "step": 8363 - }, - { - "epoch": 2.58, - "learning_rate": 4.863366585881771e-05, - "loss": 1.8756, - "step": 8364 - }, - { - "epoch": 2.58, - "learning_rate": 4.859803679958048e-05, - "loss": 1.6471, - "step": 8365 - }, - { - "epoch": 2.58, - "learning_rate": 4.856240774034325e-05, - "loss": 1.6878, - "step": 8366 - }, - { - "epoch": 2.58, - "learning_rate": 4.852677868110602e-05, - "loss": 1.6078, - "step": 8367 - }, - { - "epoch": 2.58, - "learning_rate": 4.8491149621868796e-05, - "loss": 1.768, - "step": 8368 - }, - { - "epoch": 2.58, - "learning_rate": 4.845552056263156e-05, - "loss": 1.5816, - "step": 8369 - }, - { - "epoch": 2.58, - "learning_rate": 4.841989150339433e-05, - "loss": 1.6848, - "step": 8370 - }, - { - "epoch": 2.58, - "learning_rate": 4.83842624441571e-05, - "loss": 1.422, - "step": 8371 - }, - { - "epoch": 2.58, - "learning_rate": 4.8348633384919876e-05, - "loss": 1.4297, - "step": 8372 - }, - { - "epoch": 2.58, - "learning_rate": 4.831300432568265e-05, - "loss": 1.6117, - "step": 8373 - }, - { - "epoch": 2.58, - "learning_rate": 4.827737526644542e-05, - "loss": 1.3079, - "step": 8374 - }, - { - "epoch": 2.58, - "learning_rate": 4.8241746207208194e-05, - "loss": 1.1261, - "step": 8375 - }, - { - "epoch": 2.59, - "learning_rate": 4.8206117147970956e-05, - "loss": 1.1872, - "step": 8376 - }, - { - "epoch": 2.59, - "learning_rate": 4.817048808873373e-05, - "loss": 1.2389, - "step": 8377 - }, - { - "epoch": 2.59, - "learning_rate": 4.81348590294965e-05, - "loss": 1.1146, - "step": 8378 - }, - { - "epoch": 2.59, - "learning_rate": 4.8099229970259275e-05, - "loss": 1.0483, - "step": 8379 - }, - { - "epoch": 2.59, - "learning_rate": 4.806360091102204e-05, - "loss": 1.1466, - "step": 8380 - }, - { - "epoch": 2.59, - "learning_rate": 4.802797185178482e-05, - "loss": 3.173, - "step": 8381 - }, - { - "epoch": 2.59, - "learning_rate": 4.799234279254758e-05, - "loss": 2.7077, - "step": 8382 - }, - { - "epoch": 2.59, - "learning_rate": 4.7956713733310355e-05, - "loss": 3.0078, - "step": 8383 - }, - { - "epoch": 2.59, - "learning_rate": 4.792108467407312e-05, - "loss": 2.9462, - "step": 8384 - }, - { - "epoch": 2.59, - "learning_rate": 4.78854556148359e-05, - "loss": 2.752, - "step": 8385 - }, - { - "epoch": 2.59, - "learning_rate": 4.7849826555598666e-05, - "loss": 2.691, - "step": 8386 - }, - { - "epoch": 2.59, - "learning_rate": 4.781419749636144e-05, - "loss": 2.9445, - "step": 8387 - }, - { - "epoch": 2.59, - "learning_rate": 4.77785684371242e-05, - "loss": 2.5408, - "step": 8388 - }, - { - "epoch": 2.59, - "learning_rate": 4.774293937788698e-05, - "loss": 2.5275, - "step": 8389 - }, - { - "epoch": 2.59, - "learning_rate": 4.770731031864975e-05, - "loss": 2.4246, - "step": 8390 - }, - { - "epoch": 2.59, - "learning_rate": 4.767168125941252e-05, - "loss": 2.7441, - "step": 8391 - }, - { - "epoch": 2.59, - "learning_rate": 4.7636052200175297e-05, - "loss": 2.5259, - "step": 8392 - }, - { - "epoch": 2.59, - "learning_rate": 4.7600423140938065e-05, - "loss": 2.1714, - "step": 8393 - }, - { - "epoch": 2.59, - "learning_rate": 4.756479408170084e-05, - "loss": 2.502, - "step": 8394 - }, - { - "epoch": 2.59, - "learning_rate": 4.75291650224636e-05, - "loss": 2.425, - "step": 8395 - }, - { - "epoch": 2.59, - "learning_rate": 4.749353596322638e-05, - "loss": 2.2356, - "step": 8396 - }, - { - "epoch": 2.59, - "learning_rate": 4.7457906903989145e-05, - "loss": 2.1971, - "step": 8397 - }, - { - "epoch": 2.59, - "learning_rate": 4.742227784475192e-05, - "loss": 2.1961, - "step": 8398 - }, - { - "epoch": 2.59, - "learning_rate": 4.738664878551469e-05, - "loss": 2.1759, - "step": 8399 - }, - { - "epoch": 2.59, - "learning_rate": 4.7351019726277464e-05, - "loss": 1.8855, - "step": 8400 - }, - { - "epoch": 2.59, - "learning_rate": 4.7315390667040225e-05, - "loss": 2.3121, - "step": 8401 - }, - { - "epoch": 2.59, - "learning_rate": 4.7279761607803e-05, - "loss": 2.2639, - "step": 8402 - }, - { - "epoch": 2.59, - "learning_rate": 4.724413254856577e-05, - "loss": 2.1183, - "step": 8403 - }, - { - "epoch": 2.59, - "learning_rate": 4.7208503489328544e-05, - "loss": 2.0216, - "step": 8404 - }, - { - "epoch": 2.59, - "learning_rate": 4.717287443009132e-05, - "loss": 1.9545, - "step": 8405 - }, - { - "epoch": 2.59, - "learning_rate": 4.713724537085409e-05, - "loss": 2.0568, - "step": 8406 - }, - { - "epoch": 2.59, - "learning_rate": 4.710161631161686e-05, - "loss": 2.0345, - "step": 8407 - }, - { - "epoch": 2.6, - "learning_rate": 4.706598725237963e-05, - "loss": 1.8838, - "step": 8408 - }, - { - "epoch": 2.6, - "learning_rate": 4.7030358193142406e-05, - "loss": 1.879, - "step": 8409 - }, - { - "epoch": 2.6, - "learning_rate": 4.699472913390517e-05, - "loss": 2.1507, - "step": 8410 - }, - { - "epoch": 2.6, - "learning_rate": 4.695910007466794e-05, - "loss": 1.6642, - "step": 8411 - }, - { - "epoch": 2.6, - "learning_rate": 4.692347101543071e-05, - "loss": 1.7561, - "step": 8412 - }, - { - "epoch": 2.6, - "learning_rate": 4.6887841956193486e-05, - "loss": 1.4998, - "step": 8413 - }, - { - "epoch": 2.6, - "learning_rate": 4.6852212896956254e-05, - "loss": 1.8276, - "step": 8414 - }, - { - "epoch": 2.6, - "learning_rate": 4.681658383771903e-05, - "loss": 1.7421, - "step": 8415 - }, - { - "epoch": 2.6, - "learning_rate": 4.678095477848179e-05, - "loss": 1.4917, - "step": 8416 - }, - { - "epoch": 2.6, - "learning_rate": 4.6745325719244566e-05, - "loss": 1.5564, - "step": 8417 - }, - { - "epoch": 2.6, - "learning_rate": 4.6709696660007334e-05, - "loss": 1.5428, - "step": 8418 - }, - { - "epoch": 2.6, - "learning_rate": 4.667406760077011e-05, - "loss": 1.6164, - "step": 8419 - }, - { - "epoch": 2.6, - "learning_rate": 4.663843854153288e-05, - "loss": 1.5527, - "step": 8420 - }, - { - "epoch": 2.6, - "learning_rate": 4.660280948229565e-05, - "loss": 1.6131, - "step": 8421 - }, - { - "epoch": 2.6, - "learning_rate": 4.656718042305843e-05, - "loss": 1.3453, - "step": 8422 - }, - { - "epoch": 2.6, - "learning_rate": 4.653155136382119e-05, - "loss": 1.5347, - "step": 8423 - }, - { - "epoch": 2.6, - "learning_rate": 4.6495922304583964e-05, - "loss": 1.1795, - "step": 8424 - }, - { - "epoch": 2.6, - "learning_rate": 4.646029324534673e-05, - "loss": 1.2764, - "step": 8425 - }, - { - "epoch": 2.6, - "learning_rate": 4.642466418610951e-05, - "loss": 1.2242, - "step": 8426 - }, - { - "epoch": 2.6, - "learning_rate": 4.6389035126872276e-05, - "loss": 1.3614, - "step": 8427 - }, - { - "epoch": 2.6, - "learning_rate": 4.635340606763505e-05, - "loss": 1.0933, - "step": 8428 - }, - { - "epoch": 2.6, - "learning_rate": 4.631777700839781e-05, - "loss": 1.1188, - "step": 8429 - }, - { - "epoch": 2.6, - "learning_rate": 4.628214794916059e-05, - "loss": 1.0321, - "step": 8430 - }, - { - "epoch": 2.6, - "learning_rate": 4.6246518889923356e-05, - "loss": 3.4742, - "step": 8431 - }, - { - "epoch": 2.6, - "learning_rate": 4.621088983068613e-05, - "loss": 3.3373, - "step": 8432 - }, - { - "epoch": 2.6, - "learning_rate": 4.61752607714489e-05, - "loss": 2.643, - "step": 8433 - }, - { - "epoch": 2.6, - "learning_rate": 4.6139631712211675e-05, - "loss": 3.0011, - "step": 8434 - }, - { - "epoch": 2.6, - "learning_rate": 4.6104002652974436e-05, - "loss": 2.7243, - "step": 8435 - }, - { - "epoch": 2.6, - "learning_rate": 4.606837359373721e-05, - "loss": 2.7661, - "step": 8436 - }, - { - "epoch": 2.6, - "learning_rate": 4.603274453449998e-05, - "loss": 2.6334, - "step": 8437 - }, - { - "epoch": 2.6, - "learning_rate": 4.5997115475262755e-05, - "loss": 2.6395, - "step": 8438 - }, - { - "epoch": 2.6, - "learning_rate": 4.596148641602553e-05, - "loss": 2.5444, - "step": 8439 - }, - { - "epoch": 2.6, - "learning_rate": 4.59258573567883e-05, - "loss": 2.5461, - "step": 8440 - }, - { - "epoch": 2.61, - "learning_rate": 4.589022829755107e-05, - "loss": 2.4782, - "step": 8441 - }, - { - "epoch": 2.61, - "learning_rate": 4.5854599238313835e-05, - "loss": 2.2592, - "step": 8442 - }, - { - "epoch": 2.61, - "learning_rate": 4.581897017907661e-05, - "loss": 2.4776, - "step": 8443 - }, - { - "epoch": 2.61, - "learning_rate": 4.578334111983938e-05, - "loss": 2.1969, - "step": 8444 - }, - { - "epoch": 2.61, - "learning_rate": 4.574771206060215e-05, - "loss": 2.4991, - "step": 8445 - }, - { - "epoch": 2.61, - "learning_rate": 4.571208300136492e-05, - "loss": 2.4161, - "step": 8446 - }, - { - "epoch": 2.61, - "learning_rate": 4.56764539421277e-05, - "loss": 2.2196, - "step": 8447 - }, - { - "epoch": 2.61, - "learning_rate": 4.5640824882890465e-05, - "loss": 2.272, - "step": 8448 - }, - { - "epoch": 2.61, - "learning_rate": 4.560519582365324e-05, - "loss": 2.2166, - "step": 8449 - }, - { - "epoch": 2.61, - "learning_rate": 4.5569566764416e-05, - "loss": 2.217, - "step": 8450 - }, - { - "epoch": 2.61, - "learning_rate": 4.553393770517878e-05, - "loss": 2.3055, - "step": 8451 - }, - { - "epoch": 2.61, - "learning_rate": 4.5498308645941545e-05, - "loss": 2.1132, - "step": 8452 - }, - { - "epoch": 2.61, - "learning_rate": 4.546267958670432e-05, - "loss": 2.2798, - "step": 8453 - }, - { - "epoch": 2.61, - "learning_rate": 4.5427050527467095e-05, - "loss": 2.1686, - "step": 8454 - }, - { - "epoch": 2.61, - "learning_rate": 4.5391421468229864e-05, - "loss": 1.96, - "step": 8455 - }, - { - "epoch": 2.61, - "learning_rate": 4.535579240899264e-05, - "loss": 1.9831, - "step": 8456 - }, - { - "epoch": 2.61, - "learning_rate": 4.53201633497554e-05, - "loss": 1.8924, - "step": 8457 - }, - { - "epoch": 2.61, - "learning_rate": 4.5284534290518175e-05, - "loss": 2.4104, - "step": 8458 - }, - { - "epoch": 2.61, - "learning_rate": 4.5248905231280944e-05, - "loss": 1.9125, - "step": 8459 - }, - { - "epoch": 2.61, - "learning_rate": 4.521327617204372e-05, - "loss": 1.9806, - "step": 8460 - }, - { - "epoch": 2.61, - "learning_rate": 4.517764711280649e-05, - "loss": 2.0554, - "step": 8461 - }, - { - "epoch": 2.61, - "learning_rate": 4.514201805356926e-05, - "loss": 1.7737, - "step": 8462 - }, - { - "epoch": 2.61, - "learning_rate": 4.5106388994332024e-05, - "loss": 1.856, - "step": 8463 - }, - { - "epoch": 2.61, - "learning_rate": 4.50707599350948e-05, - "loss": 1.938, - "step": 8464 - }, - { - "epoch": 2.61, - "learning_rate": 4.503513087585757e-05, - "loss": 1.7618, - "step": 8465 - }, - { - "epoch": 2.61, - "learning_rate": 4.499950181662034e-05, - "loss": 1.7406, - "step": 8466 - }, - { - "epoch": 2.61, - "learning_rate": 4.496387275738311e-05, - "loss": 1.5867, - "step": 8467 - }, - { - "epoch": 2.61, - "learning_rate": 4.4928243698145886e-05, - "loss": 1.6939, - "step": 8468 - }, - { - "epoch": 2.61, - "learning_rate": 4.489261463890865e-05, - "loss": 1.4081, - "step": 8469 - }, - { - "epoch": 2.61, - "learning_rate": 4.485698557967142e-05, - "loss": 1.6938, - "step": 8470 - }, - { - "epoch": 2.61, - "learning_rate": 4.48213565204342e-05, - "loss": 1.4048, - "step": 8471 - }, - { - "epoch": 2.61, - "learning_rate": 4.4785727461196966e-05, - "loss": 1.4009, - "step": 8472 - }, - { - "epoch": 2.62, - "learning_rate": 4.475009840195974e-05, - "loss": 1.5236, - "step": 8473 - }, - { - "epoch": 2.62, - "learning_rate": 4.471446934272251e-05, - "loss": 1.2313, - "step": 8474 - }, - { - "epoch": 2.62, - "learning_rate": 4.4678840283485284e-05, - "loss": 1.1846, - "step": 8475 - }, - { - "epoch": 2.62, - "learning_rate": 4.4643211224248046e-05, - "loss": 1.2222, - "step": 8476 - }, - { - "epoch": 2.62, - "learning_rate": 4.460758216501082e-05, - "loss": 1.0356, - "step": 8477 - }, - { - "epoch": 2.62, - "learning_rate": 4.457195310577359e-05, - "loss": 1.0133, - "step": 8478 - }, - { - "epoch": 2.62, - "learning_rate": 4.4536324046536364e-05, - "loss": 1.0698, - "step": 8479 - }, - { - "epoch": 2.62, - "learning_rate": 4.450069498729913e-05, - "loss": 1.093, - "step": 8480 - }, - { - "epoch": 2.62, - "learning_rate": 4.446506592806191e-05, - "loss": 3.54, - "step": 8481 - }, - { - "epoch": 2.62, - "learning_rate": 4.442943686882467e-05, - "loss": 2.9909, - "step": 8482 - }, - { - "epoch": 2.62, - "learning_rate": 4.4393807809587444e-05, - "loss": 2.7814, - "step": 8483 - }, - { - "epoch": 2.62, - "learning_rate": 4.435817875035021e-05, - "loss": 2.9187, - "step": 8484 - }, - { - "epoch": 2.62, - "learning_rate": 4.432254969111299e-05, - "loss": 2.8055, - "step": 8485 - }, - { - "epoch": 2.62, - "learning_rate": 4.4286920631875756e-05, - "loss": 2.7927, - "step": 8486 - }, - { - "epoch": 2.62, - "learning_rate": 4.425129157263853e-05, - "loss": 2.6639, - "step": 8487 - }, - { - "epoch": 2.62, - "learning_rate": 4.4215662513401306e-05, - "loss": 2.5406, - "step": 8488 - }, - { - "epoch": 2.62, - "learning_rate": 4.418003345416407e-05, - "loss": 2.2983, - "step": 8489 - }, - { - "epoch": 2.62, - "learning_rate": 4.414440439492685e-05, - "loss": 2.8534, - "step": 8490 - }, - { - "epoch": 2.62, - "learning_rate": 4.410877533568961e-05, - "loss": 2.6029, - "step": 8491 - }, - { - "epoch": 2.62, - "learning_rate": 4.4073146276452386e-05, - "loss": 2.3614, - "step": 8492 - }, - { - "epoch": 2.62, - "learning_rate": 4.4037517217215155e-05, - "loss": 2.3908, - "step": 8493 - }, - { - "epoch": 2.62, - "learning_rate": 4.400188815797793e-05, - "loss": 2.331, - "step": 8494 - }, - { - "epoch": 2.62, - "learning_rate": 4.39662590987407e-05, - "loss": 2.3133, - "step": 8495 - }, - { - "epoch": 2.62, - "learning_rate": 4.393063003950347e-05, - "loss": 2.2488, - "step": 8496 - }, - { - "epoch": 2.62, - "learning_rate": 4.3895000980266235e-05, - "loss": 2.4253, - "step": 8497 - }, - { - "epoch": 2.62, - "learning_rate": 4.385937192102901e-05, - "loss": 2.3801, - "step": 8498 - }, - { - "epoch": 2.62, - "learning_rate": 4.382374286179178e-05, - "loss": 2.2543, - "step": 8499 - }, - { - "epoch": 2.62, - "learning_rate": 4.378811380255455e-05, - "loss": 2.0685, - "step": 8500 - }, - { - "epoch": 2.62, - "eval_bleu": 2.20913340971076e-14, - "eval_loss": 3.2877273559570312, - "eval_runtime": 2579.3551, - "eval_samples_per_second": 5.722, - "eval_steps_per_second": 0.715, - "step": 8500 - }, - { - "epoch": 2.62, - "learning_rate": 4.375248474331732e-05, - "loss": 2.1469, - "step": 8501 - }, - { - "epoch": 2.62, - "learning_rate": 4.37168556840801e-05, - "loss": 2.2822, - "step": 8502 - }, - { - "epoch": 2.62, - "learning_rate": 4.368122662484287e-05, - "loss": 2.2191, - "step": 8503 - }, - { - "epoch": 2.62, - "learning_rate": 4.364559756560563e-05, - "loss": 2.4262, - "step": 8504 - }, - { - "epoch": 2.62, - "learning_rate": 4.360996850636841e-05, - "loss": 2.1317, - "step": 8505 - }, - { - "epoch": 2.63, - "learning_rate": 4.357433944713118e-05, - "loss": 1.9848, - "step": 8506 - }, - { - "epoch": 2.63, - "learning_rate": 4.353871038789395e-05, - "loss": 1.9256, - "step": 8507 - }, - { - "epoch": 2.63, - "learning_rate": 4.350308132865672e-05, - "loss": 2.0429, - "step": 8508 - }, - { - "epoch": 2.63, - "learning_rate": 4.3467452269419495e-05, - "loss": 2.0093, - "step": 8509 - }, - { - "epoch": 2.63, - "learning_rate": 4.343182321018226e-05, - "loss": 1.9739, - "step": 8510 - }, - { - "epoch": 2.63, - "learning_rate": 4.339619415094503e-05, - "loss": 1.8503, - "step": 8511 - }, - { - "epoch": 2.63, - "learning_rate": 4.33605650917078e-05, - "loss": 1.9031, - "step": 8512 - }, - { - "epoch": 2.63, - "learning_rate": 4.3324936032470575e-05, - "loss": 1.676, - "step": 8513 - }, - { - "epoch": 2.63, - "learning_rate": 4.3289306973233344e-05, - "loss": 1.9112, - "step": 8514 - }, - { - "epoch": 2.63, - "learning_rate": 4.325367791399612e-05, - "loss": 1.7589, - "step": 8515 - }, - { - "epoch": 2.63, - "learning_rate": 4.321804885475888e-05, - "loss": 1.6799, - "step": 8516 - }, - { - "epoch": 2.63, - "learning_rate": 4.3182419795521655e-05, - "loss": 1.7056, - "step": 8517 - }, - { - "epoch": 2.63, - "learning_rate": 4.3146790736284424e-05, - "loss": 1.55, - "step": 8518 - }, - { - "epoch": 2.63, - "learning_rate": 4.31111616770472e-05, - "loss": 1.3344, - "step": 8519 - }, - { - "epoch": 2.63, - "learning_rate": 4.3075532617809974e-05, - "loss": 1.5643, - "step": 8520 - }, - { - "epoch": 2.63, - "learning_rate": 4.303990355857274e-05, - "loss": 1.382, - "step": 8521 - }, - { - "epoch": 2.63, - "learning_rate": 4.300427449933552e-05, - "loss": 1.569, - "step": 8522 - }, - { - "epoch": 2.63, - "learning_rate": 4.296864544009828e-05, - "loss": 1.3835, - "step": 8523 - }, - { - "epoch": 2.63, - "learning_rate": 4.2933016380861054e-05, - "loss": 1.6066, - "step": 8524 - }, - { - "epoch": 2.63, - "learning_rate": 4.289738732162382e-05, - "loss": 1.2078, - "step": 8525 - }, - { - "epoch": 2.63, - "learning_rate": 4.28617582623866e-05, - "loss": 1.1748, - "step": 8526 - }, - { - "epoch": 2.63, - "learning_rate": 4.2826129203149366e-05, - "loss": 1.0292, - "step": 8527 - }, - { - "epoch": 2.63, - "learning_rate": 4.279050014391214e-05, - "loss": 1.1066, - "step": 8528 - }, - { - "epoch": 2.63, - "learning_rate": 4.27548710846749e-05, - "loss": 0.9119, - "step": 8529 - }, - { - "epoch": 2.63, - "learning_rate": 4.271924202543768e-05, - "loss": 1.0693, - "step": 8530 - }, - { - "epoch": 2.63, - "learning_rate": 4.2683612966200446e-05, - "loss": 3.2263, - "step": 8531 - }, - { - "epoch": 2.63, - "learning_rate": 4.264798390696322e-05, - "loss": 3.5073, - "step": 8532 - }, - { - "epoch": 2.63, - "learning_rate": 4.261235484772599e-05, - "loss": 2.7969, - "step": 8533 - }, - { - "epoch": 2.63, - "learning_rate": 4.2576725788488764e-05, - "loss": 2.5947, - "step": 8534 - }, - { - "epoch": 2.63, - "learning_rate": 4.254109672925153e-05, - "loss": 2.6302, - "step": 8535 - }, - { - "epoch": 2.63, - "learning_rate": 4.250546767001431e-05, - "loss": 2.7069, - "step": 8536 - }, - { - "epoch": 2.63, - "learning_rate": 4.246983861077708e-05, - "loss": 2.3792, - "step": 8537 - }, - { - "epoch": 2.64, - "learning_rate": 4.2434209551539844e-05, - "loss": 2.3323, - "step": 8538 - }, - { - "epoch": 2.64, - "learning_rate": 4.239858049230262e-05, - "loss": 2.5842, - "step": 8539 - }, - { - "epoch": 2.64, - "learning_rate": 4.236295143306539e-05, - "loss": 2.4951, - "step": 8540 - }, - { - "epoch": 2.64, - "learning_rate": 4.232732237382816e-05, - "loss": 2.4822, - "step": 8541 - }, - { - "epoch": 2.64, - "learning_rate": 4.229169331459093e-05, - "loss": 2.4839, - "step": 8542 - }, - { - "epoch": 2.64, - "learning_rate": 4.2256064255353706e-05, - "loss": 2.3317, - "step": 8543 - }, - { - "epoch": 2.64, - "learning_rate": 4.222043519611647e-05, - "loss": 2.6171, - "step": 8544 - }, - { - "epoch": 2.64, - "learning_rate": 4.218480613687924e-05, - "loss": 2.3279, - "step": 8545 - }, - { - "epoch": 2.64, - "learning_rate": 4.214917707764201e-05, - "loss": 2.1871, - "step": 8546 - }, - { - "epoch": 2.64, - "learning_rate": 4.2113548018404786e-05, - "loss": 2.3625, - "step": 8547 - }, - { - "epoch": 2.64, - "learning_rate": 4.2077918959167555e-05, - "loss": 2.1307, - "step": 8548 - }, - { - "epoch": 2.64, - "learning_rate": 4.204228989993033e-05, - "loss": 2.2423, - "step": 8549 - }, - { - "epoch": 2.64, - "learning_rate": 4.200666084069309e-05, - "loss": 2.3798, - "step": 8550 - }, - { - "epoch": 2.64, - "learning_rate": 4.1971031781455866e-05, - "loss": 2.0154, - "step": 8551 - }, - { - "epoch": 2.64, - "learning_rate": 4.193540272221864e-05, - "loss": 2.1338, - "step": 8552 - }, - { - "epoch": 2.64, - "learning_rate": 4.189977366298141e-05, - "loss": 2.046, - "step": 8553 - }, - { - "epoch": 2.64, - "learning_rate": 4.1864144603744185e-05, - "loss": 2.1768, - "step": 8554 - }, - { - "epoch": 2.64, - "learning_rate": 4.182851554450695e-05, - "loss": 2.0919, - "step": 8555 - }, - { - "epoch": 2.64, - "learning_rate": 4.179288648526973e-05, - "loss": 2.1003, - "step": 8556 - }, - { - "epoch": 2.64, - "learning_rate": 4.175725742603249e-05, - "loss": 2.0332, - "step": 8557 - }, - { - "epoch": 2.64, - "learning_rate": 4.1721628366795265e-05, - "loss": 2.0287, - "step": 8558 - }, - { - "epoch": 2.64, - "learning_rate": 4.1685999307558033e-05, - "loss": 1.8779, - "step": 8559 - }, - { - "epoch": 2.64, - "learning_rate": 4.165037024832081e-05, - "loss": 1.7839, - "step": 8560 - }, - { - "epoch": 2.64, - "learning_rate": 4.161474118908358e-05, - "loss": 1.7274, - "step": 8561 - }, - { - "epoch": 2.64, - "learning_rate": 4.157911212984635e-05, - "loss": 1.7322, - "step": 8562 - }, - { - "epoch": 2.64, - "learning_rate": 4.1543483070609113e-05, - "loss": 1.8227, - "step": 8563 - }, - { - "epoch": 2.64, - "learning_rate": 4.150785401137189e-05, - "loss": 1.5665, - "step": 8564 - }, - { - "epoch": 2.64, - "learning_rate": 4.147222495213466e-05, - "loss": 1.7331, - "step": 8565 - }, - { - "epoch": 2.64, - "learning_rate": 4.143659589289743e-05, - "loss": 1.5922, - "step": 8566 - }, - { - "epoch": 2.64, - "learning_rate": 4.14009668336602e-05, - "loss": 1.9765, - "step": 8567 - }, - { - "epoch": 2.64, - "learning_rate": 4.1365337774422975e-05, - "loss": 1.5426, - "step": 8568 - }, - { - "epoch": 2.64, - "learning_rate": 4.132970871518575e-05, - "loss": 1.6497, - "step": 8569 - }, - { - "epoch": 2.65, - "learning_rate": 4.129407965594851e-05, - "loss": 1.6288, - "step": 8570 - }, - { - "epoch": 2.65, - "learning_rate": 4.125845059671129e-05, - "loss": 1.5125, - "step": 8571 - }, - { - "epoch": 2.65, - "learning_rate": 4.1222821537474055e-05, - "loss": 1.3643, - "step": 8572 - }, - { - "epoch": 2.65, - "learning_rate": 4.118719247823683e-05, - "loss": 1.3482, - "step": 8573 - }, - { - "epoch": 2.65, - "learning_rate": 4.11515634189996e-05, - "loss": 1.393, - "step": 8574 - }, - { - "epoch": 2.65, - "learning_rate": 4.1115934359762374e-05, - "loss": 1.2808, - "step": 8575 - }, - { - "epoch": 2.65, - "learning_rate": 4.1080305300525136e-05, - "loss": 1.1576, - "step": 8576 - }, - { - "epoch": 2.65, - "learning_rate": 4.104467624128792e-05, - "loss": 1.0982, - "step": 8577 - }, - { - "epoch": 2.65, - "learning_rate": 4.1009047182050686e-05, - "loss": 0.8979, - "step": 8578 - }, - { - "epoch": 2.65, - "learning_rate": 4.0973418122813454e-05, - "loss": 1.0367, - "step": 8579 - }, - { - "epoch": 2.65, - "learning_rate": 4.093778906357623e-05, - "loss": 1.1712, - "step": 8580 - }, - { - "epoch": 2.65, - "learning_rate": 4.0902160004339e-05, - "loss": 3.4911, - "step": 8581 - }, - { - "epoch": 2.65, - "learning_rate": 4.0866530945101766e-05, - "loss": 2.8177, - "step": 8582 - }, - { - "epoch": 2.65, - "learning_rate": 4.083090188586454e-05, - "loss": 2.8718, - "step": 8583 - }, - { - "epoch": 2.65, - "learning_rate": 4.079527282662731e-05, - "loss": 2.6964, - "step": 8584 - }, - { - "epoch": 2.65, - "learning_rate": 4.075964376739008e-05, - "loss": 2.6821, - "step": 8585 - }, - { - "epoch": 2.65, - "learning_rate": 4.072401470815285e-05, - "loss": 2.4918, - "step": 8586 - }, - { - "epoch": 2.65, - "learning_rate": 4.068838564891562e-05, - "loss": 2.6305, - "step": 8587 - }, - { - "epoch": 2.65, - "learning_rate": 4.065275658967839e-05, - "loss": 2.6828, - "step": 8588 - }, - { - "epoch": 2.65, - "learning_rate": 4.0617127530441164e-05, - "loss": 2.6812, - "step": 8589 - }, - { - "epoch": 2.65, - "learning_rate": 4.058149847120393e-05, - "loss": 2.4479, - "step": 8590 - }, - { - "epoch": 2.65, - "learning_rate": 4.05458694119667e-05, - "loss": 2.3508, - "step": 8591 - }, - { - "epoch": 2.65, - "learning_rate": 4.0510240352729476e-05, - "loss": 2.5319, - "step": 8592 - }, - { - "epoch": 2.65, - "learning_rate": 4.047461129349225e-05, - "loss": 2.3875, - "step": 8593 - }, - { - "epoch": 2.65, - "learning_rate": 4.043898223425502e-05, - "loss": 2.441, - "step": 8594 - }, - { - "epoch": 2.65, - "learning_rate": 4.040335317501779e-05, - "loss": 2.2016, - "step": 8595 - }, - { - "epoch": 2.65, - "learning_rate": 4.036772411578056e-05, - "loss": 2.3764, - "step": 8596 - }, - { - "epoch": 2.65, - "learning_rate": 4.033209505654333e-05, - "loss": 2.2361, - "step": 8597 - }, - { - "epoch": 2.65, - "learning_rate": 4.02964659973061e-05, - "loss": 2.3448, - "step": 8598 - }, - { - "epoch": 2.65, - "learning_rate": 4.0260836938068875e-05, - "loss": 2.1709, - "step": 8599 - }, - { - "epoch": 2.65, - "learning_rate": 4.022520787883164e-05, - "loss": 2.2646, - "step": 8600 - }, - { - "epoch": 2.65, - "learning_rate": 4.018957881959441e-05, - "loss": 2.2957, - "step": 8601 - }, - { - "epoch": 2.65, - "learning_rate": 4.0153949760357186e-05, - "loss": 1.9544, - "step": 8602 - }, - { - "epoch": 2.66, - "learning_rate": 4.0118320701119955e-05, - "loss": 2.0882, - "step": 8603 - }, - { - "epoch": 2.66, - "learning_rate": 4.008269164188272e-05, - "loss": 1.9899, - "step": 8604 - }, - { - "epoch": 2.66, - "learning_rate": 4.00470625826455e-05, - "loss": 2.1356, - "step": 8605 - }, - { - "epoch": 2.66, - "learning_rate": 4.0011433523408267e-05, - "loss": 2.0048, - "step": 8606 - }, - { - "epoch": 2.66, - "learning_rate": 3.9975804464171035e-05, - "loss": 1.9664, - "step": 8607 - }, - { - "epoch": 2.66, - "learning_rate": 3.994017540493381e-05, - "loss": 2.0805, - "step": 8608 - }, - { - "epoch": 2.66, - "learning_rate": 3.990454634569658e-05, - "loss": 1.9445, - "step": 8609 - }, - { - "epoch": 2.66, - "learning_rate": 3.9868917286459353e-05, - "loss": 1.8846, - "step": 8610 - }, - { - "epoch": 2.66, - "learning_rate": 3.983328822722212e-05, - "loss": 2.2493, - "step": 8611 - }, - { - "epoch": 2.66, - "learning_rate": 3.97976591679849e-05, - "loss": 1.6035, - "step": 8612 - }, - { - "epoch": 2.66, - "learning_rate": 3.9762030108747665e-05, - "loss": 1.7305, - "step": 8613 - }, - { - "epoch": 2.66, - "learning_rate": 3.9726401049510433e-05, - "loss": 1.6426, - "step": 8614 - }, - { - "epoch": 2.66, - "learning_rate": 3.969077199027321e-05, - "loss": 1.478, - "step": 8615 - }, - { - "epoch": 2.66, - "learning_rate": 3.965514293103598e-05, - "loss": 1.512, - "step": 8616 - }, - { - "epoch": 2.66, - "learning_rate": 3.9619513871798745e-05, - "loss": 1.5429, - "step": 8617 - }, - { - "epoch": 2.66, - "learning_rate": 3.958388481256152e-05, - "loss": 1.5241, - "step": 8618 - }, - { - "epoch": 2.66, - "learning_rate": 3.954825575332429e-05, - "loss": 1.4434, - "step": 8619 - }, - { - "epoch": 2.66, - "learning_rate": 3.951262669408706e-05, - "loss": 1.4638, - "step": 8620 - }, - { - "epoch": 2.66, - "learning_rate": 3.947699763484983e-05, - "loss": 1.385, - "step": 8621 - }, - { - "epoch": 2.66, - "learning_rate": 3.94413685756126e-05, - "loss": 1.5409, - "step": 8622 - }, - { - "epoch": 2.66, - "learning_rate": 3.9405739516375375e-05, - "loss": 1.3256, - "step": 8623 - }, - { - "epoch": 2.66, - "learning_rate": 3.9370110457138144e-05, - "loss": 1.1654, - "step": 8624 - }, - { - "epoch": 2.66, - "learning_rate": 3.933448139790091e-05, - "loss": 1.3536, - "step": 8625 - }, - { - "epoch": 2.66, - "learning_rate": 3.929885233866369e-05, - "loss": 1.1143, - "step": 8626 - }, - { - "epoch": 2.66, - "learning_rate": 3.926322327942646e-05, - "loss": 1.119, - "step": 8627 - }, - { - "epoch": 2.66, - "learning_rate": 3.922759422018923e-05, - "loss": 1.0633, - "step": 8628 - }, - { - "epoch": 2.66, - "learning_rate": 3.9191965160952e-05, - "loss": 1.0743, - "step": 8629 - }, - { - "epoch": 2.66, - "learning_rate": 3.9156336101714774e-05, - "loss": 0.8956, - "step": 8630 - }, - { - "epoch": 2.66, - "learning_rate": 3.912070704247754e-05, - "loss": 3.2413, - "step": 8631 - }, - { - "epoch": 2.66, - "learning_rate": 3.908507798324031e-05, - "loss": 3.0011, - "step": 8632 - }, - { - "epoch": 2.66, - "learning_rate": 3.9049448924003086e-05, - "loss": 2.8318, - "step": 8633 - }, - { - "epoch": 2.66, - "learning_rate": 3.9013819864765854e-05, - "loss": 2.594, - "step": 8634 - }, - { - "epoch": 2.67, - "learning_rate": 3.897819080552862e-05, - "loss": 2.4316, - "step": 8635 - }, - { - "epoch": 2.67, - "learning_rate": 3.89425617462914e-05, - "loss": 2.3443, - "step": 8636 - }, - { - "epoch": 2.67, - "learning_rate": 3.8906932687054166e-05, - "loss": 2.6312, - "step": 8637 - }, - { - "epoch": 2.67, - "learning_rate": 3.8871303627816934e-05, - "loss": 2.4433, - "step": 8638 - }, - { - "epoch": 2.67, - "learning_rate": 3.883567456857971e-05, - "loss": 2.3638, - "step": 8639 - }, - { - "epoch": 2.67, - "learning_rate": 3.880004550934248e-05, - "loss": 2.2736, - "step": 8640 - }, - { - "epoch": 2.67, - "learning_rate": 3.8764416450105246e-05, - "loss": 2.5882, - "step": 8641 - }, - { - "epoch": 2.67, - "learning_rate": 3.872878739086802e-05, - "loss": 2.2664, - "step": 8642 - }, - { - "epoch": 2.67, - "learning_rate": 3.8693158331630796e-05, - "loss": 2.3464, - "step": 8643 - }, - { - "epoch": 2.67, - "learning_rate": 3.8657529272393564e-05, - "loss": 2.165, - "step": 8644 - }, - { - "epoch": 2.67, - "learning_rate": 3.862190021315633e-05, - "loss": 2.3288, - "step": 8645 - }, - { - "epoch": 2.67, - "learning_rate": 3.858627115391911e-05, - "loss": 2.5067, - "step": 8646 - }, - { - "epoch": 2.67, - "learning_rate": 3.8550642094681876e-05, - "loss": 2.283, - "step": 8647 - }, - { - "epoch": 2.67, - "learning_rate": 3.8515013035444645e-05, - "loss": 2.3993, - "step": 8648 - }, - { - "epoch": 2.67, - "learning_rate": 3.847938397620742e-05, - "loss": 2.0414, - "step": 8649 - }, - { - "epoch": 2.67, - "learning_rate": 3.844375491697019e-05, - "loss": 2.1921, - "step": 8650 - }, - { - "epoch": 2.67, - "learning_rate": 3.8408125857732956e-05, - "loss": 2.2321, - "step": 8651 - }, - { - "epoch": 2.67, - "learning_rate": 3.837249679849573e-05, - "loss": 1.8299, - "step": 8652 - }, - { - "epoch": 2.67, - "learning_rate": 3.83368677392585e-05, - "loss": 1.9531, - "step": 8653 - }, - { - "epoch": 2.67, - "learning_rate": 3.830123868002127e-05, - "loss": 2.0732, - "step": 8654 - }, - { - "epoch": 2.67, - "learning_rate": 3.826560962078404e-05, - "loss": 2.221, - "step": 8655 - }, - { - "epoch": 2.67, - "learning_rate": 3.822998056154681e-05, - "loss": 2.0567, - "step": 8656 - }, - { - "epoch": 2.67, - "learning_rate": 3.819435150230958e-05, - "loss": 1.9991, - "step": 8657 - }, - { - "epoch": 2.67, - "learning_rate": 3.8158722443072355e-05, - "loss": 1.7683, - "step": 8658 - }, - { - "epoch": 2.67, - "learning_rate": 3.812309338383513e-05, - "loss": 1.9542, - "step": 8659 - }, - { - "epoch": 2.67, - "learning_rate": 3.80874643245979e-05, - "loss": 1.9438, - "step": 8660 - }, - { - "epoch": 2.67, - "learning_rate": 3.8051835265360667e-05, - "loss": 1.7889, - "step": 8661 - }, - { - "epoch": 2.67, - "learning_rate": 3.801620620612344e-05, - "loss": 1.6625, - "step": 8662 - }, - { - "epoch": 2.67, - "learning_rate": 3.798057714688621e-05, - "loss": 1.7258, - "step": 8663 - }, - { - "epoch": 2.67, - "learning_rate": 3.7944948087648985e-05, - "loss": 1.6512, - "step": 8664 - }, - { - "epoch": 2.67, - "learning_rate": 3.7909319028411753e-05, - "loss": 1.8556, - "step": 8665 - }, - { - "epoch": 2.67, - "learning_rate": 3.787368996917452e-05, - "loss": 1.5633, - "step": 8666 - }, - { - "epoch": 2.67, - "learning_rate": 3.78380609099373e-05, - "loss": 1.4296, - "step": 8667 - }, - { - "epoch": 2.68, - "learning_rate": 3.7802431850700065e-05, - "loss": 1.5353, - "step": 8668 - }, - { - "epoch": 2.68, - "learning_rate": 3.7766802791462834e-05, - "loss": 1.4823, - "step": 8669 - }, - { - "epoch": 2.68, - "learning_rate": 3.773117373222561e-05, - "loss": 1.3965, - "step": 8670 - }, - { - "epoch": 2.68, - "learning_rate": 3.769554467298838e-05, - "loss": 1.2066, - "step": 8671 - }, - { - "epoch": 2.68, - "learning_rate": 3.7659915613751145e-05, - "loss": 1.5283, - "step": 8672 - }, - { - "epoch": 2.68, - "learning_rate": 3.762428655451392e-05, - "loss": 1.3302, - "step": 8673 - }, - { - "epoch": 2.68, - "learning_rate": 3.758865749527669e-05, - "loss": 1.215, - "step": 8674 - }, - { - "epoch": 2.68, - "learning_rate": 3.7553028436039464e-05, - "loss": 1.1867, - "step": 8675 - }, - { - "epoch": 2.68, - "learning_rate": 3.751739937680223e-05, - "loss": 1.0024, - "step": 8676 - }, - { - "epoch": 2.68, - "learning_rate": 3.748177031756501e-05, - "loss": 1.0517, - "step": 8677 - }, - { - "epoch": 2.68, - "learning_rate": 3.7446141258327776e-05, - "loss": 1.1019, - "step": 8678 - }, - { - "epoch": 2.68, - "learning_rate": 3.7410512199090544e-05, - "loss": 1.0939, - "step": 8679 - }, - { - "epoch": 2.68, - "learning_rate": 3.737488313985332e-05, - "loss": 1.3092, - "step": 8680 - }, - { - "epoch": 2.68, - "learning_rate": 3.733925408061609e-05, - "loss": 3.203, - "step": 8681 - }, - { - "epoch": 2.68, - "learning_rate": 3.7303625021378856e-05, - "loss": 3.1734, - "step": 8682 - }, - { - "epoch": 2.68, - "learning_rate": 3.726799596214163e-05, - "loss": 2.9, - "step": 8683 - }, - { - "epoch": 2.68, - "learning_rate": 3.72323669029044e-05, - "loss": 2.9804, - "step": 8684 - }, - { - "epoch": 2.68, - "learning_rate": 3.719673784366717e-05, - "loss": 2.6025, - "step": 8685 - }, - { - "epoch": 2.68, - "learning_rate": 3.716110878442994e-05, - "loss": 2.4717, - "step": 8686 - }, - { - "epoch": 2.68, - "learning_rate": 3.712547972519271e-05, - "loss": 2.3171, - "step": 8687 - }, - { - "epoch": 2.68, - "learning_rate": 3.708985066595548e-05, - "loss": 2.5881, - "step": 8688 - }, - { - "epoch": 2.68, - "learning_rate": 3.7054221606718254e-05, - "loss": 2.6308, - "step": 8689 - }, - { - "epoch": 2.68, - "learning_rate": 3.701859254748102e-05, - "loss": 2.3887, - "step": 8690 - }, - { - "epoch": 2.68, - "learning_rate": 3.69829634882438e-05, - "loss": 2.1645, - "step": 8691 - }, - { - "epoch": 2.68, - "learning_rate": 3.6947334429006566e-05, - "loss": 2.4104, - "step": 8692 - }, - { - "epoch": 2.68, - "learning_rate": 3.691170536976934e-05, - "loss": 2.2669, - "step": 8693 - }, - { - "epoch": 2.68, - "learning_rate": 3.687607631053211e-05, - "loss": 2.256, - "step": 8694 - }, - { - "epoch": 2.68, - "learning_rate": 3.684044725129488e-05, - "loss": 2.2489, - "step": 8695 - }, - { - "epoch": 2.68, - "learning_rate": 3.680481819205765e-05, - "loss": 1.9817, - "step": 8696 - }, - { - "epoch": 2.68, - "learning_rate": 3.676918913282042e-05, - "loss": 2.3393, - "step": 8697 - }, - { - "epoch": 2.68, - "learning_rate": 3.673356007358319e-05, - "loss": 2.1278, - "step": 8698 - }, - { - "epoch": 2.68, - "learning_rate": 3.6697931014345965e-05, - "loss": 2.058, - "step": 8699 - }, - { - "epoch": 2.69, - "learning_rate": 3.666230195510873e-05, - "loss": 2.2174, - "step": 8700 - }, - { - "epoch": 2.69, - "learning_rate": 3.66266728958715e-05, - "loss": 2.3576, - "step": 8701 - }, - { - "epoch": 2.69, - "learning_rate": 3.6591043836634276e-05, - "loss": 2.2886, - "step": 8702 - }, - { - "epoch": 2.69, - "learning_rate": 3.6555414777397045e-05, - "loss": 2.0598, - "step": 8703 - }, - { - "epoch": 2.69, - "learning_rate": 3.651978571815981e-05, - "loss": 2.2779, - "step": 8704 - }, - { - "epoch": 2.69, - "learning_rate": 3.648415665892259e-05, - "loss": 1.9708, - "step": 8705 - }, - { - "epoch": 2.69, - "learning_rate": 3.6448527599685356e-05, - "loss": 2.1095, - "step": 8706 - }, - { - "epoch": 2.69, - "learning_rate": 3.6412898540448125e-05, - "loss": 1.995, - "step": 8707 - }, - { - "epoch": 2.69, - "learning_rate": 3.6377269481210907e-05, - "loss": 1.7666, - "step": 8708 - }, - { - "epoch": 2.69, - "learning_rate": 3.6341640421973675e-05, - "loss": 1.7425, - "step": 8709 - }, - { - "epoch": 2.69, - "learning_rate": 3.630601136273644e-05, - "loss": 1.7809, - "step": 8710 - }, - { - "epoch": 2.69, - "learning_rate": 3.627038230349922e-05, - "loss": 1.53, - "step": 8711 - }, - { - "epoch": 2.69, - "learning_rate": 3.6234753244261987e-05, - "loss": 1.8267, - "step": 8712 - }, - { - "epoch": 2.69, - "learning_rate": 3.6199124185024755e-05, - "loss": 1.7729, - "step": 8713 - }, - { - "epoch": 2.69, - "learning_rate": 3.616349512578753e-05, - "loss": 1.8739, - "step": 8714 - }, - { - "epoch": 2.69, - "learning_rate": 3.61278660665503e-05, - "loss": 1.7379, - "step": 8715 - }, - { - "epoch": 2.69, - "learning_rate": 3.609223700731307e-05, - "loss": 1.5983, - "step": 8716 - }, - { - "epoch": 2.69, - "learning_rate": 3.605660794807584e-05, - "loss": 1.4337, - "step": 8717 - }, - { - "epoch": 2.69, - "learning_rate": 3.602097888883861e-05, - "loss": 1.3407, - "step": 8718 - }, - { - "epoch": 2.69, - "learning_rate": 3.598534982960138e-05, - "loss": 1.4168, - "step": 8719 - }, - { - "epoch": 2.69, - "learning_rate": 3.5949720770364153e-05, - "loss": 1.4175, - "step": 8720 - }, - { - "epoch": 2.69, - "learning_rate": 3.591409171112692e-05, - "loss": 1.4496, - "step": 8721 - }, - { - "epoch": 2.69, - "learning_rate": 3.587846265188969e-05, - "loss": 1.2093, - "step": 8722 - }, - { - "epoch": 2.69, - "learning_rate": 3.5842833592652465e-05, - "loss": 1.3046, - "step": 8723 - }, - { - "epoch": 2.69, - "learning_rate": 3.580720453341524e-05, - "loss": 1.1475, - "step": 8724 - }, - { - "epoch": 2.69, - "learning_rate": 3.577157547417801e-05, - "loss": 1.3085, - "step": 8725 - }, - { - "epoch": 2.69, - "learning_rate": 3.573594641494078e-05, - "loss": 1.2364, - "step": 8726 - }, - { - "epoch": 2.69, - "learning_rate": 3.570031735570355e-05, - "loss": 1.1155, - "step": 8727 - }, - { - "epoch": 2.69, - "learning_rate": 3.566468829646632e-05, - "loss": 1.1294, - "step": 8728 - }, - { - "epoch": 2.69, - "learning_rate": 3.562905923722909e-05, - "loss": 1.0927, - "step": 8729 - }, - { - "epoch": 2.69, - "learning_rate": 3.5593430177991864e-05, - "loss": 1.1637, - "step": 8730 - }, - { - "epoch": 2.69, - "learning_rate": 3.555780111875463e-05, - "loss": 3.0995, - "step": 8731 - }, - { - "epoch": 2.7, - "learning_rate": 3.55221720595174e-05, - "loss": 2.9785, - "step": 8732 - }, - { - "epoch": 2.7, - "learning_rate": 3.5486543000280176e-05, - "loss": 3.1387, - "step": 8733 - }, - { - "epoch": 2.7, - "learning_rate": 3.5450913941042944e-05, - "loss": 2.8221, - "step": 8734 - }, - { - "epoch": 2.7, - "learning_rate": 3.541528488180571e-05, - "loss": 2.586, - "step": 8735 - }, - { - "epoch": 2.7, - "learning_rate": 3.537965582256849e-05, - "loss": 2.4397, - "step": 8736 - }, - { - "epoch": 2.7, - "learning_rate": 3.5344026763331256e-05, - "loss": 2.4182, - "step": 8737 - }, - { - "epoch": 2.7, - "learning_rate": 3.5308397704094024e-05, - "loss": 2.2567, - "step": 8738 - }, - { - "epoch": 2.7, - "learning_rate": 3.52727686448568e-05, - "loss": 2.6196, - "step": 8739 - }, - { - "epoch": 2.7, - "learning_rate": 3.5237139585619574e-05, - "loss": 2.5208, - "step": 8740 - }, - { - "epoch": 2.7, - "learning_rate": 3.520151052638234e-05, - "loss": 2.3853, - "step": 8741 - }, - { - "epoch": 2.7, - "learning_rate": 3.516588146714511e-05, - "loss": 2.6301, - "step": 8742 - }, - { - "epoch": 2.7, - "learning_rate": 3.5130252407907886e-05, - "loss": 2.3486, - "step": 8743 - }, - { - "epoch": 2.7, - "learning_rate": 3.5094623348670654e-05, - "loss": 2.1794, - "step": 8744 - }, - { - "epoch": 2.7, - "learning_rate": 3.505899428943342e-05, - "loss": 2.1705, - "step": 8745 - }, - { - "epoch": 2.7, - "learning_rate": 3.50233652301962e-05, - "loss": 2.3571, - "step": 8746 - }, - { - "epoch": 2.7, - "learning_rate": 3.4987736170958966e-05, - "loss": 2.0542, - "step": 8747 - }, - { - "epoch": 2.7, - "learning_rate": 3.4952107111721734e-05, - "loss": 2.0125, - "step": 8748 - }, - { - "epoch": 2.7, - "learning_rate": 3.491647805248451e-05, - "loss": 2.2302, - "step": 8749 - }, - { - "epoch": 2.7, - "learning_rate": 3.488084899324728e-05, - "loss": 2.2169, - "step": 8750 - }, - { - "epoch": 2.7, - "learning_rate": 3.484521993401005e-05, - "loss": 1.9865, - "step": 8751 - }, - { - "epoch": 2.7, - "learning_rate": 3.480959087477282e-05, - "loss": 1.9271, - "step": 8752 - }, - { - "epoch": 2.7, - "learning_rate": 3.477396181553559e-05, - "loss": 2.08, - "step": 8753 - }, - { - "epoch": 2.7, - "learning_rate": 3.4738332756298365e-05, - "loss": 2.1886, - "step": 8754 - }, - { - "epoch": 2.7, - "learning_rate": 3.470270369706113e-05, - "loss": 2.1434, - "step": 8755 - }, - { - "epoch": 2.7, - "learning_rate": 3.46670746378239e-05, - "loss": 2.1595, - "step": 8756 - }, - { - "epoch": 2.7, - "learning_rate": 3.4631445578586676e-05, - "loss": 2.1558, - "step": 8757 - }, - { - "epoch": 2.7, - "learning_rate": 3.459581651934945e-05, - "loss": 1.9609, - "step": 8758 - }, - { - "epoch": 2.7, - "learning_rate": 3.456018746011222e-05, - "loss": 1.9519, - "step": 8759 - }, - { - "epoch": 2.7, - "learning_rate": 3.452455840087499e-05, - "loss": 1.9261, - "step": 8760 - }, - { - "epoch": 2.7, - "learning_rate": 3.448892934163776e-05, - "loss": 1.8133, - "step": 8761 - }, - { - "epoch": 2.7, - "learning_rate": 3.445330028240053e-05, - "loss": 1.7639, - "step": 8762 - }, - { - "epoch": 2.7, - "learning_rate": 3.44176712231633e-05, - "loss": 1.7807, - "step": 8763 - }, - { - "epoch": 2.7, - "learning_rate": 3.4382042163926075e-05, - "loss": 1.6519, - "step": 8764 - }, - { - "epoch": 2.71, - "learning_rate": 3.434641310468884e-05, - "loss": 1.7491, - "step": 8765 - }, - { - "epoch": 2.71, - "learning_rate": 3.431078404545161e-05, - "loss": 1.4836, - "step": 8766 - }, - { - "epoch": 2.71, - "learning_rate": 3.427515498621439e-05, - "loss": 1.4353, - "step": 8767 - }, - { - "epoch": 2.71, - "learning_rate": 3.4239525926977155e-05, - "loss": 1.4934, - "step": 8768 - }, - { - "epoch": 2.71, - "learning_rate": 3.420389686773992e-05, - "loss": 1.3197, - "step": 8769 - }, - { - "epoch": 2.71, - "learning_rate": 3.41682678085027e-05, - "loss": 1.6196, - "step": 8770 - }, - { - "epoch": 2.71, - "learning_rate": 3.413263874926547e-05, - "loss": 1.2623, - "step": 8771 - }, - { - "epoch": 2.71, - "learning_rate": 3.4097009690028235e-05, - "loss": 1.5216, - "step": 8772 - }, - { - "epoch": 2.71, - "learning_rate": 3.406138063079101e-05, - "loss": 1.2342, - "step": 8773 - }, - { - "epoch": 2.71, - "learning_rate": 3.4025751571553785e-05, - "loss": 1.4132, - "step": 8774 - }, - { - "epoch": 2.71, - "learning_rate": 3.3990122512316554e-05, - "loss": 1.1421, - "step": 8775 - }, - { - "epoch": 2.71, - "learning_rate": 3.395449345307932e-05, - "loss": 1.1525, - "step": 8776 - }, - { - "epoch": 2.71, - "learning_rate": 3.39188643938421e-05, - "loss": 1.0395, - "step": 8777 - }, - { - "epoch": 2.71, - "learning_rate": 3.3883235334604865e-05, - "loss": 1.1098, - "step": 8778 - }, - { - "epoch": 2.71, - "learning_rate": 3.3847606275367634e-05, - "loss": 1.0651, - "step": 8779 - }, - { - "epoch": 2.71, - "learning_rate": 3.381197721613041e-05, - "loss": 1.0496, - "step": 8780 - }, - { - "epoch": 2.71, - "learning_rate": 3.377634815689318e-05, - "loss": 3.3162, - "step": 8781 - }, - { - "epoch": 2.71, - "learning_rate": 3.3740719097655945e-05, - "loss": 3.0573, - "step": 8782 - }, - { - "epoch": 2.71, - "learning_rate": 3.370509003841872e-05, - "loss": 2.6484, - "step": 8783 - }, - { - "epoch": 2.71, - "learning_rate": 3.366946097918149e-05, - "loss": 2.5218, - "step": 8784 - }, - { - "epoch": 2.71, - "learning_rate": 3.363383191994426e-05, - "loss": 2.5642, - "step": 8785 - }, - { - "epoch": 2.71, - "learning_rate": 3.359820286070703e-05, - "loss": 2.6789, - "step": 8786 - }, - { - "epoch": 2.71, - "learning_rate": 3.35625738014698e-05, - "loss": 2.136, - "step": 8787 - }, - { - "epoch": 2.71, - "learning_rate": 3.352694474223257e-05, - "loss": 2.5443, - "step": 8788 - }, - { - "epoch": 2.71, - "learning_rate": 3.3491315682995344e-05, - "loss": 2.4994, - "step": 8789 - }, - { - "epoch": 2.71, - "learning_rate": 3.345568662375812e-05, - "loss": 2.2999, - "step": 8790 - }, - { - "epoch": 2.71, - "learning_rate": 3.342005756452089e-05, - "loss": 2.2913, - "step": 8791 - }, - { - "epoch": 2.71, - "learning_rate": 3.3384428505283656e-05, - "loss": 2.3712, - "step": 8792 - }, - { - "epoch": 2.71, - "learning_rate": 3.334879944604643e-05, - "loss": 1.8845, - "step": 8793 - }, - { - "epoch": 2.71, - "learning_rate": 3.33131703868092e-05, - "loss": 2.2713, - "step": 8794 - }, - { - "epoch": 2.71, - "learning_rate": 3.3277541327571974e-05, - "loss": 2.4481, - "step": 8795 - }, - { - "epoch": 2.71, - "learning_rate": 3.324191226833474e-05, - "loss": 2.4337, - "step": 8796 - }, - { - "epoch": 2.72, - "learning_rate": 3.320628320909751e-05, - "loss": 2.0293, - "step": 8797 - }, - { - "epoch": 2.72, - "learning_rate": 3.3170654149860286e-05, - "loss": 2.1357, - "step": 8798 - }, - { - "epoch": 2.72, - "learning_rate": 3.3135025090623054e-05, - "loss": 2.0468, - "step": 8799 - }, - { - "epoch": 2.72, - "learning_rate": 3.309939603138582e-05, - "loss": 2.0657, - "step": 8800 - }, - { - "epoch": 2.72, - "learning_rate": 3.30637669721486e-05, - "loss": 2.0085, - "step": 8801 - }, - { - "epoch": 2.72, - "learning_rate": 3.3028137912911366e-05, - "loss": 2.1713, - "step": 8802 - }, - { - "epoch": 2.72, - "learning_rate": 3.2992508853674134e-05, - "loss": 1.851, - "step": 8803 - }, - { - "epoch": 2.72, - "learning_rate": 3.295687979443691e-05, - "loss": 1.842, - "step": 8804 - }, - { - "epoch": 2.72, - "learning_rate": 3.292125073519968e-05, - "loss": 2.1869, - "step": 8805 - }, - { - "epoch": 2.72, - "learning_rate": 3.288562167596245e-05, - "loss": 1.9077, - "step": 8806 - }, - { - "epoch": 2.72, - "learning_rate": 3.284999261672522e-05, - "loss": 1.8607, - "step": 8807 - }, - { - "epoch": 2.72, - "learning_rate": 3.2814363557487996e-05, - "loss": 1.5588, - "step": 8808 - }, - { - "epoch": 2.72, - "learning_rate": 3.2778734498250765e-05, - "loss": 1.7249, - "step": 8809 - }, - { - "epoch": 2.72, - "learning_rate": 3.274310543901353e-05, - "loss": 1.9512, - "step": 8810 - }, - { - "epoch": 2.72, - "learning_rate": 3.270747637977631e-05, - "loss": 1.8065, - "step": 8811 - }, - { - "epoch": 2.72, - "learning_rate": 3.2671847320539076e-05, - "loss": 1.9297, - "step": 8812 - }, - { - "epoch": 2.72, - "learning_rate": 3.2636218261301845e-05, - "loss": 1.8698, - "step": 8813 - }, - { - "epoch": 2.72, - "learning_rate": 3.260058920206462e-05, - "loss": 1.663, - "step": 8814 - }, - { - "epoch": 2.72, - "learning_rate": 3.256496014282739e-05, - "loss": 1.7202, - "step": 8815 - }, - { - "epoch": 2.72, - "learning_rate": 3.2529331083590156e-05, - "loss": 1.4414, - "step": 8816 - }, - { - "epoch": 2.72, - "learning_rate": 3.249370202435293e-05, - "loss": 1.2541, - "step": 8817 - }, - { - "epoch": 2.72, - "learning_rate": 3.24580729651157e-05, - "loss": 1.3554, - "step": 8818 - }, - { - "epoch": 2.72, - "learning_rate": 3.242244390587847e-05, - "loss": 1.4526, - "step": 8819 - }, - { - "epoch": 2.72, - "learning_rate": 3.238681484664124e-05, - "loss": 1.3007, - "step": 8820 - }, - { - "epoch": 2.72, - "learning_rate": 3.235118578740401e-05, - "loss": 1.4615, - "step": 8821 - }, - { - "epoch": 2.72, - "learning_rate": 3.231555672816679e-05, - "loss": 1.4956, - "step": 8822 - }, - { - "epoch": 2.72, - "learning_rate": 3.2279927668929555e-05, - "loss": 1.0949, - "step": 8823 - }, - { - "epoch": 2.72, - "learning_rate": 3.224429860969233e-05, - "loss": 1.258, - "step": 8824 - }, - { - "epoch": 2.72, - "learning_rate": 3.22086695504551e-05, - "loss": 1.3014, - "step": 8825 - }, - { - "epoch": 2.72, - "learning_rate": 3.217304049121787e-05, - "loss": 1.0525, - "step": 8826 - }, - { - "epoch": 2.72, - "learning_rate": 3.213741143198064e-05, - "loss": 1.0314, - "step": 8827 - }, - { - "epoch": 2.72, - "learning_rate": 3.210178237274341e-05, - "loss": 1.0685, - "step": 8828 - }, - { - "epoch": 2.72, - "learning_rate": 3.206615331350618e-05, - "loss": 1.0276, - "step": 8829 - }, - { - "epoch": 2.73, - "learning_rate": 3.2030524254268954e-05, - "loss": 0.8688, - "step": 8830 - }, - { - "epoch": 2.73, - "learning_rate": 3.199489519503172e-05, - "loss": 3.5084, - "step": 8831 - }, - { - "epoch": 2.73, - "learning_rate": 3.195926613579449e-05, - "loss": 3.0941, - "step": 8832 - }, - { - "epoch": 2.73, - "learning_rate": 3.1923637076557265e-05, - "loss": 2.9308, - "step": 8833 - }, - { - "epoch": 2.73, - "learning_rate": 3.1888008017320034e-05, - "loss": 2.3758, - "step": 8834 - }, - { - "epoch": 2.73, - "learning_rate": 3.18523789580828e-05, - "loss": 2.3274, - "step": 8835 - }, - { - "epoch": 2.73, - "learning_rate": 3.181674989884558e-05, - "loss": 2.3781, - "step": 8836 - }, - { - "epoch": 2.73, - "learning_rate": 3.1781120839608345e-05, - "loss": 2.5207, - "step": 8837 - }, - { - "epoch": 2.73, - "learning_rate": 3.174549178037112e-05, - "loss": 2.2698, - "step": 8838 - }, - { - "epoch": 2.73, - "learning_rate": 3.1709862721133896e-05, - "loss": 2.2565, - "step": 8839 - }, - { - "epoch": 2.73, - "learning_rate": 3.1674233661896664e-05, - "loss": 2.4983, - "step": 8840 - }, - { - "epoch": 2.73, - "learning_rate": 3.163860460265943e-05, - "loss": 2.6029, - "step": 8841 - }, - { - "epoch": 2.73, - "learning_rate": 3.160297554342221e-05, - "loss": 2.3531, - "step": 8842 - }, - { - "epoch": 2.73, - "learning_rate": 3.1567346484184976e-05, - "loss": 2.1889, - "step": 8843 - }, - { - "epoch": 2.73, - "learning_rate": 3.1531717424947744e-05, - "loss": 1.9821, - "step": 8844 - }, - { - "epoch": 2.73, - "learning_rate": 3.149608836571052e-05, - "loss": 2.2924, - "step": 8845 - }, - { - "epoch": 2.73, - "learning_rate": 3.146045930647329e-05, - "loss": 2.2335, - "step": 8846 - }, - { - "epoch": 2.73, - "learning_rate": 3.1424830247236056e-05, - "loss": 2.038, - "step": 8847 - }, - { - "epoch": 2.73, - "learning_rate": 3.138920118799883e-05, - "loss": 2.1488, - "step": 8848 - }, - { - "epoch": 2.73, - "learning_rate": 3.13535721287616e-05, - "loss": 2.1255, - "step": 8849 - }, - { - "epoch": 2.73, - "learning_rate": 3.131794306952437e-05, - "loss": 2.1363, - "step": 8850 - }, - { - "epoch": 2.73, - "learning_rate": 3.128231401028714e-05, - "loss": 2.1867, - "step": 8851 - }, - { - "epoch": 2.73, - "learning_rate": 3.124668495104991e-05, - "loss": 1.86, - "step": 8852 - }, - { - "epoch": 2.73, - "learning_rate": 3.121105589181268e-05, - "loss": 2.199, - "step": 8853 - }, - { - "epoch": 2.73, - "learning_rate": 3.1175426832575454e-05, - "loss": 2.0914, - "step": 8854 - }, - { - "epoch": 2.73, - "learning_rate": 3.113979777333823e-05, - "loss": 1.7708, - "step": 8855 - }, - { - "epoch": 2.73, - "learning_rate": 3.1104168714101e-05, - "loss": 1.6936, - "step": 8856 - }, - { - "epoch": 2.73, - "learning_rate": 3.1068539654863766e-05, - "loss": 1.8469, - "step": 8857 - }, - { - "epoch": 2.73, - "learning_rate": 3.103291059562654e-05, - "loss": 2.1752, - "step": 8858 - }, - { - "epoch": 2.73, - "learning_rate": 3.099728153638931e-05, - "loss": 1.9092, - "step": 8859 - }, - { - "epoch": 2.73, - "learning_rate": 3.096165247715208e-05, - "loss": 1.7292, - "step": 8860 - }, - { - "epoch": 2.73, - "learning_rate": 3.092602341791485e-05, - "loss": 1.8697, - "step": 8861 - }, - { - "epoch": 2.74, - "learning_rate": 3.089039435867762e-05, - "loss": 1.7217, - "step": 8862 - }, - { - "epoch": 2.74, - "learning_rate": 3.085476529944039e-05, - "loss": 1.904, - "step": 8863 - }, - { - "epoch": 2.74, - "learning_rate": 3.0819136240203165e-05, - "loss": 1.4469, - "step": 8864 - }, - { - "epoch": 2.74, - "learning_rate": 3.078350718096593e-05, - "loss": 1.72, - "step": 8865 - }, - { - "epoch": 2.74, - "learning_rate": 3.07478781217287e-05, - "loss": 1.7053, - "step": 8866 - }, - { - "epoch": 2.74, - "learning_rate": 3.0712249062491476e-05, - "loss": 1.5843, - "step": 8867 - }, - { - "epoch": 2.74, - "learning_rate": 3.0676620003254245e-05, - "loss": 1.7706, - "step": 8868 - }, - { - "epoch": 2.74, - "learning_rate": 3.064099094401701e-05, - "loss": 1.3637, - "step": 8869 - }, - { - "epoch": 2.74, - "learning_rate": 3.060536188477979e-05, - "loss": 1.4612, - "step": 8870 - }, - { - "epoch": 2.74, - "learning_rate": 3.056973282554256e-05, - "loss": 1.3208, - "step": 8871 - }, - { - "epoch": 2.74, - "learning_rate": 3.053410376630533e-05, - "loss": 1.2085, - "step": 8872 - }, - { - "epoch": 2.74, - "learning_rate": 3.0498474707068103e-05, - "loss": 1.2714, - "step": 8873 - }, - { - "epoch": 2.74, - "learning_rate": 3.0462845647830875e-05, - "loss": 1.2285, - "step": 8874 - }, - { - "epoch": 2.74, - "learning_rate": 3.0427216588593643e-05, - "loss": 1.0194, - "step": 8875 - }, - { - "epoch": 2.74, - "learning_rate": 3.0391587529356415e-05, - "loss": 1.2047, - "step": 8876 - }, - { - "epoch": 2.74, - "learning_rate": 3.0355958470119187e-05, - "loss": 1.0737, - "step": 8877 - }, - { - "epoch": 2.74, - "learning_rate": 3.0320329410881955e-05, - "loss": 0.9116, - "step": 8878 - }, - { - "epoch": 2.74, - "learning_rate": 3.0284700351644727e-05, - "loss": 1.0016, - "step": 8879 - }, - { - "epoch": 2.74, - "learning_rate": 3.02490712924075e-05, - "loss": 0.9452, - "step": 8880 - }, - { - "epoch": 2.74, - "learning_rate": 3.0213442233170267e-05, - "loss": 3.3411, - "step": 8881 - }, - { - "epoch": 2.74, - "learning_rate": 3.017781317393304e-05, - "loss": 2.9053, - "step": 8882 - }, - { - "epoch": 2.74, - "learning_rate": 3.014218411469581e-05, - "loss": 2.7845, - "step": 8883 - }, - { - "epoch": 2.74, - "learning_rate": 3.010655505545858e-05, - "loss": 2.7785, - "step": 8884 - }, - { - "epoch": 2.74, - "learning_rate": 3.007092599622135e-05, - "loss": 2.7756, - "step": 8885 - }, - { - "epoch": 2.74, - "learning_rate": 3.0035296936984122e-05, - "loss": 2.5861, - "step": 8886 - }, - { - "epoch": 2.74, - "learning_rate": 2.999966787774689e-05, - "loss": 2.4832, - "step": 8887 - }, - { - "epoch": 2.74, - "learning_rate": 2.9964038818509665e-05, - "loss": 2.5987, - "step": 8888 - }, - { - "epoch": 2.74, - "learning_rate": 2.9928409759272437e-05, - "loss": 2.2145, - "step": 8889 - }, - { - "epoch": 2.74, - "learning_rate": 2.989278070003521e-05, - "loss": 2.1036, - "step": 8890 - }, - { - "epoch": 2.74, - "learning_rate": 2.9857151640797977e-05, - "loss": 2.1703, - "step": 8891 - }, - { - "epoch": 2.74, - "learning_rate": 2.982152258156075e-05, - "loss": 2.4701, - "step": 8892 - }, - { - "epoch": 2.74, - "learning_rate": 2.978589352232352e-05, - "loss": 2.2195, - "step": 8893 - }, - { - "epoch": 2.75, - "learning_rate": 2.9750264463086292e-05, - "loss": 2.2235, - "step": 8894 - }, - { - "epoch": 2.75, - "learning_rate": 2.971463540384906e-05, - "loss": 2.2967, - "step": 8895 - }, - { - "epoch": 2.75, - "learning_rate": 2.9679006344611832e-05, - "loss": 2.3609, - "step": 8896 - }, - { - "epoch": 2.75, - "learning_rate": 2.9643377285374604e-05, - "loss": 2.1932, - "step": 8897 - }, - { - "epoch": 2.75, - "learning_rate": 2.9607748226137372e-05, - "loss": 2.1367, - "step": 8898 - }, - { - "epoch": 2.75, - "learning_rate": 2.9572119166900144e-05, - "loss": 2.0415, - "step": 8899 - }, - { - "epoch": 2.75, - "learning_rate": 2.9536490107662916e-05, - "loss": 2.208, - "step": 8900 - }, - { - "epoch": 2.75, - "learning_rate": 2.9500861048425684e-05, - "loss": 2.0407, - "step": 8901 - }, - { - "epoch": 2.75, - "learning_rate": 2.9465231989188456e-05, - "loss": 2.103, - "step": 8902 - }, - { - "epoch": 2.75, - "learning_rate": 2.9429602929951228e-05, - "loss": 1.7616, - "step": 8903 - }, - { - "epoch": 2.75, - "learning_rate": 2.9393973870714003e-05, - "loss": 2.1302, - "step": 8904 - }, - { - "epoch": 2.75, - "learning_rate": 2.935834481147677e-05, - "loss": 1.898, - "step": 8905 - }, - { - "epoch": 2.75, - "learning_rate": 2.9322715752239543e-05, - "loss": 2.0871, - "step": 8906 - }, - { - "epoch": 2.75, - "learning_rate": 2.9287086693002314e-05, - "loss": 2.0918, - "step": 8907 - }, - { - "epoch": 2.75, - "learning_rate": 2.9251457633765083e-05, - "loss": 1.7868, - "step": 8908 - }, - { - "epoch": 2.75, - "learning_rate": 2.9215828574527854e-05, - "loss": 1.9503, - "step": 8909 - }, - { - "epoch": 2.75, - "learning_rate": 2.9180199515290626e-05, - "loss": 1.6819, - "step": 8910 - }, - { - "epoch": 2.75, - "learning_rate": 2.9144570456053394e-05, - "loss": 1.8408, - "step": 8911 - }, - { - "epoch": 2.75, - "learning_rate": 2.9108941396816166e-05, - "loss": 1.697, - "step": 8912 - }, - { - "epoch": 2.75, - "learning_rate": 2.9073312337578938e-05, - "loss": 1.4994, - "step": 8913 - }, - { - "epoch": 2.75, - "learning_rate": 2.903768327834171e-05, - "loss": 1.5205, - "step": 8914 - }, - { - "epoch": 2.75, - "learning_rate": 2.9002054219104478e-05, - "loss": 1.6618, - "step": 8915 - }, - { - "epoch": 2.75, - "learning_rate": 2.896642515986725e-05, - "loss": 1.7515, - "step": 8916 - }, - { - "epoch": 2.75, - "learning_rate": 2.893079610063002e-05, - "loss": 1.3434, - "step": 8917 - }, - { - "epoch": 2.75, - "learning_rate": 2.889516704139279e-05, - "loss": 1.5827, - "step": 8918 - }, - { - "epoch": 2.75, - "learning_rate": 2.885953798215556e-05, - "loss": 1.5304, - "step": 8919 - }, - { - "epoch": 2.75, - "learning_rate": 2.8823908922918336e-05, - "loss": 1.3532, - "step": 8920 - }, - { - "epoch": 2.75, - "learning_rate": 2.8788279863681108e-05, - "loss": 1.2369, - "step": 8921 - }, - { - "epoch": 2.75, - "learning_rate": 2.8752650804443876e-05, - "loss": 1.362, - "step": 8922 - }, - { - "epoch": 2.75, - "learning_rate": 2.8717021745206648e-05, - "loss": 1.1346, - "step": 8923 - }, - { - "epoch": 2.75, - "learning_rate": 2.868139268596942e-05, - "loss": 1.126, - "step": 8924 - }, - { - "epoch": 2.75, - "learning_rate": 2.8645763626732188e-05, - "loss": 1.0173, - "step": 8925 - }, - { - "epoch": 2.75, - "learning_rate": 2.861013456749496e-05, - "loss": 1.0653, - "step": 8926 - }, - { - "epoch": 2.76, - "learning_rate": 2.857450550825773e-05, - "loss": 1.1413, - "step": 8927 - }, - { - "epoch": 2.76, - "learning_rate": 2.85388764490205e-05, - "loss": 1.0217, - "step": 8928 - }, - { - "epoch": 2.76, - "learning_rate": 2.850324738978327e-05, - "loss": 0.969, - "step": 8929 - }, - { - "epoch": 2.76, - "learning_rate": 2.8467618330546043e-05, - "loss": 1.019, - "step": 8930 - }, - { - "epoch": 2.76, - "learning_rate": 2.843198927130881e-05, - "loss": 3.2161, - "step": 8931 - }, - { - "epoch": 2.76, - "learning_rate": 2.8396360212071583e-05, - "loss": 2.8417, - "step": 8932 - }, - { - "epoch": 2.76, - "learning_rate": 2.8360731152834355e-05, - "loss": 2.7811, - "step": 8933 - }, - { - "epoch": 2.76, - "learning_rate": 2.8325102093597123e-05, - "loss": 2.3102, - "step": 8934 - }, - { - "epoch": 2.76, - "learning_rate": 2.8289473034359895e-05, - "loss": 2.5295, - "step": 8935 - }, - { - "epoch": 2.76, - "learning_rate": 2.8253843975122667e-05, - "loss": 2.4044, - "step": 8936 - }, - { - "epoch": 2.76, - "learning_rate": 2.8218214915885442e-05, - "loss": 2.3383, - "step": 8937 - }, - { - "epoch": 2.76, - "learning_rate": 2.8182585856648214e-05, - "loss": 2.3592, - "step": 8938 - }, - { - "epoch": 2.76, - "learning_rate": 2.8146956797410982e-05, - "loss": 2.1102, - "step": 8939 - }, - { - "epoch": 2.76, - "learning_rate": 2.8111327738173754e-05, - "loss": 2.359, - "step": 8940 - }, - { - "epoch": 2.76, - "learning_rate": 2.8075698678936525e-05, - "loss": 2.3723, - "step": 8941 - }, - { - "epoch": 2.76, - "learning_rate": 2.8040069619699294e-05, - "loss": 2.2799, - "step": 8942 - }, - { - "epoch": 2.76, - "learning_rate": 2.8004440560462065e-05, - "loss": 2.269, - "step": 8943 - }, - { - "epoch": 2.76, - "learning_rate": 2.7968811501224837e-05, - "loss": 2.3846, - "step": 8944 - }, - { - "epoch": 2.76, - "learning_rate": 2.7933182441987605e-05, - "loss": 2.1949, - "step": 8945 - }, - { - "epoch": 2.76, - "learning_rate": 2.7897553382750377e-05, - "loss": 2.1183, - "step": 8946 - }, - { - "epoch": 2.76, - "learning_rate": 2.786192432351315e-05, - "loss": 2.0441, - "step": 8947 - }, - { - "epoch": 2.76, - "learning_rate": 2.7826295264275917e-05, - "loss": 2.221, - "step": 8948 - }, - { - "epoch": 2.76, - "learning_rate": 2.779066620503869e-05, - "loss": 1.7355, - "step": 8949 - }, - { - "epoch": 2.76, - "learning_rate": 2.775503714580146e-05, - "loss": 2.0264, - "step": 8950 - }, - { - "epoch": 2.76, - "learning_rate": 2.771940808656423e-05, - "loss": 1.9366, - "step": 8951 - }, - { - "epoch": 2.76, - "learning_rate": 2.7683779027327e-05, - "loss": 2.0758, - "step": 8952 - }, - { - "epoch": 2.76, - "learning_rate": 2.7648149968089776e-05, - "loss": 2.0584, - "step": 8953 - }, - { - "epoch": 2.76, - "learning_rate": 2.7612520908852547e-05, - "loss": 1.8181, - "step": 8954 - }, - { - "epoch": 2.76, - "learning_rate": 2.7576891849615316e-05, - "loss": 1.9546, - "step": 8955 - }, - { - "epoch": 2.76, - "learning_rate": 2.7541262790378088e-05, - "loss": 1.9036, - "step": 8956 - }, - { - "epoch": 2.76, - "learning_rate": 2.750563373114086e-05, - "loss": 2.015, - "step": 8957 - }, - { - "epoch": 2.76, - "learning_rate": 2.747000467190363e-05, - "loss": 1.9219, - "step": 8958 - }, - { - "epoch": 2.77, - "learning_rate": 2.74343756126664e-05, - "loss": 1.7293, - "step": 8959 - }, - { - "epoch": 2.77, - "learning_rate": 2.739874655342917e-05, - "loss": 1.7931, - "step": 8960 - }, - { - "epoch": 2.77, - "learning_rate": 2.7363117494191943e-05, - "loss": 1.755, - "step": 8961 - }, - { - "epoch": 2.77, - "learning_rate": 2.732748843495471e-05, - "loss": 1.7506, - "step": 8962 - }, - { - "epoch": 2.77, - "learning_rate": 2.7291859375717483e-05, - "loss": 2.0728, - "step": 8963 - }, - { - "epoch": 2.77, - "learning_rate": 2.7256230316480254e-05, - "loss": 1.5895, - "step": 8964 - }, - { - "epoch": 2.77, - "learning_rate": 2.7220601257243023e-05, - "loss": 1.6548, - "step": 8965 - }, - { - "epoch": 2.77, - "learning_rate": 2.7184972198005794e-05, - "loss": 1.4509, - "step": 8966 - }, - { - "epoch": 2.77, - "learning_rate": 2.7149343138768566e-05, - "loss": 1.6684, - "step": 8967 - }, - { - "epoch": 2.77, - "learning_rate": 2.7113714079531335e-05, - "loss": 1.4958, - "step": 8968 - }, - { - "epoch": 2.77, - "learning_rate": 2.707808502029411e-05, - "loss": 1.3203, - "step": 8969 - }, - { - "epoch": 2.77, - "learning_rate": 2.704245596105688e-05, - "loss": 1.416, - "step": 8970 - }, - { - "epoch": 2.77, - "learning_rate": 2.7006826901819653e-05, - "loss": 1.4314, - "step": 8971 - }, - { - "epoch": 2.77, - "learning_rate": 2.697119784258242e-05, - "loss": 1.4236, - "step": 8972 - }, - { - "epoch": 2.77, - "learning_rate": 2.6935568783345193e-05, - "loss": 1.2881, - "step": 8973 - }, - { - "epoch": 2.77, - "learning_rate": 2.6899939724107965e-05, - "loss": 1.2562, - "step": 8974 - }, - { - "epoch": 2.77, - "learning_rate": 2.6864310664870733e-05, - "loss": 0.9651, - "step": 8975 - }, - { - "epoch": 2.77, - "learning_rate": 2.6828681605633505e-05, - "loss": 1.0363, - "step": 8976 - }, - { - "epoch": 2.77, - "learning_rate": 2.6793052546396277e-05, - "loss": 1.0563, - "step": 8977 - }, - { - "epoch": 2.77, - "learning_rate": 2.6757423487159045e-05, - "loss": 1.0939, - "step": 8978 - }, - { - "epoch": 2.77, - "learning_rate": 2.6721794427921817e-05, - "loss": 0.8473, - "step": 8979 - }, - { - "epoch": 2.77, - "learning_rate": 2.6686165368684588e-05, - "loss": 1.0253, - "step": 8980 - }, - { - "epoch": 2.77, - "learning_rate": 2.665053630944736e-05, - "loss": 3.0819, - "step": 8981 - }, - { - "epoch": 2.77, - "learning_rate": 2.6614907250210128e-05, - "loss": 2.7862, - "step": 8982 - }, - { - "epoch": 2.77, - "learning_rate": 2.65792781909729e-05, - "loss": 3.0111, - "step": 8983 - }, - { - "epoch": 2.77, - "learning_rate": 2.6543649131735672e-05, - "loss": 2.7109, - "step": 8984 - }, - { - "epoch": 2.77, - "learning_rate": 2.650802007249844e-05, - "loss": 2.61, - "step": 8985 - }, - { - "epoch": 2.77, - "learning_rate": 2.6472391013261215e-05, - "loss": 2.0676, - "step": 8986 - }, - { - "epoch": 2.77, - "learning_rate": 2.6436761954023987e-05, - "loss": 2.7454, - "step": 8987 - }, - { - "epoch": 2.77, - "learning_rate": 2.640113289478676e-05, - "loss": 2.092, - "step": 8988 - }, - { - "epoch": 2.77, - "learning_rate": 2.6365503835549527e-05, - "loss": 2.4148, - "step": 8989 - }, - { - "epoch": 2.77, - "learning_rate": 2.63298747763123e-05, - "loss": 2.1129, - "step": 8990 - }, - { - "epoch": 2.77, - "learning_rate": 2.629424571707507e-05, - "loss": 2.2946, - "step": 8991 - }, - { - "epoch": 2.78, - "learning_rate": 2.625861665783784e-05, - "loss": 2.2556, - "step": 8992 - }, - { - "epoch": 2.78, - "learning_rate": 2.622298759860061e-05, - "loss": 2.2588, - "step": 8993 - }, - { - "epoch": 2.78, - "learning_rate": 2.6187358539363382e-05, - "loss": 2.053, - "step": 8994 - }, - { - "epoch": 2.78, - "learning_rate": 2.615172948012615e-05, - "loss": 2.323, - "step": 8995 - }, - { - "epoch": 2.78, - "learning_rate": 2.6116100420888922e-05, - "loss": 2.0555, - "step": 8996 - }, - { - "epoch": 2.78, - "learning_rate": 2.6080471361651694e-05, - "loss": 1.8683, - "step": 8997 - }, - { - "epoch": 2.78, - "learning_rate": 2.6044842302414462e-05, - "loss": 2.2882, - "step": 8998 - }, - { - "epoch": 2.78, - "learning_rate": 2.6009213243177234e-05, - "loss": 2.1629, - "step": 8999 - }, - { - "epoch": 2.78, - "learning_rate": 2.5973584183940006e-05, - "loss": 2.1419, - "step": 9000 - }, - { - "epoch": 2.78, - "eval_bleu": 3.7749272962721704e-15, - "eval_loss": 3.1467533111572266, - "eval_runtime": 2628.7198, - "eval_samples_per_second": 5.615, - "eval_steps_per_second": 0.702, - "step": 9000 - }, - { - "epoch": 2.78, - "learning_rate": 2.5937955124702777e-05, - "loss": 1.97, - "step": 9001 - }, - { - "epoch": 2.78, - "learning_rate": 2.5902326065465552e-05, - "loss": 2.1375, - "step": 9002 - }, - { - "epoch": 2.78, - "learning_rate": 2.586669700622832e-05, - "loss": 2.2439, - "step": 9003 - }, - { - "epoch": 2.78, - "learning_rate": 2.5831067946991092e-05, - "loss": 1.6859, - "step": 9004 - }, - { - "epoch": 2.78, - "learning_rate": 2.5795438887753864e-05, - "loss": 2.096, - "step": 9005 - }, - { - "epoch": 2.78, - "learning_rate": 2.5759809828516632e-05, - "loss": 1.8178, - "step": 9006 - }, - { - "epoch": 2.78, - "learning_rate": 2.5724180769279404e-05, - "loss": 1.8452, - "step": 9007 - }, - { - "epoch": 2.78, - "learning_rate": 2.5688551710042176e-05, - "loss": 1.6782, - "step": 9008 - }, - { - "epoch": 2.78, - "learning_rate": 2.5652922650804944e-05, - "loss": 1.7384, - "step": 9009 - }, - { - "epoch": 2.78, - "learning_rate": 2.5617293591567716e-05, - "loss": 1.7038, - "step": 9010 - }, - { - "epoch": 2.78, - "learning_rate": 2.5581664532330488e-05, - "loss": 1.8304, - "step": 9011 - }, - { - "epoch": 2.78, - "learning_rate": 2.5546035473093256e-05, - "loss": 1.6061, - "step": 9012 - }, - { - "epoch": 2.78, - "learning_rate": 2.5510406413856028e-05, - "loss": 1.8001, - "step": 9013 - }, - { - "epoch": 2.78, - "learning_rate": 2.54747773546188e-05, - "loss": 1.6911, - "step": 9014 - }, - { - "epoch": 2.78, - "learning_rate": 2.5439148295381568e-05, - "loss": 1.3874, - "step": 9015 - }, - { - "epoch": 2.78, - "learning_rate": 2.540351923614434e-05, - "loss": 1.3692, - "step": 9016 - }, - { - "epoch": 2.78, - "learning_rate": 2.536789017690711e-05, - "loss": 1.4226, - "step": 9017 - }, - { - "epoch": 2.78, - "learning_rate": 2.5332261117669886e-05, - "loss": 1.3455, - "step": 9018 - }, - { - "epoch": 2.78, - "learning_rate": 2.5296632058432654e-05, - "loss": 1.2716, - "step": 9019 - }, - { - "epoch": 2.78, - "learning_rate": 2.5261002999195426e-05, - "loss": 1.5093, - "step": 9020 - }, - { - "epoch": 2.78, - "learning_rate": 2.5225373939958198e-05, - "loss": 1.3959, - "step": 9021 - }, - { - "epoch": 2.78, - "learning_rate": 2.5189744880720966e-05, - "loss": 1.3418, - "step": 9022 - }, - { - "epoch": 2.78, - "learning_rate": 2.5154115821483738e-05, - "loss": 1.1889, - "step": 9023 - }, - { - "epoch": 2.79, - "learning_rate": 2.511848676224651e-05, - "loss": 1.1683, - "step": 9024 - }, - { - "epoch": 2.79, - "learning_rate": 2.508285770300928e-05, - "loss": 1.1083, - "step": 9025 - }, - { - "epoch": 2.79, - "learning_rate": 2.504722864377205e-05, - "loss": 1.0367, - "step": 9026 - }, - { - "epoch": 2.79, - "learning_rate": 2.501159958453482e-05, - "loss": 1.1692, - "step": 9027 - }, - { - "epoch": 2.79, - "learning_rate": 2.4975970525297593e-05, - "loss": 0.9068, - "step": 9028 - }, - { - "epoch": 2.79, - "learning_rate": 2.494034146606036e-05, - "loss": 1.0479, - "step": 9029 - }, - { - "epoch": 2.79, - "learning_rate": 2.4904712406823133e-05, - "loss": 1.0714, - "step": 9030 - }, - { - "epoch": 2.79, - "learning_rate": 2.4869083347585905e-05, - "loss": 3.3258, - "step": 9031 - }, - { - "epoch": 2.79, - "learning_rate": 2.4833454288348673e-05, - "loss": 2.8721, - "step": 9032 - }, - { - "epoch": 2.79, - "learning_rate": 2.4797825229111445e-05, - "loss": 2.6033, - "step": 9033 - }, - { - "epoch": 2.79, - "learning_rate": 2.4762196169874217e-05, - "loss": 2.5923, - "step": 9034 - }, - { - "epoch": 2.79, - "learning_rate": 2.4726567110636992e-05, - "loss": 2.4301, - "step": 9035 - }, - { - "epoch": 2.79, - "learning_rate": 2.469093805139976e-05, - "loss": 2.4921, - "step": 9036 - }, - { - "epoch": 2.79, - "learning_rate": 2.4655308992162532e-05, - "loss": 2.5695, - "step": 9037 - }, - { - "epoch": 2.79, - "learning_rate": 2.4619679932925303e-05, - "loss": 2.4802, - "step": 9038 - }, - { - "epoch": 2.79, - "learning_rate": 2.4584050873688072e-05, - "loss": 2.1631, - "step": 9039 - }, - { - "epoch": 2.79, - "learning_rate": 2.4548421814450843e-05, - "loss": 2.2621, - "step": 9040 - }, - { - "epoch": 2.79, - "learning_rate": 2.4512792755213615e-05, - "loss": 2.3666, - "step": 9041 - }, - { - "epoch": 2.79, - "learning_rate": 2.4477163695976384e-05, - "loss": 2.185, - "step": 9042 - }, - { - "epoch": 2.79, - "learning_rate": 2.4441534636739155e-05, - "loss": 2.1612, - "step": 9043 - }, - { - "epoch": 2.79, - "learning_rate": 2.4405905577501927e-05, - "loss": 2.1222, - "step": 9044 - }, - { - "epoch": 2.79, - "learning_rate": 2.43702765182647e-05, - "loss": 2.5561, - "step": 9045 - }, - { - "epoch": 2.79, - "learning_rate": 2.4334647459027467e-05, - "loss": 2.2493, - "step": 9046 - }, - { - "epoch": 2.79, - "learning_rate": 2.429901839979024e-05, - "loss": 2.28, - "step": 9047 - }, - { - "epoch": 2.79, - "learning_rate": 2.426338934055301e-05, - "loss": 1.9667, - "step": 9048 - }, - { - "epoch": 2.79, - "learning_rate": 2.422776028131578e-05, - "loss": 1.9692, - "step": 9049 - }, - { - "epoch": 2.79, - "learning_rate": 2.419213122207855e-05, - "loss": 2.1087, - "step": 9050 - }, - { - "epoch": 2.79, - "learning_rate": 2.4156502162841326e-05, - "loss": 1.8012, - "step": 9051 - }, - { - "epoch": 2.79, - "learning_rate": 2.4120873103604097e-05, - "loss": 1.9767, - "step": 9052 - }, - { - "epoch": 2.79, - "learning_rate": 2.4085244044366866e-05, - "loss": 1.9178, - "step": 9053 - }, - { - "epoch": 2.79, - "learning_rate": 2.4049614985129637e-05, - "loss": 1.8851, - "step": 9054 - }, - { - "epoch": 2.79, - "learning_rate": 2.401398592589241e-05, - "loss": 1.9495, - "step": 9055 - }, - { - "epoch": 2.8, - "learning_rate": 2.3978356866655177e-05, - "loss": 2.0378, - "step": 9056 - }, - { - "epoch": 2.8, - "learning_rate": 2.394272780741795e-05, - "loss": 1.8832, - "step": 9057 - }, - { - "epoch": 2.8, - "learning_rate": 2.390709874818072e-05, - "loss": 1.7432, - "step": 9058 - }, - { - "epoch": 2.8, - "learning_rate": 2.387146968894349e-05, - "loss": 1.838, - "step": 9059 - }, - { - "epoch": 2.8, - "learning_rate": 2.383584062970626e-05, - "loss": 1.818, - "step": 9060 - }, - { - "epoch": 2.8, - "learning_rate": 2.3800211570469032e-05, - "loss": 1.6792, - "step": 9061 - }, - { - "epoch": 2.8, - "learning_rate": 2.37645825112318e-05, - "loss": 1.775, - "step": 9062 - }, - { - "epoch": 2.8, - "learning_rate": 2.3728953451994572e-05, - "loss": 1.6518, - "step": 9063 - }, - { - "epoch": 2.8, - "learning_rate": 2.3693324392757344e-05, - "loss": 1.2475, - "step": 9064 - }, - { - "epoch": 2.8, - "learning_rate": 2.3657695333520113e-05, - "loss": 1.6546, - "step": 9065 - }, - { - "epoch": 2.8, - "learning_rate": 2.3622066274282884e-05, - "loss": 1.5611, - "step": 9066 - }, - { - "epoch": 2.8, - "learning_rate": 2.358643721504566e-05, - "loss": 1.5995, - "step": 9067 - }, - { - "epoch": 2.8, - "learning_rate": 2.355080815580843e-05, - "loss": 1.316, - "step": 9068 - }, - { - "epoch": 2.8, - "learning_rate": 2.3515179096571203e-05, - "loss": 1.1295, - "step": 9069 - }, - { - "epoch": 2.8, - "learning_rate": 2.347955003733397e-05, - "loss": 1.2791, - "step": 9070 - }, - { - "epoch": 2.8, - "learning_rate": 2.3443920978096743e-05, - "loss": 1.2844, - "step": 9071 - }, - { - "epoch": 2.8, - "learning_rate": 2.3408291918859515e-05, - "loss": 1.1046, - "step": 9072 - }, - { - "epoch": 2.8, - "learning_rate": 2.3372662859622283e-05, - "loss": 1.1296, - "step": 9073 - }, - { - "epoch": 2.8, - "learning_rate": 2.3337033800385055e-05, - "loss": 1.0497, - "step": 9074 - }, - { - "epoch": 2.8, - "learning_rate": 2.3301404741147826e-05, - "loss": 1.4272, - "step": 9075 - }, - { - "epoch": 2.8, - "learning_rate": 2.3265775681910595e-05, - "loss": 1.1721, - "step": 9076 - }, - { - "epoch": 2.8, - "learning_rate": 2.3230146622673366e-05, - "loss": 1.0827, - "step": 9077 - }, - { - "epoch": 2.8, - "learning_rate": 2.3194517563436138e-05, - "loss": 1.0484, - "step": 9078 - }, - { - "epoch": 2.8, - "learning_rate": 2.3158888504198906e-05, - "loss": 0.8893, - "step": 9079 - }, - { - "epoch": 2.8, - "learning_rate": 2.3123259444961678e-05, - "loss": 1.1005, - "step": 9080 - }, - { - "epoch": 2.8, - "learning_rate": 2.308763038572445e-05, - "loss": 3.1303, - "step": 9081 - }, - { - "epoch": 2.8, - "learning_rate": 2.3052001326487218e-05, - "loss": 2.8084, - "step": 9082 - }, - { - "epoch": 2.8, - "learning_rate": 2.301637226724999e-05, - "loss": 2.5695, - "step": 9083 - }, - { - "epoch": 2.8, - "learning_rate": 2.2980743208012765e-05, - "loss": 2.5344, - "step": 9084 - }, - { - "epoch": 2.8, - "learning_rate": 2.2945114148775537e-05, - "loss": 2.677, - "step": 9085 - }, - { - "epoch": 2.8, - "learning_rate": 2.2909485089538305e-05, - "loss": 2.6768, - "step": 9086 - }, - { - "epoch": 2.8, - "learning_rate": 2.2873856030301077e-05, - "loss": 2.2205, - "step": 9087 - }, - { - "epoch": 2.8, - "learning_rate": 2.283822697106385e-05, - "loss": 2.2422, - "step": 9088 - }, - { - "epoch": 2.81, - "learning_rate": 2.280259791182662e-05, - "loss": 2.4955, - "step": 9089 - }, - { - "epoch": 2.81, - "learning_rate": 2.276696885258939e-05, - "loss": 1.9918, - "step": 9090 - }, - { - "epoch": 2.81, - "learning_rate": 2.273133979335216e-05, - "loss": 2.1013, - "step": 9091 - }, - { - "epoch": 2.81, - "learning_rate": 2.2695710734114932e-05, - "loss": 2.6919, - "step": 9092 - }, - { - "epoch": 2.81, - "learning_rate": 2.26600816748777e-05, - "loss": 2.2238, - "step": 9093 - }, - { - "epoch": 2.81, - "learning_rate": 2.2624452615640472e-05, - "loss": 2.1374, - "step": 9094 - }, - { - "epoch": 2.81, - "learning_rate": 2.2588823556403244e-05, - "loss": 2.2546, - "step": 9095 - }, - { - "epoch": 2.81, - "learning_rate": 2.2553194497166012e-05, - "loss": 2.4077, - "step": 9096 - }, - { - "epoch": 2.81, - "learning_rate": 2.2517565437928784e-05, - "loss": 2.2144, - "step": 9097 - }, - { - "epoch": 2.81, - "learning_rate": 2.2481936378691555e-05, - "loss": 2.4087, - "step": 9098 - }, - { - "epoch": 2.81, - "learning_rate": 2.2446307319454324e-05, - "loss": 2.1558, - "step": 9099 - }, - { - "epoch": 2.81, - "learning_rate": 2.24106782602171e-05, - "loss": 1.9806, - "step": 9100 - }, - { - "epoch": 2.81, - "learning_rate": 2.237504920097987e-05, - "loss": 1.867, - "step": 9101 - }, - { - "epoch": 2.81, - "learning_rate": 2.2339420141742642e-05, - "loss": 1.9735, - "step": 9102 - }, - { - "epoch": 2.81, - "learning_rate": 2.230379108250541e-05, - "loss": 2.0857, - "step": 9103 - }, - { - "epoch": 2.81, - "learning_rate": 2.2268162023268182e-05, - "loss": 1.9958, - "step": 9104 - }, - { - "epoch": 2.81, - "learning_rate": 2.2232532964030954e-05, - "loss": 1.8067, - "step": 9105 - }, - { - "epoch": 2.81, - "learning_rate": 2.2196903904793722e-05, - "loss": 1.9471, - "step": 9106 - }, - { - "epoch": 2.81, - "learning_rate": 2.2161274845556494e-05, - "loss": 1.7782, - "step": 9107 - }, - { - "epoch": 2.81, - "learning_rate": 2.2125645786319266e-05, - "loss": 1.7547, - "step": 9108 - }, - { - "epoch": 2.81, - "learning_rate": 2.2090016727082034e-05, - "loss": 1.7654, - "step": 9109 - }, - { - "epoch": 2.81, - "learning_rate": 2.2054387667844806e-05, - "loss": 1.7527, - "step": 9110 - }, - { - "epoch": 2.81, - "learning_rate": 2.2018758608607577e-05, - "loss": 1.6685, - "step": 9111 - }, - { - "epoch": 2.81, - "learning_rate": 2.198312954937035e-05, - "loss": 1.9333, - "step": 9112 - }, - { - "epoch": 2.81, - "learning_rate": 2.1947500490133117e-05, - "loss": 1.5426, - "step": 9113 - }, - { - "epoch": 2.81, - "learning_rate": 2.191187143089589e-05, - "loss": 1.5614, - "step": 9114 - }, - { - "epoch": 2.81, - "learning_rate": 2.187624237165866e-05, - "loss": 1.5731, - "step": 9115 - }, - { - "epoch": 2.81, - "learning_rate": 2.1840613312421436e-05, - "loss": 1.4687, - "step": 9116 - }, - { - "epoch": 2.81, - "learning_rate": 2.1804984253184204e-05, - "loss": 1.3526, - "step": 9117 - }, - { - "epoch": 2.81, - "learning_rate": 2.1769355193946976e-05, - "loss": 1.5758, - "step": 9118 - }, - { - "epoch": 2.81, - "learning_rate": 2.1733726134709748e-05, - "loss": 1.2259, - "step": 9119 - }, - { - "epoch": 2.81, - "learning_rate": 2.1698097075472516e-05, - "loss": 1.2875, - "step": 9120 - }, - { - "epoch": 2.82, - "learning_rate": 2.1662468016235288e-05, - "loss": 1.3442, - "step": 9121 - }, - { - "epoch": 2.82, - "learning_rate": 2.162683895699806e-05, - "loss": 1.0998, - "step": 9122 - }, - { - "epoch": 2.82, - "learning_rate": 2.1591209897760828e-05, - "loss": 1.3629, - "step": 9123 - }, - { - "epoch": 2.82, - "learning_rate": 2.15555808385236e-05, - "loss": 1.035, - "step": 9124 - }, - { - "epoch": 2.82, - "learning_rate": 2.151995177928637e-05, - "loss": 1.1485, - "step": 9125 - }, - { - "epoch": 2.82, - "learning_rate": 2.148432272004914e-05, - "loss": 1.0344, - "step": 9126 - }, - { - "epoch": 2.82, - "learning_rate": 2.144869366081191e-05, - "loss": 1.0704, - "step": 9127 - }, - { - "epoch": 2.82, - "learning_rate": 2.1413064601574683e-05, - "loss": 0.916, - "step": 9128 - }, - { - "epoch": 2.82, - "learning_rate": 2.137743554233745e-05, - "loss": 1.0182, - "step": 9129 - }, - { - "epoch": 2.82, - "learning_rate": 2.1341806483100223e-05, - "loss": 1.0006, - "step": 9130 - }, - { - "epoch": 2.82, - "learning_rate": 2.1306177423862995e-05, - "loss": 3.3531, - "step": 9131 - }, - { - "epoch": 2.82, - "learning_rate": 2.1270548364625766e-05, - "loss": 3.0445, - "step": 9132 - }, - { - "epoch": 2.82, - "learning_rate": 2.123491930538854e-05, - "loss": 2.4459, - "step": 9133 - }, - { - "epoch": 2.82, - "learning_rate": 2.119929024615131e-05, - "loss": 2.5358, - "step": 9134 - }, - { - "epoch": 2.82, - "learning_rate": 2.116366118691408e-05, - "loss": 2.1953, - "step": 9135 - }, - { - "epoch": 2.82, - "learning_rate": 2.1128032127676853e-05, - "loss": 2.3559, - "step": 9136 - }, - { - "epoch": 2.82, - "learning_rate": 2.109240306843962e-05, - "loss": 2.4563, - "step": 9137 - }, - { - "epoch": 2.82, - "learning_rate": 2.1056774009202393e-05, - "loss": 2.3574, - "step": 9138 - }, - { - "epoch": 2.82, - "learning_rate": 2.1021144949965165e-05, - "loss": 2.0433, - "step": 9139 - }, - { - "epoch": 2.82, - "learning_rate": 2.0985515890727933e-05, - "loss": 2.3527, - "step": 9140 - }, - { - "epoch": 2.82, - "learning_rate": 2.0949886831490705e-05, - "loss": 2.3369, - "step": 9141 - }, - { - "epoch": 2.82, - "learning_rate": 2.0914257772253477e-05, - "loss": 2.4, - "step": 9142 - }, - { - "epoch": 2.82, - "learning_rate": 2.0878628713016245e-05, - "loss": 2.0807, - "step": 9143 - }, - { - "epoch": 2.82, - "learning_rate": 2.0842999653779017e-05, - "loss": 2.3883, - "step": 9144 - }, - { - "epoch": 2.82, - "learning_rate": 2.080737059454179e-05, - "loss": 2.362, - "step": 9145 - }, - { - "epoch": 2.82, - "learning_rate": 2.0771741535304557e-05, - "loss": 2.0861, - "step": 9146 - }, - { - "epoch": 2.82, - "learning_rate": 2.073611247606733e-05, - "loss": 2.0139, - "step": 9147 - }, - { - "epoch": 2.82, - "learning_rate": 2.07004834168301e-05, - "loss": 2.0064, - "step": 9148 - }, - { - "epoch": 2.82, - "learning_rate": 2.0664854357592875e-05, - "loss": 2.1708, - "step": 9149 - }, - { - "epoch": 2.82, - "learning_rate": 2.0629225298355644e-05, - "loss": 2.0394, - "step": 9150 - }, - { - "epoch": 2.82, - "learning_rate": 2.0593596239118415e-05, - "loss": 1.9574, - "step": 9151 - }, - { - "epoch": 2.82, - "learning_rate": 2.0557967179881187e-05, - "loss": 1.8588, - "step": 9152 - }, - { - "epoch": 2.82, - "learning_rate": 2.052233812064396e-05, - "loss": 2.0818, - "step": 9153 - }, - { - "epoch": 2.83, - "learning_rate": 2.0486709061406727e-05, - "loss": 1.9408, - "step": 9154 - }, - { - "epoch": 2.83, - "learning_rate": 2.04510800021695e-05, - "loss": 1.9927, - "step": 9155 - }, - { - "epoch": 2.83, - "learning_rate": 2.041545094293227e-05, - "loss": 1.9435, - "step": 9156 - }, - { - "epoch": 2.83, - "learning_rate": 2.037982188369504e-05, - "loss": 2.0422, - "step": 9157 - }, - { - "epoch": 2.83, - "learning_rate": 2.034419282445781e-05, - "loss": 1.8605, - "step": 9158 - }, - { - "epoch": 2.83, - "learning_rate": 2.0308563765220582e-05, - "loss": 1.8695, - "step": 9159 - }, - { - "epoch": 2.83, - "learning_rate": 2.027293470598335e-05, - "loss": 1.5394, - "step": 9160 - }, - { - "epoch": 2.83, - "learning_rate": 2.0237305646746126e-05, - "loss": 1.7766, - "step": 9161 - }, - { - "epoch": 2.83, - "learning_rate": 2.0201676587508894e-05, - "loss": 1.7102, - "step": 9162 - }, - { - "epoch": 2.83, - "learning_rate": 2.0166047528271666e-05, - "loss": 1.5642, - "step": 9163 - }, - { - "epoch": 2.83, - "learning_rate": 2.0130418469034437e-05, - "loss": 1.7443, - "step": 9164 - }, - { - "epoch": 2.83, - "learning_rate": 2.0094789409797206e-05, - "loss": 1.4618, - "step": 9165 - }, - { - "epoch": 2.83, - "learning_rate": 2.0059160350559977e-05, - "loss": 1.3652, - "step": 9166 - }, - { - "epoch": 2.83, - "learning_rate": 2.002353129132275e-05, - "loss": 1.4272, - "step": 9167 - }, - { - "epoch": 2.83, - "learning_rate": 1.9987902232085517e-05, - "loss": 1.4589, - "step": 9168 - }, - { - "epoch": 2.83, - "learning_rate": 1.995227317284829e-05, - "loss": 1.2943, - "step": 9169 - }, - { - "epoch": 2.83, - "learning_rate": 1.991664411361106e-05, - "loss": 1.4461, - "step": 9170 - }, - { - "epoch": 2.83, - "learning_rate": 1.9881015054373833e-05, - "loss": 1.4581, - "step": 9171 - }, - { - "epoch": 2.83, - "learning_rate": 1.9845385995136604e-05, - "loss": 1.3286, - "step": 9172 - }, - { - "epoch": 2.83, - "learning_rate": 1.9809756935899373e-05, - "loss": 1.2587, - "step": 9173 - }, - { - "epoch": 2.83, - "learning_rate": 1.9774127876662144e-05, - "loss": 1.0284, - "step": 9174 - }, - { - "epoch": 2.83, - "learning_rate": 1.9738498817424916e-05, - "loss": 1.1601, - "step": 9175 - }, - { - "epoch": 2.83, - "learning_rate": 1.9702869758187688e-05, - "loss": 1.2744, - "step": 9176 - }, - { - "epoch": 2.83, - "learning_rate": 1.9667240698950456e-05, - "loss": 1.0123, - "step": 9177 - }, - { - "epoch": 2.83, - "learning_rate": 1.963161163971323e-05, - "loss": 1.1021, - "step": 9178 - }, - { - "epoch": 2.83, - "learning_rate": 1.9595982580476e-05, - "loss": 0.915, - "step": 9179 - }, - { - "epoch": 2.83, - "learning_rate": 1.956035352123877e-05, - "loss": 0.9215, - "step": 9180 - }, - { - "epoch": 2.83, - "learning_rate": 1.9524724462001543e-05, - "loss": 3.1756, - "step": 9181 - }, - { - "epoch": 2.83, - "learning_rate": 1.948909540276431e-05, - "loss": 2.6713, - "step": 9182 - }, - { - "epoch": 2.83, - "learning_rate": 1.9453466343527083e-05, - "loss": 2.7708, - "step": 9183 - }, - { - "epoch": 2.83, - "learning_rate": 1.9417837284289855e-05, - "loss": 2.4908, - "step": 9184 - }, - { - "epoch": 2.83, - "learning_rate": 1.9382208225052623e-05, - "loss": 2.5566, - "step": 9185 - }, - { - "epoch": 2.84, - "learning_rate": 1.9346579165815398e-05, - "loss": 2.3188, - "step": 9186 - }, - { - "epoch": 2.84, - "learning_rate": 1.9310950106578166e-05, - "loss": 2.6201, - "step": 9187 - }, - { - "epoch": 2.84, - "learning_rate": 1.9275321047340938e-05, - "loss": 2.2284, - "step": 9188 - }, - { - "epoch": 2.84, - "learning_rate": 1.923969198810371e-05, - "loss": 2.4743, - "step": 9189 - }, - { - "epoch": 2.84, - "learning_rate": 1.9204062928866478e-05, - "loss": 2.0805, - "step": 9190 - }, - { - "epoch": 2.84, - "learning_rate": 1.916843386962925e-05, - "loss": 2.1274, - "step": 9191 - }, - { - "epoch": 2.84, - "learning_rate": 1.913280481039202e-05, - "loss": 2.1963, - "step": 9192 - }, - { - "epoch": 2.84, - "learning_rate": 1.909717575115479e-05, - "loss": 2.2713, - "step": 9193 - }, - { - "epoch": 2.84, - "learning_rate": 1.9061546691917565e-05, - "loss": 2.211, - "step": 9194 - }, - { - "epoch": 2.84, - "learning_rate": 1.9025917632680333e-05, - "loss": 2.1541, - "step": 9195 - }, - { - "epoch": 2.84, - "learning_rate": 1.8990288573443105e-05, - "loss": 2.1459, - "step": 9196 - }, - { - "epoch": 2.84, - "learning_rate": 1.8954659514205877e-05, - "loss": 1.9152, - "step": 9197 - }, - { - "epoch": 2.84, - "learning_rate": 1.891903045496865e-05, - "loss": 1.9448, - "step": 9198 - }, - { - "epoch": 2.84, - "learning_rate": 1.8883401395731417e-05, - "loss": 1.888, - "step": 9199 - }, - { - "epoch": 2.84, - "learning_rate": 1.884777233649419e-05, - "loss": 2.0033, - "step": 9200 - }, - { - "epoch": 2.84, - "learning_rate": 1.881214327725696e-05, - "loss": 1.8299, - "step": 9201 - }, - { - "epoch": 2.84, - "learning_rate": 1.8776514218019732e-05, - "loss": 1.7124, - "step": 9202 - }, - { - "epoch": 2.84, - "learning_rate": 1.8740885158782504e-05, - "loss": 1.8388, - "step": 9203 - }, - { - "epoch": 2.84, - "learning_rate": 1.8705256099545272e-05, - "loss": 1.9312, - "step": 9204 - }, - { - "epoch": 2.84, - "learning_rate": 1.8669627040308044e-05, - "loss": 1.7445, - "step": 9205 - }, - { - "epoch": 2.84, - "learning_rate": 1.8633997981070815e-05, - "loss": 1.9263, - "step": 9206 - }, - { - "epoch": 2.84, - "learning_rate": 1.8598368921833584e-05, - "loss": 1.7701, - "step": 9207 - }, - { - "epoch": 2.84, - "learning_rate": 1.8562739862596355e-05, - "loss": 1.6873, - "step": 9208 - }, - { - "epoch": 2.84, - "learning_rate": 1.8527110803359127e-05, - "loss": 1.8336, - "step": 9209 - }, - { - "epoch": 2.84, - "learning_rate": 1.84914817441219e-05, - "loss": 1.609, - "step": 9210 - }, - { - "epoch": 2.84, - "learning_rate": 1.845585268488467e-05, - "loss": 1.8059, - "step": 9211 - }, - { - "epoch": 2.84, - "learning_rate": 1.842022362564744e-05, - "loss": 1.7066, - "step": 9212 - }, - { - "epoch": 2.84, - "learning_rate": 1.838459456641021e-05, - "loss": 1.7682, - "step": 9213 - }, - { - "epoch": 2.84, - "learning_rate": 1.8348965507172982e-05, - "loss": 1.4516, - "step": 9214 - }, - { - "epoch": 2.84, - "learning_rate": 1.831333644793575e-05, - "loss": 1.6452, - "step": 9215 - }, - { - "epoch": 2.84, - "learning_rate": 1.8277707388698522e-05, - "loss": 1.3835, - "step": 9216 - }, - { - "epoch": 2.84, - "learning_rate": 1.8242078329461294e-05, - "loss": 1.4088, - "step": 9217 - }, - { - "epoch": 2.84, - "learning_rate": 1.8206449270224062e-05, - "loss": 1.3198, - "step": 9218 - }, - { - "epoch": 2.85, - "learning_rate": 1.8170820210986837e-05, - "loss": 1.4678, - "step": 9219 - }, - { - "epoch": 2.85, - "learning_rate": 1.813519115174961e-05, - "loss": 1.4052, - "step": 9220 - }, - { - "epoch": 2.85, - "learning_rate": 1.8099562092512377e-05, - "loss": 1.3398, - "step": 9221 - }, - { - "epoch": 2.85, - "learning_rate": 1.806393303327515e-05, - "loss": 1.2748, - "step": 9222 - }, - { - "epoch": 2.85, - "learning_rate": 1.802830397403792e-05, - "loss": 1.1996, - "step": 9223 - }, - { - "epoch": 2.85, - "learning_rate": 1.799267491480069e-05, - "loss": 1.2748, - "step": 9224 - }, - { - "epoch": 2.85, - "learning_rate": 1.795704585556346e-05, - "loss": 1.0661, - "step": 9225 - }, - { - "epoch": 2.85, - "learning_rate": 1.7921416796326233e-05, - "loss": 1.1277, - "step": 9226 - }, - { - "epoch": 2.85, - "learning_rate": 1.7885787737089004e-05, - "loss": 0.9891, - "step": 9227 - }, - { - "epoch": 2.85, - "learning_rate": 1.7850158677851776e-05, - "loss": 1.0338, - "step": 9228 - }, - { - "epoch": 2.85, - "learning_rate": 1.7814529618614544e-05, - "loss": 0.9677, - "step": 9229 - }, - { - "epoch": 2.85, - "learning_rate": 1.7778900559377316e-05, - "loss": 0.937, - "step": 9230 - }, - { - "epoch": 2.85, - "learning_rate": 1.7743271500140088e-05, - "loss": 3.4525, - "step": 9231 - }, - { - "epoch": 2.85, - "learning_rate": 1.7707642440902856e-05, - "loss": 2.9205, - "step": 9232 - }, - { - "epoch": 2.85, - "learning_rate": 1.7672013381665628e-05, - "loss": 2.8542, - "step": 9233 - }, - { - "epoch": 2.85, - "learning_rate": 1.76363843224284e-05, - "loss": 2.453, - "step": 9234 - }, - { - "epoch": 2.85, - "learning_rate": 1.760075526319117e-05, - "loss": 2.4218, - "step": 9235 - }, - { - "epoch": 2.85, - "learning_rate": 1.7565126203953943e-05, - "loss": 2.3871, - "step": 9236 - }, - { - "epoch": 2.85, - "learning_rate": 1.752949714471671e-05, - "loss": 2.375, - "step": 9237 - }, - { - "epoch": 2.85, - "learning_rate": 1.7493868085479483e-05, - "loss": 2.4762, - "step": 9238 - }, - { - "epoch": 2.85, - "learning_rate": 1.7458239026242255e-05, - "loss": 2.6032, - "step": 9239 - }, - { - "epoch": 2.85, - "learning_rate": 1.7422609967005026e-05, - "loss": 2.0634, - "step": 9240 - }, - { - "epoch": 2.85, - "learning_rate": 1.7386980907767795e-05, - "loss": 2.6034, - "step": 9241 - }, - { - "epoch": 2.85, - "learning_rate": 1.7351351848530566e-05, - "loss": 2.0552, - "step": 9242 - }, - { - "epoch": 2.85, - "learning_rate": 1.7315722789293338e-05, - "loss": 2.2523, - "step": 9243 - }, - { - "epoch": 2.85, - "learning_rate": 1.728009373005611e-05, - "loss": 2.1175, - "step": 9244 - }, - { - "epoch": 2.85, - "learning_rate": 1.724446467081888e-05, - "loss": 2.2848, - "step": 9245 - }, - { - "epoch": 2.85, - "learning_rate": 1.720883561158165e-05, - "loss": 1.9653, - "step": 9246 - }, - { - "epoch": 2.85, - "learning_rate": 1.717320655234442e-05, - "loss": 2.2814, - "step": 9247 - }, - { - "epoch": 2.85, - "learning_rate": 1.7137577493107193e-05, - "loss": 1.8074, - "step": 9248 - }, - { - "epoch": 2.85, - "learning_rate": 1.710194843386996e-05, - "loss": 1.9985, - "step": 9249 - }, - { - "epoch": 2.85, - "learning_rate": 1.7066319374632733e-05, - "loss": 2.271, - "step": 9250 - }, - { - "epoch": 2.86, - "learning_rate": 1.7030690315395505e-05, - "loss": 2.0514, - "step": 9251 - }, - { - "epoch": 2.86, - "learning_rate": 1.6995061256158277e-05, - "loss": 2.0182, - "step": 9252 - }, - { - "epoch": 2.86, - "learning_rate": 1.695943219692105e-05, - "loss": 1.961, - "step": 9253 - }, - { - "epoch": 2.86, - "learning_rate": 1.6923803137683817e-05, - "loss": 1.9745, - "step": 9254 - }, - { - "epoch": 2.86, - "learning_rate": 1.688817407844659e-05, - "loss": 1.9143, - "step": 9255 - }, - { - "epoch": 2.86, - "learning_rate": 1.685254501920936e-05, - "loss": 1.6308, - "step": 9256 - }, - { - "epoch": 2.86, - "learning_rate": 1.681691595997213e-05, - "loss": 1.4813, - "step": 9257 - }, - { - "epoch": 2.86, - "learning_rate": 1.67812869007349e-05, - "loss": 1.7538, - "step": 9258 - }, - { - "epoch": 2.86, - "learning_rate": 1.6745657841497672e-05, - "loss": 1.8964, - "step": 9259 - }, - { - "epoch": 2.86, - "learning_rate": 1.6710028782260444e-05, - "loss": 1.7032, - "step": 9260 - }, - { - "epoch": 2.86, - "learning_rate": 1.6674399723023215e-05, - "loss": 1.6049, - "step": 9261 - }, - { - "epoch": 2.86, - "learning_rate": 1.6638770663785987e-05, - "loss": 1.7524, - "step": 9262 - }, - { - "epoch": 2.86, - "learning_rate": 1.6603141604548755e-05, - "loss": 1.8346, - "step": 9263 - }, - { - "epoch": 2.86, - "learning_rate": 1.6567512545311527e-05, - "loss": 1.5288, - "step": 9264 - }, - { - "epoch": 2.86, - "learning_rate": 1.65318834860743e-05, - "loss": 1.4837, - "step": 9265 - }, - { - "epoch": 2.86, - "learning_rate": 1.6496254426837067e-05, - "loss": 1.3777, - "step": 9266 - }, - { - "epoch": 2.86, - "learning_rate": 1.646062536759984e-05, - "loss": 1.4537, - "step": 9267 - }, - { - "epoch": 2.86, - "learning_rate": 1.642499630836261e-05, - "loss": 1.1861, - "step": 9268 - }, - { - "epoch": 2.86, - "learning_rate": 1.6389367249125382e-05, - "loss": 1.3106, - "step": 9269 - }, - { - "epoch": 2.86, - "learning_rate": 1.6353738189888154e-05, - "loss": 1.1824, - "step": 9270 - }, - { - "epoch": 2.86, - "learning_rate": 1.6318109130650922e-05, - "loss": 1.2435, - "step": 9271 - }, - { - "epoch": 2.86, - "learning_rate": 1.6282480071413694e-05, - "loss": 1.1592, - "step": 9272 - }, - { - "epoch": 2.86, - "learning_rate": 1.6246851012176466e-05, - "loss": 1.2488, - "step": 9273 - }, - { - "epoch": 2.86, - "learning_rate": 1.6211221952939234e-05, - "loss": 1.1027, - "step": 9274 - }, - { - "epoch": 2.86, - "learning_rate": 1.6175592893702006e-05, - "loss": 1.157, - "step": 9275 - }, - { - "epoch": 2.86, - "learning_rate": 1.6139963834464778e-05, - "loss": 1.0814, - "step": 9276 - }, - { - "epoch": 2.86, - "learning_rate": 1.610433477522755e-05, - "loss": 1.0278, - "step": 9277 - }, - { - "epoch": 2.86, - "learning_rate": 1.606870571599032e-05, - "loss": 1.0059, - "step": 9278 - }, - { - "epoch": 2.86, - "learning_rate": 1.603307665675309e-05, - "loss": 1.0831, - "step": 9279 - }, - { - "epoch": 2.86, - "learning_rate": 1.599744759751586e-05, - "loss": 1.0278, - "step": 9280 - }, - { - "epoch": 2.86, - "learning_rate": 1.5961818538278633e-05, - "loss": 2.7525, - "step": 9281 - }, - { - "epoch": 2.86, - "learning_rate": 1.59261894790414e-05, - "loss": 3.1347, - "step": 9282 - }, - { - "epoch": 2.87, - "learning_rate": 1.5890560419804173e-05, - "loss": 2.7701, - "step": 9283 - }, - { - "epoch": 2.87, - "learning_rate": 1.5854931360566948e-05, - "loss": 2.5865, - "step": 9284 - }, - { - "epoch": 2.87, - "learning_rate": 1.5819302301329716e-05, - "loss": 2.2108, - "step": 9285 - }, - { - "epoch": 2.87, - "learning_rate": 1.5783673242092488e-05, - "loss": 2.0711, - "step": 9286 - }, - { - "epoch": 2.87, - "learning_rate": 1.574804418285526e-05, - "loss": 2.3002, - "step": 9287 - }, - { - "epoch": 2.87, - "learning_rate": 1.5712415123618028e-05, - "loss": 2.2201, - "step": 9288 - }, - { - "epoch": 2.87, - "learning_rate": 1.56767860643808e-05, - "loss": 2.3435, - "step": 9289 - }, - { - "epoch": 2.87, - "learning_rate": 1.564115700514357e-05, - "loss": 2.282, - "step": 9290 - }, - { - "epoch": 2.87, - "learning_rate": 1.560552794590634e-05, - "loss": 2.0258, - "step": 9291 - }, - { - "epoch": 2.87, - "learning_rate": 1.5569898886669115e-05, - "loss": 2.3217, - "step": 9292 - }, - { - "epoch": 2.87, - "learning_rate": 1.5534269827431883e-05, - "loss": 2.2276, - "step": 9293 - }, - { - "epoch": 2.87, - "learning_rate": 1.5498640768194655e-05, - "loss": 2.1246, - "step": 9294 - }, - { - "epoch": 2.87, - "learning_rate": 1.5463011708957426e-05, - "loss": 2.1234, - "step": 9295 - }, - { - "epoch": 2.87, - "learning_rate": 1.5427382649720195e-05, - "loss": 2.1468, - "step": 9296 - }, - { - "epoch": 2.87, - "learning_rate": 1.5391753590482966e-05, - "loss": 1.939, - "step": 9297 - }, - { - "epoch": 2.87, - "learning_rate": 1.5356124531245738e-05, - "loss": 2.08, - "step": 9298 - }, - { - "epoch": 2.87, - "learning_rate": 1.5320495472008507e-05, - "loss": 2.0662, - "step": 9299 - }, - { - "epoch": 2.87, - "learning_rate": 1.528486641277128e-05, - "loss": 1.9498, - "step": 9300 - }, - { - "epoch": 2.87, - "learning_rate": 1.5249237353534052e-05, - "loss": 2.1597, - "step": 9301 - }, - { - "epoch": 2.87, - "learning_rate": 1.5213608294296822e-05, - "loss": 1.9647, - "step": 9302 - }, - { - "epoch": 2.87, - "learning_rate": 1.5177979235059593e-05, - "loss": 1.8979, - "step": 9303 - }, - { - "epoch": 2.87, - "learning_rate": 1.5142350175822363e-05, - "loss": 1.7903, - "step": 9304 - }, - { - "epoch": 2.87, - "learning_rate": 1.5106721116585133e-05, - "loss": 1.73, - "step": 9305 - }, - { - "epoch": 2.87, - "learning_rate": 1.5071092057347905e-05, - "loss": 1.8519, - "step": 9306 - }, - { - "epoch": 2.87, - "learning_rate": 1.5035462998110675e-05, - "loss": 1.9056, - "step": 9307 - }, - { - "epoch": 2.87, - "learning_rate": 1.4999833938873445e-05, - "loss": 1.7979, - "step": 9308 - }, - { - "epoch": 2.87, - "learning_rate": 1.4964204879636219e-05, - "loss": 1.4864, - "step": 9309 - }, - { - "epoch": 2.87, - "learning_rate": 1.4928575820398989e-05, - "loss": 1.7691, - "step": 9310 - }, - { - "epoch": 2.87, - "learning_rate": 1.489294676116176e-05, - "loss": 1.5696, - "step": 9311 - }, - { - "epoch": 2.87, - "learning_rate": 1.485731770192453e-05, - "loss": 1.6051, - "step": 9312 - }, - { - "epoch": 2.87, - "learning_rate": 1.4821688642687302e-05, - "loss": 1.4115, - "step": 9313 - }, - { - "epoch": 2.87, - "learning_rate": 1.4786059583450072e-05, - "loss": 1.4722, - "step": 9314 - }, - { - "epoch": 2.87, - "learning_rate": 1.4750430524212842e-05, - "loss": 1.4686, - "step": 9315 - }, - { - "epoch": 2.88, - "learning_rate": 1.4714801464975614e-05, - "loss": 1.3013, - "step": 9316 - }, - { - "epoch": 2.88, - "learning_rate": 1.4679172405738385e-05, - "loss": 1.6424, - "step": 9317 - }, - { - "epoch": 2.88, - "learning_rate": 1.4643543346501157e-05, - "loss": 1.4999, - "step": 9318 - }, - { - "epoch": 2.88, - "learning_rate": 1.4607914287263927e-05, - "loss": 1.2951, - "step": 9319 - }, - { - "epoch": 2.88, - "learning_rate": 1.4572285228026697e-05, - "loss": 1.2951, - "step": 9320 - }, - { - "epoch": 2.88, - "learning_rate": 1.4536656168789469e-05, - "loss": 1.2499, - "step": 9321 - }, - { - "epoch": 2.88, - "learning_rate": 1.4501027109552239e-05, - "loss": 1.3529, - "step": 9322 - }, - { - "epoch": 2.88, - "learning_rate": 1.446539805031501e-05, - "loss": 1.2637, - "step": 9323 - }, - { - "epoch": 2.88, - "learning_rate": 1.442976899107778e-05, - "loss": 1.0397, - "step": 9324 - }, - { - "epoch": 2.88, - "learning_rate": 1.4394139931840554e-05, - "loss": 1.2979, - "step": 9325 - }, - { - "epoch": 2.88, - "learning_rate": 1.4358510872603324e-05, - "loss": 1.1258, - "step": 9326 - }, - { - "epoch": 2.88, - "learning_rate": 1.4322881813366094e-05, - "loss": 0.988, - "step": 9327 - }, - { - "epoch": 2.88, - "learning_rate": 1.4287252754128866e-05, - "loss": 1.1183, - "step": 9328 - }, - { - "epoch": 2.88, - "learning_rate": 1.4251623694891636e-05, - "loss": 1.0278, - "step": 9329 - }, - { - "epoch": 2.88, - "learning_rate": 1.4215994635654406e-05, - "loss": 0.882, - "step": 9330 - }, - { - "epoch": 2.88, - "learning_rate": 1.4180365576417178e-05, - "loss": 3.0492, - "step": 9331 - }, - { - "epoch": 2.88, - "learning_rate": 1.4144736517179948e-05, - "loss": 2.7018, - "step": 9332 - }, - { - "epoch": 2.88, - "learning_rate": 1.4109107457942721e-05, - "loss": 2.6609, - "step": 9333 - }, - { - "epoch": 2.88, - "learning_rate": 1.4073478398705491e-05, - "loss": 2.6763, - "step": 9334 - }, - { - "epoch": 2.88, - "learning_rate": 1.4037849339468263e-05, - "loss": 2.3435, - "step": 9335 - }, - { - "epoch": 2.88, - "learning_rate": 1.4002220280231033e-05, - "loss": 2.589, - "step": 9336 - }, - { - "epoch": 2.88, - "learning_rate": 1.3966591220993803e-05, - "loss": 2.3427, - "step": 9337 - }, - { - "epoch": 2.88, - "learning_rate": 1.3930962161756574e-05, - "loss": 2.2953, - "step": 9338 - }, - { - "epoch": 2.88, - "learning_rate": 1.3895333102519344e-05, - "loss": 2.4515, - "step": 9339 - }, - { - "epoch": 2.88, - "learning_rate": 1.3859704043282114e-05, - "loss": 2.2911, - "step": 9340 - }, - { - "epoch": 2.88, - "learning_rate": 1.3824074984044888e-05, - "loss": 2.2437, - "step": 9341 - }, - { - "epoch": 2.88, - "learning_rate": 1.3788445924807658e-05, - "loss": 2.2096, - "step": 9342 - }, - { - "epoch": 2.88, - "learning_rate": 1.375281686557043e-05, - "loss": 2.075, - "step": 9343 - }, - { - "epoch": 2.88, - "learning_rate": 1.37171878063332e-05, - "loss": 2.2111, - "step": 9344 - }, - { - "epoch": 2.88, - "learning_rate": 1.3681558747095971e-05, - "loss": 2.0948, - "step": 9345 - }, - { - "epoch": 2.88, - "learning_rate": 1.3645929687858741e-05, - "loss": 1.9966, - "step": 9346 - }, - { - "epoch": 2.88, - "learning_rate": 1.3610300628621511e-05, - "loss": 2.1935, - "step": 9347 - }, - { - "epoch": 2.89, - "learning_rate": 1.3574671569384283e-05, - "loss": 2.0304, - "step": 9348 - }, - { - "epoch": 2.89, - "learning_rate": 1.3539042510147055e-05, - "loss": 2.0299, - "step": 9349 - }, - { - "epoch": 2.89, - "learning_rate": 1.3503413450909827e-05, - "loss": 1.9996, - "step": 9350 - }, - { - "epoch": 2.89, - "learning_rate": 1.3467784391672597e-05, - "loss": 1.9886, - "step": 9351 - }, - { - "epoch": 2.89, - "learning_rate": 1.3432155332435367e-05, - "loss": 2.1139, - "step": 9352 - }, - { - "epoch": 2.89, - "learning_rate": 1.3396526273198138e-05, - "loss": 1.8984, - "step": 9353 - }, - { - "epoch": 2.89, - "learning_rate": 1.3360897213960908e-05, - "loss": 1.9278, - "step": 9354 - }, - { - "epoch": 2.89, - "learning_rate": 1.332526815472368e-05, - "loss": 1.9168, - "step": 9355 - }, - { - "epoch": 2.89, - "learning_rate": 1.328963909548645e-05, - "loss": 1.8398, - "step": 9356 - }, - { - "epoch": 2.89, - "learning_rate": 1.325401003624922e-05, - "loss": 1.9367, - "step": 9357 - }, - { - "epoch": 2.89, - "learning_rate": 1.3218380977011993e-05, - "loss": 1.6776, - "step": 9358 - }, - { - "epoch": 2.89, - "learning_rate": 1.3182751917774763e-05, - "loss": 1.4912, - "step": 9359 - }, - { - "epoch": 2.89, - "learning_rate": 1.3147122858537535e-05, - "loss": 1.475, - "step": 9360 - }, - { - "epoch": 2.89, - "learning_rate": 1.3111493799300305e-05, - "loss": 1.888, - "step": 9361 - }, - { - "epoch": 2.89, - "learning_rate": 1.3075864740063075e-05, - "loss": 1.7385, - "step": 9362 - }, - { - "epoch": 2.89, - "learning_rate": 1.3040235680825847e-05, - "loss": 1.5282, - "step": 9363 - }, - { - "epoch": 2.89, - "learning_rate": 1.3004606621588617e-05, - "loss": 1.5744, - "step": 9364 - }, - { - "epoch": 2.89, - "learning_rate": 1.2968977562351389e-05, - "loss": 1.5786, - "step": 9365 - }, - { - "epoch": 2.89, - "learning_rate": 1.293334850311416e-05, - "loss": 1.4349, - "step": 9366 - }, - { - "epoch": 2.89, - "learning_rate": 1.2897719443876932e-05, - "loss": 1.4073, - "step": 9367 - }, - { - "epoch": 2.89, - "learning_rate": 1.2862090384639702e-05, - "loss": 1.146, - "step": 9368 - }, - { - "epoch": 2.89, - "learning_rate": 1.2826461325402472e-05, - "loss": 1.4762, - "step": 9369 - }, - { - "epoch": 2.89, - "learning_rate": 1.2790832266165244e-05, - "loss": 1.2287, - "step": 9370 - }, - { - "epoch": 2.89, - "learning_rate": 1.2755203206928014e-05, - "loss": 1.2606, - "step": 9371 - }, - { - "epoch": 2.89, - "learning_rate": 1.2719574147690784e-05, - "loss": 1.0997, - "step": 9372 - }, - { - "epoch": 2.89, - "learning_rate": 1.2683945088453556e-05, - "loss": 1.2352, - "step": 9373 - }, - { - "epoch": 2.89, - "learning_rate": 1.2648316029216327e-05, - "loss": 1.0461, - "step": 9374 - }, - { - "epoch": 2.89, - "learning_rate": 1.2612686969979099e-05, - "loss": 0.9814, - "step": 9375 - }, - { - "epoch": 2.89, - "learning_rate": 1.2577057910741869e-05, - "loss": 1.1068, - "step": 9376 - }, - { - "epoch": 2.89, - "learning_rate": 1.254142885150464e-05, - "loss": 0.9725, - "step": 9377 - }, - { - "epoch": 2.89, - "learning_rate": 1.250579979226741e-05, - "loss": 1.0271, - "step": 9378 - }, - { - "epoch": 2.89, - "learning_rate": 1.247017073303018e-05, - "loss": 0.8541, - "step": 9379 - }, - { - "epoch": 2.89, - "learning_rate": 1.2434541673792952e-05, - "loss": 0.95, - "step": 9380 - }, - { - "epoch": 2.9, - "learning_rate": 1.2398912614555722e-05, - "loss": 3.1451, - "step": 9381 - }, - { - "epoch": 2.9, - "learning_rate": 1.2363283555318496e-05, - "loss": 2.7918, - "step": 9382 - }, - { - "epoch": 2.9, - "learning_rate": 1.2327654496081266e-05, - "loss": 2.5621, - "step": 9383 - }, - { - "epoch": 2.9, - "learning_rate": 1.2292025436844036e-05, - "loss": 2.5334, - "step": 9384 - }, - { - "epoch": 2.9, - "learning_rate": 1.2256396377606808e-05, - "loss": 2.2924, - "step": 9385 - }, - { - "epoch": 2.9, - "learning_rate": 1.2220767318369578e-05, - "loss": 2.544, - "step": 9386 - }, - { - "epoch": 2.9, - "learning_rate": 1.218513825913235e-05, - "loss": 2.4469, - "step": 9387 - }, - { - "epoch": 2.9, - "learning_rate": 1.214950919989512e-05, - "loss": 2.1778, - "step": 9388 - }, - { - "epoch": 2.9, - "learning_rate": 1.211388014065789e-05, - "loss": 2.2828, - "step": 9389 - }, - { - "epoch": 2.9, - "learning_rate": 1.2078251081420663e-05, - "loss": 2.2468, - "step": 9390 - }, - { - "epoch": 2.9, - "learning_rate": 1.2042622022183433e-05, - "loss": 2.098, - "step": 9391 - }, - { - "epoch": 2.9, - "learning_rate": 1.2006992962946204e-05, - "loss": 1.9191, - "step": 9392 - }, - { - "epoch": 2.9, - "learning_rate": 1.1971363903708975e-05, - "loss": 2.1173, - "step": 9393 - }, - { - "epoch": 2.9, - "learning_rate": 1.1935734844471745e-05, - "loss": 2.1061, - "step": 9394 - }, - { - "epoch": 2.9, - "learning_rate": 1.1900105785234516e-05, - "loss": 1.9389, - "step": 9395 - }, - { - "epoch": 2.9, - "learning_rate": 1.1864476725997286e-05, - "loss": 2.1359, - "step": 9396 - }, - { - "epoch": 2.9, - "learning_rate": 1.1828847666760056e-05, - "loss": 2.4053, - "step": 9397 - }, - { - "epoch": 2.9, - "learning_rate": 1.179321860752283e-05, - "loss": 2.0414, - "step": 9398 - }, - { - "epoch": 2.9, - "learning_rate": 1.1757589548285601e-05, - "loss": 2.3455, - "step": 9399 - }, - { - "epoch": 2.9, - "learning_rate": 1.1721960489048371e-05, - "loss": 1.9381, - "step": 9400 - }, - { - "epoch": 2.9, - "learning_rate": 1.1686331429811141e-05, - "loss": 1.8706, - "step": 9401 - }, - { - "epoch": 2.9, - "learning_rate": 1.1650702370573913e-05, - "loss": 2.0346, - "step": 9402 - }, - { - "epoch": 2.9, - "learning_rate": 1.1615073311336683e-05, - "loss": 1.8787, - "step": 9403 - }, - { - "epoch": 2.9, - "learning_rate": 1.1579444252099453e-05, - "loss": 1.7475, - "step": 9404 - }, - { - "epoch": 2.9, - "learning_rate": 1.1543815192862225e-05, - "loss": 1.8621, - "step": 9405 - }, - { - "epoch": 2.9, - "learning_rate": 1.1508186133624995e-05, - "loss": 1.6786, - "step": 9406 - }, - { - "epoch": 2.9, - "learning_rate": 1.1472557074387768e-05, - "loss": 1.7458, - "step": 9407 - }, - { - "epoch": 2.9, - "learning_rate": 1.1436928015150538e-05, - "loss": 1.822, - "step": 9408 - }, - { - "epoch": 2.9, - "learning_rate": 1.140129895591331e-05, - "loss": 1.6836, - "step": 9409 - }, - { - "epoch": 2.9, - "learning_rate": 1.136566989667608e-05, - "loss": 1.9154, - "step": 9410 - }, - { - "epoch": 2.9, - "learning_rate": 1.133004083743885e-05, - "loss": 2.0012, - "step": 9411 - }, - { - "epoch": 2.9, - "learning_rate": 1.1294411778201622e-05, - "loss": 1.5844, - "step": 9412 - }, - { - "epoch": 2.91, - "learning_rate": 1.1258782718964392e-05, - "loss": 1.3809, - "step": 9413 - }, - { - "epoch": 2.91, - "learning_rate": 1.1223153659727162e-05, - "loss": 1.5041, - "step": 9414 - }, - { - "epoch": 2.91, - "learning_rate": 1.1187524600489935e-05, - "loss": 1.3799, - "step": 9415 - }, - { - "epoch": 2.91, - "learning_rate": 1.1151895541252705e-05, - "loss": 1.3562, - "step": 9416 - }, - { - "epoch": 2.91, - "learning_rate": 1.1116266482015477e-05, - "loss": 1.4116, - "step": 9417 - }, - { - "epoch": 2.91, - "learning_rate": 1.1080637422778247e-05, - "loss": 1.4143, - "step": 9418 - }, - { - "epoch": 2.91, - "learning_rate": 1.1045008363541017e-05, - "loss": 1.3605, - "step": 9419 - }, - { - "epoch": 2.91, - "learning_rate": 1.1009379304303789e-05, - "loss": 1.4236, - "step": 9420 - }, - { - "epoch": 2.91, - "learning_rate": 1.0973750245066559e-05, - "loss": 1.3578, - "step": 9421 - }, - { - "epoch": 2.91, - "learning_rate": 1.093812118582933e-05, - "loss": 1.0433, - "step": 9422 - }, - { - "epoch": 2.91, - "learning_rate": 1.0902492126592102e-05, - "loss": 1.3, - "step": 9423 - }, - { - "epoch": 2.91, - "learning_rate": 1.0866863067354874e-05, - "loss": 1.2402, - "step": 9424 - }, - { - "epoch": 2.91, - "learning_rate": 1.0831234008117644e-05, - "loss": 1.0894, - "step": 9425 - }, - { - "epoch": 2.91, - "learning_rate": 1.0795604948880414e-05, - "loss": 1.0451, - "step": 9426 - }, - { - "epoch": 2.91, - "learning_rate": 1.0759975889643186e-05, - "loss": 0.987, - "step": 9427 - }, - { - "epoch": 2.91, - "learning_rate": 1.0724346830405956e-05, - "loss": 1.0492, - "step": 9428 - }, - { - "epoch": 2.91, - "learning_rate": 1.0688717771168726e-05, - "loss": 0.8768, - "step": 9429 - }, - { - "epoch": 2.91, - "learning_rate": 1.0653088711931497e-05, - "loss": 0.9437, - "step": 9430 - }, - { - "epoch": 2.91, - "learning_rate": 1.061745965269427e-05, - "loss": 3.0649, - "step": 9431 - }, - { - "epoch": 2.91, - "learning_rate": 1.058183059345704e-05, - "loss": 2.8014, - "step": 9432 - }, - { - "epoch": 2.91, - "learning_rate": 1.054620153421981e-05, - "loss": 2.5883, - "step": 9433 - }, - { - "epoch": 2.91, - "learning_rate": 1.0510572474982582e-05, - "loss": 2.6257, - "step": 9434 - }, - { - "epoch": 2.91, - "learning_rate": 1.0474943415745352e-05, - "loss": 2.6101, - "step": 9435 - }, - { - "epoch": 2.91, - "learning_rate": 1.0439314356508122e-05, - "loss": 2.2227, - "step": 9436 - }, - { - "epoch": 2.91, - "learning_rate": 1.0403685297270894e-05, - "loss": 2.5429, - "step": 9437 - }, - { - "epoch": 2.91, - "learning_rate": 1.0368056238033664e-05, - "loss": 2.2053, - "step": 9438 - }, - { - "epoch": 2.91, - "learning_rate": 1.0332427178796438e-05, - "loss": 2.0374, - "step": 9439 - }, - { - "epoch": 2.91, - "learning_rate": 1.0296798119559208e-05, - "loss": 2.0571, - "step": 9440 - }, - { - "epoch": 2.91, - "learning_rate": 1.026116906032198e-05, - "loss": 2.3049, - "step": 9441 - }, - { - "epoch": 2.91, - "learning_rate": 1.022554000108475e-05, - "loss": 2.296, - "step": 9442 - }, - { - "epoch": 2.91, - "learning_rate": 1.018991094184752e-05, - "loss": 2.0852, - "step": 9443 - }, - { - "epoch": 2.91, - "learning_rate": 1.0154281882610291e-05, - "loss": 2.061, - "step": 9444 - }, - { - "epoch": 2.92, - "learning_rate": 1.0118652823373063e-05, - "loss": 1.8929, - "step": 9445 - }, - { - "epoch": 2.92, - "learning_rate": 1.0083023764135833e-05, - "loss": 2.0834, - "step": 9446 - }, - { - "epoch": 2.92, - "learning_rate": 1.0047394704898603e-05, - "loss": 2.2027, - "step": 9447 - }, - { - "epoch": 2.92, - "learning_rate": 1.0011765645661375e-05, - "loss": 2.0275, - "step": 9448 - }, - { - "epoch": 2.92, - "learning_rate": 9.976136586424145e-06, - "loss": 2.2227, - "step": 9449 - }, - { - "epoch": 2.92, - "learning_rate": 9.940507527186916e-06, - "loss": 1.9459, - "step": 9450 - }, - { - "epoch": 2.92, - "learning_rate": 9.904878467949686e-06, - "loss": 2.1818, - "step": 9451 - }, - { - "epoch": 2.92, - "learning_rate": 9.869249408712458e-06, - "loss": 1.8763, - "step": 9452 - }, - { - "epoch": 2.92, - "learning_rate": 9.833620349475228e-06, - "loss": 1.9124, - "step": 9453 - }, - { - "epoch": 2.92, - "learning_rate": 9.797991290238e-06, - "loss": 2.0718, - "step": 9454 - }, - { - "epoch": 2.92, - "learning_rate": 9.762362231000771e-06, - "loss": 1.8463, - "step": 9455 - }, - { - "epoch": 2.92, - "learning_rate": 9.726733171763541e-06, - "loss": 1.6763, - "step": 9456 - }, - { - "epoch": 2.92, - "learning_rate": 9.691104112526311e-06, - "loss": 1.696, - "step": 9457 - }, - { - "epoch": 2.92, - "learning_rate": 9.655475053289083e-06, - "loss": 1.7865, - "step": 9458 - }, - { - "epoch": 2.92, - "learning_rate": 9.619845994051855e-06, - "loss": 1.9251, - "step": 9459 - }, - { - "epoch": 2.92, - "learning_rate": 9.584216934814625e-06, - "loss": 1.8367, - "step": 9460 - }, - { - "epoch": 2.92, - "learning_rate": 9.548587875577395e-06, - "loss": 1.7728, - "step": 9461 - }, - { - "epoch": 2.92, - "learning_rate": 9.512958816340167e-06, - "loss": 1.6027, - "step": 9462 - }, - { - "epoch": 2.92, - "learning_rate": 9.477329757102938e-06, - "loss": 1.799, - "step": 9463 - }, - { - "epoch": 2.92, - "learning_rate": 9.441700697865708e-06, - "loss": 1.6403, - "step": 9464 - }, - { - "epoch": 2.92, - "learning_rate": 9.40607163862848e-06, - "loss": 1.4852, - "step": 9465 - }, - { - "epoch": 2.92, - "learning_rate": 9.370442579391252e-06, - "loss": 1.4173, - "step": 9466 - }, - { - "epoch": 2.92, - "learning_rate": 9.334813520154022e-06, - "loss": 1.5533, - "step": 9467 - }, - { - "epoch": 2.92, - "learning_rate": 9.299184460916792e-06, - "loss": 1.3633, - "step": 9468 - }, - { - "epoch": 2.92, - "learning_rate": 9.263555401679564e-06, - "loss": 1.3083, - "step": 9469 - }, - { - "epoch": 2.92, - "learning_rate": 9.227926342442335e-06, - "loss": 1.308, - "step": 9470 - }, - { - "epoch": 2.92, - "learning_rate": 9.192297283205105e-06, - "loss": 1.0692, - "step": 9471 - }, - { - "epoch": 2.92, - "learning_rate": 9.156668223967875e-06, - "loss": 1.1021, - "step": 9472 - }, - { - "epoch": 2.92, - "learning_rate": 9.121039164730647e-06, - "loss": 1.2118, - "step": 9473 - }, - { - "epoch": 2.92, - "learning_rate": 9.085410105493419e-06, - "loss": 1.1548, - "step": 9474 - }, - { - "epoch": 2.92, - "learning_rate": 9.049781046256189e-06, - "loss": 1.1081, - "step": 9475 - }, - { - "epoch": 2.92, - "learning_rate": 9.01415198701896e-06, - "loss": 1.1504, - "step": 9476 - }, - { - "epoch": 2.92, - "learning_rate": 8.97852292778173e-06, - "loss": 0.9251, - "step": 9477 - }, - { - "epoch": 2.93, - "learning_rate": 8.942893868544502e-06, - "loss": 0.991, - "step": 9478 - }, - { - "epoch": 2.93, - "learning_rate": 8.907264809307272e-06, - "loss": 0.9884, - "step": 9479 - }, - { - "epoch": 2.93, - "learning_rate": 8.871635750070044e-06, - "loss": 0.9667, - "step": 9480 - }, - { - "epoch": 2.93, - "learning_rate": 8.836006690832814e-06, - "loss": 2.8228, - "step": 9481 - }, - { - "epoch": 2.93, - "learning_rate": 8.800377631595586e-06, - "loss": 3.1934, - "step": 9482 - }, - { - "epoch": 2.93, - "learning_rate": 8.764748572358356e-06, - "loss": 2.6028, - "step": 9483 - }, - { - "epoch": 2.93, - "learning_rate": 8.729119513121127e-06, - "loss": 2.3047, - "step": 9484 - }, - { - "epoch": 2.93, - "learning_rate": 8.693490453883897e-06, - "loss": 2.1804, - "step": 9485 - }, - { - "epoch": 2.93, - "learning_rate": 8.657861394646669e-06, - "loss": 2.2493, - "step": 9486 - }, - { - "epoch": 2.93, - "learning_rate": 8.62223233540944e-06, - "loss": 2.4625, - "step": 9487 - }, - { - "epoch": 2.93, - "learning_rate": 8.58660327617221e-06, - "loss": 2.3951, - "step": 9488 - }, - { - "epoch": 2.93, - "learning_rate": 8.55097421693498e-06, - "loss": 2.2439, - "step": 9489 - }, - { - "epoch": 2.93, - "learning_rate": 8.515345157697753e-06, - "loss": 2.2117, - "step": 9490 - }, - { - "epoch": 2.93, - "learning_rate": 8.479716098460524e-06, - "loss": 2.1491, - "step": 9491 - }, - { - "epoch": 2.93, - "learning_rate": 8.444087039223294e-06, - "loss": 1.9351, - "step": 9492 - }, - { - "epoch": 2.93, - "learning_rate": 8.408457979986064e-06, - "loss": 1.7992, - "step": 9493 - }, - { - "epoch": 2.93, - "learning_rate": 8.372828920748836e-06, - "loss": 2.0094, - "step": 9494 - }, - { - "epoch": 2.93, - "learning_rate": 8.337199861511608e-06, - "loss": 1.8854, - "step": 9495 - }, - { - "epoch": 2.93, - "learning_rate": 8.301570802274378e-06, - "loss": 1.9184, - "step": 9496 - }, - { - "epoch": 2.93, - "learning_rate": 8.26594174303715e-06, - "loss": 1.9433, - "step": 9497 - }, - { - "epoch": 2.93, - "learning_rate": 8.23031268379992e-06, - "loss": 1.7687, - "step": 9498 - }, - { - "epoch": 2.93, - "learning_rate": 8.194683624562691e-06, - "loss": 2.0878, - "step": 9499 - }, - { - "epoch": 2.93, - "learning_rate": 8.159054565325461e-06, - "loss": 1.9175, - "step": 9500 - }, - { - "epoch": 2.93, - "eval_bleu": 1.5827445667544166e-18, - "eval_loss": 3.0510671138763428, - "eval_runtime": 2619.9089, - "eval_samples_per_second": 5.634, - "eval_steps_per_second": 0.704, - "step": 9500 - }, - { - "epoch": 2.93, - "learning_rate": 8.123425506088233e-06, - "loss": 1.9421, - "step": 9501 - }, - { - "epoch": 2.93, - "learning_rate": 8.087796446851003e-06, - "loss": 1.6852, - "step": 9502 - }, - { - "epoch": 2.93, - "learning_rate": 8.052167387613775e-06, - "loss": 1.9491, - "step": 9503 - }, - { - "epoch": 2.93, - "learning_rate": 8.016538328376545e-06, - "loss": 1.9421, - "step": 9504 - }, - { - "epoch": 2.93, - "learning_rate": 7.980909269139316e-06, - "loss": 1.9617, - "step": 9505 - }, - { - "epoch": 2.93, - "learning_rate": 7.945280209902086e-06, - "loss": 1.8439, - "step": 9506 - }, - { - "epoch": 2.93, - "learning_rate": 7.909651150664858e-06, - "loss": 1.6839, - "step": 9507 - }, - { - "epoch": 2.93, - "learning_rate": 7.87402209142763e-06, - "loss": 1.639, - "step": 9508 - }, - { - "epoch": 2.93, - "learning_rate": 7.8383930321904e-06, - "loss": 1.5324, - "step": 9509 - }, - { - "epoch": 2.94, - "learning_rate": 7.80276397295317e-06, - "loss": 1.7009, - "step": 9510 - }, - { - "epoch": 2.94, - "learning_rate": 7.767134913715942e-06, - "loss": 1.4478, - "step": 9511 - }, - { - "epoch": 2.94, - "learning_rate": 7.731505854478713e-06, - "loss": 1.6354, - "step": 9512 - }, - { - "epoch": 2.94, - "learning_rate": 7.695876795241483e-06, - "loss": 1.532, - "step": 9513 - }, - { - "epoch": 2.94, - "learning_rate": 7.660247736004253e-06, - "loss": 1.4991, - "step": 9514 - }, - { - "epoch": 2.94, - "learning_rate": 7.624618676767026e-06, - "loss": 1.5178, - "step": 9515 - }, - { - "epoch": 2.94, - "learning_rate": 7.588989617529797e-06, - "loss": 1.5689, - "step": 9516 - }, - { - "epoch": 2.94, - "learning_rate": 7.553360558292567e-06, - "loss": 1.5031, - "step": 9517 - }, - { - "epoch": 2.94, - "learning_rate": 7.5177314990553376e-06, - "loss": 1.6065, - "step": 9518 - }, - { - "epoch": 2.94, - "learning_rate": 7.482102439818109e-06, - "loss": 1.5157, - "step": 9519 - }, - { - "epoch": 2.94, - "learning_rate": 7.44647338058088e-06, - "loss": 1.2304, - "step": 9520 - }, - { - "epoch": 2.94, - "learning_rate": 7.410844321343651e-06, - "loss": 1.1117, - "step": 9521 - }, - { - "epoch": 2.94, - "learning_rate": 7.375215262106421e-06, - "loss": 1.3179, - "step": 9522 - }, - { - "epoch": 2.94, - "learning_rate": 7.339586202869193e-06, - "loss": 1.1632, - "step": 9523 - }, - { - "epoch": 2.94, - "learning_rate": 7.303957143631964e-06, - "loss": 0.9703, - "step": 9524 - }, - { - "epoch": 2.94, - "learning_rate": 7.2683280843947345e-06, - "loss": 1.1412, - "step": 9525 - }, - { - "epoch": 2.94, - "learning_rate": 7.232699025157505e-06, - "loss": 0.8973, - "step": 9526 - }, - { - "epoch": 2.94, - "learning_rate": 7.197069965920277e-06, - "loss": 0.927, - "step": 9527 - }, - { - "epoch": 2.94, - "learning_rate": 7.161440906683047e-06, - "loss": 1.0374, - "step": 9528 - }, - { - "epoch": 2.94, - "learning_rate": 7.125811847445818e-06, - "loss": 0.9249, - "step": 9529 - }, - { - "epoch": 2.94, - "learning_rate": 7.090182788208589e-06, - "loss": 1.0137, - "step": 9530 - }, - { - "epoch": 2.94, - "learning_rate": 7.0545537289713605e-06, - "loss": 3.1992, - "step": 9531 - }, - { - "epoch": 2.94, - "learning_rate": 7.018924669734131e-06, - "loss": 2.955, - "step": 9532 - }, - { - "epoch": 2.94, - "learning_rate": 6.983295610496901e-06, - "loss": 2.4761, - "step": 9533 - }, - { - "epoch": 2.94, - "learning_rate": 6.947666551259672e-06, - "loss": 2.6519, - "step": 9534 - }, - { - "epoch": 2.94, - "learning_rate": 6.912037492022444e-06, - "loss": 2.3562, - "step": 9535 - }, - { - "epoch": 2.94, - "learning_rate": 6.876408432785215e-06, - "loss": 2.318, - "step": 9536 - }, - { - "epoch": 2.94, - "learning_rate": 6.840779373547986e-06, - "loss": 2.2666, - "step": 9537 - }, - { - "epoch": 2.94, - "learning_rate": 6.805150314310756e-06, - "loss": 2.4338, - "step": 9538 - }, - { - "epoch": 2.94, - "learning_rate": 6.769521255073527e-06, - "loss": 1.9336, - "step": 9539 - }, - { - "epoch": 2.94, - "learning_rate": 6.733892195836298e-06, - "loss": 2.0247, - "step": 9540 - }, - { - "epoch": 2.94, - "learning_rate": 6.698263136599069e-06, - "loss": 2.2633, - "step": 9541 - }, - { - "epoch": 2.94, - "learning_rate": 6.66263407736184e-06, - "loss": 2.2944, - "step": 9542 - }, - { - "epoch": 2.95, - "learning_rate": 6.62700501812461e-06, - "loss": 2.0059, - "step": 9543 - }, - { - "epoch": 2.95, - "learning_rate": 6.591375958887382e-06, - "loss": 1.8876, - "step": 9544 - }, - { - "epoch": 2.95, - "learning_rate": 6.555746899650153e-06, - "loss": 2.0158, - "step": 9545 - }, - { - "epoch": 2.95, - "learning_rate": 6.5201178404129234e-06, - "loss": 1.8958, - "step": 9546 - }, - { - "epoch": 2.95, - "learning_rate": 6.484488781175694e-06, - "loss": 1.8154, - "step": 9547 - }, - { - "epoch": 2.95, - "learning_rate": 6.448859721938466e-06, - "loss": 2.0031, - "step": 9548 - }, - { - "epoch": 2.95, - "learning_rate": 6.413230662701236e-06, - "loss": 1.9046, - "step": 9549 - }, - { - "epoch": 2.95, - "learning_rate": 6.377601603464007e-06, - "loss": 1.7662, - "step": 9550 - }, - { - "epoch": 2.95, - "learning_rate": 6.341972544226778e-06, - "loss": 1.8266, - "step": 9551 - }, - { - "epoch": 2.95, - "learning_rate": 6.3063434849895495e-06, - "loss": 1.9874, - "step": 9552 - }, - { - "epoch": 2.95, - "learning_rate": 6.27071442575232e-06, - "loss": 1.9894, - "step": 9553 - }, - { - "epoch": 2.95, - "learning_rate": 6.23508536651509e-06, - "loss": 1.9142, - "step": 9554 - }, - { - "epoch": 2.95, - "learning_rate": 6.199456307277861e-06, - "loss": 1.9071, - "step": 9555 - }, - { - "epoch": 2.95, - "learning_rate": 6.163827248040633e-06, - "loss": 1.5023, - "step": 9556 - }, - { - "epoch": 2.95, - "learning_rate": 6.128198188803404e-06, - "loss": 1.9704, - "step": 9557 - }, - { - "epoch": 2.95, - "learning_rate": 6.092569129566175e-06, - "loss": 1.711, - "step": 9558 - }, - { - "epoch": 2.95, - "learning_rate": 6.056940070328945e-06, - "loss": 1.494, - "step": 9559 - }, - { - "epoch": 2.95, - "learning_rate": 6.021311011091716e-06, - "loss": 1.5974, - "step": 9560 - }, - { - "epoch": 2.95, - "learning_rate": 5.985681951854487e-06, - "loss": 1.3329, - "step": 9561 - }, - { - "epoch": 2.95, - "learning_rate": 5.950052892617258e-06, - "loss": 1.7576, - "step": 9562 - }, - { - "epoch": 2.95, - "learning_rate": 5.914423833380028e-06, - "loss": 1.4196, - "step": 9563 - }, - { - "epoch": 2.95, - "learning_rate": 5.878794774142801e-06, - "loss": 1.5928, - "step": 9564 - }, - { - "epoch": 2.95, - "learning_rate": 5.843165714905571e-06, - "loss": 1.5253, - "step": 9565 - }, - { - "epoch": 2.95, - "learning_rate": 5.8075366556683416e-06, - "loss": 1.2918, - "step": 9566 - }, - { - "epoch": 2.95, - "learning_rate": 5.7719075964311124e-06, - "loss": 1.3825, - "step": 9567 - }, - { - "epoch": 2.95, - "learning_rate": 5.736278537193884e-06, - "loss": 1.2231, - "step": 9568 - }, - { - "epoch": 2.95, - "learning_rate": 5.700649477956655e-06, - "loss": 1.3448, - "step": 9569 - }, - { - "epoch": 2.95, - "learning_rate": 5.665020418719425e-06, - "loss": 1.2047, - "step": 9570 - }, - { - "epoch": 2.95, - "learning_rate": 5.629391359482196e-06, - "loss": 1.2959, - "step": 9571 - }, - { - "epoch": 2.95, - "learning_rate": 5.593762300244968e-06, - "loss": 1.1479, - "step": 9572 - }, - { - "epoch": 2.95, - "learning_rate": 5.5581332410077385e-06, - "loss": 1.0769, - "step": 9573 - }, - { - "epoch": 2.95, - "learning_rate": 5.5225041817705085e-06, - "loss": 1.005, - "step": 9574 - }, - { - "epoch": 2.96, - "learning_rate": 5.486875122533279e-06, - "loss": 0.9161, - "step": 9575 - }, - { - "epoch": 2.96, - "learning_rate": 5.451246063296051e-06, - "loss": 1.1845, - "step": 9576 - }, - { - "epoch": 2.96, - "learning_rate": 5.415617004058822e-06, - "loss": 1.1245, - "step": 9577 - }, - { - "epoch": 2.96, - "learning_rate": 5.379987944821593e-06, - "loss": 1.0074, - "step": 9578 - }, - { - "epoch": 2.96, - "learning_rate": 5.344358885584363e-06, - "loss": 0.777, - "step": 9579 - }, - { - "epoch": 2.96, - "learning_rate": 5.308729826347135e-06, - "loss": 0.9761, - "step": 9580 - }, - { - "epoch": 2.96, - "learning_rate": 5.273100767109905e-06, - "loss": 3.0781, - "step": 9581 - }, - { - "epoch": 2.96, - "learning_rate": 5.237471707872676e-06, - "loss": 2.7149, - "step": 9582 - }, - { - "epoch": 2.96, - "learning_rate": 5.201842648635447e-06, - "loss": 2.4177, - "step": 9583 - }, - { - "epoch": 2.96, - "learning_rate": 5.166213589398219e-06, - "loss": 2.3557, - "step": 9584 - }, - { - "epoch": 2.96, - "learning_rate": 5.13058453016099e-06, - "loss": 2.3639, - "step": 9585 - }, - { - "epoch": 2.96, - "learning_rate": 5.09495547092376e-06, - "loss": 2.3823, - "step": 9586 - }, - { - "epoch": 2.96, - "learning_rate": 5.059326411686531e-06, - "loss": 2.4537, - "step": 9587 - }, - { - "epoch": 2.96, - "learning_rate": 5.023697352449301e-06, - "loss": 2.3316, - "step": 9588 - }, - { - "epoch": 2.96, - "learning_rate": 4.988068293212072e-06, - "loss": 2.0534, - "step": 9589 - }, - { - "epoch": 2.96, - "learning_rate": 4.952439233974843e-06, - "loss": 2.3118, - "step": 9590 - }, - { - "epoch": 2.96, - "learning_rate": 4.916810174737614e-06, - "loss": 2.2839, - "step": 9591 - }, - { - "epoch": 2.96, - "learning_rate": 4.881181115500386e-06, - "loss": 2.0061, - "step": 9592 - }, - { - "epoch": 2.96, - "learning_rate": 4.845552056263156e-06, - "loss": 2.0694, - "step": 9593 - }, - { - "epoch": 2.96, - "learning_rate": 4.8099229970259275e-06, - "loss": 2.2367, - "step": 9594 - }, - { - "epoch": 2.96, - "learning_rate": 4.7742939377886975e-06, - "loss": 2.2238, - "step": 9595 - }, - { - "epoch": 2.96, - "learning_rate": 4.738664878551469e-06, - "loss": 2.1866, - "step": 9596 - }, - { - "epoch": 2.96, - "learning_rate": 4.70303581931424e-06, - "loss": 2.189, - "step": 9597 - }, - { - "epoch": 2.96, - "learning_rate": 4.667406760077011e-06, - "loss": 1.9049, - "step": 9598 - }, - { - "epoch": 2.96, - "learning_rate": 4.631777700839782e-06, - "loss": 1.9036, - "step": 9599 - }, - { - "epoch": 2.96, - "learning_rate": 4.596148641602553e-06, - "loss": 2.2067, - "step": 9600 - }, - { - "epoch": 2.96, - "learning_rate": 4.5605195823653235e-06, - "loss": 1.7287, - "step": 9601 - }, - { - "epoch": 2.96, - "learning_rate": 4.524890523128094e-06, - "loss": 1.8241, - "step": 9602 - }, - { - "epoch": 2.96, - "learning_rate": 4.489261463890865e-06, - "loss": 2.1436, - "step": 9603 - }, - { - "epoch": 2.96, - "learning_rate": 4.453632404653636e-06, - "loss": 1.7473, - "step": 9604 - }, - { - "epoch": 2.96, - "learning_rate": 4.418003345416407e-06, - "loss": 1.5794, - "step": 9605 - }, - { - "epoch": 2.96, - "learning_rate": 4.382374286179178e-06, - "loss": 1.7757, - "step": 9606 - }, - { - "epoch": 2.97, - "learning_rate": 4.346745226941949e-06, - "loss": 1.7814, - "step": 9607 - }, - { - "epoch": 2.97, - "learning_rate": 4.31111616770472e-06, - "loss": 1.9176, - "step": 9608 - }, - { - "epoch": 2.97, - "learning_rate": 4.27548710846749e-06, - "loss": 1.5518, - "step": 9609 - }, - { - "epoch": 2.97, - "learning_rate": 4.239858049230262e-06, - "loss": 1.6683, - "step": 9610 - }, - { - "epoch": 2.97, - "learning_rate": 4.204228989993032e-06, - "loss": 1.6852, - "step": 9611 - }, - { - "epoch": 2.97, - "learning_rate": 4.168599930755804e-06, - "loss": 1.5382, - "step": 9612 - }, - { - "epoch": 2.97, - "learning_rate": 4.132970871518575e-06, - "loss": 1.5865, - "step": 9613 - }, - { - "epoch": 2.97, - "learning_rate": 4.0973418122813456e-06, - "loss": 1.5392, - "step": 9614 - }, - { - "epoch": 2.97, - "learning_rate": 4.0617127530441164e-06, - "loss": 1.6195, - "step": 9615 - }, - { - "epoch": 2.97, - "learning_rate": 4.026083693806887e-06, - "loss": 1.2314, - "step": 9616 - }, - { - "epoch": 2.97, - "learning_rate": 3.990454634569658e-06, - "loss": 1.4378, - "step": 9617 - }, - { - "epoch": 2.97, - "learning_rate": 3.954825575332429e-06, - "loss": 1.3567, - "step": 9618 - }, - { - "epoch": 2.97, - "learning_rate": 3.9191965160952e-06, - "loss": 1.2381, - "step": 9619 - }, - { - "epoch": 2.97, - "learning_rate": 3.883567456857971e-06, - "loss": 1.1308, - "step": 9620 - }, - { - "epoch": 2.97, - "learning_rate": 3.847938397620742e-06, - "loss": 1.4152, - "step": 9621 - }, - { - "epoch": 2.97, - "learning_rate": 3.812309338383513e-06, - "loss": 1.4504, - "step": 9622 - }, - { - "epoch": 2.97, - "learning_rate": 3.7766802791462834e-06, - "loss": 1.0987, - "step": 9623 - }, - { - "epoch": 2.97, - "learning_rate": 3.7410512199090546e-06, - "loss": 1.221, - "step": 9624 - }, - { - "epoch": 2.97, - "learning_rate": 3.7054221606718255e-06, - "loss": 1.0713, - "step": 9625 - }, - { - "epoch": 2.97, - "learning_rate": 3.6697931014345964e-06, - "loss": 1.0441, - "step": 9626 - }, - { - "epoch": 2.97, - "learning_rate": 3.6341640421973672e-06, - "loss": 0.8949, - "step": 9627 - }, - { - "epoch": 2.97, - "learning_rate": 3.5985349829601385e-06, - "loss": 0.9717, - "step": 9628 - }, - { - "epoch": 2.97, - "learning_rate": 3.562905923722909e-06, - "loss": 0.8657, - "step": 9629 - }, - { - "epoch": 2.97, - "learning_rate": 3.5272768644856802e-06, - "loss": 0.836, - "step": 9630 - }, - { - "epoch": 2.97, - "learning_rate": 3.4916478052484507e-06, - "loss": 3.2072, - "step": 9631 - }, - { - "epoch": 2.97, - "learning_rate": 3.456018746011222e-06, - "loss": 2.8631, - "step": 9632 - }, - { - "epoch": 2.97, - "learning_rate": 3.420389686773993e-06, - "loss": 2.7104, - "step": 9633 - }, - { - "epoch": 2.97, - "learning_rate": 3.3847606275367637e-06, - "loss": 2.6306, - "step": 9634 - }, - { - "epoch": 2.97, - "learning_rate": 3.3491315682995346e-06, - "loss": 2.8402, - "step": 9635 - }, - { - "epoch": 2.97, - "learning_rate": 3.313502509062305e-06, - "loss": 2.2221, - "step": 9636 - }, - { - "epoch": 2.97, - "learning_rate": 3.2778734498250763e-06, - "loss": 2.3995, - "step": 9637 - }, - { - "epoch": 2.97, - "learning_rate": 3.242244390587847e-06, - "loss": 2.1021, - "step": 9638 - }, - { - "epoch": 2.97, - "learning_rate": 3.206615331350618e-06, - "loss": 2.1753, - "step": 9639 - }, - { - "epoch": 2.98, - "learning_rate": 3.170986272113389e-06, - "loss": 2.2605, - "step": 9640 - }, - { - "epoch": 2.98, - "learning_rate": 3.13535721287616e-06, - "loss": 2.2511, - "step": 9641 - }, - { - "epoch": 2.98, - "learning_rate": 3.0997281536389306e-06, - "loss": 2.2129, - "step": 9642 - }, - { - "epoch": 2.98, - "learning_rate": 3.064099094401702e-06, - "loss": 2.0257, - "step": 9643 - }, - { - "epoch": 2.98, - "learning_rate": 3.0284700351644723e-06, - "loss": 2.0224, - "step": 9644 - }, - { - "epoch": 2.98, - "learning_rate": 2.9928409759272436e-06, - "loss": 1.9464, - "step": 9645 - }, - { - "epoch": 2.98, - "learning_rate": 2.957211916690014e-06, - "loss": 1.921, - "step": 9646 - }, - { - "epoch": 2.98, - "learning_rate": 2.9215828574527854e-06, - "loss": 1.9435, - "step": 9647 - }, - { - "epoch": 2.98, - "learning_rate": 2.8859537982155562e-06, - "loss": 2.2296, - "step": 9648 - }, - { - "epoch": 2.98, - "learning_rate": 2.8503247389783275e-06, - "loss": 1.7179, - "step": 9649 - }, - { - "epoch": 2.98, - "learning_rate": 2.814695679741098e-06, - "loss": 2.0163, - "step": 9650 - }, - { - "epoch": 2.98, - "learning_rate": 2.7790666205038692e-06, - "loss": 1.8746, - "step": 9651 - }, - { - "epoch": 2.98, - "learning_rate": 2.7434375612666397e-06, - "loss": 2.1264, - "step": 9652 - }, - { - "epoch": 2.98, - "learning_rate": 2.707808502029411e-06, - "loss": 1.6816, - "step": 9653 - }, - { - "epoch": 2.98, - "learning_rate": 2.6721794427921814e-06, - "loss": 1.9046, - "step": 9654 - }, - { - "epoch": 2.98, - "learning_rate": 2.6365503835549527e-06, - "loss": 1.7281, - "step": 9655 - }, - { - "epoch": 2.98, - "learning_rate": 2.6009213243177236e-06, - "loss": 1.7914, - "step": 9656 - }, - { - "epoch": 2.98, - "learning_rate": 2.565292265080495e-06, - "loss": 1.5076, - "step": 9657 - }, - { - "epoch": 2.98, - "learning_rate": 2.5296632058432657e-06, - "loss": 1.726, - "step": 9658 - }, - { - "epoch": 2.98, - "learning_rate": 2.494034146606036e-06, - "loss": 1.8808, - "step": 9659 - }, - { - "epoch": 2.98, - "learning_rate": 2.458405087368807e-06, - "loss": 1.754, - "step": 9660 - }, - { - "epoch": 2.98, - "learning_rate": 2.422776028131578e-06, - "loss": 1.7944, - "step": 9661 - }, - { - "epoch": 2.98, - "learning_rate": 2.3871469688943487e-06, - "loss": 1.672, - "step": 9662 - }, - { - "epoch": 2.98, - "learning_rate": 2.35151790965712e-06, - "loss": 1.3362, - "step": 9663 - }, - { - "epoch": 2.98, - "learning_rate": 2.315888850419891e-06, - "loss": 1.6027, - "step": 9664 - }, - { - "epoch": 2.98, - "learning_rate": 2.2802597911826617e-06, - "loss": 1.46, - "step": 9665 - }, - { - "epoch": 2.98, - "learning_rate": 2.2446307319454326e-06, - "loss": 1.4726, - "step": 9666 - }, - { - "epoch": 2.98, - "learning_rate": 2.2090016727082035e-06, - "loss": 1.4147, - "step": 9667 - }, - { - "epoch": 2.98, - "learning_rate": 2.1733726134709743e-06, - "loss": 1.3789, - "step": 9668 - }, - { - "epoch": 2.98, - "learning_rate": 2.137743554233745e-06, - "loss": 1.4164, - "step": 9669 - }, - { - "epoch": 2.98, - "learning_rate": 2.102114494996516e-06, - "loss": 1.2731, - "step": 9670 - }, - { - "epoch": 2.98, - "learning_rate": 2.0664854357592874e-06, - "loss": 1.1165, - "step": 9671 - }, - { - "epoch": 2.99, - "learning_rate": 2.0308563765220582e-06, - "loss": 1.0977, - "step": 9672 - }, - { - "epoch": 2.99, - "learning_rate": 1.995227317284829e-06, - "loss": 1.0438, - "step": 9673 - }, - { - "epoch": 2.99, - "learning_rate": 1.9595982580476e-06, - "loss": 1.0962, - "step": 9674 - }, - { - "epoch": 2.99, - "learning_rate": 1.923969198810371e-06, - "loss": 1.0614, - "step": 9675 - }, - { - "epoch": 2.99, - "learning_rate": 1.8883401395731417e-06, - "loss": 0.9073, - "step": 9676 - }, - { - "epoch": 2.99, - "learning_rate": 1.8527110803359128e-06, - "loss": 0.9881, - "step": 9677 - }, - { - "epoch": 2.99, - "learning_rate": 1.8170820210986836e-06, - "loss": 0.9171, - "step": 9678 - }, - { - "epoch": 2.99, - "learning_rate": 1.7814529618614545e-06, - "loss": 1.0174, - "step": 9679 - }, - { - "epoch": 2.99, - "learning_rate": 1.7458239026242253e-06, - "loss": 0.9757, - "step": 9680 - }, - { - "epoch": 2.99, - "learning_rate": 1.7101948433869964e-06, - "loss": 3.1418, - "step": 9681 - }, - { - "epoch": 2.99, - "learning_rate": 1.6745657841497673e-06, - "loss": 2.6657, - "step": 9682 - }, - { - "epoch": 2.99, - "learning_rate": 1.6389367249125381e-06, - "loss": 2.3569, - "step": 9683 - }, - { - "epoch": 2.99, - "learning_rate": 1.603307665675309e-06, - "loss": 2.6663, - "step": 9684 - }, - { - "epoch": 2.99, - "learning_rate": 1.56767860643808e-06, - "loss": 2.4925, - "step": 9685 - }, - { - "epoch": 2.99, - "learning_rate": 1.532049547200851e-06, - "loss": 2.1699, - "step": 9686 - }, - { - "epoch": 2.99, - "learning_rate": 1.4964204879636218e-06, - "loss": 2.2108, - "step": 9687 - }, - { - "epoch": 2.99, - "learning_rate": 1.4607914287263927e-06, - "loss": 2.2998, - "step": 9688 - }, - { - "epoch": 2.99, - "learning_rate": 1.4251623694891638e-06, - "loss": 2.2607, - "step": 9689 - }, - { - "epoch": 2.99, - "learning_rate": 1.3895333102519346e-06, - "loss": 2.1735, - "step": 9690 - }, - { - "epoch": 2.99, - "learning_rate": 1.3539042510147055e-06, - "loss": 2.2706, - "step": 9691 - }, - { - "epoch": 2.99, - "learning_rate": 1.3182751917774763e-06, - "loss": 1.9037, - "step": 9692 - }, - { - "epoch": 2.99, - "learning_rate": 1.2826461325402474e-06, - "loss": 1.8137, - "step": 9693 - }, - { - "epoch": 2.99, - "learning_rate": 1.247017073303018e-06, - "loss": 2.0697, - "step": 9694 - }, - { - "epoch": 2.99, - "learning_rate": 1.211388014065789e-06, - "loss": 2.0527, - "step": 9695 - }, - { - "epoch": 2.99, - "learning_rate": 1.17575895482856e-06, - "loss": 1.8727, - "step": 9696 - }, - { - "epoch": 2.99, - "learning_rate": 1.1401298955913309e-06, - "loss": 2.0334, - "step": 9697 - }, - { - "epoch": 2.99, - "learning_rate": 1.1045008363541017e-06, - "loss": 2.0171, - "step": 9698 - }, - { - "epoch": 2.99, - "learning_rate": 1.0688717771168726e-06, - "loss": 1.896, - "step": 9699 - }, - { - "epoch": 2.99, - "learning_rate": 1.0332427178796437e-06, - "loss": 1.8235, - "step": 9700 - }, - { - "epoch": 2.99, - "learning_rate": 9.976136586424145e-07, - "loss": 1.7797, - "step": 9701 - }, - { - "epoch": 2.99, - "learning_rate": 9.619845994051854e-07, - "loss": 2.0114, - "step": 9702 - }, - { - "epoch": 2.99, - "learning_rate": 9.263555401679564e-07, - "loss": 1.636, - "step": 9703 - }, - { - "epoch": 2.99, - "learning_rate": 8.907264809307272e-07, - "loss": 1.653, - "step": 9704 - }, - { - "epoch": 3.0, - "learning_rate": 8.550974216934982e-07, - "loss": 1.5532, - "step": 9705 - }, - { - "epoch": 3.0, - "learning_rate": 8.194683624562691e-07, - "loss": 1.6229, - "step": 9706 - }, - { - "epoch": 3.0, - "learning_rate": 7.8383930321904e-07, - "loss": 1.5868, - "step": 9707 - }, - { - "epoch": 3.0, - "learning_rate": 7.482102439818109e-07, - "loss": 1.5558, - "step": 9708 - }, - { - "epoch": 3.0, - "learning_rate": 7.125811847445819e-07, - "loss": 1.3796, - "step": 9709 - }, - { - "epoch": 3.0, - "learning_rate": 6.769521255073527e-07, - "loss": 1.3825, - "step": 9710 - }, - { - "epoch": 3.0, - "learning_rate": 6.413230662701237e-07, - "loss": 1.3212, - "step": 9711 - }, - { - "epoch": 3.0, - "learning_rate": 6.056940070328945e-07, - "loss": 1.2322, - "step": 9712 - }, - { - "epoch": 3.0, - "learning_rate": 5.700649477956654e-07, - "loss": 1.1678, - "step": 9713 - }, - { - "epoch": 3.0, - "learning_rate": 5.344358885584363e-07, - "loss": 1.2295, - "step": 9714 - }, - { - "epoch": 3.0, - "learning_rate": 4.988068293212073e-07, - "loss": 0.932, - "step": 9715 - }, - { - "epoch": 3.0, - "learning_rate": 4.631777700839782e-07, - "loss": 1.0625, - "step": 9716 - }, - { - "epoch": 3.0, - "learning_rate": 4.275487108467491e-07, - "loss": 0.994, - "step": 9717 - }, - { - "epoch": 3.0, - "learning_rate": 3.9191965160952e-07, - "loss": 1.0231, - "step": 9718 - }, - { - "epoch": 3.0, - "learning_rate": 3.5629059237229094e-07, - "loss": 0.9733, - "step": 9719 - }, - { - "epoch": 3.0, - "learning_rate": 3.2066153313506185e-07, - "loss": 0.9251, - "step": 9720 - }, - { - "epoch": 3.0, - "step": 9720, - "total_flos": 0.0, - "train_loss": 3.260483605049765, - "train_runtime": 93795.5595, - "train_samples_per_second": 6.633, - "train_steps_per_second": 0.104 - }, - { - "epoch": 3.0, - "step": 9720, - "total_flos": 0.0, - "train_loss": 0.0, - "train_runtime": 15.9016, - "train_samples_per_second": 39123.207, - "train_steps_per_second": 611.259 - }, - { - "epoch": 3.0, - "step": 9720, - "total_flos": 0.0, - "train_loss": 0.0, - "train_runtime": 8.7447, - "train_samples_per_second": 71142.69, - "train_steps_per_second": 1111.529 - }, - { - "epoch": 3.0, - "step": 9720, - "total_flos": 0.0, - "train_loss": 0.0, - "train_runtime": 8.4854, - "train_samples_per_second": 73316.525, - "train_steps_per_second": 1145.493 - }, - { - "epoch": 3.0, - "step": 9720, - "total_flos": 0.0, - "train_loss": 0.0, - "train_runtime": 8.6541, - "train_samples_per_second": 71887.743, - "train_steps_per_second": 1123.17 - } - ], - "max_steps": 9720, - "num_train_epochs": 3, - "total_flos": 0.0, - "trial_name": null, - "trial_params": null -} diff --git a/training_args.bin b/training_args.bin index 4dc3d7ddf284ecf0136c57ef3fa51e4d7bc325f8..fe34c93c009465054d769e79777793b7d5c2d2cc 100644 --- a/training_args.bin +++ b/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0d48663471605abd96c1360ba2f44538ceb5703ea64f57354b42be8d949c099 +oid sha256:8d4c647b971e03f3437cfa5c377bc5c5dd4ecd573f7c676852a13d5a49d1c8db size 3247 diff --git a/wandb/debug-cli.log b/wandb/debug-cli.log index 2df690fd61d92022b155129e9176c5569e79615f..b0c7eeb8cafb93622a88b2f35990794d778fa8d9 100644 --- a/wandb/debug-cli.log +++ b/wandb/debug-cli.log @@ -184,3 +184,135 @@ warmup_steps: 500 2022-05-05 16:48:04 INFO About to run command: python3 run_xtreme_s.py --overwrite_output_dir --freeze_feature_encoder --gradient_checkpointing --predict_with_generate --fp16 --group_by_length --do_train --do_eval --load_best_model_at_end --push_to_hub --use_auth_token --eval_split_name=test --eval_steps=500 --evaluation_strategy=steps --generation_max_length=40 --generation_num_beams=1 --gradient_accumulation_steps=8 --greater_is_better=True --hidden_dropout=0.16627274455555233 --language=fr.en --learning_rate=0.00022154311254852488 --logging_steps=1 --max_duration_in_seconds=20 --metric_for_best_model=bleu --model_name_or_path=./ --num_train_epochs=3 --output_dir=./ --per_device_eval_batch_size=8 --per_device_train_batch_size=8 --save_steps=500 --task=covost2 --warmup_steps=500 2022-05-05 16:48:09 INFO Running runs: ['lr7oonrp'] +2022-05-05 16:51:25 ERROR Detected 5 failed runs in a row, shutting down. +2022-05-05 16:51:25 INFO To change this value set WANDB_AGENT_MAX_INITIAL_FAILURES=val +2022-05-05 17:29:38 INFO Running runs: [] +2022-05-05 17:29:38 INFO Agent received command: run +2022-05-05 17:29:38 INFO Agent starting run with config: + eval_split_name: test + eval_steps: 500 + evaluation_strategy: steps + generation_max_length: 40 + generation_num_beams: 1 + gradient_accumulation_steps: 16 + greater_is_better: True + hidden_dropout: 0.2 + language: fr.en + learning_rate: 0.0007057712331944904 + logging_steps: 1 + max_duration_in_seconds: 20 + metric_for_best_model: bleu + model_name_or_path: ./ + num_train_epochs: 3 + output_dir: ./ + per_device_eval_batch_size: 8 + per_device_train_batch_size: 8 + save_steps: 500 + task: covost2 + warmup_steps: 500 +2022-05-05 17:29:38 INFO About to run command: python3 run_xtreme_s.py --overwrite_output_dir --freeze_feature_encoder --gradient_checkpointing --predict_with_generate --fp16 --group_by_length --do_train --do_eval --load_best_model_at_end --push_to_hub --use_auth_token --eval_split_name=test --eval_steps=500 --evaluation_strategy=steps --generation_max_length=40 --generation_num_beams=1 --gradient_accumulation_steps=16 --greater_is_better=True --hidden_dropout=0.2 --language=fr.en --learning_rate=0.0007057712331944904 --logging_steps=1 --max_duration_in_seconds=20 --metric_for_best_model=bleu --model_name_or_path=./ --num_train_epochs=3 --output_dir=./ --per_device_eval_batch_size=8 --per_device_train_batch_size=8 --save_steps=500 --task=covost2 --warmup_steps=500 +2022-05-05 17:29:43 INFO Running runs: ['rwbnpkt2'] +2022-05-05 17:29:50 INFO Running runs: [] +2022-05-05 17:29:51 INFO Agent received command: run +2022-05-05 17:29:51 INFO Agent starting run with config: + eval_split_name: test + eval_steps: 500 + evaluation_strategy: steps + generation_max_length: 40 + generation_num_beams: 1 + gradient_accumulation_steps: 16 + greater_is_better: True + hidden_dropout: 0.2 + language: fr.en + learning_rate: 0.0005587128574267087 + logging_steps: 1 + max_duration_in_seconds: 20 + metric_for_best_model: bleu + model_name_or_path: ./ + num_train_epochs: 3 + output_dir: ./ + per_device_eval_batch_size: 8 + per_device_train_batch_size: 8 + save_steps: 500 + task: covost2 + warmup_steps: 500 +2022-05-05 17:29:51 INFO About to run command: python3 run_xtreme_s.py --overwrite_output_dir --freeze_feature_encoder --gradient_checkpointing --predict_with_generate --fp16 --group_by_length --do_train --do_eval --load_best_model_at_end --push_to_hub --use_auth_token --eval_split_name=test --eval_steps=500 --evaluation_strategy=steps --generation_max_length=40 --generation_num_beams=1 --gradient_accumulation_steps=16 --greater_is_better=True --hidden_dropout=0.2 --language=fr.en --learning_rate=0.0005587128574267087 --logging_steps=1 --max_duration_in_seconds=20 --metric_for_best_model=bleu --model_name_or_path=./ --num_train_epochs=3 --output_dir=./ --per_device_eval_batch_size=8 --per_device_train_batch_size=8 --save_steps=500 --task=covost2 --warmup_steps=500 +2022-05-05 17:29:56 INFO Running runs: ['ydgnpqx2'] +2022-05-05 17:30:19 INFO Cleaning up finished run: rwbnpkt2 +2022-05-05 17:30:19 INFO Agent received command: run +2022-05-05 17:30:19 INFO Agent starting run with config: + eval_split_name: test + eval_steps: 500 + evaluation_strategy: steps + generation_max_length: 40 + generation_num_beams: 1 + gradient_accumulation_steps: 16 + greater_is_better: True + hidden_dropout: 0.2 + language: fr.en + learning_rate: 0.0003851276453057612 + logging_steps: 1 + max_duration_in_seconds: 20 + metric_for_best_model: bleu + model_name_or_path: ./ + num_train_epochs: 3 + output_dir: ./ + per_device_eval_batch_size: 8 + per_device_train_batch_size: 8 + save_steps: 500 + task: covost2 + warmup_steps: 500 +2022-05-05 17:30:19 INFO About to run command: python3 run_xtreme_s.py --overwrite_output_dir --freeze_feature_encoder --gradient_checkpointing --predict_with_generate --fp16 --group_by_length --do_train --do_eval --load_best_model_at_end --push_to_hub --use_auth_token --eval_split_name=test --eval_steps=500 --evaluation_strategy=steps --generation_max_length=40 --generation_num_beams=1 --gradient_accumulation_steps=16 --greater_is_better=True --hidden_dropout=0.2 --language=fr.en --learning_rate=0.0003851276453057612 --logging_steps=1 --max_duration_in_seconds=20 --metric_for_best_model=bleu --model_name_or_path=./ --num_train_epochs=3 --output_dir=./ --per_device_eval_batch_size=8 --per_device_train_batch_size=8 --save_steps=500 --task=covost2 --warmup_steps=500 +2022-05-05 17:30:24 INFO Running runs: ['xbi4p92m'] +2022-05-05 17:31:15 INFO Running runs: [] +2022-05-05 17:31:16 INFO Agent received command: run +2022-05-05 17:31:16 INFO Agent starting run with config: + eval_split_name: test + eval_steps: 500 + evaluation_strategy: steps + generation_max_length: 40 + generation_num_beams: 1 + gradient_accumulation_steps: 16 + greater_is_better: True + hidden_dropout: 0.2 + language: fr.en + learning_rate: 0.0003287457929573604 + logging_steps: 1 + max_duration_in_seconds: 20 + metric_for_best_model: bleu + model_name_or_path: ./ + num_train_epochs: 3 + output_dir: ./ + per_device_eval_batch_size: 8 + per_device_train_batch_size: 8 + save_steps: 500 + task: covost2 + warmup_steps: 500 +2022-05-05 17:31:16 INFO About to run command: python3 run_xtreme_s.py --overwrite_output_dir --freeze_feature_encoder --gradient_checkpointing --predict_with_generate --fp16 --group_by_length --do_train --do_eval --load_best_model_at_end --push_to_hub --use_auth_token --eval_split_name=test --eval_steps=500 --evaluation_strategy=steps --generation_max_length=40 --generation_num_beams=1 --gradient_accumulation_steps=16 --greater_is_better=True --hidden_dropout=0.2 --language=fr.en --learning_rate=0.0003287457929573604 --logging_steps=1 --max_duration_in_seconds=20 --metric_for_best_model=bleu --model_name_or_path=./ --num_train_epochs=3 --output_dir=./ --per_device_eval_batch_size=8 --per_device_train_batch_size=8 --save_steps=500 --task=covost2 --warmup_steps=500 +2022-05-05 17:31:21 INFO Running runs: ['ykkm1chu'] +2022-05-05 17:36:50 INFO Running runs: [] +2022-05-05 17:36:50 INFO Agent received command: run +2022-05-05 17:36:50 INFO Agent starting run with config: + eval_split_name: test + eval_steps: 500 + evaluation_strategy: steps + generation_max_length: 40 + generation_num_beams: 1 + gradient_accumulation_steps: 16 + greater_is_better: True + hidden_dropout: 0.2 + language: fr.en + learning_rate: 3e-05 + logging_steps: 1 + max_duration_in_seconds: 20 + metric_for_best_model: bleu + model_name_or_path: ./ + num_train_epochs: 3 + output_dir: ./ + per_device_eval_batch_size: 8 + per_device_train_batch_size: 8 + save_steps: 500 + task: covost2 + warmup_steps: 500 +2022-05-05 17:36:50 INFO About to run command: python3 run_xtreme_s.py --overwrite_output_dir --freeze_feature_encoder --gradient_checkpointing --predict_with_generate --fp16 --group_by_length --do_train --do_eval --load_best_model_at_end --push_to_hub --use_auth_token --eval_split_name=test --eval_steps=500 --evaluation_strategy=steps --generation_max_length=40 --generation_num_beams=1 --gradient_accumulation_steps=16 --greater_is_better=True --hidden_dropout=0.2 --language=fr.en --learning_rate=3e-05 --logging_steps=1 --max_duration_in_seconds=20 --metric_for_best_model=bleu --model_name_or_path=./ --num_train_epochs=3 --output_dir=./ --per_device_eval_batch_size=8 --per_device_train_batch_size=8 --save_steps=500 --task=covost2 --warmup_steps=500 +2022-05-05 17:36:55 INFO Running runs: ['b097rk18'] diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log index 58c5ec7da5ac28dd2a78bd294903ddbe4f051451..7d96e38617ae4a49a5357e18de7229b9e6b89003 120000 --- a/wandb/debug-internal.log +++ b/wandb/debug-internal.log @@ -1 +1 @@ -run-20220505_164902-lr7oonrp/logs/debug-internal.log \ No newline at end of file +run-20220505_173748-b097rk18/logs/debug-internal.log \ No newline at end of file diff --git a/wandb/debug.log b/wandb/debug.log index 82d5f1ae64f9fd55a283ddbb07fe94684571ed2d..8ece50b92477ca7fec5b5153010ee838848febfc 120000 --- a/wandb/debug.log +++ b/wandb/debug.log @@ -1 +1 @@ -run-20220505_164902-lr7oonrp/logs/debug.log \ No newline at end of file +run-20220505_173748-b097rk18/logs/debug.log \ No newline at end of file diff --git a/wandb/latest-run b/wandb/latest-run index 431fc61664a82df8f1c1d0ab92ae936ef5a1e7cc..292c21e3360a10ef428be59900a9da1701caaf9b 120000 --- a/wandb/latest-run +++ b/wandb/latest-run @@ -1 +1 @@ -run-20220505_164902-lr7oonrp \ No newline at end of file +run-20220505_173748-b097rk18 \ No newline at end of file diff --git a/wandb/run-20220505_164902-lr7oonrp/files/config.yaml b/wandb/run-20220505_164902-lr7oonrp/files/config.yaml index 605a57ab9d60c939856e09ba50a0c72e4870144e..8ba190a64dc2ed48415d9fc8312db23fba64ac2f 100644 --- a/wandb/run-20220505_164902-lr7oonrp/files/config.yaml +++ b/wandb/run-20220505_164902-lr7oonrp/files/config.yaml @@ -52,7 +52,16 @@ _wandb: - 5 - 11 - 12 + 2: + - 1 + - 2 + - 3 + - 5 + - 11 + - 12 3: + - 1 + - 7 - 13 4: 3.9.5 5: 0.12.10 diff --git a/wandb/run-20220505_164902-lr7oonrp/files/output.log b/wandb/run-20220505_164902-lr7oonrp/files/output.log index d59ad20cdb32307ad202e78fdc3d7d349bd37833..b7715fdc772e6d0f114786a2c1fc97012f1b5f3a 100644 --- a/wandb/run-20220505_164902-lr7oonrp/files/output.log +++ b/wandb/run-20220505_164902-lr7oonrp/files/output.log @@ -51,3 +51,25 @@ To https://huggingface.co/sanchit-gandhi/xtreme_s_xlsr_2_bart_covost2_fr_en Saving model checkpoint to ./ Configuration saved in ./config.json Model weights saved in ./pytorch_model.bin +Feature extractor saved in ./preprocessor_config.json +05/05/2022 16:51:11 - WARNING - huggingface_hub.repository - To https://huggingface.co/sanchit-gandhi/xtreme_s_xlsr_2_bart_covost2_fr_en + 9ad50a4..9638619 main -> main +To https://huggingface.co/sanchit-gandhi/xtreme_s_xlsr_2_bart_covost2_fr_en + 9ad50a4..9638619 main -> main +Traceback (most recent call last): + File "/home/sanchit_huggingface_co/xtreme_s_xlsr_2_bart_covost2_fr_en/run_xtreme_s.py", line 972, in + main() + File "/home/sanchit_huggingface_co/xtreme_s_xlsr_2_bart_covost2_fr_en/run_xtreme_s.py", line 964, in main + trainer.push_to_hub(**kwargs) + File "/home/sanchit_huggingface_co/transformers/src/transformers/trainer.py", line 2944, in push_to_hub + self.create_model_card(model_name=model_name, **kwargs) + File "/home/sanchit_huggingface_co/transformers/src/transformers/trainer.py", line 2841, in create_model_card + training_summary = TrainingSummary.from_trainer( + File "/home/sanchit_huggingface_co/transformers/src/transformers/modelcard.py", line 611, in from_trainer + return cls( + File "", line 16, in __init__ + File "/home/sanchit_huggingface_co/transformers/src/transformers/modelcard.py", line 401, in __post_init__ + info = model_info(self.finetuned_from) + File "/home/sanchit_huggingface_co/gcp/lib/python3.9/site-packages/huggingface_hub/hf_api.py", line 870, in model_info + return ModelInfo(**d) +TypeError: huggingface_hub.hf_api.ModelInfo() argument after ** must be a mapping, not list \ No newline at end of file diff --git a/wandb/run-20220505_164902-lr7oonrp/files/wandb-summary.json b/wandb/run-20220505_164902-lr7oonrp/files/wandb-summary.json index 078e072feadb034e036b6b542c03dc8f7cd54d70..ef23ece324522a9bc94a1b09fc62d8dba7c2b769 100644 --- a/wandb/run-20220505_164902-lr7oonrp/files/wandb-summary.json +++ b/wandb/run-20220505_164902-lr7oonrp/files/wandb-summary.json @@ -1 +1 @@ -{"train/train_runtime": 8.6541, "train/train_samples_per_second": 71887.743, "train/train_steps_per_second": 1123.17, "train/total_flos": 0.0, "train/train_loss": 0.0, "train/epoch": 3.0, "train/global_step": 9720, "_runtime": 8, "_timestamp": 1651769350, "_step": 0} \ No newline at end of file +{"train/train_runtime": 8.6541, "train/train_samples_per_second": 71887.743, "train/train_steps_per_second": 1123.17, "train/total_flos": 0.0, "train/train_loss": 0.0, "train/epoch": 3.0, "train/global_step": 9720, "_runtime": 8, "_timestamp": 1651769350, "_step": 0, "_wandb": {"runtime": 134}} \ No newline at end of file diff --git a/wandb/run-20220505_164902-lr7oonrp/logs/debug-internal.log b/wandb/run-20220505_164902-lr7oonrp/logs/debug-internal.log index 885dfcfae3ce834a043e4c8cf2b94b8cd6923997..096df0284d6209d3f3cc911781eda4ced95ef98b 100644 --- a/wandb/run-20220505_164902-lr7oonrp/logs/debug-internal.log +++ b/wandb/run-20220505_164902-lr7oonrp/logs/debug-internal.log @@ -71,3 +71,108 @@ 2022-05-05 16:51:05,586 DEBUG SenderThread:67587 [sender.py:send():235] send: stats 2022-05-05 16:51:06,466 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: stop_status 2022-05-05 16:51:06,467 DEBUG SenderThread:67587 [sender.py:send_request():249] send_request: stop_status +2022-05-05 16:51:12,618 INFO Thread-8 :67587 [dir_watcher.py:_on_file_modified():230] file/dir modified: /home/sanchit_huggingface_co/xtreme_s_xlsr_2_bart_covost2_fr_en/wandb/run-20220505_164902-lr7oonrp/files/output.log +2022-05-05 16:51:16,619 INFO Thread-8 :67587 [dir_watcher.py:_on_file_modified():230] file/dir modified: /home/sanchit_huggingface_co/xtreme_s_xlsr_2_bart_covost2_fr_en/wandb/run-20220505_164902-lr7oonrp/files/output.log +2022-05-05 16:51:18,214 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: poll_exit +2022-05-05 16:51:18,214 DEBUG SenderThread:67587 [sender.py:send():235] send: telemetry +2022-05-05 16:51:18,214 DEBUG SenderThread:67587 [sender.py:send():235] send: exit +2022-05-05 16:51:18,215 INFO SenderThread:67587 [sender.py:send_exit():371] handling exit code: 1 +2022-05-05 16:51:18,215 INFO SenderThread:67587 [sender.py:send_exit():373] handling runtime: 134 +2022-05-05 16:51:18,215 INFO SenderThread:67587 [sender.py:_save_file():944] saving file wandb-summary.json with policy end +2022-05-05 16:51:18,215 INFO SenderThread:67587 [sender.py:send_exit():379] send defer +2022-05-05 16:51:18,216 DEBUG SenderThread:67587 [sender.py:send_request():249] send_request: poll_exit +2022-05-05 16:51:18,216 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: defer +2022-05-05 16:51:18,216 INFO HandlerThread:67587 [handler.py:handle_request_defer():154] handle defer: 0 +2022-05-05 16:51:18,217 DEBUG SenderThread:67587 [sender.py:send_request():249] send_request: defer +2022-05-05 16:51:18,217 INFO SenderThread:67587 [sender.py:send_request_defer():388] handle sender defer: 0 +2022-05-05 16:51:18,217 INFO SenderThread:67587 [sender.py:transition_state():392] send defer: 1 +2022-05-05 16:51:18,217 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: defer +2022-05-05 16:51:18,217 INFO HandlerThread:67587 [handler.py:handle_request_defer():154] handle defer: 1 +2022-05-05 16:51:18,266 DEBUG SenderThread:67587 [sender.py:send_request():249] send_request: defer +2022-05-05 16:51:18,266 INFO SenderThread:67587 [sender.py:send_request_defer():388] handle sender defer: 1 +2022-05-05 16:51:18,266 INFO SenderThread:67587 [sender.py:transition_state():392] send defer: 2 +2022-05-05 16:51:18,266 DEBUG SenderThread:67587 [sender.py:send():235] send: stats +2022-05-05 16:51:18,267 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: defer +2022-05-05 16:51:18,267 INFO HandlerThread:67587 [handler.py:handle_request_defer():154] handle defer: 2 +2022-05-05 16:51:18,267 DEBUG SenderThread:67587 [sender.py:send_request():249] send_request: defer +2022-05-05 16:51:18,268 INFO SenderThread:67587 [sender.py:send_request_defer():388] handle sender defer: 2 +2022-05-05 16:51:18,268 INFO SenderThread:67587 [sender.py:transition_state():392] send defer: 3 +2022-05-05 16:51:18,268 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: defer +2022-05-05 16:51:18,268 INFO HandlerThread:67587 [handler.py:handle_request_defer():154] handle defer: 3 +2022-05-05 16:51:18,268 DEBUG SenderThread:67587 [sender.py:send():235] send: summary +2022-05-05 16:51:18,269 INFO SenderThread:67587 [sender.py:_save_file():944] saving file wandb-summary.json with policy end +2022-05-05 16:51:18,269 DEBUG SenderThread:67587 [sender.py:send_request():249] send_request: defer +2022-05-05 16:51:18,270 INFO SenderThread:67587 [sender.py:send_request_defer():388] handle sender defer: 3 +2022-05-05 16:51:18,270 INFO SenderThread:67587 [sender.py:transition_state():392] send defer: 4 +2022-05-05 16:51:18,270 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: defer +2022-05-05 16:51:18,270 INFO HandlerThread:67587 [handler.py:handle_request_defer():154] handle defer: 4 +2022-05-05 16:51:18,270 DEBUG SenderThread:67587 [sender.py:send_request():249] send_request: defer +2022-05-05 16:51:18,270 INFO SenderThread:67587 [sender.py:send_request_defer():388] handle sender defer: 4 +2022-05-05 16:51:18,318 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: poll_exit +2022-05-05 16:51:18,620 INFO Thread-8 :67587 [dir_watcher.py:_on_file_modified():230] file/dir modified: /home/sanchit_huggingface_co/xtreme_s_xlsr_2_bart_covost2_fr_en/wandb/run-20220505_164902-lr7oonrp/files/output.log +2022-05-05 16:51:18,620 INFO Thread-8 :67587 [dir_watcher.py:_on_file_modified():230] file/dir modified: /home/sanchit_huggingface_co/xtreme_s_xlsr_2_bart_covost2_fr_en/wandb/run-20220505_164902-lr7oonrp/files/wandb-summary.json +2022-05-05 16:51:18,806 INFO SenderThread:67587 [sender.py:transition_state():392] send defer: 5 +2022-05-05 16:51:18,807 DEBUG SenderThread:67587 [sender.py:send_request():249] send_request: poll_exit +2022-05-05 16:51:18,807 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: defer +2022-05-05 16:51:18,808 INFO HandlerThread:67587 [handler.py:handle_request_defer():154] handle defer: 5 +2022-05-05 16:51:18,808 DEBUG SenderThread:67587 [sender.py:send_request():249] send_request: defer +2022-05-05 16:51:18,808 INFO SenderThread:67587 [sender.py:send_request_defer():388] handle sender defer: 5 +2022-05-05 16:51:18,808 INFO SenderThread:67587 [dir_watcher.py:finish():283] shutting down directory watcher +2022-05-05 16:51:18,909 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: poll_exit +2022-05-05 16:51:19,621 INFO Thread-8 :67587 [dir_watcher.py:_on_file_modified():230] file/dir modified: /home/sanchit_huggingface_co/xtreme_s_xlsr_2_bart_covost2_fr_en/wandb/run-20220505_164902-lr7oonrp/files/config.yaml +2022-05-05 16:51:19,621 INFO SenderThread:67587 [dir_watcher.py:finish():313] scan: /home/sanchit_huggingface_co/xtreme_s_xlsr_2_bart_covost2_fr_en/wandb/run-20220505_164902-lr7oonrp/files +2022-05-05 16:51:19,621 INFO SenderThread:67587 [dir_watcher.py:finish():327] scan save: /home/sanchit_huggingface_co/xtreme_s_xlsr_2_bart_covost2_fr_en/wandb/run-20220505_164902-lr7oonrp/files/wandb-metadata.json wandb-metadata.json +2022-05-05 16:51:19,621 INFO SenderThread:67587 [dir_watcher.py:finish():327] scan save: /home/sanchit_huggingface_co/xtreme_s_xlsr_2_bart_covost2_fr_en/wandb/run-20220505_164902-lr7oonrp/files/output.log output.log +2022-05-05 16:51:19,622 INFO SenderThread:67587 [dir_watcher.py:finish():327] scan save: /home/sanchit_huggingface_co/xtreme_s_xlsr_2_bart_covost2_fr_en/wandb/run-20220505_164902-lr7oonrp/files/wandb-summary.json wandb-summary.json +2022-05-05 16:51:19,622 INFO SenderThread:67587 [dir_watcher.py:finish():327] scan save: /home/sanchit_huggingface_co/xtreme_s_xlsr_2_bart_covost2_fr_en/wandb/run-20220505_164902-lr7oonrp/files/requirements.txt requirements.txt +2022-05-05 16:51:19,625 INFO SenderThread:67587 [dir_watcher.py:finish():327] scan save: /home/sanchit_huggingface_co/xtreme_s_xlsr_2_bart_covost2_fr_en/wandb/run-20220505_164902-lr7oonrp/files/config.yaml config.yaml +2022-05-05 16:51:19,625 INFO SenderThread:67587 [sender.py:transition_state():392] send defer: 6 +2022-05-05 16:51:19,626 DEBUG SenderThread:67587 [sender.py:send_request():249] send_request: poll_exit +2022-05-05 16:51:19,629 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: defer +2022-05-05 16:51:19,629 INFO HandlerThread:67587 [handler.py:handle_request_defer():154] handle defer: 6 +2022-05-05 16:51:19,629 DEBUG SenderThread:67587 [sender.py:send_request():249] send_request: defer +2022-05-05 16:51:19,629 INFO SenderThread:67587 [sender.py:send_request_defer():388] handle sender defer: 6 +2022-05-05 16:51:19,629 INFO SenderThread:67587 [file_pusher.py:finish():177] shutting down file pusher +2022-05-05 16:51:19,727 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: poll_exit +2022-05-05 16:51:19,728 DEBUG SenderThread:67587 [sender.py:send_request():249] send_request: poll_exit +2022-05-05 16:51:19,830 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: poll_exit +2022-05-05 16:51:19,830 DEBUG SenderThread:67587 [sender.py:send_request():249] send_request: poll_exit +2022-05-05 16:51:19,891 INFO Thread-14 :67587 [upload_job.py:push():137] Uploaded file /home/sanchit_huggingface_co/xtreme_s_xlsr_2_bart_covost2_fr_en/wandb/run-20220505_164902-lr7oonrp/files/requirements.txt +2022-05-05 16:51:19,902 INFO Thread-12 :67587 [upload_job.py:push():137] Uploaded file /home/sanchit_huggingface_co/xtreme_s_xlsr_2_bart_covost2_fr_en/wandb/run-20220505_164902-lr7oonrp/files/output.log +2022-05-05 16:51:19,909 INFO Thread-13 :67587 [upload_job.py:push():137] Uploaded file /home/sanchit_huggingface_co/xtreme_s_xlsr_2_bart_covost2_fr_en/wandb/run-20220505_164902-lr7oonrp/files/wandb-summary.json +2022-05-05 16:51:19,933 INFO Thread-15 :67587 [upload_job.py:push():137] Uploaded file /home/sanchit_huggingface_co/xtreme_s_xlsr_2_bart_covost2_fr_en/wandb/run-20220505_164902-lr7oonrp/files/config.yaml +2022-05-05 16:51:19,934 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: poll_exit +2022-05-05 16:51:19,934 DEBUG SenderThread:67587 [sender.py:send_request():249] send_request: poll_exit +2022-05-05 16:51:20,036 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: poll_exit +2022-05-05 16:51:20,037 DEBUG SenderThread:67587 [sender.py:send_request():249] send_request: poll_exit +2022-05-05 16:51:20,134 INFO Thread-7 :67587 [sender.py:transition_state():392] send defer: 7 +2022-05-05 16:51:20,135 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: defer +2022-05-05 16:51:20,135 INFO HandlerThread:67587 [handler.py:handle_request_defer():154] handle defer: 7 +2022-05-05 16:51:20,135 DEBUG SenderThread:67587 [sender.py:send_request():249] send_request: defer +2022-05-05 16:51:20,135 INFO SenderThread:67587 [sender.py:send_request_defer():388] handle sender defer: 7 +2022-05-05 16:51:20,139 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: poll_exit +2022-05-05 16:51:20,296 INFO SenderThread:67587 [sender.py:transition_state():392] send defer: 8 +2022-05-05 16:51:20,296 DEBUG SenderThread:67587 [sender.py:send_request():249] send_request: poll_exit +2022-05-05 16:51:20,297 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: defer +2022-05-05 16:51:20,297 INFO HandlerThread:67587 [handler.py:handle_request_defer():154] handle defer: 8 +2022-05-05 16:51:20,297 DEBUG SenderThread:67587 [sender.py:send_request():249] send_request: defer +2022-05-05 16:51:20,297 INFO SenderThread:67587 [sender.py:send_request_defer():388] handle sender defer: 8 +2022-05-05 16:51:20,297 INFO SenderThread:67587 [sender.py:transition_state():392] send defer: 9 +2022-05-05 16:51:20,298 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: defer +2022-05-05 16:51:20,298 DEBUG SenderThread:67587 [sender.py:send():235] send: final +2022-05-05 16:51:20,298 INFO HandlerThread:67587 [handler.py:handle_request_defer():154] handle defer: 9 +2022-05-05 16:51:20,298 DEBUG SenderThread:67587 [sender.py:send():235] send: footer +2022-05-05 16:51:20,299 DEBUG SenderThread:67587 [sender.py:send_request():249] send_request: defer +2022-05-05 16:51:20,299 INFO SenderThread:67587 [sender.py:send_request_defer():388] handle sender defer: 9 +2022-05-05 16:51:20,398 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: poll_exit +2022-05-05 16:51:20,398 DEBUG SenderThread:67587 [sender.py:send_request():249] send_request: poll_exit +2022-05-05 16:51:20,398 INFO SenderThread:67587 [file_pusher.py:join():182] waiting for file pusher +2022-05-05 16:51:21,569 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: get_summary +2022-05-05 16:51:21,570 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: sampled_history +2022-05-05 16:51:21,571 DEBUG HandlerThread:67587 [handler.py:handle_request():131] handle_request: shutdown +2022-05-05 16:51:21,571 INFO HandlerThread:67587 [handler.py:finish():739] shutting down handler +2022-05-05 16:51:22,298 INFO WriterThread:67587 [datastore.py:close():281] close: /home/sanchit_huggingface_co/xtreme_s_xlsr_2_bart_covost2_fr_en/wandb/run-20220505_164902-lr7oonrp/run-lr7oonrp.wandb +2022-05-05 16:51:22,568 INFO SenderThread:67587 [sender.py:finish():1075] shutting down sender +2022-05-05 16:51:22,568 INFO SenderThread:67587 [file_pusher.py:finish():177] shutting down file pusher +2022-05-05 16:51:22,568 INFO SenderThread:67587 [file_pusher.py:join():182] waiting for file pusher +2022-05-05 16:51:22,570 INFO MainThread:67587 [internal.py:handle_exit():79] Internal process exited diff --git a/wandb/run-20220505_164902-lr7oonrp/logs/debug.log b/wandb/run-20220505_164902-lr7oonrp/logs/debug.log index 114103c49cd4440cd30110ea376bcc6892b36b6f..7aa955b9271d77119dafad498c0840464a99a92d 100644 --- a/wandb/run-20220505_164902-lr7oonrp/logs/debug.log +++ b/wandb/run-20220505_164902-lr7oonrp/logs/debug.log @@ -26,3 +26,85 @@ config: {} 2022-05-05 16:49:06,070 INFO MainThread:67410 [wandb_init.py:init():651] run started, returning control to user process 2022-05-05 16:49:06,073 INFO MainThread:67410 [wandb_run.py:_config_callback():966] config_cb None None {'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'torch.float32', 'use_bfloat16': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'is_encoder_decoder': True, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 40, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'chunk_size_feed_forward': 0, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'architectures': ['SpeechEncoderDecoderModel'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': 1, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': 0, 'task_specific_params': None, 'problem_type': None, '_name_or_path': './', 'transformers_version': None, 'decoder': {'vocab_size': 50265, 'max_position_embeddings': 1024, 'd_model': 1024, 'encoder_ffn_dim': 4096, 'encoder_layers': 12, 'encoder_attention_heads': 16, 'decoder_ffn_dim': 4096, 'decoder_layers': 12, 'decoder_attention_heads': 16, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation_dropout': 0.1, 'activation_function': 'gelu', 'init_std': 0.02, 'encoder_layerdrop': 0.0, 'decoder_layerdrop': 0.0, 'classifier_dropout': 0.0, 'use_cache': True, 'num_hidden_layers': 12, 'scale_embedding': False, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'is_encoder_decoder': False, 'is_decoder': True, 'cross_attention_hidden_size': None, 'add_cross_attention': True, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': True, 'num_beams': 4, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 3, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'chunk_size_feed_forward': 0, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'architectures': ['BartModel'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1', 2: 'LABEL_2'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1, 'LABEL_2': 2}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 0, 'pad_token_id': 1, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': 2, 'task_specific_params': {'summarization': {'length_penalty': 1.0, 'max_length': 128, 'min_length': 12, 'num_beams': 4}, 'summarization_cnn': {'length_penalty': 2.0, 'max_length': 142, 'min_length': 56, 'num_beams': 4}, 'summarization_xsum': {'length_penalty': 1.0, 'max_length': 62, 'min_length': 11, 'num_beams': 6}}, 'problem_type': None, '_name_or_path': 'facebook/bart-large', 'transformers_version': '4.19.0.dev0', 'add_bias_logits': False, 'add_final_layer_norm': False, 'classif_dropout': 0.1, 'gradient_checkpointing': False, 'normalize_before': False, 'model_type': 'bart'}, 'encoder': {'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float32', 'use_bfloat16': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'chunk_size_feed_forward': 0, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'architectures': ['Wav2Vec2ForPreTraining'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': 0, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'facebook/wav2vec2-xls-r-300m', 'transformers_version': '4.19.0.dev0', 'feat_extract_dropout': 0.0, 'gradient_checkpointing': False, 'num_feat_extract_layers': 7, 'hidden_size': 1024, 'feat_extract_norm': 'layer', 'feat_extract_activation': 'gelu', 'conv_dim': [512, 512, 512, 512, 512, 512, 512], 'conv_stride': [5, 2, 2, 2, 2, 2, 2], 'conv_kernel': [10, 3, 3, 3, 3, 2, 2], 'conv_bias': True, 'num_conv_pos_embeddings': 128, 'num_conv_pos_embedding_groups': 16, 'num_hidden_layers': 24, 'intermediate_size': 4096, 'hidden_act': 'gelu', 'num_attention_heads': 16, 'hidden_dropout': 0.16627274455555233, 'attention_dropout': 0.1, 'activation_dropout': 0.0, 'feat_proj_dropout': 0.0, 'final_dropout': 0.0, 'layerdrop': 0.0, 'layer_norm_eps': 1e-05, 'initializer_range': 0.02, 'vocab_size': 32, 'do_stable_layer_norm': True, 'use_weighted_layer_sum': False, 'apply_spec_augment': True, 'mask_time_prob': 0.1, 'mask_time_length': 10, 'mask_time_min_masks': 2, 'mask_feature_prob': 0.0, 'mask_feature_length': 10, 'mask_feature_min_masks': 0, 'num_codevectors_per_group': 320, 'num_codevector_groups': 2, 'contrastive_logits_temperature': 0.1, 'feat_quantizer_dropout': 0.0, 'num_negatives': 100, 'codevector_dim': 768, 'proj_codevector_dim': 768, 'diversity_loss_weight': 0.1, 'ctc_loss_reduction': 'sum', 'ctc_zero_infinity': False, 'add_adapter': True, 'adapter_kernel_size': 3, 'adapter_stride': 2, 'num_adapter_layers': 3, 'output_hidden_size': 1024, 'classifier_proj_size': 256, 'tdnn_dim': [512, 512, 512, 512, 1500], 'tdnn_kernel': [5, 3, 3, 1, 1], 'tdnn_dilation': [1, 2, 3, 1, 1], 'xvector_output_dim': 512, 'model_type': 'wav2vec2'}, 'model_type': 'speech-encoder-decoder', 'processor_class': 'Wav2Vec2Processor', 'use_cache': False, 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_gpu_train_batch_size': 'None', 'per_gpu_eval_batch_size': 'None', 'eval_accumulation_steps': 'None', 'eval_delay': 0, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './runs/May05_16-48-07_sanchit--v100', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_total_limit': 'None', 'save_on_each_node': False, 'no_cuda': False, 'seed': 42, 'data_seed': 'None', 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'amp', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': 'None', 'local_rank': -1, 'xpu_backend': 'None', 'tpu_num_cores': 'None', 'tpu_metrics_debug': False, 'debug': '[]', 'dataloader_drop_last': False, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': 'None', 'load_best_model_at_end': True, 'ignore_data_skip': False, 'sharded_ddp': '[]', 'deepspeed': 'None', 'label_smoothing_factor': 0.0, 'optim': 'adamw_hf', 'adafactor': False, 'group_by_length': True, 'length_column_name': 'length', 'report_to': "['tensorboard', 'wandb', 'codecarbon']", 'ddp_find_unused_parameters': 'None', 'ddp_bucket_cap_mb': 'None', 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': 'None', 'hub_model_id': 'None', 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'gradient_checkpointing': True, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': 'None', 'push_to_hub_organization': 'None', 'push_to_hub_token': '', '_n_gpu': 1, 'mp_parameters': '', 'sortish_sampler': False, 'predict_with_generate': True, 'train_batch_size': 8, 'eval_batch_size': 8} 2022-05-05 16:49:06,076 INFO MainThread:67410 [wandb_watch.py:watch():43] Watching +2022-05-05 16:51:15,383 INFO MainThread:67410 [wandb_run.py:_atexit_cleanup():1797] got exitcode: 1 +2022-05-05 16:51:15,388 INFO MainThread:67410 [wandb_run.py:_restore():1769] restore +2022-05-05 16:51:18,216 INFO MainThread:67410 [wandb_run.py:_wait_for_finish():1929] got exit ret: file_counts { + wandb_count: 1 +} +pusher_stats { + uploaded_bytes: 1972 + total_bytes: 1972 +} + +2022-05-05 16:51:18,807 INFO MainThread:67410 [wandb_run.py:_wait_for_finish():1929] got exit ret: file_counts { + wandb_count: 1 +} +pusher_stats { + uploaded_bytes: 1972 + total_bytes: 1972 +} + +2022-05-05 16:51:19,626 INFO MainThread:67410 [wandb_run.py:_wait_for_finish():1929] got exit ret: file_counts { + wandb_count: 5 +} +pusher_stats { + uploaded_bytes: 1972 + total_bytes: 27373 +} + +2022-05-05 16:51:19,729 INFO MainThread:67410 [wandb_run.py:_wait_for_finish():1929] got exit ret: file_counts { + wandb_count: 5 +} +pusher_stats { + uploaded_bytes: 27373 + total_bytes: 27373 +} + +2022-05-05 16:51:19,831 INFO MainThread:67410 [wandb_run.py:_wait_for_finish():1929] got exit ret: file_counts { + wandb_count: 5 +} +pusher_stats { + uploaded_bytes: 27373 + total_bytes: 27373 +} + +2022-05-05 16:51:19,935 INFO MainThread:67410 [wandb_run.py:_wait_for_finish():1929] got exit ret: file_counts { + wandb_count: 5 +} +pusher_stats { + uploaded_bytes: 27373 + total_bytes: 27373 +} + +2022-05-05 16:51:20,038 INFO MainThread:67410 [wandb_run.py:_wait_for_finish():1929] got exit ret: file_counts { + wandb_count: 5 +} +pusher_stats { + uploaded_bytes: 27373 + total_bytes: 27373 +} + +2022-05-05 16:51:20,297 INFO MainThread:67410 [wandb_run.py:_wait_for_finish():1929] got exit ret: file_counts { + wandb_count: 5 +} +pusher_stats { + uploaded_bytes: 27373 + total_bytes: 27373 +} + +2022-05-05 16:51:21,568 INFO MainThread:67410 [wandb_run.py:_wait_for_finish():1929] got exit ret: done: true +exit_result { +} +file_counts { + wandb_count: 5 +} +pusher_stats { + uploaded_bytes: 27373 + total_bytes: 27373 +} +local_info { +} + +2022-05-05 16:51:23,149 INFO MainThread:67410 [wandb_run.py:_append_history():2144] rendering history +2022-05-05 16:51:23,149 INFO MainThread:67410 [wandb_run.py:_append_summary():2102] rendering summary +2022-05-05 16:51:23,149 INFO MainThread:67410 [wandb_run.py:_append_files():2194] logging synced files diff --git a/wandb/run-20220505_164902-lr7oonrp/run-lr7oonrp.wandb b/wandb/run-20220505_164902-lr7oonrp/run-lr7oonrp.wandb index d3172f8f6d80631dea24bde9c6a1d4c58a4726da..ef48511c39dc89e4cf657d554a9eee228ca21b73 100644 Binary files a/wandb/run-20220505_164902-lr7oonrp/run-lr7oonrp.wandb and b/wandb/run-20220505_164902-lr7oonrp/run-lr7oonrp.wandb differ diff --git a/wandb/run-20220505_173213-ykkm1chu/files/config.yaml b/wandb/run-20220505_173213-ykkm1chu/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..55404a137c067017538875e2a85757ff8848e389 --- /dev/null +++ b/wandb/run-20220505_173213-ykkm1chu/files/config.yaml @@ -0,0 +1,788 @@ +wandb_version: 1 + +_n_gpu: + desc: null + value: 1 +_name_or_path: + desc: null + value: ./ +_wandb: + desc: null + value: + cli_version: 0.12.10 + framework: huggingface + huggingface_version: 4.19.0.dev0 + is_jupyter_run: false + is_kaggle_kernel: false + m: + - 1: train/global_step + 6: + - 3 + - 1: train/train_runtime + 5: 1 + 6: + - 1 + - 1: train/train_samples_per_second + 5: 1 + 6: + - 1 + - 1: train/train_steps_per_second + 5: 1 + 6: + - 1 + - 1: train/total_flos + 5: 1 + 6: + - 1 + - 1: train/train_loss + 5: 1 + 6: + - 1 + - 1: train/epoch + 5: 1 + 6: + - 1 + python_version: 3.9.5 + start_time: 1651771933 + t: + 1: + - 1 + - 2 + - 3 + - 5 + - 11 + - 12 + 3: + - 13 + 4: 3.9.5 + 5: 0.12.10 + 6: 4.19.0.dev0 + 8: + - 5 +adafactor: + desc: null + value: false +adam_beta1: + desc: null + value: 0.9 +adam_beta2: + desc: null + value: 0.999 +adam_epsilon: + desc: null + value: 1.0e-08 +add_cross_attention: + desc: null + value: false +architectures: + desc: null + value: + - SpeechEncoderDecoderModel +bad_words_ids: + desc: null + value: null +bf16: + desc: null + value: false +bf16_full_eval: + desc: null + value: false +bos_token_id: + desc: null + value: null +chunk_size_feed_forward: + desc: null + value: 0 +cross_attention_hidden_size: + desc: null + value: null +data_seed: + desc: null + value: None +dataloader_drop_last: + desc: null + value: false +dataloader_num_workers: + desc: null + value: 0 +dataloader_pin_memory: + desc: null + value: true +ddp_bucket_cap_mb: + desc: null + value: None +ddp_find_unused_parameters: + desc: null + value: None +debug: + desc: null + value: '[]' +decoder: + desc: null + value: + _name_or_path: facebook/bart-large + activation_dropout: 0.1 + activation_function: gelu + add_bias_logits: false + add_cross_attention: true + add_final_layer_norm: false + architectures: + - BartModel + attention_dropout: 0.1 + bad_words_ids: null + bos_token_id: 0 + chunk_size_feed_forward: 0 + classif_dropout: 0.1 + classifier_dropout: 0.0 + cross_attention_hidden_size: null + d_model: 1024 + decoder_attention_heads: 16 + decoder_ffn_dim: 4096 + decoder_layerdrop: 0.0 + decoder_layers: 12 + decoder_start_token_id: 2 + diversity_penalty: 0.0 + do_sample: false + dropout: 0.1 + early_stopping: true + encoder_attention_heads: 16 + encoder_ffn_dim: 4096 + encoder_layerdrop: 0.0 + encoder_layers: 12 + encoder_no_repeat_ngram_size: 0 + eos_token_id: 2 + exponential_decay_length_penalty: null + finetuning_task: null + forced_bos_token_id: 0 + forced_eos_token_id: 2 + gradient_checkpointing: false + id2label: + '0': LABEL_0 + '1': LABEL_1 + '2': LABEL_2 + init_std: 0.02 + is_decoder: true + is_encoder_decoder: false + label2id: + LABEL_0: 0 + LABEL_1: 1 + LABEL_2: 2 + length_penalty: 1.0 + max_length: 20 + max_position_embeddings: 1024 + min_length: 0 + model_type: bart + no_repeat_ngram_size: 3 + normalize_before: false + num_beam_groups: 1 + num_beams: 4 + num_hidden_layers: 12 + num_return_sequences: 1 + output_attentions: false + output_hidden_states: false + output_scores: false + pad_token_id: 1 + prefix: null + problem_type: null + pruned_heads: {} + remove_invalid_values: false + repetition_penalty: 1.0 + return_dict: true + return_dict_in_generate: false + scale_embedding: false + sep_token_id: null + task_specific_params: + summarization: + length_penalty: 1.0 + max_length: 128 + min_length: 12 + num_beams: 4 + summarization_cnn: + length_penalty: 2.0 + max_length: 142 + min_length: 56 + num_beams: 4 + summarization_xsum: + length_penalty: 1.0 + max_length: 62 + min_length: 11 + num_beams: 6 + temperature: 1.0 + tie_encoder_decoder: false + tie_word_embeddings: true + tokenizer_class: null + top_k: 50 + top_p: 1.0 + torch_dtype: null + torchscript: false + transformers_version: 4.19.0.dev0 + typical_p: 1.0 + use_bfloat16: false + use_cache: true + vocab_size: 50265 +decoder_start_token_id: + desc: null + value: 0 +deepspeed: + desc: null + value: None +disable_tqdm: + desc: null + value: false +diversity_penalty: + desc: null + value: 0.0 +do_eval: + desc: null + value: true +do_predict: + desc: null + value: false +do_sample: + desc: null + value: false +do_train: + desc: null + value: true +early_stopping: + desc: null + value: false +encoder: + desc: null + value: + _name_or_path: facebook/wav2vec2-xls-r-300m + activation_dropout: 0.0 + adapter_kernel_size: 3 + adapter_stride: 2 + add_adapter: true + add_cross_attention: false + apply_spec_augment: true + architectures: + - Wav2Vec2ForPreTraining + attention_dropout: 0.1 + bad_words_ids: null + bos_token_id: 1 + chunk_size_feed_forward: 0 + classifier_proj_size: 256 + codevector_dim: 768 + contrastive_logits_temperature: 0.1 + conv_bias: true + conv_dim: + - 512 + - 512 + - 512 + - 512 + - 512 + - 512 + - 512 + conv_kernel: + - 10 + - 3 + - 3 + - 3 + - 3 + - 2 + - 2 + conv_stride: + - 5 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + cross_attention_hidden_size: null + ctc_loss_reduction: sum + ctc_zero_infinity: false + decoder_start_token_id: null + diversity_loss_weight: 0.1 + diversity_penalty: 0.0 + do_sample: false + do_stable_layer_norm: true + early_stopping: false + encoder_no_repeat_ngram_size: 0 + eos_token_id: 2 + exponential_decay_length_penalty: null + feat_extract_activation: gelu + feat_extract_dropout: 0.0 + feat_extract_norm: layer + feat_proj_dropout: 0.0 + feat_quantizer_dropout: 0.0 + final_dropout: 0.0 + finetuning_task: null + forced_bos_token_id: null + forced_eos_token_id: null + gradient_checkpointing: false + hidden_act: gelu + hidden_dropout: 0.2 + hidden_size: 1024 + id2label: + '0': LABEL_0 + '1': LABEL_1 + initializer_range: 0.02 + intermediate_size: 4096 + is_decoder: false + is_encoder_decoder: false + label2id: + LABEL_0: 0 + LABEL_1: 1 + layer_norm_eps: 1.0e-05 + layerdrop: 0.0 + length_penalty: 1.0 + mask_feature_length: 10 + mask_feature_min_masks: 0 + mask_feature_prob: 0.0 + mask_time_length: 10 + mask_time_min_masks: 2 + mask_time_prob: 0.1 + max_length: 20 + min_length: 0 + model_type: wav2vec2 + no_repeat_ngram_size: 0 + num_adapter_layers: 3 + num_attention_heads: 16 + num_beam_groups: 1 + num_beams: 1 + num_codevector_groups: 2 + num_codevectors_per_group: 320 + num_conv_pos_embedding_groups: 16 + num_conv_pos_embeddings: 128 + num_feat_extract_layers: 7 + num_hidden_layers: 24 + num_negatives: 100 + num_return_sequences: 1 + output_attentions: false + output_hidden_size: 1024 + output_hidden_states: false + output_scores: false + pad_token_id: 0 + prefix: null + problem_type: null + proj_codevector_dim: 768 + pruned_heads: {} + remove_invalid_values: false + repetition_penalty: 1.0 + return_dict: true + return_dict_in_generate: false + sep_token_id: null + task_specific_params: null + tdnn_dilation: + - 1 + - 2 + - 3 + - 1 + - 1 + tdnn_dim: + - 512 + - 512 + - 512 + - 512 + - 1500 + tdnn_kernel: + - 5 + - 3 + - 3 + - 1 + - 1 + temperature: 1.0 + tie_encoder_decoder: false + tie_word_embeddings: true + tokenizer_class: null + top_k: 50 + top_p: 1.0 + torch_dtype: float32 + torchscript: false + transformers_version: 4.19.0.dev0 + typical_p: 1.0 + use_bfloat16: false + use_weighted_layer_sum: false + vocab_size: 32 + xvector_output_dim: 512 +encoder_no_repeat_ngram_size: + desc: null + value: 0 +eos_token_id: + desc: null + value: 2 +eval_accumulation_steps: + desc: null + value: None +eval_batch_size: + desc: null + value: 8 +eval_delay: + desc: null + value: 0 +eval_split_name: + desc: null + value: test +eval_steps: + desc: null + value: 500 +evaluation_strategy: + desc: null + value: steps +exponential_decay_length_penalty: + desc: null + value: null +finetuning_task: + desc: null + value: null +forced_bos_token_id: + desc: null + value: null +forced_eos_token_id: + desc: null + value: null +fp16: + desc: null + value: true +fp16_backend: + desc: null + value: auto +fp16_full_eval: + desc: null + value: false +fp16_opt_level: + desc: null + value: O1 +generation_max_length: + desc: null + value: 40 +generation_num_beams: + desc: null + value: 1 +gradient_accumulation_steps: + desc: null + value: 16 +gradient_checkpointing: + desc: null + value: true +greater_is_better: + desc: null + value: true +group_by_length: + desc: null + value: true +half_precision_backend: + desc: null + value: amp +hidden_dropout: + desc: null + value: 0.2 +hub_model_id: + desc: null + value: None +hub_private_repo: + desc: null + value: false +hub_strategy: + desc: null + value: every_save +hub_token: + desc: null + value: +id2label: + desc: null + value: + '0': LABEL_0 + '1': LABEL_1 +ignore_data_skip: + desc: null + value: false +include_inputs_for_metrics: + desc: null + value: false +is_decoder: + desc: null + value: false +is_encoder_decoder: + desc: null + value: true +label2id: + desc: null + value: + LABEL_0: 0 + LABEL_1: 1 +label_names: + desc: null + value: None +label_smoothing_factor: + desc: null + value: 0.0 +language: + desc: null + value: fr.en +learning_rate: + desc: null + value: 0.0003287457929573604 +length_column_name: + desc: null + value: length +length_penalty: + desc: null + value: 1.0 +load_best_model_at_end: + desc: null + value: true +local_rank: + desc: null + value: -1 +log_level: + desc: null + value: -1 +log_level_replica: + desc: null + value: -1 +log_on_each_node: + desc: null + value: true +logging_dir: + desc: null + value: ./runs/May05_17-31-19_sanchit--v100 +logging_first_step: + desc: null + value: false +logging_nan_inf_filter: + desc: null + value: true +logging_steps: + desc: null + value: 1 +logging_strategy: + desc: null + value: steps +lr_scheduler_type: + desc: null + value: linear +max_duration_in_seconds: + desc: null + value: 20 +max_grad_norm: + desc: null + value: 1.0 +max_length: + desc: null + value: 40 +max_steps: + desc: null + value: -1 +metric_for_best_model: + desc: null + value: bleu +min_length: + desc: null + value: 0 +model_name_or_path: + desc: null + value: ./ +model_type: + desc: null + value: speech-encoder-decoder +mp_parameters: + desc: null + value: '' +no_cuda: + desc: null + value: false +no_repeat_ngram_size: + desc: null + value: 0 +num_beam_groups: + desc: null + value: 1 +num_beams: + desc: null + value: 1 +num_return_sequences: + desc: null + value: 1 +num_train_epochs: + desc: null + value: 3 +optim: + desc: null + value: adamw_hf +output_attentions: + desc: null + value: false +output_dir: + desc: null + value: ./ +output_hidden_states: + desc: null + value: false +output_scores: + desc: null + value: false +overwrite_output_dir: + desc: null + value: true +pad_token_id: + desc: null + value: 1 +past_index: + desc: null + value: -1 +per_device_eval_batch_size: + desc: null + value: 8 +per_device_train_batch_size: + desc: null + value: 8 +per_gpu_eval_batch_size: + desc: null + value: None +per_gpu_train_batch_size: + desc: null + value: None +predict_with_generate: + desc: null + value: true +prediction_loss_only: + desc: null + value: false +prefix: + desc: null + value: null +problem_type: + desc: null + value: null +processor_class: + desc: null + value: Wav2Vec2Processor +pruned_heads: + desc: null + value: {} +push_to_hub: + desc: null + value: true +push_to_hub_model_id: + desc: null + value: None +push_to_hub_organization: + desc: null + value: None +push_to_hub_token: + desc: null + value: +remove_invalid_values: + desc: null + value: false +remove_unused_columns: + desc: null + value: true +repetition_penalty: + desc: null + value: 1.0 +report_to: + desc: null + value: '[''tensorboard'', ''wandb'', ''codecarbon'']' +resume_from_checkpoint: + desc: null + value: None +return_dict: + desc: null + value: true +return_dict_in_generate: + desc: null + value: false +run_name: + desc: null + value: ./ +save_on_each_node: + desc: null + value: false +save_steps: + desc: null + value: 500 +save_strategy: + desc: null + value: steps +save_total_limit: + desc: null + value: None +seed: + desc: null + value: 42 +sep_token_id: + desc: null + value: null +sharded_ddp: + desc: null + value: '[]' +skip_memory_metrics: + desc: null + value: true +sortish_sampler: + desc: null + value: false +task: + desc: null + value: covost2 +task_specific_params: + desc: null + value: null +temperature: + desc: null + value: 1.0 +tf32: + desc: null + value: None +tie_encoder_decoder: + desc: null + value: false +tie_word_embeddings: + desc: null + value: false +tokenizer_class: + desc: null + value: null +top_k: + desc: null + value: 50 +top_p: + desc: null + value: 1.0 +torch_dtype: + desc: null + value: torch.float32 +torchscript: + desc: null + value: false +tpu_metrics_debug: + desc: null + value: false +tpu_num_cores: + desc: null + value: None +train_batch_size: + desc: null + value: 8 +transformers_version: + desc: null + value: null +typical_p: + desc: null + value: 1.0 +use_bfloat16: + desc: null + value: false +use_cache: + desc: null + value: false +use_legacy_prediction_loop: + desc: null + value: false +warmup_ratio: + desc: null + value: 0.0 +warmup_steps: + desc: null + value: 500 +weight_decay: + desc: null + value: 0.0 +xpu_backend: + desc: null + value: None diff --git a/wandb/run-20220505_173213-ykkm1chu/files/output.log b/wandb/run-20220505_173213-ykkm1chu/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..94947a6ea62c05c1c7049222a9296ecd15113c03 --- /dev/null +++ b/wandb/run-20220505_173213-ykkm1chu/files/output.log @@ -0,0 +1,29 @@ +wandb: WARNING Config item 'output_dir' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'evaluation_strategy' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'per_device_train_batch_size' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'per_device_eval_batch_size' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'gradient_accumulation_steps' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'learning_rate' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'num_train_epochs' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'warmup_steps' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'logging_steps' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'save_steps' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'eval_steps' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'metric_for_best_model' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'greater_is_better' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'generation_max_length' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'generation_num_beams' was locked by 'sweep' (ignored update). + 0%| | 0/4860 [00:00', 'hub_private_repo': False, 'gradient_checkpointing': True, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': 'None', 'push_to_hub_organization': 'None', 'push_to_hub_token': '', '_n_gpu': 1, 'mp_parameters': '', 'sortish_sampler': False, 'predict_with_generate': True, 'train_batch_size': 8, 'eval_batch_size': 8} +2022-05-05 17:32:17,063 INFO MainThread:73744 [wandb_watch.py:watch():43] Watching diff --git a/wandb/run-20220505_173213-ykkm1chu/run-ykkm1chu.wandb b/wandb/run-20220505_173213-ykkm1chu/run-ykkm1chu.wandb new file mode 100644 index 0000000000000000000000000000000000000000..a3aa30c5f5eb37935fdf5fb61a28a257ee72840d Binary files /dev/null and b/wandb/run-20220505_173213-ykkm1chu/run-ykkm1chu.wandb differ diff --git a/wandb/run-20220505_173748-b097rk18/files/config.yaml b/wandb/run-20220505_173748-b097rk18/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e2c9cdf904e7efc1607f71610bd3aa0ba203c81d --- /dev/null +++ b/wandb/run-20220505_173748-b097rk18/files/config.yaml @@ -0,0 +1,9388 @@ +wandb_version: 1 + +_n_gpu: + desc: null + value: 1 +_name_or_path: + desc: null + value: ./ +_wandb: + desc: null + value: + cli_version: 0.12.10 + framework: huggingface + huggingface_version: 4.19.0.dev0 + is_jupyter_run: false + is_kaggle_kernel: false + m: + - 1: train/global_step + 6: + - 3 + - 1: train/loss + 5: 1 + 6: + - 1 + - 1: train/learning_rate + 5: 1 + 6: + - 1 + - 1: train/epoch + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.11\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.10\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.9\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.8\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.7\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.6\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.5\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.4\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.3\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.2\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.1\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc2\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc2\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc2\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc2\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc2\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc2\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc1\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc1\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc1\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc1\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc1\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.fc1\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.encoder_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layers\.0\.self_attn\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layernorm_embedding\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layernorm_embedding\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layernorm_embedding\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layernorm_embedding\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layernorm_embedding\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.layernorm_embedding\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.embed_positions\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.embed_positions\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.embed_positions\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.embed_tokens\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.embed_tokens\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/decoder\.model\.decoder\.embed_tokens\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.2\.conv\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.2\.conv\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.2\.conv\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.2\.conv\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.2\.conv\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.2\.conv\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.1\.conv\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.1\.conv\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.1\.conv\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.1\.conv\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.1\.conv\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.1\.conv\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.0\.conv\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.0\.conv\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.0\.conv\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.0\.conv\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.0\.conv\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.adapter\.layers\.0\.conv\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_v._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_v.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_v.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_g._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_g.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_g.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.masked_spec_embed._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.masked_spec_embed.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.masked_spec_embed.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.projection\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.projection\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.projection\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.projection\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.projection\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.projection\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.layer_norm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.layer_norm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.layer_norm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.layer_norm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.layer_norm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/encoder\.feature_projection\.layer_norm\.bias.bins + 5: 1 + 6: + - 1 + - 1: eval/loss + 5: 1 + 6: + - 1 + - 1: eval/bleu + 5: 1 + 6: + - 1 + - 1: eval/runtime + 5: 1 + 6: + - 1 + - 1: eval/samples_per_second + 5: 1 + 6: + - 1 + - 1: eval/steps_per_second + 5: 1 + 6: + - 1 + python_version: 3.9.5 + start_time: 1651772268 + t: + 1: + - 1 + - 2 + - 3 + - 5 + - 11 + - 12 + 3: + - 13 + 4: 3.9.5 + 5: 0.12.10 + 6: 4.19.0.dev0 + 8: + - 5 +adafactor: + desc: null + value: false +adam_beta1: + desc: null + value: 0.9 +adam_beta2: + desc: null + value: 0.999 +adam_epsilon: + desc: null + value: 1.0e-08 +add_cross_attention: + desc: null + value: false +architectures: + desc: null + value: + - SpeechEncoderDecoderModel +bad_words_ids: + desc: null + value: null +bf16: + desc: null + value: false +bf16_full_eval: + desc: null + value: false +bos_token_id: + desc: null + value: null +chunk_size_feed_forward: + desc: null + value: 0 +cross_attention_hidden_size: + desc: null + value: null +data_seed: + desc: null + value: None +dataloader_drop_last: + desc: null + value: false +dataloader_num_workers: + desc: null + value: 0 +dataloader_pin_memory: + desc: null + value: true +ddp_bucket_cap_mb: + desc: null + value: None +ddp_find_unused_parameters: + desc: null + value: None +debug: + desc: null + value: '[]' +decoder: + desc: null + value: + _name_or_path: facebook/bart-large + activation_dropout: 0.1 + activation_function: gelu + add_bias_logits: false + add_cross_attention: true + add_final_layer_norm: false + architectures: + - BartModel + attention_dropout: 0.1 + bad_words_ids: null + bos_token_id: 0 + chunk_size_feed_forward: 0 + classif_dropout: 0.1 + classifier_dropout: 0.0 + cross_attention_hidden_size: null + d_model: 1024 + decoder_attention_heads: 16 + decoder_ffn_dim: 4096 + decoder_layerdrop: 0.0 + decoder_layers: 12 + decoder_start_token_id: 2 + diversity_penalty: 0.0 + do_sample: false + dropout: 0.1 + early_stopping: true + encoder_attention_heads: 16 + encoder_ffn_dim: 4096 + encoder_layerdrop: 0.0 + encoder_layers: 12 + encoder_no_repeat_ngram_size: 0 + eos_token_id: 2 + exponential_decay_length_penalty: null + finetuning_task: null + forced_bos_token_id: 0 + forced_eos_token_id: 2 + gradient_checkpointing: false + id2label: + '0': LABEL_0 + '1': LABEL_1 + '2': LABEL_2 + init_std: 0.02 + is_decoder: true + is_encoder_decoder: false + label2id: + LABEL_0: 0 + LABEL_1: 1 + LABEL_2: 2 + length_penalty: 1.0 + max_length: 20 + max_position_embeddings: 1024 + min_length: 0 + model_type: bart + no_repeat_ngram_size: 3 + normalize_before: false + num_beam_groups: 1 + num_beams: 4 + num_hidden_layers: 12 + num_return_sequences: 1 + output_attentions: false + output_hidden_states: false + output_scores: false + pad_token_id: 1 + prefix: null + problem_type: null + pruned_heads: {} + remove_invalid_values: false + repetition_penalty: 1.0 + return_dict: true + return_dict_in_generate: false + scale_embedding: false + sep_token_id: null + task_specific_params: + summarization: + length_penalty: 1.0 + max_length: 128 + min_length: 12 + num_beams: 4 + summarization_cnn: + length_penalty: 2.0 + max_length: 142 + min_length: 56 + num_beams: 4 + summarization_xsum: + length_penalty: 1.0 + max_length: 62 + min_length: 11 + num_beams: 6 + temperature: 1.0 + tie_encoder_decoder: false + tie_word_embeddings: true + tokenizer_class: null + top_k: 50 + top_p: 1.0 + torch_dtype: null + torchscript: false + transformers_version: 4.19.0.dev0 + typical_p: 1.0 + use_bfloat16: false + use_cache: true + vocab_size: 50265 +decoder_start_token_id: + desc: null + value: 0 +deepspeed: + desc: null + value: None +disable_tqdm: + desc: null + value: false +diversity_penalty: + desc: null + value: 0.0 +do_eval: + desc: null + value: true +do_predict: + desc: null + value: false +do_sample: + desc: null + value: false +do_train: + desc: null + value: true +early_stopping: + desc: null + value: false +encoder: + desc: null + value: + _name_or_path: facebook/wav2vec2-xls-r-300m + activation_dropout: 0.0 + adapter_kernel_size: 3 + adapter_stride: 2 + add_adapter: true + add_cross_attention: false + apply_spec_augment: true + architectures: + - Wav2Vec2ForPreTraining + attention_dropout: 0.1 + bad_words_ids: null + bos_token_id: 1 + chunk_size_feed_forward: 0 + classifier_proj_size: 256 + codevector_dim: 768 + contrastive_logits_temperature: 0.1 + conv_bias: true + conv_dim: + - 512 + - 512 + - 512 + - 512 + - 512 + - 512 + - 512 + conv_kernel: + - 10 + - 3 + - 3 + - 3 + - 3 + - 2 + - 2 + conv_stride: + - 5 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + cross_attention_hidden_size: null + ctc_loss_reduction: sum + ctc_zero_infinity: false + decoder_start_token_id: null + diversity_loss_weight: 0.1 + diversity_penalty: 0.0 + do_sample: false + do_stable_layer_norm: true + early_stopping: false + encoder_no_repeat_ngram_size: 0 + eos_token_id: 2 + exponential_decay_length_penalty: null + feat_extract_activation: gelu + feat_extract_dropout: 0.0 + feat_extract_norm: layer + feat_proj_dropout: 0.0 + feat_quantizer_dropout: 0.0 + final_dropout: 0.0 + finetuning_task: null + forced_bos_token_id: null + forced_eos_token_id: null + gradient_checkpointing: false + hidden_act: gelu + hidden_dropout: 0.2 + hidden_size: 1024 + id2label: + '0': LABEL_0 + '1': LABEL_1 + initializer_range: 0.02 + intermediate_size: 4096 + is_decoder: false + is_encoder_decoder: false + label2id: + LABEL_0: 0 + LABEL_1: 1 + layer_norm_eps: 1.0e-05 + layerdrop: 0.0 + length_penalty: 1.0 + mask_feature_length: 10 + mask_feature_min_masks: 0 + mask_feature_prob: 0.0 + mask_time_length: 10 + mask_time_min_masks: 2 + mask_time_prob: 0.1 + max_length: 20 + min_length: 0 + model_type: wav2vec2 + no_repeat_ngram_size: 0 + num_adapter_layers: 3 + num_attention_heads: 16 + num_beam_groups: 1 + num_beams: 1 + num_codevector_groups: 2 + num_codevectors_per_group: 320 + num_conv_pos_embedding_groups: 16 + num_conv_pos_embeddings: 128 + num_feat_extract_layers: 7 + num_hidden_layers: 24 + num_negatives: 100 + num_return_sequences: 1 + output_attentions: false + output_hidden_size: 1024 + output_hidden_states: false + output_scores: false + pad_token_id: 0 + prefix: null + problem_type: null + proj_codevector_dim: 768 + pruned_heads: {} + remove_invalid_values: false + repetition_penalty: 1.0 + return_dict: true + return_dict_in_generate: false + sep_token_id: null + task_specific_params: null + tdnn_dilation: + - 1 + - 2 + - 3 + - 1 + - 1 + tdnn_dim: + - 512 + - 512 + - 512 + - 512 + - 1500 + tdnn_kernel: + - 5 + - 3 + - 3 + - 1 + - 1 + temperature: 1.0 + tie_encoder_decoder: false + tie_word_embeddings: true + tokenizer_class: null + top_k: 50 + top_p: 1.0 + torch_dtype: float32 + torchscript: false + transformers_version: 4.19.0.dev0 + typical_p: 1.0 + use_bfloat16: false + use_weighted_layer_sum: false + vocab_size: 32 + xvector_output_dim: 512 +encoder_no_repeat_ngram_size: + desc: null + value: 0 +eos_token_id: + desc: null + value: 2 +eval_accumulation_steps: + desc: null + value: None +eval_batch_size: + desc: null + value: 8 +eval_delay: + desc: null + value: 0 +eval_split_name: + desc: null + value: test +eval_steps: + desc: null + value: 500 +evaluation_strategy: + desc: null + value: steps +exponential_decay_length_penalty: + desc: null + value: null +finetuning_task: + desc: null + value: null +forced_bos_token_id: + desc: null + value: null +forced_eos_token_id: + desc: null + value: null +fp16: + desc: null + value: true +fp16_backend: + desc: null + value: auto +fp16_full_eval: + desc: null + value: false +fp16_opt_level: + desc: null + value: O1 +generation_max_length: + desc: null + value: 40 +generation_num_beams: + desc: null + value: 1 +gradient_accumulation_steps: + desc: null + value: 16 +gradient_checkpointing: + desc: null + value: true +greater_is_better: + desc: null + value: true +group_by_length: + desc: null + value: true +half_precision_backend: + desc: null + value: amp +hidden_dropout: + desc: null + value: 0.2 +hub_model_id: + desc: null + value: None +hub_private_repo: + desc: null + value: false +hub_strategy: + desc: null + value: every_save +hub_token: + desc: null + value: +id2label: + desc: null + value: + '0': LABEL_0 + '1': LABEL_1 +ignore_data_skip: + desc: null + value: false +include_inputs_for_metrics: + desc: null + value: false +is_decoder: + desc: null + value: false +is_encoder_decoder: + desc: null + value: true +label2id: + desc: null + value: + LABEL_0: 0 + LABEL_1: 1 +label_names: + desc: null + value: None +label_smoothing_factor: + desc: null + value: 0.0 +language: + desc: null + value: fr.en +learning_rate: + desc: null + value: 3.0e-05 +length_column_name: + desc: null + value: length +length_penalty: + desc: null + value: 1.0 +load_best_model_at_end: + desc: null + value: true +local_rank: + desc: null + value: -1 +log_level: + desc: null + value: -1 +log_level_replica: + desc: null + value: -1 +log_on_each_node: + desc: null + value: true +logging_dir: + desc: null + value: ./runs/May05_17-36-54_sanchit--v100 +logging_first_step: + desc: null + value: false +logging_nan_inf_filter: + desc: null + value: true +logging_steps: + desc: null + value: 1 +logging_strategy: + desc: null + value: steps +lr_scheduler_type: + desc: null + value: linear +max_duration_in_seconds: + desc: null + value: 20 +max_grad_norm: + desc: null + value: 1.0 +max_length: + desc: null + value: 40 +max_steps: + desc: null + value: -1 +metric_for_best_model: + desc: null + value: bleu +min_length: + desc: null + value: 0 +model_name_or_path: + desc: null + value: ./ +model_type: + desc: null + value: speech-encoder-decoder +mp_parameters: + desc: null + value: '' +no_cuda: + desc: null + value: false +no_repeat_ngram_size: + desc: null + value: 0 +num_beam_groups: + desc: null + value: 1 +num_beams: + desc: null + value: 1 +num_return_sequences: + desc: null + value: 1 +num_train_epochs: + desc: null + value: 3 +optim: + desc: null + value: adamw_hf +output_attentions: + desc: null + value: false +output_dir: + desc: null + value: ./ +output_hidden_states: + desc: null + value: false +output_scores: + desc: null + value: false +overwrite_output_dir: + desc: null + value: true +pad_token_id: + desc: null + value: 1 +past_index: + desc: null + value: -1 +per_device_eval_batch_size: + desc: null + value: 8 +per_device_train_batch_size: + desc: null + value: 8 +per_gpu_eval_batch_size: + desc: null + value: None +per_gpu_train_batch_size: + desc: null + value: None +predict_with_generate: + desc: null + value: true +prediction_loss_only: + desc: null + value: false +prefix: + desc: null + value: null +problem_type: + desc: null + value: null +processor_class: + desc: null + value: Wav2Vec2Processor +pruned_heads: + desc: null + value: {} +push_to_hub: + desc: null + value: true +push_to_hub_model_id: + desc: null + value: None +push_to_hub_organization: + desc: null + value: None +push_to_hub_token: + desc: null + value: +remove_invalid_values: + desc: null + value: false +remove_unused_columns: + desc: null + value: true +repetition_penalty: + desc: null + value: 1.0 +report_to: + desc: null + value: '[''tensorboard'', ''wandb'', ''codecarbon'']' +resume_from_checkpoint: + desc: null + value: None +return_dict: + desc: null + value: true +return_dict_in_generate: + desc: null + value: false +run_name: + desc: null + value: ./ +save_on_each_node: + desc: null + value: false +save_steps: + desc: null + value: 500 +save_strategy: + desc: null + value: steps +save_total_limit: + desc: null + value: None +seed: + desc: null + value: 42 +sep_token_id: + desc: null + value: null +sharded_ddp: + desc: null + value: '[]' +skip_memory_metrics: + desc: null + value: true +sortish_sampler: + desc: null + value: false +task: + desc: null + value: covost2 +task_specific_params: + desc: null + value: null +temperature: + desc: null + value: 1.0 +tf32: + desc: null + value: None +tie_encoder_decoder: + desc: null + value: false +tie_word_embeddings: + desc: null + value: false +tokenizer_class: + desc: null + value: null +top_k: + desc: null + value: 50 +top_p: + desc: null + value: 1.0 +torch_dtype: + desc: null + value: torch.float32 +torchscript: + desc: null + value: false +tpu_metrics_debug: + desc: null + value: false +tpu_num_cores: + desc: null + value: None +train_batch_size: + desc: null + value: 8 +transformers_version: + desc: null + value: null +typical_p: + desc: null + value: 1.0 +use_bfloat16: + desc: null + value: false +use_cache: + desc: null + value: false +use_legacy_prediction_loop: + desc: null + value: false +warmup_ratio: + desc: null + value: 0.0 +warmup_steps: + desc: null + value: 500 +weight_decay: + desc: null + value: 0.0 +xpu_backend: + desc: null + value: None diff --git a/wandb/run-20220505_173748-b097rk18/files/output.log b/wandb/run-20220505_173748-b097rk18/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..f3875722a5a2532a488b0e323fee2a8d98d5765d --- /dev/null +++ b/wandb/run-20220505_173748-b097rk18/files/output.log @@ -0,0 +1,4587 @@ +wandb: WARNING Config item 'output_dir' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'evaluation_strategy' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'per_device_train_batch_size' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'per_device_eval_batch_size' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'gradient_accumulation_steps' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'learning_rate' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'num_train_epochs' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'warmup_steps' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'logging_steps' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'save_steps' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'eval_steps' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'metric_for_best_model' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'greater_is_better' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'generation_max_length' was locked by 'sweep' (ignored update). +wandb: WARNING Config item 'generation_num_beams' was locked by 'sweep' (ignored update). + 0%| | 0/4860 [00:00', 'hub_private_repo': False, 'gradient_checkpointing': True, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': 'None', 'push_to_hub_organization': 'None', 'push_to_hub_token': '', '_n_gpu': 1, 'mp_parameters': '', 'sortish_sampler': False, 'predict_with_generate': True, 'train_batch_size': 8, 'eval_batch_size': 8} +2022-05-05 17:37:52,326 INFO MainThread:74387 [wandb_watch.py:watch():43] Watching diff --git a/wandb/run-20220505_173748-b097rk18/run-b097rk18.wandb b/wandb/run-20220505_173748-b097rk18/run-b097rk18.wandb new file mode 100644 index 0000000000000000000000000000000000000000..4d933e25d10c826cf3a1f711f11145388a0ab34a --- /dev/null +++ b/wandb/run-20220505_173748-b097rk18/run-b097rk18.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efb2ab10fc337236355baa757fd16be25a5b2001f559d1dc8cdcd5021a06def9 +size 107188252 diff --git a/wandb/sweep-gtn0dbwf/config-ykkm1chu.yaml b/wandb/sweep-gtn0dbwf/config-ykkm1chu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0b99aa0d7186ae83e521c6553ac462a6b9e7075d --- /dev/null +++ b/wandb/sweep-gtn0dbwf/config-ykkm1chu.yaml @@ -0,0 +1,44 @@ +wandb_version: 1 + +eval_split_name: + value: test +eval_steps: + value: 500 +evaluation_strategy: + value: steps +generation_max_length: + value: 40 +generation_num_beams: + value: 1 +gradient_accumulation_steps: + value: 16 +greater_is_better: + value: true +hidden_dropout: + value: 0.2 +language: + value: fr.en +learning_rate: + value: 0.0003287457929573604 +logging_steps: + value: 1 +max_duration_in_seconds: + value: 20 +metric_for_best_model: + value: bleu +model_name_or_path: + value: ./ +num_train_epochs: + value: 3 +output_dir: + value: ./ +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 8 +save_steps: + value: 500 +task: + value: covost2 +warmup_steps: + value: 500 diff --git a/wandb/sweep-r3wu4wam/config-b097rk18.yaml b/wandb/sweep-r3wu4wam/config-b097rk18.yaml new file mode 100644 index 0000000000000000000000000000000000000000..03e959d547623a601e2dbc91514334fa3a0ebe4e --- /dev/null +++ b/wandb/sweep-r3wu4wam/config-b097rk18.yaml @@ -0,0 +1,44 @@ +wandb_version: 1 + +eval_split_name: + value: test +eval_steps: + value: 500 +evaluation_strategy: + value: steps +generation_max_length: + value: 40 +generation_num_beams: + value: 1 +gradient_accumulation_steps: + value: 16 +greater_is_better: + value: true +hidden_dropout: + value: 0.2 +language: + value: fr.en +learning_rate: + value: 3.0e-05 +logging_steps: + value: 1 +max_duration_in_seconds: + value: 20 +metric_for_best_model: + value: bleu +model_name_or_path: + value: ./ +num_train_epochs: + value: 3 +output_dir: + value: ./ +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 8 +save_steps: + value: 500 +task: + value: covost2 +warmup_steps: + value: 500 diff --git a/wandb/sweep-sndcs9xa/config-rwbnpkt2.yaml b/wandb/sweep-sndcs9xa/config-rwbnpkt2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f8ce654a3a1a5a8b8050c77302155a8bc394d893 --- /dev/null +++ b/wandb/sweep-sndcs9xa/config-rwbnpkt2.yaml @@ -0,0 +1,44 @@ +wandb_version: 1 + +eval_split_name: + value: test +eval_steps: + value: 500 +evaluation_strategy: + value: steps +generation_max_length: + value: 40 +generation_num_beams: + value: 1 +gradient_accumulation_steps: + value: 16 +greater_is_better: + value: true +hidden_dropout: + value: 0.2 +language: + value: fr.en +learning_rate: + value: 0.0007057712331944904 +logging_steps: + value: 1 +max_duration_in_seconds: + value: 20 +metric_for_best_model: + value: bleu +model_name_or_path: + value: ./ +num_train_epochs: + value: 3 +output_dir: + value: ./ +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 8 +save_steps: + value: 500 +task: + value: covost2 +warmup_steps: + value: 500 diff --git a/wandb/sweep-sndcs9xa/config-xbi4p92m.yaml b/wandb/sweep-sndcs9xa/config-xbi4p92m.yaml new file mode 100644 index 0000000000000000000000000000000000000000..016f4aae44adfc0a176fb9a0e0aab35ec81934bc --- /dev/null +++ b/wandb/sweep-sndcs9xa/config-xbi4p92m.yaml @@ -0,0 +1,44 @@ +wandb_version: 1 + +eval_split_name: + value: test +eval_steps: + value: 500 +evaluation_strategy: + value: steps +generation_max_length: + value: 40 +generation_num_beams: + value: 1 +gradient_accumulation_steps: + value: 16 +greater_is_better: + value: true +hidden_dropout: + value: 0.2 +language: + value: fr.en +learning_rate: + value: 0.0003851276453057612 +logging_steps: + value: 1 +max_duration_in_seconds: + value: 20 +metric_for_best_model: + value: bleu +model_name_or_path: + value: ./ +num_train_epochs: + value: 3 +output_dir: + value: ./ +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 8 +save_steps: + value: 500 +task: + value: covost2 +warmup_steps: + value: 500 diff --git a/wandb/sweep-sndcs9xa/config-ydgnpqx2.yaml b/wandb/sweep-sndcs9xa/config-ydgnpqx2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c58836af8d79b0cbe0e9528cb55a33f90fd9f1b9 --- /dev/null +++ b/wandb/sweep-sndcs9xa/config-ydgnpqx2.yaml @@ -0,0 +1,44 @@ +wandb_version: 1 + +eval_split_name: + value: test +eval_steps: + value: 500 +evaluation_strategy: + value: steps +generation_max_length: + value: 40 +generation_num_beams: + value: 1 +gradient_accumulation_steps: + value: 16 +greater_is_better: + value: true +hidden_dropout: + value: 0.2 +language: + value: fr.en +learning_rate: + value: 0.0005587128574267087 +logging_steps: + value: 1 +max_duration_in_seconds: + value: 20 +metric_for_best_model: + value: bleu +model_name_or_path: + value: ./ +num_train_epochs: + value: 3 +output_dir: + value: ./ +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 8 +save_steps: + value: 500 +task: + value: covost2 +warmup_steps: + value: 500