diff --git "a/running_log.txt" "b/running_log.txt"
new file mode 100644--- /dev/null
+++ "b/running_log.txt"
@@ -0,0 +1,1835 @@
+[INFO|parser.py:344] 2024-07-29 16:41:53,515 >> Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16
+
+07/29/2024 16:41:53 - INFO - llamafactory.hparams.parser - Process rank: 4, device: cuda:4, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16
+
+07/29/2024 16:41:53 - INFO - llamafactory.hparams.parser - Process rank: 2, device: cuda:2, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16
+
+07/29/2024 16:41:53 - INFO - llamafactory.hparams.parser - Process rank: 1, device: cuda:1, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16
+
+07/29/2024 16:41:53 - INFO - llamafactory.hparams.parser - Process rank: 5, device: cuda:5, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16
+
+07/29/2024 16:41:53 - INFO - llamafactory.hparams.parser - Process rank: 7, device: cuda:7, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16
+
+07/29/2024 16:41:53 - INFO - llamafactory.hparams.parser - Process rank: 6, device: cuda:6, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16
+
+[INFO|tokenization_utils_base.py:2289] 2024-07-29 16:41:55,980 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/b2a4d0f33b41fcd59a6d31662cc63b8d53367e1e/tokenizer.json
+
+07/29/2024 16:41:56 - INFO - llamafactory.data.template - Replace eos token: <|eot_id|>
+
+07/29/2024 16:41:56 - INFO - llamafactory.data.template - Add pad token: <|eot_id|>
+
+07/29/2024 16:41:56 - INFO - llamafactory.data.template - Replace eos token: <|eot_id|>
+
+07/29/2024 16:41:56 - INFO - llamafactory.data.template - Add pad token: <|eot_id|>
+
+[INFO|tokenization_utils_base.py:2289] 2024-07-29 16:41:55,980 >> loading file added_tokens.json from cache at None
+
+[INFO|tokenization_utils_base.py:2289] 2024-07-29 16:41:55,980 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/b2a4d0f33b41fcd59a6d31662cc63b8d53367e1e/special_tokens_map.json
+
+[INFO|tokenization_utils_base.py:2289] 2024-07-29 16:41:55,980 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/b2a4d0f33b41fcd59a6d31662cc63b8d53367e1e/tokenizer_config.json
+
+[INFO|tokenization_utils_base.py:2533] 2024-07-29 16:41:56,271 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+
+[INFO|template.py:270] 2024-07-29 16:41:56,272 >> Replace eos token: <|eot_id|>
+
+[INFO|template.py:372] 2024-07-29 16:41:56,272 >> Add pad token: <|eot_id|>
+
+[INFO|loader.py:52] 2024-07-29 16:41:56,272 >> Loading dataset convert_finetune_truth_train.json...
+
+07/29/2024 16:41:56 - INFO - llamafactory.data.template - Replace eos token: <|eot_id|>
+
+07/29/2024 16:41:56 - INFO - llamafactory.data.template - Add pad token: <|eot_id|>
+
+07/29/2024 16:41:56 - INFO - llamafactory.data.template - Replace eos token: <|eot_id|>
+
+07/29/2024 16:41:56 - INFO - llamafactory.data.template - Add pad token: <|eot_id|>
+
+07/29/2024 16:41:56 - INFO - llamafactory.data.template - Replace eos token: <|eot_id|>
+
+07/29/2024 16:41:56 - INFO - llamafactory.data.template - Add pad token: <|eot_id|>
+
+07/29/2024 16:41:56 - INFO - llamafactory.data.template - Replace eos token: <|eot_id|>
+
+07/29/2024 16:41:56 - INFO - llamafactory.data.template - Add pad token: <|eot_id|>
+
+07/29/2024 16:41:56 - INFO - llamafactory.data.template - Replace eos token: <|eot_id|>
+
+07/29/2024 16:41:56 - INFO - llamafactory.data.template - Add pad token: <|eot_id|>
+
+07/29/2024 16:41:57 - INFO - llamafactory.data.loader - Loading dataset convert_finetune_truth_train.json...
+
+07/29/2024 16:41:57 - INFO - llamafactory.data.loader - Loading dataset convert_finetune_truth_train.json...
+
+07/29/2024 16:41:57 - INFO - llamafactory.data.loader - Loading dataset convert_finetune_truth_train.json...
+
+07/29/2024 16:41:57 - INFO - llamafactory.data.loader - Loading dataset convert_finetune_truth_train.json...
+
+07/29/2024 16:41:57 - INFO - llamafactory.data.loader - Loading dataset convert_finetune_truth_train.json...
+
+07/29/2024 16:41:57 - INFO - llamafactory.data.loader - Loading dataset convert_finetune_truth_train.json...
+
+07/29/2024 16:41:57 - INFO - llamafactory.data.loader - Loading dataset convert_finetune_truth_train.json...
+
+[INFO|configuration_utils.py:733] 2024-07-29 16:42:02,037 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/b2a4d0f33b41fcd59a6d31662cc63b8d53367e1e/config.json
+
+[INFO|configuration_utils.py:800] 2024-07-29 16:42:02,039 >> Model config LlamaConfig {
+  "_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.43.3",
+  "use_cache": true,
+  "vocab_size": 128256
+}
+
+
+[INFO|modeling_utils.py:3634] 2024-07-29 16:42:02,932 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/b2a4d0f33b41fcd59a6d31662cc63b8d53367e1e/model.safetensors.index.json
+
+[INFO|modeling_utils.py:1572] 2024-07-29 16:44:31,929 >> Instantiating LlamaForCausalLM model under default dtype torch.bfloat16.
+
+[INFO|configuration_utils.py:1038] 2024-07-29 16:44:31,933 >> Generate config GenerationConfig {
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ]
+}
+
+
+07/29/2024 16:44:36 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.
+
+07/29/2024 16:44:36 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
+
+07/29/2024 16:44:36 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.
+
+07/29/2024 16:44:36 - INFO - llamafactory.model.adapter - Fine-tuning method: Full
+
+07/29/2024 16:44:36 - INFO - llamafactory.model.loader - trainable params: 8,030,261,248 || all params: 8,030,261,248 || trainable%: 100.0000
+
+[INFO|modeling_utils.py:4463] 2024-07-29 16:44:37,295 >> All model checkpoint weights were used when initializing LlamaForCausalLM.
+
+
+[INFO|modeling_utils.py:4471] 2024-07-29 16:44:37,295 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Meta-Llama-3.1-8B-Instruct.
+If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.adapter - Fine-tuning method: Full
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.adapter - Fine-tuning method: Full
+
+[INFO|configuration_utils.py:993] 2024-07-29 16:44:37,468 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/b2a4d0f33b41fcd59a6d31662cc63b8d53367e1e/generation_config.json
+
+[INFO|configuration_utils.py:1038] 2024-07-29 16:44:37,468 >> Generate config GenerationConfig {
+  "bos_token_id": 128000,
+  "do_sample": true,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "temperature": 0.6,
+  "top_p": 0.9
+}
+
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.adapter - Fine-tuning method: Full
+
+[INFO|checkpointing.py:103] 2024-07-29 16:44:37,476 >> Gradient checkpointing enabled.
+
+[INFO|attention.py:84] 2024-07-29 16:44:37,476 >> Using torch SDPA for faster training and inference.
+
+[INFO|adapter.py:302] 2024-07-29 16:44:37,476 >> Upcasting trainable params to float32.
+
+[INFO|adapter.py:48] 2024-07-29 16:44:37,476 >> Fine-tuning method: Full
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.adapter - Fine-tuning method: Full
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.adapter - Fine-tuning method: Full
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.loader - trainable params: 8,030,261,248 || all params: 8,030,261,248 || trainable%: 100.0000
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.loader - trainable params: 8,030,261,248 || all params: 8,030,261,248 || trainable%: 100.0000
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.loader - trainable params: 8,030,261,248 || all params: 8,030,261,248 || trainable%: 100.0000
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.adapter - Fine-tuning method: Full
+
+[INFO|loader.py:196] 2024-07-29 16:44:37,576 >> trainable params: 8,030,261,248 || all params: 8,030,261,248 || trainable%: 100.0000
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.loader - trainable params: 8,030,261,248 || all params: 8,030,261,248 || trainable%: 100.0000
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.loader - trainable params: 8,030,261,248 || all params: 8,030,261,248 || trainable%: 100.0000
+
+07/29/2024 16:44:37 - INFO - llamafactory.model.loader - trainable params: 8,030,261,248 || all params: 8,030,261,248 || trainable%: 100.0000
+
+[INFO|trainer.py:648] 2024-07-29 16:44:37,582 >> Using auto half precision backend
+
+[INFO|trainer.py:2134] 2024-07-29 16:44:59,827 >> ***** Running training *****
+
+[INFO|trainer.py:2135] 2024-07-29 16:44:59,827 >>   Num examples = 19,880
+
+[INFO|trainer.py:2136] 2024-07-29 16:44:59,827 >>   Num Epochs = 5
+
+[INFO|trainer.py:2137] 2024-07-29 16:44:59,827 >>   Instantaneous batch size per device = 2
+
+[INFO|trainer.py:2140] 2024-07-29 16:44:59,827 >>   Total train batch size (w. parallel, distributed & accumulation) = 128
+
+[INFO|trainer.py:2141] 2024-07-29 16:44:59,827 >>   Gradient Accumulation steps = 8
+
+[INFO|trainer.py:2142] 2024-07-29 16:44:59,827 >>   Total optimization steps = 775
+
+[INFO|trainer.py:2143] 2024-07-29 16:44:59,828 >>   Number of trainable parameters = 8,030,261,248
+
+[INFO|callbacks.py:310] 2024-07-29 16:45:33,237 >> {'loss': 12.2736, 'learning_rate': 5.0000e-07, 'epoch': 0.01, 'throughput': 196.84}
+
+[INFO|callbacks.py:310] 2024-07-29 16:45:46,485 >> {'loss': 12.5518, 'learning_rate': 1.0000e-06, 'epoch': 0.01, 'throughput': 285.33}
+
+[INFO|callbacks.py:310] 2024-07-29 16:45:59,669 >> {'loss': 11.8551, 'learning_rate': 1.5000e-06, 'epoch': 0.02, 'throughput': 333.43}
+
+[INFO|callbacks.py:310] 2024-07-29 16:46:12,884 >> {'loss': 10.9045, 'learning_rate': 2.0000e-06, 'epoch': 0.03, 'throughput': 364.66}
+
+[INFO|callbacks.py:310] 2024-07-29 16:46:26,094 >> {'loss': 8.9845, 'learning_rate': 2.5000e-06, 'epoch': 0.03, 'throughput': 386.72}
+
+[INFO|callbacks.py:310] 2024-07-29 16:46:39,302 >> {'loss': 6.3500, 'learning_rate': 3.0000e-06, 'epoch': 0.04, 'throughput': 401.96}
+
+[INFO|callbacks.py:310] 2024-07-29 16:46:52,516 >> {'loss': 5.4864, 'learning_rate': 3.5000e-06, 'epoch': 0.05, 'throughput': 412.61}
+
+[INFO|callbacks.py:310] 2024-07-29 16:47:05,728 >> {'loss': 2.8013, 'learning_rate': 4.0000e-06, 'epoch': 0.05, 'throughput': 421.04}
+
+[INFO|callbacks.py:310] 2024-07-29 16:47:18,954 >> {'loss': 0.9851, 'learning_rate': 4.5000e-06, 'epoch': 0.06, 'throughput': 430.24}
+
+[INFO|callbacks.py:310] 2024-07-29 16:47:32,150 >> {'loss': 0.5088, 'learning_rate': 5.0000e-06, 'epoch': 0.06, 'throughput': 435.61}
+
+[INFO|callbacks.py:310] 2024-07-29 16:47:45,374 >> {'loss': 2.0220, 'learning_rate': 5.0000e-06, 'epoch': 0.07, 'throughput': 442.08}
+
+[INFO|callbacks.py:310] 2024-07-29 16:47:58,566 >> {'loss': 0.5842, 'learning_rate': 4.9999e-06, 'epoch': 0.08, 'throughput': 447.95}
+
+[INFO|callbacks.py:310] 2024-07-29 16:48:11,784 >> {'loss': 0.8538, 'learning_rate': 4.9998e-06, 'epoch': 0.08, 'throughput': 451.61}
+
+[INFO|callbacks.py:310] 2024-07-29 16:48:24,982 >> {'loss': 0.5843, 'learning_rate': 4.9997e-06, 'epoch': 0.09, 'throughput': 454.14}
+
+[INFO|callbacks.py:310] 2024-07-29 16:48:38,193 >> {'loss': 0.4456, 'learning_rate': 4.9995e-06, 'epoch': 0.10, 'throughput': 456.56}
+
+[INFO|callbacks.py:310] 2024-07-29 16:48:51,402 >> {'loss': 0.3542, 'learning_rate': 4.9992e-06, 'epoch': 0.10, 'throughput': 457.81}
+
+[INFO|callbacks.py:310] 2024-07-29 16:49:04,606 >> {'loss': 0.3416, 'learning_rate': 4.9990e-06, 'epoch': 0.11, 'throughput': 460.31}
+
+[INFO|callbacks.py:310] 2024-07-29 16:49:17,817 >> {'loss': 0.2803, 'learning_rate': 4.9987e-06, 'epoch': 0.12, 'throughput': 462.47}
+
+[INFO|callbacks.py:310] 2024-07-29 16:49:31,031 >> {'loss': 0.2603, 'learning_rate': 4.9983e-06, 'epoch': 0.12, 'throughput': 464.66}
+
+[INFO|callbacks.py:310] 2024-07-29 16:49:44,230 >> {'loss': 0.2456, 'learning_rate': 4.9979e-06, 'epoch': 0.13, 'throughput': 466.61}
+
+[INFO|callbacks.py:310] 2024-07-29 16:49:57,442 >> {'loss': 0.2269, 'learning_rate': 4.9974e-06, 'epoch': 0.14, 'throughput': 468.64}
+
+[INFO|callbacks.py:310] 2024-07-29 16:50:10,645 >> {'loss': 0.2896, 'learning_rate': 4.9970e-06, 'epoch': 0.14, 'throughput': 469.63}
+
+[INFO|callbacks.py:310] 2024-07-29 16:50:23,868 >> {'loss': 0.1415, 'learning_rate': 4.9964e-06, 'epoch': 0.15, 'throughput': 469.52}
+
+[INFO|callbacks.py:310] 2024-07-29 16:50:37,074 >> {'loss': 0.2228, 'learning_rate': 4.9959e-06, 'epoch': 0.15, 'throughput': 469.78}
+
+[INFO|callbacks.py:310] 2024-07-29 16:50:50,286 >> {'loss': 0.1537, 'learning_rate': 4.9953e-06, 'epoch': 0.16, 'throughput': 469.88}
+
+[INFO|callbacks.py:310] 2024-07-29 16:51:03,488 >> {'loss': 0.2146, 'learning_rate': 4.9946e-06, 'epoch': 0.17, 'throughput': 469.94}
+
+[INFO|callbacks.py:310] 2024-07-29 16:51:16,692 >> {'loss': 0.1688, 'learning_rate': 4.9939e-06, 'epoch': 0.17, 'throughput': 472.07}
+
+[INFO|callbacks.py:310] 2024-07-29 16:51:29,886 >> {'loss': 0.1592, 'learning_rate': 4.9932e-06, 'epoch': 0.18, 'throughput': 473.37}
+
+[INFO|callbacks.py:310] 2024-07-29 16:51:43,076 >> {'loss': 0.2333, 'learning_rate': 4.9924e-06, 'epoch': 0.19, 'throughput': 473.68}
+
+[INFO|callbacks.py:310] 2024-07-29 16:51:56,314 >> {'loss': 0.1812, 'learning_rate': 4.9916e-06, 'epoch': 0.19, 'throughput': 474.37}
+
+[INFO|callbacks.py:310] 2024-07-29 16:52:09,516 >> {'loss': 0.1925, 'learning_rate': 4.9907e-06, 'epoch': 0.20, 'throughput': 475.40}
+
+[INFO|callbacks.py:310] 2024-07-29 16:52:22,726 >> {'loss': 0.1864, 'learning_rate': 4.9898e-06, 'epoch': 0.21, 'throughput': 476.28}
+
+[INFO|callbacks.py:310] 2024-07-29 16:52:35,936 >> {'loss': 0.1848, 'learning_rate': 4.9889e-06, 'epoch': 0.21, 'throughput': 477.05}
+
+[INFO|callbacks.py:310] 2024-07-29 16:52:49,155 >> {'loss': 0.1466, 'learning_rate': 4.9879e-06, 'epoch': 0.22, 'throughput': 477.42}
+
+[INFO|callbacks.py:310] 2024-07-29 16:53:02,357 >> {'loss': 0.1481, 'learning_rate': 4.9868e-06, 'epoch': 0.23, 'throughput': 478.48}
+
+[INFO|callbacks.py:310] 2024-07-29 16:53:15,573 >> {'loss': 0.1490, 'learning_rate': 4.9858e-06, 'epoch': 0.23, 'throughput': 478.70}
+
+[INFO|callbacks.py:310] 2024-07-29 16:53:28,783 >> {'loss': 0.1506, 'learning_rate': 4.9846e-06, 'epoch': 0.24, 'throughput': 478.72}
+
+[INFO|callbacks.py:310] 2024-07-29 16:53:41,994 >> {'loss': 0.1709, 'learning_rate': 4.9835e-06, 'epoch': 0.24, 'throughput': 478.96}
+
+[INFO|callbacks.py:310] 2024-07-29 16:53:55,207 >> {'loss': 0.0898, 'learning_rate': 4.9823e-06, 'epoch': 0.25, 'throughput': 478.97}
+
+[INFO|callbacks.py:310] 2024-07-29 16:54:08,390 >> {'loss': 0.1856, 'learning_rate': 4.9811e-06, 'epoch': 0.26, 'throughput': 479.60}
+
+[INFO|callbacks.py:310] 2024-07-29 16:54:21,607 >> {'loss': 0.0947, 'learning_rate': 4.9798e-06, 'epoch': 0.26, 'throughput': 480.45}
+
+[INFO|callbacks.py:310] 2024-07-29 16:54:34,818 >> {'loss': 0.1367, 'learning_rate': 4.9784e-06, 'epoch': 0.27, 'throughput': 481.04}
+
+[INFO|callbacks.py:310] 2024-07-29 16:54:48,033 >> {'loss': 0.1645, 'learning_rate': 4.9771e-06, 'epoch': 0.28, 'throughput': 481.85}
+
+[INFO|callbacks.py:310] 2024-07-29 16:55:01,242 >> {'loss': 0.1179, 'learning_rate': 4.9757e-06, 'epoch': 0.28, 'throughput': 482.49}
+
+[INFO|callbacks.py:310] 2024-07-29 16:55:14,448 >> {'loss': 0.1330, 'learning_rate': 4.9742e-06, 'epoch': 0.29, 'throughput': 482.82}
+
+[INFO|callbacks.py:310] 2024-07-29 16:55:27,644 >> {'loss': 0.0830, 'learning_rate': 4.9727e-06, 'epoch': 0.30, 'throughput': 483.25}
+
+[INFO|callbacks.py:310] 2024-07-29 16:55:40,841 >> {'loss': 0.1748, 'learning_rate': 4.9712e-06, 'epoch': 0.30, 'throughput': 484.08}
+
+[INFO|callbacks.py:310] 2024-07-29 16:55:54,040 >> {'loss': 0.1854, 'learning_rate': 4.9696e-06, 'epoch': 0.31, 'throughput': 485.23}
+
+[INFO|callbacks.py:310] 2024-07-29 16:56:07,259 >> {'loss': 0.1232, 'learning_rate': 4.9680e-06, 'epoch': 0.32, 'throughput': 485.90}
+
+[INFO|callbacks.py:310] 2024-07-29 16:56:20,464 >> {'loss': 0.1172, 'learning_rate': 4.9663e-06, 'epoch': 0.32, 'throughput': 486.06}
+
+[INFO|callbacks.py:310] 2024-07-29 16:56:33,665 >> {'loss': 0.1404, 'learning_rate': 4.9646e-06, 'epoch': 0.33, 'throughput': 487.08}
+
+[INFO|callbacks.py:310] 2024-07-29 16:56:46,867 >> {'loss': 0.1370, 'learning_rate': 4.9629e-06, 'epoch': 0.33, 'throughput': 487.08}
+
+[INFO|callbacks.py:310] 2024-07-29 16:57:00,077 >> {'loss': 0.1191, 'learning_rate': 4.9611e-06, 'epoch': 0.34, 'throughput': 487.59}
+
+[INFO|callbacks.py:310] 2024-07-29 16:57:13,277 >> {'loss': 0.1624, 'learning_rate': 4.9593e-06, 'epoch': 0.35, 'throughput': 487.69}
+
+[INFO|callbacks.py:310] 2024-07-29 16:57:26,482 >> {'loss': 0.1256, 'learning_rate': 4.9574e-06, 'epoch': 0.35, 'throughput': 488.00}
+
+[INFO|callbacks.py:310] 2024-07-29 16:57:39,692 >> {'loss': 0.1274, 'learning_rate': 4.9555e-06, 'epoch': 0.36, 'throughput': 488.11}
+
+[INFO|callbacks.py:310] 2024-07-29 16:57:52,902 >> {'loss': 0.1740, 'learning_rate': 4.9536e-06, 'epoch': 0.37, 'throughput': 487.88}
+
+[INFO|callbacks.py:310] 2024-07-29 16:58:06,106 >> {'loss': 0.0805, 'learning_rate': 4.9516e-06, 'epoch': 0.37, 'throughput': 488.03}
+
+[INFO|callbacks.py:310] 2024-07-29 16:58:19,315 >> {'loss': 0.1181, 'learning_rate': 4.9496e-06, 'epoch': 0.38, 'throughput': 488.37}
+
+[INFO|callbacks.py:310] 2024-07-29 16:58:32,518 >> {'loss': 0.1613, 'learning_rate': 4.9475e-06, 'epoch': 0.39, 'throughput': 488.45}
+
+[INFO|callbacks.py:310] 2024-07-29 16:58:45,723 >> {'loss': 0.0885, 'learning_rate': 4.9454e-06, 'epoch': 0.39, 'throughput': 488.78}
+
+[INFO|callbacks.py:310] 2024-07-29 16:58:58,924 >> {'loss': 0.1526, 'learning_rate': 4.9432e-06, 'epoch': 0.40, 'throughput': 488.74}
+
+[INFO|callbacks.py:310] 2024-07-29 16:59:12,150 >> {'loss': 0.1114, 'learning_rate': 4.9410e-06, 'epoch': 0.41, 'throughput': 488.87}
+
+[INFO|callbacks.py:310] 2024-07-29 16:59:25,370 >> {'loss': 0.1139, 'learning_rate': 4.9388e-06, 'epoch': 0.41, 'throughput': 488.87}
+
+[INFO|callbacks.py:310] 2024-07-29 16:59:38,576 >> {'loss': 0.0878, 'learning_rate': 4.9365e-06, 'epoch': 0.42, 'throughput': 489.21}
+
+[INFO|callbacks.py:310] 2024-07-29 16:59:51,775 >> {'loss': 0.1047, 'learning_rate': 4.9342e-06, 'epoch': 0.42, 'throughput': 489.63}
+
+[INFO|callbacks.py:310] 2024-07-29 17:00:04,986 >> {'loss': 0.1323, 'learning_rate': 4.9318e-06, 'epoch': 0.43, 'throughput': 490.13}
+
+[INFO|callbacks.py:310] 2024-07-29 17:00:18,183 >> {'loss': 0.0950, 'learning_rate': 4.9294e-06, 'epoch': 0.44, 'throughput': 490.51}
+
+[INFO|callbacks.py:310] 2024-07-29 17:00:31,381 >> {'loss': 0.2024, 'learning_rate': 4.9270e-06, 'epoch': 0.44, 'throughput': 490.90}
+
+[INFO|callbacks.py:310] 2024-07-29 17:00:44,566 >> {'loss': 0.1614, 'learning_rate': 4.9245e-06, 'epoch': 0.45, 'throughput': 491.13}
+
+[INFO|callbacks.py:310] 2024-07-29 17:00:57,772 >> {'loss': 0.2073, 'learning_rate': 4.9220e-06, 'epoch': 0.46, 'throughput': 491.09}
+
+[INFO|callbacks.py:310] 2024-07-29 17:01:10,978 >> {'loss': 0.1080, 'learning_rate': 4.9194e-06, 'epoch': 0.46, 'throughput': 490.90}
+
+[INFO|callbacks.py:310] 2024-07-29 17:01:24,194 >> {'loss': 0.1455, 'learning_rate': 4.9168e-06, 'epoch': 0.47, 'throughput': 490.57}
+
+[INFO|callbacks.py:310] 2024-07-29 17:01:37,418 >> {'loss': 0.1310, 'learning_rate': 4.9141e-06, 'epoch': 0.48, 'throughput': 490.69}
+
+[INFO|callbacks.py:310] 2024-07-29 17:01:50,635 >> {'loss': 0.1416, 'learning_rate': 4.9115e-06, 'epoch': 0.48, 'throughput': 490.86}
+
+[INFO|callbacks.py:310] 2024-07-29 17:02:03,838 >> {'loss': 0.1045, 'learning_rate': 4.9087e-06, 'epoch': 0.49, 'throughput': 490.75}
+
+[INFO|callbacks.py:310] 2024-07-29 17:02:17,046 >> {'loss': 0.1466, 'learning_rate': 4.9060e-06, 'epoch': 0.50, 'throughput': 490.84}
+
+[INFO|callbacks.py:310] 2024-07-29 17:02:30,255 >> {'loss': 0.1502, 'learning_rate': 4.9032e-06, 'epoch': 0.50, 'throughput': 491.00}
+
+[INFO|callbacks.py:310] 2024-07-29 17:02:43,471 >> {'loss': 0.1235, 'learning_rate': 4.9003e-06, 'epoch': 0.51, 'throughput': 491.08}
+
+[INFO|callbacks.py:310] 2024-07-29 17:02:56,661 >> {'loss': 0.1022, 'learning_rate': 4.8974e-06, 'epoch': 0.51, 'throughput': 491.22}
+
+[INFO|callbacks.py:310] 2024-07-29 17:03:09,868 >> {'loss': 0.0922, 'learning_rate': 4.8945e-06, 'epoch': 0.52, 'throughput': 491.65}
+
+[INFO|callbacks.py:310] 2024-07-29 17:03:23,085 >> {'loss': 0.0699, 'learning_rate': 4.8915e-06, 'epoch': 0.53, 'throughput': 491.72}
+
+[INFO|callbacks.py:310] 2024-07-29 17:03:36,311 >> {'loss': 0.1057, 'learning_rate': 4.8885e-06, 'epoch': 0.53, 'throughput': 491.50}
+
+[INFO|callbacks.py:310] 2024-07-29 17:03:49,515 >> {'loss': 0.0793, 'learning_rate': 4.8854e-06, 'epoch': 0.54, 'throughput': 491.90}
+
+[INFO|callbacks.py:310] 2024-07-29 17:04:02,733 >> {'loss': 0.1400, 'learning_rate': 4.8824e-06, 'epoch': 0.55, 'throughput': 491.90}
+
+[INFO|callbacks.py:310] 2024-07-29 17:04:15,934 >> {'loss': 0.1538, 'learning_rate': 4.8792e-06, 'epoch': 0.55, 'throughput': 491.89}
+
+[INFO|callbacks.py:310] 2024-07-29 17:04:29,139 >> {'loss': 0.1135, 'learning_rate': 4.8760e-06, 'epoch': 0.56, 'throughput': 492.12}
+
+[INFO|callbacks.py:310] 2024-07-29 17:04:42,351 >> {'loss': 0.1253, 'learning_rate': 4.8728e-06, 'epoch': 0.57, 'throughput': 492.01}
+
+[INFO|callbacks.py:310] 2024-07-29 17:04:55,572 >> {'loss': 0.1408, 'learning_rate': 4.8696e-06, 'epoch': 0.57, 'throughput': 492.23}
+
+[INFO|callbacks.py:310] 2024-07-29 17:05:08,780 >> {'loss': 0.0916, 'learning_rate': 4.8663e-06, 'epoch': 0.58, 'throughput': 492.43}
+
+[INFO|callbacks.py:310] 2024-07-29 17:05:22,001 >> {'loss': 0.1016, 'learning_rate': 4.8630e-06, 'epoch': 0.59, 'throughput': 492.41}
+
+[INFO|callbacks.py:310] 2024-07-29 17:05:35,214 >> {'loss': 0.0936, 'learning_rate': 4.8596e-06, 'epoch': 0.59, 'throughput': 492.48}
+
+[INFO|callbacks.py:310] 2024-07-29 17:05:48,425 >> {'loss': 0.1557, 'learning_rate': 4.8562e-06, 'epoch': 0.60, 'throughput': 492.79}
+
+[INFO|callbacks.py:310] 2024-07-29 17:06:01,639 >> {'loss': 0.0879, 'learning_rate': 4.8527e-06, 'epoch': 0.60, 'throughput': 492.85}
+
+[INFO|callbacks.py:310] 2024-07-29 17:06:14,846 >> {'loss': 0.1399, 'learning_rate': 4.8492e-06, 'epoch': 0.61, 'throughput': 493.15}
+
+[INFO|callbacks.py:310] 2024-07-29 17:06:28,063 >> {'loss': 0.1724, 'learning_rate': 4.8457e-06, 'epoch': 0.62, 'throughput': 492.97}
+
+[INFO|callbacks.py:310] 2024-07-29 17:06:41,274 >> {'loss': 0.1018, 'learning_rate': 4.8421e-06, 'epoch': 0.62, 'throughput': 493.20}
+
+[INFO|callbacks.py:310] 2024-07-29 17:06:54,486 >> {'loss': 0.0398, 'learning_rate': 4.8385e-06, 'epoch': 0.63, 'throughput': 493.48}
+
+[INFO|callbacks.py:310] 2024-07-29 17:07:07,691 >> {'loss': 0.1094, 'learning_rate': 4.8349e-06, 'epoch': 0.64, 'throughput': 493.59}
+
+[INFO|callbacks.py:310] 2024-07-29 17:07:20,901 >> {'loss': 0.1915, 'learning_rate': 4.8312e-06, 'epoch': 0.64, 'throughput': 493.80}
+
+[INFO|callbacks.py:310] 2024-07-29 17:07:34,110 >> {'loss': 0.1518, 'learning_rate': 4.8275e-06, 'epoch': 0.65, 'throughput': 493.68}
+
+[INFO|callbacks.py:310] 2024-07-29 17:07:47,302 >> {'loss': 0.1410, 'learning_rate': 4.8237e-06, 'epoch': 0.66, 'throughput': 493.78}
+
+[INFO|callbacks.py:310] 2024-07-29 17:08:00,533 >> {'loss': 0.1564, 'learning_rate': 4.8199e-06, 'epoch': 0.66, 'throughput': 493.85}
+
+[INFO|callbacks.py:310] 2024-07-29 17:08:13,748 >> {'loss': 0.1267, 'learning_rate': 4.8160e-06, 'epoch': 0.67, 'throughput': 493.87}
+
+[INFO|callbacks.py:310] 2024-07-29 17:08:26,979 >> {'loss': 0.2280, 'learning_rate': 4.8121e-06, 'epoch': 0.68, 'throughput': 494.08}
+
+[INFO|callbacks.py:310] 2024-07-29 17:08:40,183 >> {'loss': 0.1532, 'learning_rate': 4.8082e-06, 'epoch': 0.68, 'throughput': 494.21}
+
+[INFO|callbacks.py:310] 2024-07-29 17:08:53,402 >> {'loss': 0.1482, 'learning_rate': 4.8043e-06, 'epoch': 0.69, 'throughput': 494.13}
+
+[INFO|callbacks.py:310] 2024-07-29 17:09:06,601 >> {'loss': 0.1074, 'learning_rate': 4.8003e-06, 'epoch': 0.70, 'throughput': 494.04}
+
+[INFO|callbacks.py:310] 2024-07-29 17:09:19,791 >> {'loss': 0.1163, 'learning_rate': 4.7962e-06, 'epoch': 0.70, 'throughput': 494.26}
+
+[INFO|callbacks.py:310] 2024-07-29 17:09:32,997 >> {'loss': 0.1871, 'learning_rate': 4.7921e-06, 'epoch': 0.71, 'throughput': 494.48}
+
+[INFO|callbacks.py:310] 2024-07-29 17:09:46,214 >> {'loss': 0.0920, 'learning_rate': 4.7880e-06, 'epoch': 0.71, 'throughput': 494.30}
+
+[INFO|callbacks.py:310] 2024-07-29 17:09:59,411 >> {'loss': 0.0959, 'learning_rate': 4.7839e-06, 'epoch': 0.72, 'throughput': 494.28}
+
+[INFO|callbacks.py:310] 2024-07-29 17:10:12,618 >> {'loss': 0.0853, 'learning_rate': 4.7797e-06, 'epoch': 0.73, 'throughput': 494.23}
+
+[INFO|callbacks.py:310] 2024-07-29 17:10:25,817 >> {'loss': 0.1581, 'learning_rate': 4.7754e-06, 'epoch': 0.73, 'throughput': 494.42}
+
+[INFO|callbacks.py:310] 2024-07-29 17:10:39,028 >> {'loss': 0.0907, 'learning_rate': 4.7712e-06, 'epoch': 0.74, 'throughput': 494.64}
+
+[INFO|callbacks.py:310] 2024-07-29 17:10:52,225 >> {'loss': 0.0826, 'learning_rate': 4.7669e-06, 'epoch': 0.75, 'throughput': 494.82}
+
+[INFO|callbacks.py:310] 2024-07-29 17:11:05,421 >> {'loss': 0.0979, 'learning_rate': 4.7625e-06, 'epoch': 0.75, 'throughput': 494.70}
+
+[INFO|callbacks.py:310] 2024-07-29 17:11:18,622 >> {'loss': 0.1617, 'learning_rate': 4.7581e-06, 'epoch': 0.76, 'throughput': 494.83}
+
+[INFO|callbacks.py:310] 2024-07-29 17:11:31,831 >> {'loss': 0.1306, 'learning_rate': 4.7537e-06, 'epoch': 0.77, 'throughput': 495.01}
+
+[INFO|callbacks.py:310] 2024-07-29 17:11:45,048 >> {'loss': 0.0955, 'learning_rate': 4.7492e-06, 'epoch': 0.77, 'throughput': 495.05}
+
+[INFO|callbacks.py:310] 2024-07-29 17:11:58,252 >> {'loss': 0.0683, 'learning_rate': 4.7447e-06, 'epoch': 0.78, 'throughput': 495.04}
+
+[INFO|callbacks.py:310] 2024-07-29 17:12:11,457 >> {'loss': 0.1117, 'learning_rate': 4.7402e-06, 'epoch': 0.79, 'throughput': 495.21}
+
+[INFO|callbacks.py:310] 2024-07-29 17:12:24,648 >> {'loss': 0.1115, 'learning_rate': 4.7356e-06, 'epoch': 0.79, 'throughput': 495.26}
+
+[INFO|callbacks.py:310] 2024-07-29 17:12:37,835 >> {'loss': 0.0945, 'learning_rate': 4.7310e-06, 'epoch': 0.80, 'throughput': 495.22}
+
+[INFO|callbacks.py:310] 2024-07-29 17:12:51,044 >> {'loss': 0.1125, 'learning_rate': 4.7263e-06, 'epoch': 0.80, 'throughput': 495.35}
+
+[INFO|callbacks.py:310] 2024-07-29 17:13:04,262 >> {'loss': 0.1500, 'learning_rate': 4.7217e-06, 'epoch': 0.81, 'throughput': 495.64}
+
+[INFO|callbacks.py:310] 2024-07-29 17:13:17,461 >> {'loss': 0.1015, 'learning_rate': 4.7169e-06, 'epoch': 0.82, 'throughput': 495.61}
+
+[INFO|callbacks.py:310] 2024-07-29 17:13:30,661 >> {'loss': 0.1271, 'learning_rate': 4.7122e-06, 'epoch': 0.82, 'throughput': 495.60}
+
+[INFO|callbacks.py:310] 2024-07-29 17:13:43,859 >> {'loss': 0.0762, 'learning_rate': 4.7074e-06, 'epoch': 0.83, 'throughput': 495.87}
+
+[INFO|callbacks.py:310] 2024-07-29 17:13:57,065 >> {'loss': 0.1196, 'learning_rate': 4.7025e-06, 'epoch': 0.84, 'throughput': 495.96}
+
+[INFO|callbacks.py:310] 2024-07-29 17:14:10,272 >> {'loss': 0.0866, 'learning_rate': 4.6977e-06, 'epoch': 0.84, 'throughput': 496.03}
+
+[INFO|callbacks.py:310] 2024-07-29 17:14:23,483 >> {'loss': 0.0718, 'learning_rate': 4.6927e-06, 'epoch': 0.85, 'throughput': 496.26}
+
+[INFO|callbacks.py:310] 2024-07-29 17:14:36,696 >> {'loss': 0.0808, 'learning_rate': 4.6878e-06, 'epoch': 0.86, 'throughput': 496.44}
+
+[INFO|callbacks.py:310] 2024-07-29 17:14:49,889 >> {'loss': 0.1069, 'learning_rate': 4.6828e-06, 'epoch': 0.86, 'throughput': 496.55}
+
+[INFO|callbacks.py:310] 2024-07-29 17:15:03,095 >> {'loss': 0.0804, 'learning_rate': 4.6778e-06, 'epoch': 0.87, 'throughput': 496.40}
+
+[INFO|callbacks.py:310] 2024-07-29 17:15:16,308 >> {'loss': 0.0680, 'learning_rate': 4.6727e-06, 'epoch': 0.88, 'throughput': 496.32}
+
+[INFO|callbacks.py:310] 2024-07-29 17:15:29,510 >> {'loss': 0.1193, 'learning_rate': 4.6676e-06, 'epoch': 0.88, 'throughput': 496.41}
+
+[INFO|callbacks.py:310] 2024-07-29 17:15:42,713 >> {'loss': 0.1535, 'learning_rate': 4.6625e-06, 'epoch': 0.89, 'throughput': 496.47}
+
+[INFO|callbacks.py:310] 2024-07-29 17:15:55,902 >> {'loss': 0.1303, 'learning_rate': 4.6573e-06, 'epoch': 0.89, 'throughput': 496.37}
+
+[INFO|callbacks.py:310] 2024-07-29 17:16:09,114 >> {'loss': 0.0879, 'learning_rate': 4.6521e-06, 'epoch': 0.90, 'throughput': 496.30}
+
+[INFO|callbacks.py:310] 2024-07-29 17:16:22,324 >> {'loss': 0.0986, 'learning_rate': 4.6469e-06, 'epoch': 0.91, 'throughput': 496.28}
+
+[INFO|callbacks.py:310] 2024-07-29 17:16:35,530 >> {'loss': 0.1219, 'learning_rate': 4.6416e-06, 'epoch': 0.91, 'throughput': 496.30}
+
+[INFO|callbacks.py:310] 2024-07-29 17:16:48,758 >> {'loss': 0.1303, 'learning_rate': 4.6363e-06, 'epoch': 0.92, 'throughput': 496.54}
+
+[INFO|callbacks.py:310] 2024-07-29 17:17:01,956 >> {'loss': 0.0810, 'learning_rate': 4.6309e-06, 'epoch': 0.93, 'throughput': 496.41}
+
+[INFO|callbacks.py:310] 2024-07-29 17:17:15,174 >> {'loss': 0.0979, 'learning_rate': 4.6255e-06, 'epoch': 0.93, 'throughput': 496.47}
+
+[INFO|callbacks.py:310] 2024-07-29 17:17:28,371 >> {'loss': 0.1446, 'learning_rate': 4.6201e-06, 'epoch': 0.94, 'throughput': 496.49}
+
+[INFO|callbacks.py:310] 2024-07-29 17:17:41,582 >> {'loss': 0.1267, 'learning_rate': 4.6147e-06, 'epoch': 0.95, 'throughput': 496.40}
+
+[INFO|callbacks.py:310] 2024-07-29 17:17:54,796 >> {'loss': 0.1095, 'learning_rate': 4.6092e-06, 'epoch': 0.95, 'throughput': 496.39}
+
+[INFO|callbacks.py:310] 2024-07-29 17:18:08,010 >> {'loss': 0.1252, 'learning_rate': 4.6036e-06, 'epoch': 0.96, 'throughput': 496.48}
+
+[INFO|callbacks.py:310] 2024-07-29 17:18:21,233 >> {'loss': 0.0645, 'learning_rate': 4.5981e-06, 'epoch': 0.97, 'throughput': 496.48}
+
+[INFO|callbacks.py:310] 2024-07-29 17:18:34,447 >> {'loss': 0.0974, 'learning_rate': 4.5925e-06, 'epoch': 0.97, 'throughput': 496.51}
+
+[INFO|callbacks.py:310] 2024-07-29 17:18:47,657 >> {'loss': 0.0833, 'learning_rate': 4.5868e-06, 'epoch': 0.98, 'throughput': 496.55}
+
+[INFO|callbacks.py:310] 2024-07-29 17:19:00,871 >> {'loss': 0.0787, 'learning_rate': 4.5812e-06, 'epoch': 0.98, 'throughput': 496.48}
+
+[INFO|callbacks.py:310] 2024-07-29 17:19:14,087 >> {'loss': 0.0814, 'learning_rate': 4.5755e-06, 'epoch': 0.99, 'throughput': 496.59}
+
+[INFO|callbacks.py:310] 2024-07-29 17:19:27,314 >> {'loss': 0.0278, 'learning_rate': 4.5697e-06, 'epoch': 1.00, 'throughput': 496.67}
+
+[INFO|callbacks.py:310] 2024-07-29 17:19:40,525 >> {'loss': 0.0930, 'learning_rate': 4.5639e-06, 'epoch': 1.00, 'throughput': 496.82}
+
+[INFO|callbacks.py:310] 2024-07-29 17:19:53,725 >> {'loss': 0.0730, 'learning_rate': 4.5581e-06, 'epoch': 1.01, 'throughput': 497.03}
+
+[INFO|callbacks.py:310] 2024-07-29 17:20:06,939 >> {'loss': 0.0625, 'learning_rate': 4.5523e-06, 'epoch': 1.02, 'throughput': 496.97}
+
+[INFO|callbacks.py:310] 2024-07-29 17:20:20,133 >> {'loss': 0.0461, 'learning_rate': 4.5464e-06, 'epoch': 1.02, 'throughput': 496.99}
+
+[INFO|callbacks.py:310] 2024-07-29 17:20:33,356 >> {'loss': 0.0318, 'learning_rate': 4.5405e-06, 'epoch': 1.03, 'throughput': 496.91}
+
+[INFO|callbacks.py:310] 2024-07-29 17:20:46,566 >> {'loss': 0.0068, 'learning_rate': 4.5345e-06, 'epoch': 1.04, 'throughput': 496.93}
+
+[INFO|callbacks.py:310] 2024-07-29 17:20:59,795 >> {'loss': 0.0462, 'learning_rate': 4.5286e-06, 'epoch': 1.04, 'throughput': 496.94}
+
+[INFO|callbacks.py:310] 2024-07-29 17:21:12,995 >> {'loss': 0.0487, 'learning_rate': 4.5225e-06, 'epoch': 1.05, 'throughput': 496.95}
+
+[INFO|callbacks.py:310] 2024-07-29 17:21:26,197 >> {'loss': 0.0664, 'learning_rate': 4.5165e-06, 'epoch': 1.06, 'throughput': 497.10}
+
+[INFO|callbacks.py:310] 2024-07-29 17:21:39,408 >> {'loss': 0.0347, 'learning_rate': 4.5104e-06, 'epoch': 1.06, 'throughput': 497.06}
+
+[INFO|callbacks.py:310] 2024-07-29 17:21:52,617 >> {'loss': 0.0202, 'learning_rate': 4.5043e-06, 'epoch': 1.07, 'throughput': 497.04}
+
+[INFO|callbacks.py:310] 2024-07-29 17:22:05,837 >> {'loss': 0.0573, 'learning_rate': 4.4981e-06, 'epoch': 1.07, 'throughput': 497.09}
+
+[INFO|callbacks.py:310] 2024-07-29 17:22:19,041 >> {'loss': 0.0848, 'learning_rate': 4.4919e-06, 'epoch': 1.08, 'throughput': 497.24}
+
+[INFO|callbacks.py:310] 2024-07-29 17:22:32,265 >> {'loss': 0.0464, 'learning_rate': 4.4857e-06, 'epoch': 1.09, 'throughput': 497.27}
+
+[INFO|callbacks.py:310] 2024-07-29 17:22:45,472 >> {'loss': 0.1004, 'learning_rate': 4.4795e-06, 'epoch': 1.09, 'throughput': 497.41}
+
+[INFO|callbacks.py:310] 2024-07-29 17:22:58,676 >> {'loss': 0.0842, 'learning_rate': 4.4732e-06, 'epoch': 1.10, 'throughput': 497.47}
+
+[INFO|callbacks.py:310] 2024-07-29 17:23:11,888 >> {'loss': 0.0452, 'learning_rate': 4.4669e-06, 'epoch': 1.11, 'throughput': 497.39}
+
+[INFO|callbacks.py:310] 2024-07-29 17:23:25,085 >> {'loss': 0.0225, 'learning_rate': 4.4605e-06, 'epoch': 1.11, 'throughput': 497.32}
+
+[INFO|callbacks.py:310] 2024-07-29 17:23:38,294 >> {'loss': 0.0058, 'learning_rate': 4.4541e-06, 'epoch': 1.12, 'throughput': 497.29}
+
+[INFO|callbacks.py:310] 2024-07-29 17:23:51,501 >> {'loss': 0.0565, 'learning_rate': 4.4477e-06, 'epoch': 1.13, 'throughput': 497.34}
+
+[INFO|callbacks.py:310] 2024-07-29 17:24:04,707 >> {'loss': 0.0591, 'learning_rate': 4.4412e-06, 'epoch': 1.13, 'throughput': 497.53}
+
+[INFO|callbacks.py:310] 2024-07-29 17:24:17,921 >> {'loss': 0.0488, 'learning_rate': 4.4348e-06, 'epoch': 1.14, 'throughput': 497.61}
+
+[INFO|callbacks.py:310] 2024-07-29 17:24:31,124 >> {'loss': 0.0318, 'learning_rate': 4.4282e-06, 'epoch': 1.15, 'throughput': 497.52}
+
+[INFO|callbacks.py:310] 2024-07-29 17:24:44,314 >> {'loss': 0.0640, 'learning_rate': 4.4217e-06, 'epoch': 1.15, 'throughput': 497.68}
+
+[INFO|callbacks.py:310] 2024-07-29 17:24:57,527 >> {'loss': 0.0465, 'learning_rate': 4.4151e-06, 'epoch': 1.16, 'throughput': 497.73}
+
+[INFO|callbacks.py:310] 2024-07-29 17:25:10,727 >> {'loss': 0.0344, 'learning_rate': 4.4085e-06, 'epoch': 1.16, 'throughput': 497.78}
+
+[INFO|callbacks.py:310] 2024-07-29 17:25:23,923 >> {'loss': 0.0747, 'learning_rate': 4.4018e-06, 'epoch': 1.17, 'throughput': 497.86}
+
+[INFO|callbacks.py:310] 2024-07-29 17:25:37,107 >> {'loss': 0.0730, 'learning_rate': 4.3952e-06, 'epoch': 1.18, 'throughput': 497.95}
+
+[INFO|callbacks.py:310] 2024-07-29 17:25:50,318 >> {'loss': 0.0652, 'learning_rate': 4.3885e-06, 'epoch': 1.18, 'throughput': 498.00}
+
+[INFO|callbacks.py:310] 2024-07-29 17:26:03,524 >> {'loss': 0.0881, 'learning_rate': 4.3817e-06, 'epoch': 1.19, 'throughput': 497.93}
+
+[INFO|callbacks.py:310] 2024-07-29 17:26:16,717 >> {'loss': 0.0676, 'learning_rate': 4.3749e-06, 'epoch': 1.20, 'throughput': 498.02}
+
+[INFO|callbacks.py:310] 2024-07-29 17:26:29,928 >> {'loss': 0.0481, 'learning_rate': 4.3681e-06, 'epoch': 1.20, 'throughput': 497.99}
+
+[INFO|callbacks.py:310] 2024-07-29 17:26:43,129 >> {'loss': 0.0477, 'learning_rate': 4.3613e-06, 'epoch': 1.21, 'throughput': 498.15}
+
+[INFO|callbacks.py:310] 2024-07-29 17:26:56,325 >> {'loss': 0.0456, 'learning_rate': 4.3544e-06, 'epoch': 1.22, 'throughput': 498.32}
+
+[INFO|callbacks.py:310] 2024-07-29 17:27:09,520 >> {'loss': 0.1330, 'learning_rate': 4.3475e-06, 'epoch': 1.22, 'throughput': 498.52}
+
+[INFO|callbacks.py:310] 2024-07-29 17:27:22,747 >> {'loss': 0.0672, 'learning_rate': 4.3406e-06, 'epoch': 1.23, 'throughput': 498.51}
+
+[INFO|callbacks.py:310] 2024-07-29 17:27:35,941 >> {'loss': 0.0352, 'learning_rate': 4.3336e-06, 'epoch': 1.24, 'throughput': 498.46}
+
+[INFO|callbacks.py:310] 2024-07-29 17:27:49,148 >> {'loss': 0.0361, 'learning_rate': 4.3266e-06, 'epoch': 1.24, 'throughput': 498.39}
+
+[INFO|callbacks.py:310] 2024-07-29 17:28:02,339 >> {'loss': 0.0432, 'learning_rate': 4.3196e-06, 'epoch': 1.25, 'throughput': 498.44}
+
+[INFO|callbacks.py:310] 2024-07-29 17:28:15,541 >> {'loss': 0.0398, 'learning_rate': 4.3126e-06, 'epoch': 1.26, 'throughput': 498.43}
+
+[INFO|callbacks.py:310] 2024-07-29 17:28:28,735 >> {'loss': 0.0473, 'learning_rate': 4.3055e-06, 'epoch': 1.26, 'throughput': 498.53}
+
+[INFO|callbacks.py:310] 2024-07-29 17:28:41,944 >> {'loss': 0.0394, 'learning_rate': 4.2983e-06, 'epoch': 1.27, 'throughput': 498.65}
+
+[INFO|callbacks.py:310] 2024-07-29 17:28:55,142 >> {'loss': 0.0645, 'learning_rate': 4.2912e-06, 'epoch': 1.27, 'throughput': 498.55}
+
+[INFO|callbacks.py:310] 2024-07-29 17:29:08,366 >> {'loss': 0.0663, 'learning_rate': 4.2840e-06, 'epoch': 1.28, 'throughput': 498.37}
+
+[INFO|callbacks.py:310] 2024-07-29 17:29:21,559 >> {'loss': 0.1067, 'learning_rate': 4.2768e-06, 'epoch': 1.29, 'throughput': 498.30}
+
+[INFO|callbacks.py:310] 2024-07-29 17:29:34,751 >> {'loss': 0.0603, 'learning_rate': 4.2696e-06, 'epoch': 1.29, 'throughput': 498.30}
+
+[INFO|callbacks.py:310] 2024-07-29 17:29:47,948 >> {'loss': 0.0055, 'learning_rate': 4.2623e-06, 'epoch': 1.30, 'throughput': 498.44}
+
+[INFO|callbacks.py:310] 2024-07-29 17:30:01,146 >> {'loss': 0.0547, 'learning_rate': 4.2550e-06, 'epoch': 1.31, 'throughput': 498.36}
+
+[INFO|callbacks.py:310] 2024-07-29 17:30:14,328 >> {'loss': 0.0684, 'learning_rate': 4.2477e-06, 'epoch': 1.31, 'throughput': 498.47}
+
+[INFO|callbacks.py:310] 2024-07-29 17:30:27,547 >> {'loss': 0.0484, 'learning_rate': 4.2403e-06, 'epoch': 1.32, 'throughput': 498.46}
+
+[INFO|callbacks.py:310] 2024-07-29 17:30:40,749 >> {'loss': 0.0225, 'learning_rate': 4.2329e-06, 'epoch': 1.33, 'throughput': 498.41}
+
+[INFO|callbacks.py:310] 2024-07-29 17:30:53,968 >> {'loss': 0.0278, 'learning_rate': 4.2255e-06, 'epoch': 1.33, 'throughput': 498.49}
+
+[INFO|callbacks.py:310] 2024-07-29 17:31:07,169 >> {'loss': 0.0340, 'learning_rate': 4.2181e-06, 'epoch': 1.34, 'throughput': 498.38}
+
+[INFO|callbacks.py:310] 2024-07-29 17:31:20,394 >> {'loss': 0.0453, 'learning_rate': 4.2106e-06, 'epoch': 1.35, 'throughput': 498.29}
+
+[INFO|callbacks.py:310] 2024-07-29 17:31:33,580 >> {'loss': 0.0564, 'learning_rate': 4.2031e-06, 'epoch': 1.35, 'throughput': 498.24}
+
+[INFO|callbacks.py:310] 2024-07-29 17:31:46,796 >> {'loss': 0.0318, 'learning_rate': 4.1956e-06, 'epoch': 1.36, 'throughput': 498.25}
+
+[INFO|callbacks.py:310] 2024-07-29 17:31:59,987 >> {'loss': 0.0575, 'learning_rate': 4.1880e-06, 'epoch': 1.36, 'throughput': 498.30}
+
+[INFO|callbacks.py:310] 2024-07-29 17:32:13,203 >> {'loss': 0.0448, 'learning_rate': 4.1804e-06, 'epoch': 1.37, 'throughput': 498.33}
+
+[INFO|callbacks.py:310] 2024-07-29 17:32:26,416 >> {'loss': 0.0493, 'learning_rate': 4.1728e-06, 'epoch': 1.38, 'throughput': 498.34}
+
+[INFO|callbacks.py:310] 2024-07-29 17:32:39,613 >> {'loss': 0.0478, 'learning_rate': 4.1652e-06, 'epoch': 1.38, 'throughput': 498.42}
+
+[INFO|callbacks.py:310] 2024-07-29 17:32:52,829 >> {'loss': 0.0887, 'learning_rate': 4.1575e-06, 'epoch': 1.39, 'throughput': 498.48}
+
+[INFO|callbacks.py:310] 2024-07-29 17:33:06,034 >> {'loss': 0.0634, 'learning_rate': 4.1498e-06, 'epoch': 1.40, 'throughput': 498.65}
+
+[INFO|callbacks.py:310] 2024-07-29 17:33:19,224 >> {'loss': 0.0349, 'learning_rate': 4.1421e-06, 'epoch': 1.40, 'throughput': 498.63}
+
+[INFO|callbacks.py:310] 2024-07-29 17:33:32,421 >> {'loss': 0.0459, 'learning_rate': 4.1343e-06, 'epoch': 1.41, 'throughput': 498.66}
+
+[INFO|callbacks.py:310] 2024-07-29 17:33:45,641 >> {'loss': 0.0304, 'learning_rate': 4.1265e-06, 'epoch': 1.42, 'throughput': 498.71}
+
+[INFO|callbacks.py:310] 2024-07-29 17:33:58,855 >> {'loss': 0.0658, 'learning_rate': 4.1187e-06, 'epoch': 1.42, 'throughput': 498.68}
+
+[INFO|callbacks.py:310] 2024-07-29 17:34:12,048 >> {'loss': 0.0870, 'learning_rate': 4.1109e-06, 'epoch': 1.43, 'throughput': 498.81}
+
+[INFO|callbacks.py:310] 2024-07-29 17:34:25,255 >> {'loss': 0.0952, 'learning_rate': 4.1030e-06, 'epoch': 1.44, 'throughput': 498.80}
+
+[INFO|callbacks.py:310] 2024-07-29 17:34:38,462 >> {'loss': 0.0358, 'learning_rate': 4.0951e-06, 'epoch': 1.44, 'throughput': 498.77}
+
+[INFO|callbacks.py:310] 2024-07-29 17:34:51,672 >> {'loss': 0.0527, 'learning_rate': 4.0872e-06, 'epoch': 1.45, 'throughput': 498.81}
+
+[INFO|callbacks.py:310] 2024-07-29 17:35:04,888 >> {'loss': 0.0452, 'learning_rate': 4.0793e-06, 'epoch': 1.45, 'throughput': 498.74}
+
+[INFO|callbacks.py:310] 2024-07-29 17:35:18,078 >> {'loss': 0.0313, 'learning_rate': 4.0713e-06, 'epoch': 1.46, 'throughput': 498.73}
+
+[INFO|callbacks.py:310] 2024-07-29 17:35:31,305 >> {'loss': 0.0544, 'learning_rate': 4.0633e-06, 'epoch': 1.47, 'throughput': 498.63}
+
+[INFO|callbacks.py:310] 2024-07-29 17:35:44,513 >> {'loss': 0.0428, 'learning_rate': 4.0553e-06, 'epoch': 1.47, 'throughput': 498.50}
+
+[INFO|callbacks.py:310] 2024-07-29 17:35:57,720 >> {'loss': 0.0183, 'learning_rate': 4.0472e-06, 'epoch': 1.48, 'throughput': 498.52}
+
+[INFO|callbacks.py:310] 2024-07-29 17:36:10,933 >> {'loss': 0.0662, 'learning_rate': 4.0392e-06, 'epoch': 1.49, 'throughput': 498.47}
+
+[INFO|callbacks.py:310] 2024-07-29 17:36:24,122 >> {'loss': 0.0258, 'learning_rate': 4.0311e-06, 'epoch': 1.49, 'throughput': 498.40}
+
+[INFO|callbacks.py:310] 2024-07-29 17:36:37,336 >> {'loss': 0.0584, 'learning_rate': 4.0229e-06, 'epoch': 1.50, 'throughput': 498.41}
+
+[INFO|callbacks.py:310] 2024-07-29 17:36:50,535 >> {'loss': 0.0916, 'learning_rate': 4.0148e-06, 'epoch': 1.51, 'throughput': 498.42}
+
+[INFO|callbacks.py:310] 2024-07-29 17:37:03,756 >> {'loss': 0.0397, 'learning_rate': 4.0066e-06, 'epoch': 1.51, 'throughput': 498.38}
+
+[INFO|callbacks.py:310] 2024-07-29 17:37:16,972 >> {'loss': 0.0405, 'learning_rate': 3.9984e-06, 'epoch': 1.52, 'throughput': 498.37}
+
+[INFO|callbacks.py:310] 2024-07-29 17:37:30,176 >> {'loss': 0.0645, 'learning_rate': 3.9902e-06, 'epoch': 1.53, 'throughput': 498.43}
+
+[INFO|callbacks.py:310] 2024-07-29 17:37:43,379 >> {'loss': 0.0389, 'learning_rate': 3.9819e-06, 'epoch': 1.53, 'throughput': 498.46}
+
+[INFO|callbacks.py:310] 2024-07-29 17:37:56,587 >> {'loss': 0.0341, 'learning_rate': 3.9736e-06, 'epoch': 1.54, 'throughput': 498.45}
+
+[INFO|callbacks.py:310] 2024-07-29 17:38:09,782 >> {'loss': 0.0625, 'learning_rate': 3.9653e-06, 'epoch': 1.54, 'throughput': 498.43}
+
+[INFO|callbacks.py:310] 2024-07-29 17:38:22,974 >> {'loss': 0.0651, 'learning_rate': 3.9570e-06, 'epoch': 1.55, 'throughput': 498.39}
+
+[INFO|callbacks.py:310] 2024-07-29 17:38:36,177 >> {'loss': 0.0391, 'learning_rate': 3.9486e-06, 'epoch': 1.56, 'throughput': 498.37}
+
+[INFO|callbacks.py:310] 2024-07-29 17:38:49,401 >> {'loss': 0.0486, 'learning_rate': 3.9402e-06, 'epoch': 1.56, 'throughput': 498.40}
+
+[INFO|callbacks.py:310] 2024-07-29 17:39:02,583 >> {'loss': 0.1192, 'learning_rate': 3.9318e-06, 'epoch': 1.57, 'throughput': 498.37}
+
+[INFO|callbacks.py:310] 2024-07-29 17:39:15,788 >> {'loss': 0.0679, 'learning_rate': 3.9234e-06, 'epoch': 1.58, 'throughput': 498.42}
+
+[INFO|callbacks.py:310] 2024-07-29 17:39:28,987 >> {'loss': 0.0493, 'learning_rate': 3.9150e-06, 'epoch': 1.58, 'throughput': 498.48}
+
+[INFO|callbacks.py:310] 2024-07-29 17:39:42,195 >> {'loss': 0.0456, 'learning_rate': 3.9065e-06, 'epoch': 1.59, 'throughput': 498.48}
+
+[INFO|callbacks.py:310] 2024-07-29 17:39:55,386 >> {'loss': 0.0391, 'learning_rate': 3.8980e-06, 'epoch': 1.60, 'throughput': 498.50}
+
+[INFO|callbacks.py:310] 2024-07-29 17:40:08,593 >> {'loss': 0.0501, 'learning_rate': 3.8895e-06, 'epoch': 1.60, 'throughput': 498.65}
+
+[INFO|callbacks.py:310] 2024-07-29 17:40:21,807 >> {'loss': 0.0397, 'learning_rate': 3.8809e-06, 'epoch': 1.61, 'throughput': 498.71}
+
+[INFO|callbacks.py:310] 2024-07-29 17:40:35,008 >> {'loss': 0.0846, 'learning_rate': 3.8723e-06, 'epoch': 1.62, 'throughput': 498.83}
+
+[INFO|callbacks.py:310] 2024-07-29 17:40:48,213 >> {'loss': 0.0443, 'learning_rate': 3.8637e-06, 'epoch': 1.62, 'throughput': 498.80}
+
+[INFO|callbacks.py:310] 2024-07-29 17:41:01,411 >> {'loss': 0.0450, 'learning_rate': 3.8551e-06, 'epoch': 1.63, 'throughput': 498.82}
+
+[INFO|callbacks.py:310] 2024-07-29 17:41:14,601 >> {'loss': 0.0172, 'learning_rate': 3.8465e-06, 'epoch': 1.63, 'throughput': 498.90}
+
+[INFO|callbacks.py:310] 2024-07-29 17:41:27,798 >> {'loss': 0.0949, 'learning_rate': 3.8378e-06, 'epoch': 1.64, 'throughput': 498.89}
+
+[INFO|callbacks.py:310] 2024-07-29 17:41:41,004 >> {'loss': 0.0277, 'learning_rate': 3.8291e-06, 'epoch': 1.65, 'throughput': 499.03}
+
+[INFO|callbacks.py:310] 2024-07-29 17:41:54,221 >> {'loss': 0.0247, 'learning_rate': 3.8204e-06, 'epoch': 1.65, 'throughput': 498.92}
+
+[INFO|callbacks.py:310] 2024-07-29 17:42:07,440 >> {'loss': 0.0549, 'learning_rate': 3.8117e-06, 'epoch': 1.66, 'throughput': 498.89}
+
+[INFO|callbacks.py:310] 2024-07-29 17:42:20,639 >> {'loss': 0.0342, 'learning_rate': 3.8030e-06, 'epoch': 1.67, 'throughput': 499.00}
+
+[INFO|callbacks.py:310] 2024-07-29 17:42:33,840 >> {'loss': 0.0617, 'learning_rate': 3.7942e-06, 'epoch': 1.67, 'throughput': 498.99}
+
+[INFO|callbacks.py:310] 2024-07-29 17:42:47,056 >> {'loss': 0.0675, 'learning_rate': 3.7854e-06, 'epoch': 1.68, 'throughput': 499.00}
+
+[INFO|callbacks.py:310] 2024-07-29 17:43:00,250 >> {'loss': 0.1055, 'learning_rate': 3.7766e-06, 'epoch': 1.69, 'throughput': 499.01}
+
+[INFO|callbacks.py:310] 2024-07-29 17:43:13,453 >> {'loss': 0.0355, 'learning_rate': 3.7677e-06, 'epoch': 1.69, 'throughput': 499.00}
+
+[INFO|callbacks.py:310] 2024-07-29 17:43:26,648 >> {'loss': 0.0622, 'learning_rate': 3.7589e-06, 'epoch': 1.70, 'throughput': 498.96}
+
+[INFO|callbacks.py:310] 2024-07-29 17:43:39,856 >> {'loss': 0.0297, 'learning_rate': 3.7500e-06, 'epoch': 1.71, 'throughput': 499.01}
+
+[INFO|callbacks.py:310] 2024-07-29 17:43:53,069 >> {'loss': 0.0312, 'learning_rate': 3.7411e-06, 'epoch': 1.71, 'throughput': 498.96}
+
+[INFO|callbacks.py:310] 2024-07-29 17:44:06,262 >> {'loss': 0.0811, 'learning_rate': 3.7322e-06, 'epoch': 1.72, 'throughput': 498.92}
+
+[INFO|callbacks.py:310] 2024-07-29 17:44:19,465 >> {'loss': 0.0670, 'learning_rate': 3.7232e-06, 'epoch': 1.72, 'throughput': 498.94}
+
+[INFO|callbacks.py:310] 2024-07-29 17:44:32,666 >> {'loss': 0.0553, 'learning_rate': 3.7143e-06, 'epoch': 1.73, 'throughput': 498.91}
+
+[INFO|callbacks.py:310] 2024-07-29 17:44:45,852 >> {'loss': 0.0210, 'learning_rate': 3.7053e-06, 'epoch': 1.74, 'throughput': 498.96}
+
+[INFO|callbacks.py:310] 2024-07-29 17:44:59,066 >> {'loss': 0.0589, 'learning_rate': 3.6963e-06, 'epoch': 1.74, 'throughput': 498.91}
+
+[INFO|callbacks.py:310] 2024-07-29 17:45:12,268 >> {'loss': 0.0521, 'learning_rate': 3.6873e-06, 'epoch': 1.75, 'throughput': 498.96}
+
+[INFO|callbacks.py:310] 2024-07-29 17:45:25,477 >> {'loss': 0.0402, 'learning_rate': 3.6782e-06, 'epoch': 1.76, 'throughput': 499.02}
+
+[INFO|callbacks.py:310] 2024-07-29 17:45:38,674 >> {'loss': 0.0363, 'learning_rate': 3.6691e-06, 'epoch': 1.76, 'throughput': 499.01}
+
+[INFO|callbacks.py:310] 2024-07-29 17:45:51,897 >> {'loss': 0.0523, 'learning_rate': 3.6601e-06, 'epoch': 1.77, 'throughput': 498.99}
+
+[INFO|callbacks.py:310] 2024-07-29 17:46:05,094 >> {'loss': 0.0255, 'learning_rate': 3.6510e-06, 'epoch': 1.78, 'throughput': 499.02}
+
+[INFO|callbacks.py:310] 2024-07-29 17:46:18,289 >> {'loss': 0.1031, 'learning_rate': 3.6418e-06, 'epoch': 1.78, 'throughput': 499.04}
+
+[INFO|callbacks.py:310] 2024-07-29 17:46:31,492 >> {'loss': 0.0452, 'learning_rate': 3.6327e-06, 'epoch': 1.79, 'throughput': 499.04}
+
+[INFO|callbacks.py:310] 2024-07-29 17:46:44,705 >> {'loss': 0.0537, 'learning_rate': 3.6235e-06, 'epoch': 1.80, 'throughput': 499.06}
+
+[INFO|callbacks.py:310] 2024-07-29 17:46:57,903 >> {'loss': 0.0572, 'learning_rate': 3.6143e-06, 'epoch': 1.80, 'throughput': 499.09}
+
+[INFO|callbacks.py:310] 2024-07-29 17:47:11,093 >> {'loss': 0.0298, 'learning_rate': 3.6051e-06, 'epoch': 1.81, 'throughput': 499.07}
+
+[INFO|callbacks.py:310] 2024-07-29 17:47:24,291 >> {'loss': 0.0370, 'learning_rate': 3.5959e-06, 'epoch': 1.81, 'throughput': 499.11}
+
+[INFO|callbacks.py:310] 2024-07-29 17:47:37,499 >> {'loss': 0.0783, 'learning_rate': 3.5867e-06, 'epoch': 1.82, 'throughput': 499.05}
+
+[INFO|callbacks.py:310] 2024-07-29 17:47:50,688 >> {'loss': 0.0436, 'learning_rate': 3.5774e-06, 'epoch': 1.83, 'throughput': 499.02}
+
+[INFO|callbacks.py:310] 2024-07-29 17:48:03,887 >> {'loss': 0.0258, 'learning_rate': 3.5682e-06, 'epoch': 1.83, 'throughput': 499.04}
+
+[INFO|callbacks.py:310] 2024-07-29 17:48:17,099 >> {'loss': 0.0235, 'learning_rate': 3.5589e-06, 'epoch': 1.84, 'throughput': 499.01}
+
+[INFO|callbacks.py:310] 2024-07-29 17:48:30,327 >> {'loss': 0.0541, 'learning_rate': 3.5496e-06, 'epoch': 1.85, 'throughput': 499.06}
+
+[INFO|callbacks.py:310] 2024-07-29 17:48:43,526 >> {'loss': 0.0767, 'learning_rate': 3.5402e-06, 'epoch': 1.85, 'throughput': 499.07}
+
+[INFO|callbacks.py:310] 2024-07-29 17:48:56,720 >> {'loss': 0.0558, 'learning_rate': 3.5309e-06, 'epoch': 1.86, 'throughput': 499.03}
+
+[INFO|callbacks.py:310] 2024-07-29 17:49:09,917 >> {'loss': 0.0483, 'learning_rate': 3.5215e-06, 'epoch': 1.87, 'throughput': 498.97}
+
+[INFO|callbacks.py:310] 2024-07-29 17:49:23,101 >> {'loss': 0.0389, 'learning_rate': 3.5121e-06, 'epoch': 1.87, 'throughput': 498.91}
+
+[INFO|callbacks.py:310] 2024-07-29 17:49:36,292 >> {'loss': 0.0340, 'learning_rate': 3.5028e-06, 'epoch': 1.88, 'throughput': 498.95}
+
+[INFO|callbacks.py:310] 2024-07-29 17:49:49,511 >> {'loss': 0.0921, 'learning_rate': 3.4933e-06, 'epoch': 1.89, 'throughput': 498.99}
+
+[INFO|callbacks.py:310] 2024-07-29 17:50:02,728 >> {'loss': 0.0661, 'learning_rate': 3.4839e-06, 'epoch': 1.89, 'throughput': 498.99}
+
+[INFO|callbacks.py:310] 2024-07-29 17:50:15,935 >> {'loss': 0.0301, 'learning_rate': 3.4745e-06, 'epoch': 1.90, 'throughput': 498.97}
+
+[INFO|callbacks.py:310] 2024-07-29 17:50:29,125 >> {'loss': 0.0878, 'learning_rate': 3.4650e-06, 'epoch': 1.91, 'throughput': 498.92}
+
+[INFO|callbacks.py:310] 2024-07-29 17:50:42,323 >> {'loss': 0.1126, 'learning_rate': 3.4555e-06, 'epoch': 1.91, 'throughput': 499.11}
+
+[INFO|callbacks.py:310] 2024-07-29 17:50:55,526 >> {'loss': 0.0679, 'learning_rate': 3.4460e-06, 'epoch': 1.92, 'throughput': 499.05}
+
+[INFO|callbacks.py:310] 2024-07-29 17:51:08,745 >> {'loss': 0.0310, 'learning_rate': 3.4365e-06, 'epoch': 1.92, 'throughput': 499.05}
+
+[INFO|callbacks.py:310] 2024-07-29 17:51:21,930 >> {'loss': 0.0687, 'learning_rate': 3.4270e-06, 'epoch': 1.93, 'throughput': 499.01}
+
+[INFO|callbacks.py:310] 2024-07-29 17:51:35,139 >> {'loss': 0.0497, 'learning_rate': 3.4174e-06, 'epoch': 1.94, 'throughput': 499.10}
+
+[INFO|callbacks.py:310] 2024-07-29 17:51:48,329 >> {'loss': 0.0829, 'learning_rate': 3.4079e-06, 'epoch': 1.94, 'throughput': 499.20}
+
+[INFO|callbacks.py:310] 2024-07-29 17:52:01,530 >> {'loss': 0.0578, 'learning_rate': 3.3983e-06, 'epoch': 1.95, 'throughput': 499.22}
+
+[INFO|callbacks.py:310] 2024-07-29 17:52:14,737 >> {'loss': 0.0398, 'learning_rate': 3.3887e-06, 'epoch': 1.96, 'throughput': 499.24}
+
+[INFO|callbacks.py:310] 2024-07-29 17:52:27,939 >> {'loss': 0.0754, 'learning_rate': 3.3791e-06, 'epoch': 1.96, 'throughput': 499.14}
+
+[INFO|callbacks.py:310] 2024-07-29 17:52:41,147 >> {'loss': 0.0395, 'learning_rate': 3.3695e-06, 'epoch': 1.97, 'throughput': 499.07}
+
+[INFO|callbacks.py:310] 2024-07-29 17:52:54,335 >> {'loss': 0.0575, 'learning_rate': 3.3599e-06, 'epoch': 1.98, 'throughput': 499.06}
+
+[INFO|callbacks.py:310] 2024-07-29 17:53:07,548 >> {'loss': 0.0472, 'learning_rate': 3.3502e-06, 'epoch': 1.98, 'throughput': 498.99}
+
+[INFO|callbacks.py:310] 2024-07-29 17:53:20,750 >> {'loss': 0.0498, 'learning_rate': 3.3406e-06, 'epoch': 1.99, 'throughput': 499.05}
+
+[INFO|callbacks.py:310] 2024-07-29 17:53:33,950 >> {'loss': 0.0447, 'learning_rate': 3.3309e-06, 'epoch': 2.00, 'throughput': 499.07}
+
+[INFO|callbacks.py:310] 2024-07-29 17:53:47,140 >> {'loss': 0.0155, 'learning_rate': 3.3212e-06, 'epoch': 2.00, 'throughput': 499.08}
+
+[INFO|callbacks.py:310] 2024-07-29 17:54:00,338 >> {'loss': 0.0183, 'learning_rate': 3.3115e-06, 'epoch': 2.01, 'throughput': 499.07}
+
+[INFO|callbacks.py:310] 2024-07-29 17:54:13,550 >> {'loss': 0.0102, 'learning_rate': 3.3018e-06, 'epoch': 2.01, 'throughput': 499.09}
+
+[INFO|callbacks.py:310] 2024-07-29 17:54:26,748 >> {'loss': 0.0119, 'learning_rate': 3.2920e-06, 'epoch': 2.02, 'throughput': 499.07}
+
+[INFO|callbacks.py:310] 2024-07-29 17:54:39,942 >> {'loss': 0.0037, 'learning_rate': 3.2823e-06, 'epoch': 2.03, 'throughput': 499.05}
+
+[INFO|callbacks.py:310] 2024-07-29 17:54:53,141 >> {'loss': 0.0048, 'learning_rate': 3.2725e-06, 'epoch': 2.03, 'throughput': 499.07}
+
+[INFO|callbacks.py:310] 2024-07-29 17:55:06,341 >> {'loss': 0.0040, 'learning_rate': 3.2628e-06, 'epoch': 2.04, 'throughput': 499.06}
+
+[INFO|callbacks.py:310] 2024-07-29 17:55:19,555 >> {'loss': 0.0472, 'learning_rate': 3.2530e-06, 'epoch': 2.05, 'throughput': 499.13}
+
+[INFO|callbacks.py:310] 2024-07-29 17:55:32,755 >> {'loss': 0.0008, 'learning_rate': 3.2432e-06, 'epoch': 2.05, 'throughput': 499.08}
+
+[INFO|callbacks.py:310] 2024-07-29 17:55:45,956 >> {'loss': 0.0332, 'learning_rate': 3.2334e-06, 'epoch': 2.06, 'throughput': 499.11}
+
+[INFO|callbacks.py:310] 2024-07-29 17:55:59,157 >> {'loss': 0.0434, 'learning_rate': 3.2236e-06, 'epoch': 2.07, 'throughput': 499.05}
+
+[INFO|callbacks.py:310] 2024-07-29 17:56:12,334 >> {'loss': 0.0105, 'learning_rate': 3.2137e-06, 'epoch': 2.07, 'throughput': 499.03}
+
+[INFO|callbacks.py:310] 2024-07-29 17:56:25,546 >> {'loss': 0.0188, 'learning_rate': 3.2039e-06, 'epoch': 2.08, 'throughput': 498.94}
+
+[INFO|callbacks.py:310] 2024-07-29 17:56:38,770 >> {'loss': 0.0382, 'learning_rate': 3.1940e-06, 'epoch': 2.09, 'throughput': 498.88}
+
+[INFO|callbacks.py:310] 2024-07-29 17:56:51,984 >> {'loss': 0.0002, 'learning_rate': 3.1842e-06, 'epoch': 2.09, 'throughput': 498.87}
+
+[INFO|callbacks.py:310] 2024-07-29 17:57:05,186 >> {'loss': 0.0473, 'learning_rate': 3.1743e-06, 'epoch': 2.10, 'throughput': 498.92}
+
+[INFO|callbacks.py:310] 2024-07-29 17:57:18,382 >> {'loss': 0.0008, 'learning_rate': 3.1644e-06, 'epoch': 2.10, 'throughput': 498.89}
+
+[INFO|callbacks.py:310] 2024-07-29 17:57:31,578 >> {'loss': 0.0123, 'learning_rate': 3.1545e-06, 'epoch': 2.11, 'throughput': 498.91}
+
+[INFO|callbacks.py:310] 2024-07-29 17:57:44,771 >> {'loss': 0.0300, 'learning_rate': 3.1446e-06, 'epoch': 2.12, 'throughput': 498.90}
+
+[INFO|callbacks.py:310] 2024-07-29 17:57:57,984 >> {'loss': 0.0207, 'learning_rate': 3.1346e-06, 'epoch': 2.12, 'throughput': 498.81}
+
+[INFO|callbacks.py:310] 2024-07-29 17:58:11,198 >> {'loss': 0.0172, 'learning_rate': 3.1247e-06, 'epoch': 2.13, 'throughput': 498.84}
+
+[INFO|callbacks.py:310] 2024-07-29 17:58:24,402 >> {'loss': 0.0115, 'learning_rate': 3.1148e-06, 'epoch': 2.14, 'throughput': 498.85}
+
+[INFO|callbacks.py:310] 2024-07-29 17:58:37,618 >> {'loss': 0.0217, 'learning_rate': 3.1048e-06, 'epoch': 2.14, 'throughput': 498.77}
+
+[INFO|callbacks.py:310] 2024-07-29 17:58:50,821 >> {'loss': 0.0035, 'learning_rate': 3.0948e-06, 'epoch': 2.15, 'throughput': 498.73}
+
+[INFO|callbacks.py:310] 2024-07-29 17:59:04,030 >> {'loss': 0.0426, 'learning_rate': 3.0849e-06, 'epoch': 2.16, 'throughput': 498.72}
+
+[INFO|callbacks.py:310] 2024-07-29 17:59:17,234 >> {'loss': 0.0382, 'learning_rate': 3.0749e-06, 'epoch': 2.16, 'throughput': 498.69}
+
+[INFO|callbacks.py:310] 2024-07-29 17:59:30,447 >> {'loss': 0.0082, 'learning_rate': 3.0649e-06, 'epoch': 2.17, 'throughput': 498.76}
+
+[INFO|callbacks.py:310] 2024-07-29 17:59:43,657 >> {'loss': 0.0338, 'learning_rate': 3.0549e-06, 'epoch': 2.18, 'throughput': 498.76}
+
+[INFO|callbacks.py:310] 2024-07-29 17:59:56,869 >> {'loss': 0.0141, 'learning_rate': 3.0449e-06, 'epoch': 2.18, 'throughput': 498.73}
+
+[INFO|callbacks.py:310] 2024-07-29 18:00:10,066 >> {'loss': 0.0287, 'learning_rate': 3.0348e-06, 'epoch': 2.19, 'throughput': 498.75}
+
+[INFO|callbacks.py:310] 2024-07-29 18:00:23,278 >> {'loss': 0.0027, 'learning_rate': 3.0248e-06, 'epoch': 2.19, 'throughput': 498.73}
+
+[INFO|callbacks.py:310] 2024-07-29 18:00:36,465 >> {'loss': 0.0026, 'learning_rate': 3.0148e-06, 'epoch': 2.20, 'throughput': 498.73}
+
+[INFO|callbacks.py:310] 2024-07-29 18:00:49,672 >> {'loss': 0.0390, 'learning_rate': 3.0047e-06, 'epoch': 2.21, 'throughput': 498.77}
+
+[INFO|callbacks.py:310] 2024-07-29 18:01:02,866 >> {'loss': 0.0154, 'learning_rate': 2.9946e-06, 'epoch': 2.21, 'throughput': 498.78}
+
+[INFO|callbacks.py:310] 2024-07-29 18:01:16,084 >> {'loss': 0.0231, 'learning_rate': 2.9846e-06, 'epoch': 2.22, 'throughput': 498.87}
+
+[INFO|callbacks.py:310] 2024-07-29 18:01:29,274 >> {'loss': 0.0400, 'learning_rate': 2.9745e-06, 'epoch': 2.23, 'throughput': 498.90}
+
+[INFO|callbacks.py:310] 2024-07-29 18:01:42,472 >> {'loss': 0.0282, 'learning_rate': 2.9644e-06, 'epoch': 2.23, 'throughput': 498.90}
+
+[INFO|callbacks.py:310] 2024-07-29 18:01:55,678 >> {'loss': 0.0143, 'learning_rate': 2.9543e-06, 'epoch': 2.24, 'throughput': 498.96}
+
+[INFO|callbacks.py:310] 2024-07-29 18:02:08,889 >> {'loss': 0.0194, 'learning_rate': 2.9442e-06, 'epoch': 2.25, 'throughput': 498.96}
+
+[INFO|callbacks.py:310] 2024-07-29 18:02:22,066 >> {'loss': 0.0051, 'learning_rate': 2.9341e-06, 'epoch': 2.25, 'throughput': 498.93}
+
+[INFO|callbacks.py:310] 2024-07-29 18:02:35,267 >> {'loss': 0.0162, 'learning_rate': 2.9240e-06, 'epoch': 2.26, 'throughput': 498.94}
+
+[INFO|callbacks.py:310] 2024-07-29 18:02:48,484 >> {'loss': 0.0313, 'learning_rate': 2.9139e-06, 'epoch': 2.27, 'throughput': 498.93}
+
+[INFO|callbacks.py:310] 2024-07-29 18:03:01,690 >> {'loss': 0.0041, 'learning_rate': 2.9038e-06, 'epoch': 2.27, 'throughput': 498.91}
+
+[INFO|callbacks.py:310] 2024-07-29 18:03:14,887 >> {'loss': 0.0321, 'learning_rate': 2.8936e-06, 'epoch': 2.28, 'throughput': 498.88}
+
+[INFO|callbacks.py:310] 2024-07-29 18:03:28,088 >> {'loss': 0.0064, 'learning_rate': 2.8835e-06, 'epoch': 2.28, 'throughput': 498.92}
+
+[INFO|callbacks.py:310] 2024-07-29 18:03:41,301 >> {'loss': 0.0237, 'learning_rate': 2.8733e-06, 'epoch': 2.29, 'throughput': 498.90}
+
+[INFO|callbacks.py:310] 2024-07-29 18:03:54,506 >> {'loss': 0.0152, 'learning_rate': 2.8632e-06, 'epoch': 2.30, 'throughput': 498.83}
+
+[INFO|callbacks.py:310] 2024-07-29 18:04:07,699 >> {'loss': 0.0038, 'learning_rate': 2.8530e-06, 'epoch': 2.30, 'throughput': 498.81}
+
+[INFO|callbacks.py:310] 2024-07-29 18:04:20,908 >> {'loss': 0.0168, 'learning_rate': 2.8428e-06, 'epoch': 2.31, 'throughput': 498.83}
+
+[INFO|callbacks.py:310] 2024-07-29 18:04:34,121 >> {'loss': 0.0049, 'learning_rate': 2.8327e-06, 'epoch': 2.32, 'throughput': 498.88}
+
+[INFO|callbacks.py:310] 2024-07-29 18:04:47,315 >> {'loss': 0.0114, 'learning_rate': 2.8225e-06, 'epoch': 2.32, 'throughput': 498.82}
+
+[INFO|callbacks.py:310] 2024-07-29 18:05:00,517 >> {'loss': 0.0220, 'learning_rate': 2.8123e-06, 'epoch': 2.33, 'throughput': 498.83}
+
+[INFO|callbacks.py:310] 2024-07-29 18:05:13,715 >> {'loss': 0.0154, 'learning_rate': 2.8021e-06, 'epoch': 2.34, 'throughput': 498.82}
+
+[INFO|callbacks.py:310] 2024-07-29 18:05:26,935 >> {'loss': 0.0204, 'learning_rate': 2.7919e-06, 'epoch': 2.34, 'throughput': 498.82}
+
+[INFO|callbacks.py:310] 2024-07-29 18:05:40,131 >> {'loss': 0.0334, 'learning_rate': 2.7817e-06, 'epoch': 2.35, 'throughput': 498.82}
+
+[INFO|callbacks.py:310] 2024-07-29 18:05:53,317 >> {'loss': 0.0132, 'learning_rate': 2.7715e-06, 'epoch': 2.36, 'throughput': 498.79}
+
+[INFO|callbacks.py:310] 2024-07-29 18:06:06,529 >> {'loss': 0.0291, 'learning_rate': 2.7613e-06, 'epoch': 2.36, 'throughput': 498.84}
+
+[INFO|callbacks.py:310] 2024-07-29 18:06:19,726 >> {'loss': 0.0089, 'learning_rate': 2.7511e-06, 'epoch': 2.37, 'throughput': 498.96}
+
+[INFO|callbacks.py:310] 2024-07-29 18:06:32,929 >> {'loss': 0.0306, 'learning_rate': 2.7409e-06, 'epoch': 2.37, 'throughput': 498.90}
+
+[INFO|callbacks.py:310] 2024-07-29 18:06:46,144 >> {'loss': 0.0297, 'learning_rate': 2.7307e-06, 'epoch': 2.38, 'throughput': 498.93}
+
+[INFO|callbacks.py:310] 2024-07-29 18:06:59,353 >> {'loss': 0.0247, 'learning_rate': 2.7204e-06, 'epoch': 2.39, 'throughput': 498.97}
+
+[INFO|callbacks.py:310] 2024-07-29 18:07:12,561 >> {'loss': 0.0234, 'learning_rate': 2.7102e-06, 'epoch': 2.39, 'throughput': 498.96}
+
+[INFO|callbacks.py:310] 2024-07-29 18:07:25,748 >> {'loss': 0.0192, 'learning_rate': 2.7000e-06, 'epoch': 2.40, 'throughput': 499.00}
+
+[INFO|callbacks.py:310] 2024-07-29 18:07:38,961 >> {'loss': 0.0324, 'learning_rate': 2.6898e-06, 'epoch': 2.41, 'throughput': 498.99}
+
+[INFO|callbacks.py:310] 2024-07-29 18:07:52,167 >> {'loss': 0.0332, 'learning_rate': 2.6795e-06, 'epoch': 2.41, 'throughput': 499.02}
+
+[INFO|callbacks.py:310] 2024-07-29 18:08:05,370 >> {'loss': 0.0043, 'learning_rate': 2.6693e-06, 'epoch': 2.42, 'throughput': 499.02}
+
+[INFO|callbacks.py:310] 2024-07-29 18:08:18,568 >> {'loss': 0.0197, 'learning_rate': 2.6590e-06, 'epoch': 2.43, 'throughput': 499.08}
+
+[INFO|callbacks.py:310] 2024-07-29 18:08:31,768 >> {'loss': 0.0366, 'learning_rate': 2.6488e-06, 'epoch': 2.43, 'throughput': 499.15}
+
+[INFO|callbacks.py:310] 2024-07-29 18:08:44,976 >> {'loss': 0.0173, 'learning_rate': 2.6385e-06, 'epoch': 2.44, 'throughput': 499.21}
+
+[INFO|callbacks.py:310] 2024-07-29 18:08:58,164 >> {'loss': 0.0102, 'learning_rate': 2.6283e-06, 'epoch': 2.45, 'throughput': 499.22}
+
+[INFO|callbacks.py:310] 2024-07-29 18:09:11,378 >> {'loss': 0.0361, 'learning_rate': 2.6180e-06, 'epoch': 2.45, 'throughput': 499.27}
+
+[INFO|callbacks.py:310] 2024-07-29 18:09:24,575 >> {'loss': 0.0292, 'learning_rate': 2.6078e-06, 'epoch': 2.46, 'throughput': 499.26}
+
+[INFO|callbacks.py:310] 2024-07-29 18:09:37,765 >> {'loss': 0.0095, 'learning_rate': 2.5975e-06, 'epoch': 2.47, 'throughput': 499.28}
+
+[INFO|callbacks.py:310] 2024-07-29 18:09:50,976 >> {'loss': 0.0150, 'learning_rate': 2.5872e-06, 'epoch': 2.47, 'throughput': 499.34}
+
+[INFO|callbacks.py:310] 2024-07-29 18:10:04,177 >> {'loss': 0.0276, 'learning_rate': 2.5770e-06, 'epoch': 2.48, 'throughput': 499.37}
+
+[INFO|callbacks.py:310] 2024-07-29 18:10:17,374 >> {'loss': 0.0284, 'learning_rate': 2.5667e-06, 'epoch': 2.48, 'throughput': 499.37}
+
+[INFO|callbacks.py:310] 2024-07-29 18:10:30,575 >> {'loss': 0.0366, 'learning_rate': 2.5565e-06, 'epoch': 2.49, 'throughput': 499.37}
+
+[INFO|callbacks.py:310] 2024-07-29 18:10:43,776 >> {'loss': 0.0322, 'learning_rate': 2.5462e-06, 'epoch': 2.50, 'throughput': 499.35}
+
+[INFO|callbacks.py:310] 2024-07-29 18:10:56,981 >> {'loss': 0.0057, 'learning_rate': 2.5359e-06, 'epoch': 2.50, 'throughput': 499.31}
+
+[INFO|callbacks.py:310] 2024-07-29 18:11:10,184 >> {'loss': 0.0054, 'learning_rate': 2.5257e-06, 'epoch': 2.51, 'throughput': 499.29}
+
+[INFO|callbacks.py:310] 2024-07-29 18:11:23,371 >> {'loss': 0.0199, 'learning_rate': 2.5154e-06, 'epoch': 2.52, 'throughput': 499.37}
+
+[INFO|callbacks.py:310] 2024-07-29 18:11:36,566 >> {'loss': 0.0045, 'learning_rate': 2.5051e-06, 'epoch': 2.52, 'throughput': 499.35}
+
+[INFO|callbacks.py:310] 2024-07-29 18:11:49,768 >> {'loss': 0.0283, 'learning_rate': 2.4949e-06, 'epoch': 2.53, 'throughput': 499.35}
+
+[INFO|callbacks.py:310] 2024-07-29 18:12:02,974 >> {'loss': 0.0248, 'learning_rate': 2.4846e-06, 'epoch': 2.54, 'throughput': 499.35}
+
+[INFO|callbacks.py:310] 2024-07-29 18:12:16,158 >> {'loss': 0.0132, 'learning_rate': 2.4743e-06, 'epoch': 2.54, 'throughput': 499.33}
+
+[INFO|callbacks.py:310] 2024-07-29 18:12:29,374 >> {'loss': 0.0400, 'learning_rate': 2.4641e-06, 'epoch': 2.55, 'throughput': 499.40}
+
+[INFO|callbacks.py:310] 2024-07-29 18:12:42,568 >> {'loss': 0.0029, 'learning_rate': 2.4538e-06, 'epoch': 2.56, 'throughput': 499.36}
+
+[INFO|callbacks.py:310] 2024-07-29 18:12:55,762 >> {'loss': 0.0254, 'learning_rate': 2.4435e-06, 'epoch': 2.56, 'throughput': 499.38}
+
+[INFO|callbacks.py:310] 2024-07-29 18:13:08,966 >> {'loss': 0.0108, 'learning_rate': 2.4333e-06, 'epoch': 2.57, 'throughput': 499.41}
+
+[INFO|callbacks.py:310] 2024-07-29 18:13:22,165 >> {'loss': 0.0269, 'learning_rate': 2.4230e-06, 'epoch': 2.57, 'throughput': 499.36}
+
+[INFO|callbacks.py:310] 2024-07-29 18:13:35,376 >> {'loss': 0.0111, 'learning_rate': 2.4128e-06, 'epoch': 2.58, 'throughput': 499.40}
+
+[INFO|callbacks.py:310] 2024-07-29 18:13:48,568 >> {'loss': 0.0222, 'learning_rate': 2.4025e-06, 'epoch': 2.59, 'throughput': 499.38}
+
+[INFO|callbacks.py:310] 2024-07-29 18:14:01,780 >> {'loss': 0.0101, 'learning_rate': 2.3922e-06, 'epoch': 2.59, 'throughput': 499.39}
+
+[INFO|callbacks.py:310] 2024-07-29 18:14:14,991 >> {'loss': 0.0138, 'learning_rate': 2.3820e-06, 'epoch': 2.60, 'throughput': 499.42}
+
+[INFO|callbacks.py:310] 2024-07-29 18:14:28,194 >> {'loss': 0.0281, 'learning_rate': 2.3717e-06, 'epoch': 2.61, 'throughput': 499.37}
+
+[INFO|callbacks.py:310] 2024-07-29 18:14:41,383 >> {'loss': 0.0042, 'learning_rate': 2.3615e-06, 'epoch': 2.61, 'throughput': 499.42}
+
+[INFO|callbacks.py:310] 2024-07-29 18:14:54,582 >> {'loss': 0.0015, 'learning_rate': 2.3512e-06, 'epoch': 2.62, 'throughput': 499.46}
+
+[INFO|callbacks.py:310] 2024-07-29 18:15:07,768 >> {'loss': 0.0279, 'learning_rate': 2.3410e-06, 'epoch': 2.63, 'throughput': 499.51}
+
+[INFO|callbacks.py:310] 2024-07-29 18:15:20,984 >> {'loss': 0.0399, 'learning_rate': 2.3307e-06, 'epoch': 2.63, 'throughput': 499.44}
+
+[INFO|callbacks.py:310] 2024-07-29 18:15:34,184 >> {'loss': 0.0074, 'learning_rate': 2.3205e-06, 'epoch': 2.64, 'throughput': 499.52}
+
+[INFO|callbacks.py:310] 2024-07-29 18:15:47,400 >> {'loss': 0.0078, 'learning_rate': 2.3102e-06, 'epoch': 2.65, 'throughput': 499.50}
+
+[INFO|callbacks.py:310] 2024-07-29 18:16:00,615 >> {'loss': 0.0406, 'learning_rate': 2.3000e-06, 'epoch': 2.65, 'throughput': 499.50}
+
+[INFO|callbacks.py:310] 2024-07-29 18:16:13,820 >> {'loss': 0.0023, 'learning_rate': 2.2898e-06, 'epoch': 2.66, 'throughput': 499.38}
+
+[INFO|callbacks.py:310] 2024-07-29 18:16:27,016 >> {'loss': 0.0060, 'learning_rate': 2.2796e-06, 'epoch': 2.66, 'throughput': 499.42}
+
+[INFO|callbacks.py:310] 2024-07-29 18:16:40,217 >> {'loss': 0.0236, 'learning_rate': 2.2693e-06, 'epoch': 2.67, 'throughput': 499.46}
+
+[INFO|callbacks.py:310] 2024-07-29 18:16:53,430 >> {'loss': 0.0136, 'learning_rate': 2.2591e-06, 'epoch': 2.68, 'throughput': 499.45}
+
+[INFO|callbacks.py:310] 2024-07-29 18:17:06,628 >> {'loss': 0.0150, 'learning_rate': 2.2489e-06, 'epoch': 2.68, 'throughput': 499.52}
+
+[INFO|callbacks.py:310] 2024-07-29 18:17:19,840 >> {'loss': 0.0483, 'learning_rate': 2.2387e-06, 'epoch': 2.69, 'throughput': 499.48}
+
+[INFO|callbacks.py:310] 2024-07-29 18:17:33,062 >> {'loss': 0.0014, 'learning_rate': 2.2285e-06, 'epoch': 2.70, 'throughput': 499.47}
+
+[INFO|callbacks.py:310] 2024-07-29 18:17:46,264 >> {'loss': 0.0279, 'learning_rate': 2.2183e-06, 'epoch': 2.70, 'throughput': 499.45}
+
+[INFO|callbacks.py:310] 2024-07-29 18:17:59,464 >> {'loss': 0.0705, 'learning_rate': 2.2081e-06, 'epoch': 2.71, 'throughput': 499.48}
+
+[INFO|callbacks.py:310] 2024-07-29 18:18:12,659 >> {'loss': 0.0050, 'learning_rate': 2.1979e-06, 'epoch': 2.72, 'throughput': 499.55}
+
+[INFO|callbacks.py:310] 2024-07-29 18:18:25,868 >> {'loss': 0.0453, 'learning_rate': 2.1877e-06, 'epoch': 2.72, 'throughput': 499.52}
+
+[INFO|callbacks.py:310] 2024-07-29 18:18:39,044 >> {'loss': 0.0095, 'learning_rate': 2.1775e-06, 'epoch': 2.73, 'throughput': 499.55}
+
+[INFO|callbacks.py:310] 2024-07-29 18:18:52,269 >> {'loss': 0.0095, 'learning_rate': 2.1673e-06, 'epoch': 2.74, 'throughput': 499.48}
+
+[INFO|callbacks.py:310] 2024-07-29 18:19:05,477 >> {'loss': 0.0138, 'learning_rate': 2.1572e-06, 'epoch': 2.74, 'throughput': 499.49}
+
+[INFO|callbacks.py:310] 2024-07-29 18:19:18,684 >> {'loss': 0.0272, 'learning_rate': 2.1470e-06, 'epoch': 2.75, 'throughput': 499.43}
+
+[INFO|callbacks.py:310] 2024-07-29 18:19:31,887 >> {'loss': 0.0026, 'learning_rate': 2.1368e-06, 'epoch': 2.75, 'throughput': 499.46}
+
+[INFO|callbacks.py:310] 2024-07-29 18:19:45,089 >> {'loss': 0.0016, 'learning_rate': 2.1267e-06, 'epoch': 2.76, 'throughput': 499.52}
+
+[INFO|callbacks.py:310] 2024-07-29 18:19:58,277 >> {'loss': 0.0633, 'learning_rate': 2.1165e-06, 'epoch': 2.77, 'throughput': 499.56}
+
+[INFO|callbacks.py:310] 2024-07-29 18:20:11,475 >> {'loss': 0.0191, 'learning_rate': 2.1064e-06, 'epoch': 2.77, 'throughput': 499.54}
+
+[INFO|callbacks.py:310] 2024-07-29 18:20:24,692 >> {'loss': 0.0075, 'learning_rate': 2.0962e-06, 'epoch': 2.78, 'throughput': 499.51}
+
+[INFO|callbacks.py:310] 2024-07-29 18:20:37,900 >> {'loss': 0.0371, 'learning_rate': 2.0861e-06, 'epoch': 2.79, 'throughput': 499.46}
+
+[INFO|callbacks.py:310] 2024-07-29 18:20:51,097 >> {'loss': 0.0275, 'learning_rate': 2.0760e-06, 'epoch': 2.79, 'throughput': 499.45}
+
+[INFO|callbacks.py:310] 2024-07-29 18:21:04,304 >> {'loss': 0.0441, 'learning_rate': 2.0659e-06, 'epoch': 2.80, 'throughput': 499.47}
+
+[INFO|callbacks.py:310] 2024-07-29 18:21:17,494 >> {'loss': 0.0074, 'learning_rate': 2.0558e-06, 'epoch': 2.81, 'throughput': 499.43}
+
+[INFO|callbacks.py:310] 2024-07-29 18:21:30,695 >> {'loss': 0.0412, 'learning_rate': 2.0457e-06, 'epoch': 2.81, 'throughput': 499.47}
+
+[INFO|callbacks.py:310] 2024-07-29 18:21:43,901 >> {'loss': 0.0070, 'learning_rate': 2.0356e-06, 'epoch': 2.82, 'throughput': 499.41}
+
+[INFO|callbacks.py:310] 2024-07-29 18:21:57,096 >> {'loss': 0.0195, 'learning_rate': 2.0255e-06, 'epoch': 2.83, 'throughput': 499.43}
+
+[INFO|callbacks.py:310] 2024-07-29 18:22:10,303 >> {'loss': 0.0104, 'learning_rate': 2.0154e-06, 'epoch': 2.83, 'throughput': 499.42}
+
+[INFO|callbacks.py:310] 2024-07-29 18:22:23,500 >> {'loss': 0.0196, 'learning_rate': 2.0054e-06, 'epoch': 2.84, 'throughput': 499.43}
+
+[INFO|callbacks.py:310] 2024-07-29 18:22:36,701 >> {'loss': 0.0131, 'learning_rate': 1.9953e-06, 'epoch': 2.84, 'throughput': 499.40}
+
+[INFO|callbacks.py:310] 2024-07-29 18:22:49,913 >> {'loss': 0.0072, 'learning_rate': 1.9852e-06, 'epoch': 2.85, 'throughput': 499.39}
+
+[INFO|callbacks.py:310] 2024-07-29 18:23:03,115 >> {'loss': 0.0058, 'learning_rate': 1.9752e-06, 'epoch': 2.86, 'throughput': 499.43}
+
+[INFO|callbacks.py:310] 2024-07-29 18:23:16,324 >> {'loss': 0.0242, 'learning_rate': 1.9652e-06, 'epoch': 2.86, 'throughput': 499.48}
+
+[INFO|callbacks.py:310] 2024-07-29 18:23:29,511 >> {'loss': 0.0323, 'learning_rate': 1.9551e-06, 'epoch': 2.87, 'throughput': 499.51}
+
+[INFO|callbacks.py:310] 2024-07-29 18:23:42,722 >> {'loss': 0.0186, 'learning_rate': 1.9451e-06, 'epoch': 2.88, 'throughput': 499.49}
+
+[INFO|callbacks.py:310] 2024-07-29 18:23:55,935 >> {'loss': 0.0071, 'learning_rate': 1.9351e-06, 'epoch': 2.88, 'throughput': 499.54}
+
+[INFO|callbacks.py:310] 2024-07-29 18:24:09,140 >> {'loss': 0.0177, 'learning_rate': 1.9251e-06, 'epoch': 2.89, 'throughput': 499.53}
+
+[INFO|callbacks.py:310] 2024-07-29 18:24:22,332 >> {'loss': 0.0440, 'learning_rate': 1.9151e-06, 'epoch': 2.90, 'throughput': 499.59}
+
+[INFO|callbacks.py:310] 2024-07-29 18:24:35,531 >> {'loss': 0.0119, 'learning_rate': 1.9052e-06, 'epoch': 2.90, 'throughput': 499.65}
+
+[INFO|callbacks.py:310] 2024-07-29 18:24:48,739 >> {'loss': 0.0121, 'learning_rate': 1.8952e-06, 'epoch': 2.91, 'throughput': 499.65}
+
+[INFO|callbacks.py:310] 2024-07-29 18:25:01,946 >> {'loss': 0.0319, 'learning_rate': 1.8852e-06, 'epoch': 2.92, 'throughput': 499.62}
+
+[INFO|callbacks.py:310] 2024-07-29 18:25:15,150 >> {'loss': 0.0024, 'learning_rate': 1.8753e-06, 'epoch': 2.92, 'throughput': 499.62}
+
+[INFO|callbacks.py:310] 2024-07-29 18:25:28,360 >> {'loss': 0.0102, 'learning_rate': 1.8654e-06, 'epoch': 2.93, 'throughput': 499.63}
+
+[INFO|callbacks.py:310] 2024-07-29 18:25:41,560 >> {'loss': 0.0419, 'learning_rate': 1.8554e-06, 'epoch': 2.93, 'throughput': 499.65}
+
+[INFO|callbacks.py:310] 2024-07-29 18:25:54,768 >> {'loss': 0.0109, 'learning_rate': 1.8455e-06, 'epoch': 2.94, 'throughput': 499.65}
+
+[INFO|callbacks.py:310] 2024-07-29 18:26:07,972 >> {'loss': 0.0092, 'learning_rate': 1.8356e-06, 'epoch': 2.95, 'throughput': 499.67}
+
+[INFO|callbacks.py:310] 2024-07-29 18:26:21,175 >> {'loss': 0.0449, 'learning_rate': 1.8257e-06, 'epoch': 2.95, 'throughput': 499.71}
+
+[INFO|callbacks.py:310] 2024-07-29 18:26:34,363 >> {'loss': 0.0140, 'learning_rate': 1.8158e-06, 'epoch': 2.96, 'throughput': 499.72}
+
+[INFO|callbacks.py:310] 2024-07-29 18:26:47,565 >> {'loss': 0.0035, 'learning_rate': 1.8060e-06, 'epoch': 2.97, 'throughput': 499.74}
+
+[INFO|callbacks.py:310] 2024-07-29 18:27:00,760 >> {'loss': 0.0160, 'learning_rate': 1.7961e-06, 'epoch': 2.97, 'throughput': 499.72}
+
+[INFO|callbacks.py:310] 2024-07-29 18:27:13,969 >> {'loss': 0.0151, 'learning_rate': 1.7863e-06, 'epoch': 2.98, 'throughput': 499.76}
+
+[INFO|callbacks.py:310] 2024-07-29 18:27:27,186 >> {'loss': 0.0039, 'learning_rate': 1.7764e-06, 'epoch': 2.99, 'throughput': 499.79}
+
+[INFO|callbacks.py:310] 2024-07-29 18:27:40,383 >> {'loss': 0.0361, 'learning_rate': 1.7666e-06, 'epoch': 2.99, 'throughput': 499.77}
+
+[INFO|callbacks.py:310] 2024-07-29 18:27:53,576 >> {'loss': 0.0609, 'learning_rate': 1.7568e-06, 'epoch': 3.00, 'throughput': 499.80}
+
+[INFO|callbacks.py:310] 2024-07-29 18:28:06,806 >> {'loss': 0.0027, 'learning_rate': 1.7470e-06, 'epoch': 3.01, 'throughput': 499.69}
+
+[INFO|callbacks.py:310] 2024-07-29 18:28:19,994 >> {'loss': 0.0097, 'learning_rate': 1.7372e-06, 'epoch': 3.01, 'throughput': 499.75}
+
+[INFO|callbacks.py:310] 2024-07-29 18:28:33,215 >> {'loss': 0.0091, 'learning_rate': 1.7275e-06, 'epoch': 3.02, 'throughput': 499.75}
+
+[INFO|callbacks.py:310] 2024-07-29 18:28:46,423 >> {'loss': 0.0026, 'learning_rate': 1.7177e-06, 'epoch': 3.02, 'throughput': 499.72}
+
+[INFO|callbacks.py:310] 2024-07-29 18:28:59,638 >> {'loss': 0.0311, 'learning_rate': 1.7080e-06, 'epoch': 3.03, 'throughput': 499.73}
+
+[INFO|callbacks.py:310] 2024-07-29 18:29:12,835 >> {'loss': 0.0014, 'learning_rate': 1.6982e-06, 'epoch': 3.04, 'throughput': 499.70}
+
+[INFO|callbacks.py:310] 2024-07-29 18:29:26,036 >> {'loss': 0.0185, 'learning_rate': 1.6885e-06, 'epoch': 3.04, 'throughput': 499.66}
+
+[INFO|callbacks.py:310] 2024-07-29 18:29:39,230 >> {'loss': 0.0075, 'learning_rate': 1.6788e-06, 'epoch': 3.05, 'throughput': 499.68}
+
+[INFO|callbacks.py:310] 2024-07-29 18:29:52,420 >> {'loss': 0.0069, 'learning_rate': 1.6691e-06, 'epoch': 3.06, 'throughput': 499.72}
+
+[INFO|callbacks.py:310] 2024-07-29 18:30:05,637 >> {'loss': 0.0009, 'learning_rate': 1.6594e-06, 'epoch': 3.06, 'throughput': 499.71}
+
+[INFO|callbacks.py:310] 2024-07-29 18:30:18,859 >> {'loss': 0.0015, 'learning_rate': 1.6498e-06, 'epoch': 3.07, 'throughput': 499.76}
+
+[INFO|callbacks.py:310] 2024-07-29 18:30:32,054 >> {'loss': 0.0007, 'learning_rate': 1.6401e-06, 'epoch': 3.08, 'throughput': 499.75}
+
+[INFO|callbacks.py:310] 2024-07-29 18:30:45,260 >> {'loss': 0.0012, 'learning_rate': 1.6305e-06, 'epoch': 3.08, 'throughput': 499.74}
+
+[INFO|callbacks.py:310] 2024-07-29 18:30:58,454 >> {'loss': 0.0018, 'learning_rate': 1.6209e-06, 'epoch': 3.09, 'throughput': 499.79}
+
+[INFO|callbacks.py:310] 2024-07-29 18:31:11,654 >> {'loss': 0.0003, 'learning_rate': 1.6113e-06, 'epoch': 3.10, 'throughput': 499.79}
+
+[INFO|callbacks.py:310] 2024-07-29 18:31:24,840 >> {'loss': 0.0044, 'learning_rate': 1.6017e-06, 'epoch': 3.10, 'throughput': 499.75}
+
+[INFO|callbacks.py:310] 2024-07-29 18:31:38,032 >> {'loss': 0.0019, 'learning_rate': 1.5921e-06, 'epoch': 3.11, 'throughput': 499.73}
+
+[INFO|callbacks.py:310] 2024-07-29 18:31:51,232 >> {'loss': 0.0059, 'learning_rate': 1.5826e-06, 'epoch': 3.12, 'throughput': 499.70}
+
+[INFO|callbacks.py:310] 2024-07-29 18:32:04,444 >> {'loss': 0.0036, 'learning_rate': 1.5730e-06, 'epoch': 3.12, 'throughput': 499.71}
+
+[INFO|callbacks.py:310] 2024-07-29 18:32:17,643 >> {'loss': 0.0002, 'learning_rate': 1.5635e-06, 'epoch': 3.13, 'throughput': 499.71}
+
+[INFO|callbacks.py:310] 2024-07-29 18:32:30,851 >> {'loss': 0.0019, 'learning_rate': 1.5540e-06, 'epoch': 3.13, 'throughput': 499.74}
+
+[INFO|callbacks.py:310] 2024-07-29 18:32:44,059 >> {'loss': 0.0009, 'learning_rate': 1.5445e-06, 'epoch': 3.14, 'throughput': 499.76}
+
+[INFO|callbacks.py:310] 2024-07-29 18:32:57,264 >> {'loss': 0.0318, 'learning_rate': 1.5350e-06, 'epoch': 3.15, 'throughput': 499.77}
+
+[INFO|callbacks.py:310] 2024-07-29 18:33:10,448 >> {'loss': 0.0216, 'learning_rate': 1.5255e-06, 'epoch': 3.15, 'throughput': 499.79}
+
+[INFO|callbacks.py:310] 2024-07-29 18:33:23,667 >> {'loss': 0.0004, 'learning_rate': 1.5161e-06, 'epoch': 3.16, 'throughput': 499.81}
+
+[INFO|callbacks.py:310] 2024-07-29 18:33:36,881 >> {'loss': 0.0286, 'learning_rate': 1.5067e-06, 'epoch': 3.17, 'throughput': 499.81}
+
+[INFO|callbacks.py:310] 2024-07-29 18:33:50,074 >> {'loss': 0.0254, 'learning_rate': 1.4972e-06, 'epoch': 3.17, 'throughput': 499.85}
+
+[INFO|callbacks.py:310] 2024-07-29 18:34:03,269 >> {'loss': 0.0039, 'learning_rate': 1.4879e-06, 'epoch': 3.18, 'throughput': 499.82}
+
+[INFO|callbacks.py:310] 2024-07-29 18:34:16,473 >> {'loss': 0.0005, 'learning_rate': 1.4785e-06, 'epoch': 3.19, 'throughput': 499.80}
+
+[INFO|callbacks.py:310] 2024-07-29 18:34:29,674 >> {'loss': 0.0196, 'learning_rate': 1.4691e-06, 'epoch': 3.19, 'throughput': 499.76}
+
+[INFO|callbacks.py:310] 2024-07-29 18:34:42,863 >> {'loss': 0.0015, 'learning_rate': 1.4598e-06, 'epoch': 3.20, 'throughput': 499.72}
+
+[INFO|callbacks.py:310] 2024-07-29 18:34:56,064 >> {'loss': 0.0088, 'learning_rate': 1.4504e-06, 'epoch': 3.21, 'throughput': 499.77}
+
+[INFO|callbacks.py:310] 2024-07-29 18:35:09,278 >> {'loss': 0.0023, 'learning_rate': 1.4411e-06, 'epoch': 3.21, 'throughput': 499.76}
+
+[INFO|callbacks.py:310] 2024-07-29 18:35:22,465 >> {'loss': 0.0071, 'learning_rate': 1.4318e-06, 'epoch': 3.22, 'throughput': 499.86}
+
+[INFO|callbacks.py:310] 2024-07-29 18:35:35,664 >> {'loss': 0.0006, 'learning_rate': 1.4226e-06, 'epoch': 3.22, 'throughput': 499.83}
+
+[INFO|callbacks.py:310] 2024-07-29 18:35:48,853 >> {'loss': 0.0011, 'learning_rate': 1.4133e-06, 'epoch': 3.23, 'throughput': 499.85}
+
+[INFO|callbacks.py:310] 2024-07-29 18:36:02,055 >> {'loss': 0.0057, 'learning_rate': 1.4041e-06, 'epoch': 3.24, 'throughput': 499.83}
+
+[INFO|callbacks.py:310] 2024-07-29 18:36:15,250 >> {'loss': 0.0006, 'learning_rate': 1.3949e-06, 'epoch': 3.24, 'throughput': 499.83}
+
+[INFO|callbacks.py:310] 2024-07-29 18:36:28,454 >> {'loss': 0.0066, 'learning_rate': 1.3857e-06, 'epoch': 3.25, 'throughput': 499.82}
+
+[INFO|callbacks.py:310] 2024-07-29 18:36:41,661 >> {'loss': 0.0079, 'learning_rate': 1.3765e-06, 'epoch': 3.26, 'throughput': 499.82}
+
+[INFO|callbacks.py:310] 2024-07-29 18:36:54,849 >> {'loss': 0.0215, 'learning_rate': 1.3673e-06, 'epoch': 3.26, 'throughput': 499.82}
+
+[INFO|callbacks.py:310] 2024-07-29 18:37:08,041 >> {'loss': 0.0031, 'learning_rate': 1.3582e-06, 'epoch': 3.27, 'throughput': 499.84}
+
+[INFO|callbacks.py:310] 2024-07-29 18:37:21,264 >> {'loss': 0.0024, 'learning_rate': 1.3490e-06, 'epoch': 3.28, 'throughput': 499.85}
+
+[INFO|callbacks.py:310] 2024-07-29 18:37:34,465 >> {'loss': 0.0094, 'learning_rate': 1.3399e-06, 'epoch': 3.28, 'throughput': 499.80}
+
+[INFO|callbacks.py:310] 2024-07-29 18:37:47,664 >> {'loss': 0.0054, 'learning_rate': 1.3309e-06, 'epoch': 3.29, 'throughput': 499.87}
+
+[INFO|callbacks.py:310] 2024-07-29 18:38:00,859 >> {'loss': 0.0008, 'learning_rate': 1.3218e-06, 'epoch': 3.30, 'throughput': 499.85}
+
+[INFO|callbacks.py:310] 2024-07-29 18:38:14,077 >> {'loss': 0.0053, 'learning_rate': 1.3127e-06, 'epoch': 3.30, 'throughput': 499.79}
+
+[INFO|callbacks.py:310] 2024-07-29 18:38:27,284 >> {'loss': 0.0060, 'learning_rate': 1.3037e-06, 'epoch': 3.31, 'throughput': 499.80}
+
+[INFO|callbacks.py:310] 2024-07-29 18:38:40,489 >> {'loss': 0.0004, 'learning_rate': 1.2947e-06, 'epoch': 3.31, 'throughput': 499.82}
+
+[INFO|callbacks.py:310] 2024-07-29 18:38:53,692 >> {'loss': 0.0135, 'learning_rate': 1.2857e-06, 'epoch': 3.32, 'throughput': 499.83}
+
+[INFO|callbacks.py:310] 2024-07-29 18:39:06,897 >> {'loss': 0.0083, 'learning_rate': 1.2768e-06, 'epoch': 3.33, 'throughput': 499.85}
+
+[INFO|callbacks.py:310] 2024-07-29 18:39:20,097 >> {'loss': 0.0004, 'learning_rate': 1.2678e-06, 'epoch': 3.33, 'throughput': 499.88}
+
+[INFO|callbacks.py:310] 2024-07-29 18:39:33,291 >> {'loss': 0.0038, 'learning_rate': 1.2589e-06, 'epoch': 3.34, 'throughput': 499.87}
+
+[INFO|callbacks.py:310] 2024-07-29 18:39:46,516 >> {'loss': 0.0269, 'learning_rate': 1.2500e-06, 'epoch': 3.35, 'throughput': 499.83}
+
+[INFO|callbacks.py:310] 2024-07-29 18:39:59,735 >> {'loss': 0.0062, 'learning_rate': 1.2411e-06, 'epoch': 3.35, 'throughput': 499.83}
+
+[INFO|callbacks.py:310] 2024-07-29 18:40:12,935 >> {'loss': 0.0004, 'learning_rate': 1.2323e-06, 'epoch': 3.36, 'throughput': 499.80}
+
+[INFO|callbacks.py:310] 2024-07-29 18:40:26,134 >> {'loss': 0.0011, 'learning_rate': 1.2234e-06, 'epoch': 3.37, 'throughput': 499.80}
+
+[INFO|callbacks.py:310] 2024-07-29 18:40:39,344 >> {'loss': 0.0026, 'learning_rate': 1.2146e-06, 'epoch': 3.37, 'throughput': 499.84}
+
+[INFO|callbacks.py:310] 2024-07-29 18:40:52,526 >> {'loss': 0.0183, 'learning_rate': 1.2058e-06, 'epoch': 3.38, 'throughput': 499.80}
+
+[INFO|callbacks.py:310] 2024-07-29 18:41:05,716 >> {'loss': 0.0079, 'learning_rate': 1.1970e-06, 'epoch': 3.39, 'throughput': 499.77}
+
+[INFO|callbacks.py:310] 2024-07-29 18:41:18,935 >> {'loss': 0.0123, 'learning_rate': 1.1883e-06, 'epoch': 3.39, 'throughput': 499.76}
+
+[INFO|callbacks.py:310] 2024-07-29 18:41:32,150 >> {'loss': 0.0018, 'learning_rate': 1.1796e-06, 'epoch': 3.40, 'throughput': 499.74}
+
+[INFO|callbacks.py:310] 2024-07-29 18:41:45,360 >> {'loss': 0.0014, 'learning_rate': 1.1709e-06, 'epoch': 3.40, 'throughput': 499.71}
+
+[INFO|callbacks.py:310] 2024-07-29 18:41:58,555 >> {'loss': 0.0009, 'learning_rate': 1.1622e-06, 'epoch': 3.41, 'throughput': 499.69}
+
+[INFO|callbacks.py:310] 2024-07-29 18:42:11,760 >> {'loss': 0.0078, 'learning_rate': 1.1535e-06, 'epoch': 3.42, 'throughput': 499.67}
+
+[INFO|callbacks.py:310] 2024-07-29 18:42:24,964 >> {'loss': 0.0027, 'learning_rate': 1.1449e-06, 'epoch': 3.42, 'throughput': 499.65}
+
+[INFO|callbacks.py:310] 2024-07-29 18:42:38,161 >> {'loss': 0.0004, 'learning_rate': 1.1363e-06, 'epoch': 3.43, 'throughput': 499.63}
+
+[INFO|callbacks.py:310] 2024-07-29 18:42:51,358 >> {'loss': 0.0064, 'learning_rate': 1.1277e-06, 'epoch': 3.44, 'throughput': 499.68}
+
+[INFO|callbacks.py:310] 2024-07-29 18:43:04,564 >> {'loss': 0.0070, 'learning_rate': 1.1191e-06, 'epoch': 3.44, 'throughput': 499.71}
+
+[INFO|callbacks.py:310] 2024-07-29 18:43:17,771 >> {'loss': 0.0005, 'learning_rate': 1.1105e-06, 'epoch': 3.45, 'throughput': 499.68}
+
+[INFO|callbacks.py:310] 2024-07-29 18:43:30,958 >> {'loss': 0.0004, 'learning_rate': 1.1020e-06, 'epoch': 3.46, 'throughput': 499.68}
+
+[INFO|callbacks.py:310] 2024-07-29 18:43:44,159 >> {'loss': 0.0294, 'learning_rate': 1.0935e-06, 'epoch': 3.46, 'throughput': 499.66}
+
+[INFO|callbacks.py:310] 2024-07-29 18:43:57,349 >> {'loss': 0.0036, 'learning_rate': 1.0850e-06, 'epoch': 3.47, 'throughput': 499.73}
+
+[INFO|callbacks.py:310] 2024-07-29 18:44:10,552 >> {'loss': 0.0051, 'learning_rate': 1.0766e-06, 'epoch': 3.48, 'throughput': 499.71}
+
+[INFO|callbacks.py:310] 2024-07-29 18:44:23,747 >> {'loss': 0.0012, 'learning_rate': 1.0682e-06, 'epoch': 3.48, 'throughput': 499.69}
+
+[INFO|callbacks.py:310] 2024-07-29 18:44:36,965 >> {'loss': 0.0231, 'learning_rate': 1.0598e-06, 'epoch': 3.49, 'throughput': 499.71}
+
+[INFO|callbacks.py:310] 2024-07-29 18:44:50,177 >> {'loss': 0.0041, 'learning_rate': 1.0514e-06, 'epoch': 3.49, 'throughput': 499.72}
+
+[INFO|callbacks.py:310] 2024-07-29 18:45:03,391 >> {'loss': 0.0124, 'learning_rate': 1.0430e-06, 'epoch': 3.50, 'throughput': 499.75}
+
+[INFO|callbacks.py:310] 2024-07-29 18:45:16,593 >> {'loss': 0.0171, 'learning_rate': 1.0347e-06, 'epoch': 3.51, 'throughput': 499.75}
+
+[INFO|callbacks.py:310] 2024-07-29 18:45:29,789 >> {'loss': 0.0080, 'learning_rate': 1.0264e-06, 'epoch': 3.51, 'throughput': 499.74}
+
+[INFO|callbacks.py:310] 2024-07-29 18:45:42,972 >> {'loss': 0.0035, 'learning_rate': 1.0181e-06, 'epoch': 3.52, 'throughput': 499.74}
+
+[INFO|callbacks.py:310] 2024-07-29 18:45:56,172 >> {'loss': 0.0006, 'learning_rate': 1.0098e-06, 'epoch': 3.53, 'throughput': 499.73}
+
+[INFO|callbacks.py:310] 2024-07-29 18:46:09,400 >> {'loss': 0.0020, 'learning_rate': 1.0016e-06, 'epoch': 3.53, 'throughput': 499.71}
+
+[INFO|callbacks.py:310] 2024-07-29 18:46:22,593 >> {'loss': 0.0003, 'learning_rate': 9.9341e-07, 'epoch': 3.54, 'throughput': 499.72}
+
+[INFO|callbacks.py:310] 2024-07-29 18:46:35,795 >> {'loss': 0.0004, 'learning_rate': 9.8523e-07, 'epoch': 3.55, 'throughput': 499.74}
+
+[INFO|callbacks.py:310] 2024-07-29 18:46:48,982 >> {'loss': 0.0016, 'learning_rate': 9.7708e-07, 'epoch': 3.55, 'throughput': 499.74}
+
+[INFO|callbacks.py:310] 2024-07-29 18:47:02,170 >> {'loss': 0.0012, 'learning_rate': 9.6895e-07, 'epoch': 3.56, 'throughput': 499.80}
+
+[INFO|callbacks.py:310] 2024-07-29 18:47:15,372 >> {'loss': 0.0057, 'learning_rate': 9.6085e-07, 'epoch': 3.57, 'throughput': 499.80}
+
+[INFO|callbacks.py:310] 2024-07-29 18:47:28,582 >> {'loss': 0.0168, 'learning_rate': 9.5277e-07, 'epoch': 3.57, 'throughput': 499.83}
+
+[INFO|callbacks.py:310] 2024-07-29 18:47:41,791 >> {'loss': 0.0012, 'learning_rate': 9.4472e-07, 'epoch': 3.58, 'throughput': 499.85}
+
+[INFO|callbacks.py:310] 2024-07-29 18:47:54,990 >> {'loss': 0.0002, 'learning_rate': 9.3669e-07, 'epoch': 3.58, 'throughput': 499.82}
+
+[INFO|callbacks.py:310] 2024-07-29 18:48:08,176 >> {'loss': 0.0025, 'learning_rate': 9.2869e-07, 'epoch': 3.59, 'throughput': 499.84}
+
+[INFO|callbacks.py:310] 2024-07-29 18:48:21,376 >> {'loss': 0.0194, 'learning_rate': 9.2072e-07, 'epoch': 3.60, 'throughput': 499.80}
+
+[INFO|callbacks.py:310] 2024-07-29 18:48:34,577 >> {'loss': 0.0055, 'learning_rate': 9.1278e-07, 'epoch': 3.60, 'throughput': 499.80}
+
+[INFO|callbacks.py:310] 2024-07-29 18:48:47,797 >> {'loss': 0.0215, 'learning_rate': 9.0486e-07, 'epoch': 3.61, 'throughput': 499.83}
+
+[INFO|callbacks.py:310] 2024-07-29 18:49:01,006 >> {'loss': 0.0003, 'learning_rate': 8.9697e-07, 'epoch': 3.62, 'throughput': 499.81}
+
+[INFO|callbacks.py:310] 2024-07-29 18:49:14,215 >> {'loss': 0.0242, 'learning_rate': 8.8910e-07, 'epoch': 3.62, 'throughput': 499.85}
+
+[INFO|callbacks.py:310] 2024-07-29 18:49:27,433 >> {'loss': 0.0206, 'learning_rate': 8.8126e-07, 'epoch': 3.63, 'throughput': 499.85}
+
+[INFO|callbacks.py:310] 2024-07-29 18:49:40,658 >> {'loss': 0.0024, 'learning_rate': 8.7345e-07, 'epoch': 3.64, 'throughput': 499.87}
+
+[INFO|callbacks.py:310] 2024-07-29 18:49:53,860 >> {'loss': 0.0087, 'learning_rate': 8.6567e-07, 'epoch': 3.64, 'throughput': 499.89}
+
+[INFO|callbacks.py:310] 2024-07-29 18:50:07,067 >> {'loss': 0.0089, 'learning_rate': 8.5792e-07, 'epoch': 3.65, 'throughput': 499.86}
+
+[INFO|callbacks.py:310] 2024-07-29 18:50:20,271 >> {'loss': 0.0003, 'learning_rate': 8.5019e-07, 'epoch': 3.66, 'throughput': 499.92}
+
+[INFO|callbacks.py:310] 2024-07-29 18:50:33,471 >> {'loss': 0.0002, 'learning_rate': 8.4249e-07, 'epoch': 3.66, 'throughput': 499.90}
+
+[INFO|callbacks.py:310] 2024-07-29 18:50:46,665 >> {'loss': 0.0009, 'learning_rate': 8.3482e-07, 'epoch': 3.67, 'throughput': 499.94}
+
+[INFO|callbacks.py:310] 2024-07-29 18:50:59,887 >> {'loss': 0.0154, 'learning_rate': 8.2717e-07, 'epoch': 3.67, 'throughput': 499.95}
+
+[INFO|callbacks.py:310] 2024-07-29 18:51:13,096 >> {'loss': 0.0012, 'learning_rate': 8.1956e-07, 'epoch': 3.68, 'throughput': 499.93}
+
+[INFO|callbacks.py:310] 2024-07-29 18:51:26,298 >> {'loss': 0.0008, 'learning_rate': 8.1197e-07, 'epoch': 3.69, 'throughput': 499.94}
+
+[INFO|callbacks.py:310] 2024-07-29 18:51:39,491 >> {'loss': 0.0081, 'learning_rate': 8.0441e-07, 'epoch': 3.69, 'throughput': 499.95}
+
+[INFO|callbacks.py:310] 2024-07-29 18:51:52,704 >> {'loss': 0.0080, 'learning_rate': 7.9688e-07, 'epoch': 3.70, 'throughput': 499.94}
+
+[INFO|callbacks.py:310] 2024-07-29 18:52:05,905 >> {'loss': 0.0073, 'learning_rate': 7.8938e-07, 'epoch': 3.71, 'throughput': 500.01}
+
+[INFO|callbacks.py:310] 2024-07-29 18:52:19,100 >> {'loss': 0.0062, 'learning_rate': 7.8191e-07, 'epoch': 3.71, 'throughput': 500.07}
+
+[INFO|callbacks.py:310] 2024-07-29 18:52:32,295 >> {'loss': 0.0015, 'learning_rate': 7.7446e-07, 'epoch': 3.72, 'throughput': 500.07}
+
+[INFO|callbacks.py:310] 2024-07-29 18:52:45,507 >> {'loss': 0.0268, 'learning_rate': 7.6705e-07, 'epoch': 3.73, 'throughput': 500.07}
+
+[INFO|callbacks.py:310] 2024-07-29 18:52:58,725 >> {'loss': 0.0007, 'learning_rate': 7.5967e-07, 'epoch': 3.73, 'throughput': 500.06}
+
+[INFO|callbacks.py:310] 2024-07-29 18:53:11,917 >> {'loss': 0.0016, 'learning_rate': 7.5231e-07, 'epoch': 3.74, 'throughput': 500.05}
+
+[INFO|callbacks.py:310] 2024-07-29 18:53:25,128 >> {'loss': 0.0631, 'learning_rate': 7.4498e-07, 'epoch': 3.75, 'throughput': 500.07}
+
+[INFO|callbacks.py:310] 2024-07-29 18:53:38,339 >> {'loss': 0.0128, 'learning_rate': 7.3769e-07, 'epoch': 3.75, 'throughput': 500.05}
+
+[INFO|callbacks.py:310] 2024-07-29 18:53:51,540 >> {'loss': 0.0034, 'learning_rate': 7.3042e-07, 'epoch': 3.76, 'throughput': 500.05}
+
+[INFO|callbacks.py:310] 2024-07-29 18:54:04,733 >> {'loss': 0.0633, 'learning_rate': 7.2318e-07, 'epoch': 3.77, 'throughput': 500.04}
+
+[INFO|callbacks.py:310] 2024-07-29 18:54:17,937 >> {'loss': 0.0007, 'learning_rate': 7.1597e-07, 'epoch': 3.77, 'throughput': 500.03}
+
+[INFO|callbacks.py:310] 2024-07-29 18:54:31,144 >> {'loss': 0.0211, 'learning_rate': 7.0880e-07, 'epoch': 3.78, 'throughput': 500.01}
+
+[INFO|callbacks.py:310] 2024-07-29 18:54:44,336 >> {'loss': 0.0090, 'learning_rate': 7.0165e-07, 'epoch': 3.78, 'throughput': 500.01}
+
+[INFO|callbacks.py:310] 2024-07-29 18:54:57,545 >> {'loss': 0.0152, 'learning_rate': 6.9453e-07, 'epoch': 3.79, 'throughput': 499.99}
+
+[INFO|callbacks.py:310] 2024-07-29 18:55:10,766 >> {'loss': 0.0022, 'learning_rate': 6.8745e-07, 'epoch': 3.80, 'throughput': 499.94}
+
+[INFO|callbacks.py:310] 2024-07-29 18:55:23,972 >> {'loss': 0.0140, 'learning_rate': 6.8039e-07, 'epoch': 3.80, 'throughput': 499.91}
+
+[INFO|callbacks.py:310] 2024-07-29 18:55:37,172 >> {'loss': 0.0085, 'learning_rate': 6.7337e-07, 'epoch': 3.81, 'throughput': 499.94}
+
+[INFO|callbacks.py:310] 2024-07-29 18:55:50,385 >> {'loss': 0.0161, 'learning_rate': 6.6637e-07, 'epoch': 3.82, 'throughput': 499.96}
+
+[INFO|callbacks.py:310] 2024-07-29 18:56:03,588 >> {'loss': 0.0234, 'learning_rate': 6.5941e-07, 'epoch': 3.82, 'throughput': 499.98}
+
+[INFO|callbacks.py:310] 2024-07-29 18:56:16,775 >> {'loss': 0.0186, 'learning_rate': 6.5248e-07, 'epoch': 3.83, 'throughput': 499.98}
+
+[INFO|callbacks.py:310] 2024-07-29 18:56:29,962 >> {'loss': 0.0199, 'learning_rate': 6.4558e-07, 'epoch': 3.84, 'throughput': 500.02}
+
+[INFO|callbacks.py:310] 2024-07-29 18:56:43,167 >> {'loss': 0.0052, 'learning_rate': 6.3871e-07, 'epoch': 3.84, 'throughput': 500.02}
+
+[INFO|callbacks.py:310] 2024-07-29 18:56:56,371 >> {'loss': 0.0165, 'learning_rate': 6.3187e-07, 'epoch': 3.85, 'throughput': 500.03}
+
+[INFO|callbacks.py:310] 2024-07-29 18:57:09,566 >> {'loss': 0.0100, 'learning_rate': 6.2506e-07, 'epoch': 3.86, 'throughput': 500.03}
+
+[INFO|callbacks.py:310] 2024-07-29 18:57:22,758 >> {'loss': 0.0047, 'learning_rate': 6.1829e-07, 'epoch': 3.86, 'throughput': 500.04}
+
+[INFO|callbacks.py:310] 2024-07-29 18:57:35,960 >> {'loss': 0.0039, 'learning_rate': 6.1154e-07, 'epoch': 3.87, 'throughput': 500.05}
+
+[INFO|callbacks.py:310] 2024-07-29 18:57:49,144 >> {'loss': 0.0022, 'learning_rate': 6.0483e-07, 'epoch': 3.87, 'throughput': 500.07}
+
+[INFO|callbacks.py:310] 2024-07-29 18:58:02,342 >> {'loss': 0.0042, 'learning_rate': 5.9815e-07, 'epoch': 3.88, 'throughput': 500.07}
+
+[INFO|callbacks.py:310] 2024-07-29 18:58:15,549 >> {'loss': 0.0076, 'learning_rate': 5.9150e-07, 'epoch': 3.89, 'throughput': 500.05}
+
+[INFO|callbacks.py:310] 2024-07-29 18:58:28,752 >> {'loss': 0.0019, 'learning_rate': 5.8489e-07, 'epoch': 3.89, 'throughput': 500.06}
+
+[INFO|callbacks.py:310] 2024-07-29 18:58:41,955 >> {'loss': 0.0014, 'learning_rate': 5.7831e-07, 'epoch': 3.90, 'throughput': 500.11}
+
+[INFO|callbacks.py:310] 2024-07-29 18:58:55,136 >> {'loss': 0.0022, 'learning_rate': 5.7176e-07, 'epoch': 3.91, 'throughput': 500.16}
+
+[INFO|callbacks.py:310] 2024-07-29 18:59:08,354 >> {'loss': 0.0024, 'learning_rate': 5.6524e-07, 'epoch': 3.91, 'throughput': 500.15}
+
+[INFO|callbacks.py:310] 2024-07-29 18:59:21,570 >> {'loss': 0.0013, 'learning_rate': 5.5875e-07, 'epoch': 3.92, 'throughput': 500.11}
+
+[INFO|callbacks.py:310] 2024-07-29 18:59:34,756 >> {'loss': 0.0064, 'learning_rate': 5.5230e-07, 'epoch': 3.93, 'throughput': 500.09}
+
+[INFO|callbacks.py:310] 2024-07-29 18:59:47,959 >> {'loss': 0.0084, 'learning_rate': 5.4588e-07, 'epoch': 3.93, 'throughput': 500.11}
+
+[INFO|callbacks.py:310] 2024-07-29 19:00:01,181 >> {'loss': 0.0009, 'learning_rate': 5.3949e-07, 'epoch': 3.94, 'throughput': 500.11}
+
+[INFO|callbacks.py:310] 2024-07-29 19:00:14,371 >> {'loss': 0.0010, 'learning_rate': 5.3314e-07, 'epoch': 3.95, 'throughput': 500.13}
+
+[INFO|callbacks.py:310] 2024-07-29 19:00:27,574 >> {'loss': 0.0064, 'learning_rate': 5.2682e-07, 'epoch': 3.95, 'throughput': 500.10}
+
+[INFO|callbacks.py:310] 2024-07-29 19:00:40,785 >> {'loss': 0.0033, 'learning_rate': 5.2053e-07, 'epoch': 3.96, 'throughput': 500.09}
+
+[INFO|callbacks.py:310] 2024-07-29 19:00:53,979 >> {'loss': 0.0065, 'learning_rate': 5.1428e-07, 'epoch': 3.96, 'throughput': 500.10}
+
+[INFO|callbacks.py:310] 2024-07-29 19:01:07,179 >> {'loss': 0.0272, 'learning_rate': 5.0805e-07, 'epoch': 3.97, 'throughput': 500.12}
+
+[INFO|callbacks.py:310] 2024-07-29 19:01:20,372 >> {'loss': 0.0015, 'learning_rate': 5.0187e-07, 'epoch': 3.98, 'throughput': 500.16}
+
+[INFO|callbacks.py:310] 2024-07-29 19:01:33,578 >> {'loss': 0.0013, 'learning_rate': 4.9571e-07, 'epoch': 3.98, 'throughput': 500.23}
+
+[INFO|callbacks.py:310] 2024-07-29 19:01:46,776 >> {'loss': 0.0148, 'learning_rate': 4.8959e-07, 'epoch': 3.99, 'throughput': 500.22}
+
+[INFO|callbacks.py:310] 2024-07-29 19:01:59,979 >> {'loss': 0.0063, 'learning_rate': 4.8351e-07, 'epoch': 4.00, 'throughput': 500.23}
+
+[INFO|callbacks.py:310] 2024-07-29 19:02:13,193 >> {'loss': 0.0003, 'learning_rate': 4.7746e-07, 'epoch': 4.00, 'throughput': 500.24}
+
+[INFO|callbacks.py:310] 2024-07-29 19:02:26,401 >> {'loss': 0.0010, 'learning_rate': 4.7144e-07, 'epoch': 4.01, 'throughput': 500.26}
+
+[INFO|callbacks.py:310] 2024-07-29 19:02:39,611 >> {'loss': 0.0001, 'learning_rate': 4.6546e-07, 'epoch': 4.02, 'throughput': 500.25}
+
+[INFO|callbacks.py:310] 2024-07-29 19:02:52,825 >> {'loss': 0.0193, 'learning_rate': 4.5951e-07, 'epoch': 4.02, 'throughput': 500.22}
+
+[INFO|callbacks.py:310] 2024-07-29 19:03:06,039 >> {'loss': 0.0049, 'learning_rate': 4.5359e-07, 'epoch': 4.03, 'throughput': 500.19}
+
+[INFO|callbacks.py:310] 2024-07-29 19:03:19,242 >> {'loss': 0.0015, 'learning_rate': 4.4771e-07, 'epoch': 4.04, 'throughput': 500.19}
+
+[INFO|callbacks.py:310] 2024-07-29 19:03:32,447 >> {'loss': 0.0019, 'learning_rate': 4.4187e-07, 'epoch': 4.04, 'throughput': 500.17}
+
+[INFO|callbacks.py:310] 2024-07-29 19:03:45,648 >> {'loss': 0.0034, 'learning_rate': 4.3606e-07, 'epoch': 4.05, 'throughput': 500.17}
+
+[INFO|callbacks.py:310] 2024-07-29 19:03:58,855 >> {'loss': 0.0003, 'learning_rate': 4.3028e-07, 'epoch': 4.05, 'throughput': 500.18}
+
+[INFO|callbacks.py:310] 2024-07-29 19:04:12,059 >> {'loss': 0.0031, 'learning_rate': 4.2454e-07, 'epoch': 4.06, 'throughput': 500.21}
+
+[INFO|callbacks.py:310] 2024-07-29 19:04:25,263 >> {'loss': 0.0004, 'learning_rate': 4.1883e-07, 'epoch': 4.07, 'throughput': 500.25}
+
+[INFO|callbacks.py:310] 2024-07-29 19:04:38,470 >> {'loss': 0.0027, 'learning_rate': 4.1316e-07, 'epoch': 4.07, 'throughput': 500.22}
+
+[INFO|callbacks.py:310] 2024-07-29 19:04:51,680 >> {'loss': 0.0047, 'learning_rate': 4.0753e-07, 'epoch': 4.08, 'throughput': 500.25}
+
+[INFO|callbacks.py:310] 2024-07-29 19:05:04,882 >> {'loss': 0.0003, 'learning_rate': 4.0193e-07, 'epoch': 4.09, 'throughput': 500.26}
+
+[INFO|callbacks.py:310] 2024-07-29 19:05:18,088 >> {'loss': 0.0204, 'learning_rate': 3.9636e-07, 'epoch': 4.09, 'throughput': 500.26}
+
+[INFO|callbacks.py:310] 2024-07-29 19:05:31,292 >> {'loss': 0.0043, 'learning_rate': 3.9083e-07, 'epoch': 4.10, 'throughput': 500.27}
+
+[INFO|callbacks.py:310] 2024-07-29 19:05:44,508 >> {'loss': 0.0002, 'learning_rate': 3.8534e-07, 'epoch': 4.11, 'throughput': 500.26}
+
+[INFO|callbacks.py:310] 2024-07-29 19:05:57,704 >> {'loss': 0.0009, 'learning_rate': 3.7988e-07, 'epoch': 4.11, 'throughput': 500.24}
+
+[INFO|callbacks.py:310] 2024-07-29 19:06:10,893 >> {'loss': 0.0004, 'learning_rate': 3.7446e-07, 'epoch': 4.12, 'throughput': 500.27}
+
+[INFO|callbacks.py:310] 2024-07-29 19:06:24,099 >> {'loss': 0.0003, 'learning_rate': 3.6907e-07, 'epoch': 4.13, 'throughput': 500.22}
+
+[INFO|callbacks.py:310] 2024-07-29 19:06:37,303 >> {'loss': 0.0015, 'learning_rate': 3.6372e-07, 'epoch': 4.13, 'throughput': 500.22}
+
+[INFO|callbacks.py:310] 2024-07-29 19:06:50,498 >> {'loss': 0.0012, 'learning_rate': 3.5840e-07, 'epoch': 4.14, 'throughput': 500.25}
+
+[INFO|callbacks.py:310] 2024-07-29 19:07:03,709 >> {'loss': 0.0041, 'learning_rate': 3.5313e-07, 'epoch': 4.14, 'throughput': 500.23}
+
+[INFO|callbacks.py:310] 2024-07-29 19:07:16,923 >> {'loss': 0.0087, 'learning_rate': 3.4788e-07, 'epoch': 4.15, 'throughput': 500.19}
+
+[INFO|callbacks.py:310] 2024-07-29 19:07:30,118 >> {'loss': 0.0025, 'learning_rate': 3.4268e-07, 'epoch': 4.16, 'throughput': 500.21}
+
+[INFO|callbacks.py:310] 2024-07-29 19:07:43,322 >> {'loss': 0.0012, 'learning_rate': 3.3751e-07, 'epoch': 4.16, 'throughput': 500.20}
+
+[INFO|callbacks.py:310] 2024-07-29 19:07:56,527 >> {'loss': 0.0111, 'learning_rate': 3.3237e-07, 'epoch': 4.17, 'throughput': 500.19}
+
+[INFO|callbacks.py:310] 2024-07-29 19:08:09,739 >> {'loss': 0.0026, 'learning_rate': 3.2728e-07, 'epoch': 4.18, 'throughput': 500.19}
+
+[INFO|callbacks.py:310] 2024-07-29 19:08:22,919 >> {'loss': 0.0004, 'learning_rate': 3.2222e-07, 'epoch': 4.18, 'throughput': 500.18}
+
+[INFO|callbacks.py:310] 2024-07-29 19:08:36,113 >> {'loss': 0.0002, 'learning_rate': 3.1719e-07, 'epoch': 4.19, 'throughput': 500.18}
+
+[INFO|callbacks.py:310] 2024-07-29 19:08:49,337 >> {'loss': 0.0118, 'learning_rate': 3.1221e-07, 'epoch': 4.20, 'throughput': 500.16}
+
+[INFO|callbacks.py:310] 2024-07-29 19:09:02,541 >> {'loss': 0.0001, 'learning_rate': 3.0726e-07, 'epoch': 4.20, 'throughput': 500.17}
+
+[INFO|callbacks.py:310] 2024-07-29 19:09:15,739 >> {'loss': 0.0151, 'learning_rate': 3.0235e-07, 'epoch': 4.21, 'throughput': 500.17}
+
+[INFO|callbacks.py:310] 2024-07-29 19:09:28,933 >> {'loss': 0.0001, 'learning_rate': 2.9747e-07, 'epoch': 4.22, 'throughput': 500.13}
+
+[INFO|callbacks.py:310] 2024-07-29 19:09:42,146 >> {'loss': 0.0019, 'learning_rate': 2.9263e-07, 'epoch': 4.22, 'throughput': 500.13}
+
+[INFO|callbacks.py:310] 2024-07-29 19:09:55,346 >> {'loss': 0.0009, 'learning_rate': 2.8783e-07, 'epoch': 4.23, 'throughput': 500.18}
+
+[INFO|callbacks.py:310] 2024-07-29 19:10:08,550 >> {'loss': 0.0051, 'learning_rate': 2.8307e-07, 'epoch': 4.23, 'throughput': 500.20}
+
+[INFO|callbacks.py:310] 2024-07-29 19:10:21,767 >> {'loss': 0.0008, 'learning_rate': 2.7834e-07, 'epoch': 4.24, 'throughput': 500.20}
+
+[INFO|callbacks.py:310] 2024-07-29 19:10:34,975 >> {'loss': 0.0001, 'learning_rate': 2.7365e-07, 'epoch': 4.25, 'throughput': 500.21}
+
+[INFO|callbacks.py:310] 2024-07-29 19:10:48,190 >> {'loss': 0.0043, 'learning_rate': 2.6900e-07, 'epoch': 4.25, 'throughput': 500.18}
+
+[INFO|callbacks.py:310] 2024-07-29 19:11:01,399 >> {'loss': 0.0004, 'learning_rate': 2.6438e-07, 'epoch': 4.26, 'throughput': 500.23}
+
+[INFO|callbacks.py:310] 2024-07-29 19:11:14,602 >> {'loss': 0.0092, 'learning_rate': 2.5981e-07, 'epoch': 4.27, 'throughput': 500.23}
+
+[INFO|callbacks.py:310] 2024-07-29 19:11:27,811 >> {'loss': 0.0006, 'learning_rate': 2.5527e-07, 'epoch': 4.27, 'throughput': 500.22}
+
+[INFO|callbacks.py:310] 2024-07-29 19:11:41,010 >> {'loss': 0.0049, 'learning_rate': 2.5077e-07, 'epoch': 4.28, 'throughput': 500.18}
+
+[INFO|callbacks.py:310] 2024-07-29 19:11:54,213 >> {'loss': 0.0058, 'learning_rate': 2.4631e-07, 'epoch': 4.29, 'throughput': 500.17}
+
+[INFO|callbacks.py:310] 2024-07-29 19:12:07,427 >> {'loss': 0.0001, 'learning_rate': 2.4188e-07, 'epoch': 4.29, 'throughput': 500.14}
+
+[INFO|callbacks.py:310] 2024-07-29 19:12:20,629 >> {'loss': 0.0063, 'learning_rate': 2.3750e-07, 'epoch': 4.30, 'throughput': 500.15}
+
+[INFO|callbacks.py:310] 2024-07-29 19:12:33,832 >> {'loss': 0.0116, 'learning_rate': 2.3315e-07, 'epoch': 4.31, 'throughput': 500.12}
+
+[INFO|callbacks.py:310] 2024-07-29 19:12:47,036 >> {'loss': 0.0030, 'learning_rate': 2.2884e-07, 'epoch': 4.31, 'throughput': 500.13}
+
+[INFO|callbacks.py:310] 2024-07-29 19:13:00,242 >> {'loss': 0.0001, 'learning_rate': 2.2456e-07, 'epoch': 4.32, 'throughput': 500.16}
+
+[INFO|callbacks.py:310] 2024-07-29 19:13:13,427 >> {'loss': 0.0001, 'learning_rate': 2.2033e-07, 'epoch': 4.33, 'throughput': 500.19}
+
+[INFO|callbacks.py:310] 2024-07-29 19:13:26,636 >> {'loss': 0.0008, 'learning_rate': 2.1614e-07, 'epoch': 4.33, 'throughput': 500.18}
+
+[INFO|callbacks.py:310] 2024-07-29 19:13:39,852 >> {'loss': 0.0002, 'learning_rate': 2.1198e-07, 'epoch': 4.34, 'throughput': 500.16}
+
+[INFO|callbacks.py:310] 2024-07-29 19:13:53,066 >> {'loss': 0.0001, 'learning_rate': 2.0786e-07, 'epoch': 4.34, 'throughput': 500.16}
+
+[INFO|callbacks.py:310] 2024-07-29 19:14:06,261 >> {'loss': 0.0105, 'learning_rate': 2.0378e-07, 'epoch': 4.35, 'throughput': 500.19}
+
+[INFO|callbacks.py:310] 2024-07-29 19:14:19,462 >> {'loss': 0.0001, 'learning_rate': 1.9974e-07, 'epoch': 4.36, 'throughput': 500.18}
+
+[INFO|callbacks.py:310] 2024-07-29 19:14:32,679 >> {'loss': 0.0001, 'learning_rate': 1.9574e-07, 'epoch': 4.36, 'throughput': 500.12}
+
+[INFO|callbacks.py:310] 2024-07-29 19:14:45,877 >> {'loss': 0.0076, 'learning_rate': 1.9178e-07, 'epoch': 4.37, 'throughput': 500.14}
+
+[INFO|callbacks.py:310] 2024-07-29 19:14:59,061 >> {'loss': 0.0001, 'learning_rate': 1.8785e-07, 'epoch': 4.38, 'throughput': 500.17}
+
+[INFO|callbacks.py:310] 2024-07-29 19:15:12,281 >> {'loss': 0.0001, 'learning_rate': 1.8397e-07, 'epoch': 4.38, 'throughput': 500.13}
+
+[INFO|callbacks.py:310] 2024-07-29 19:15:25,497 >> {'loss': 0.0020, 'learning_rate': 1.8012e-07, 'epoch': 4.39, 'throughput': 500.12}
+
+[INFO|callbacks.py:310] 2024-07-29 19:15:38,698 >> {'loss': 0.0088, 'learning_rate': 1.7632e-07, 'epoch': 4.40, 'throughput': 500.13}
+
+[INFO|callbacks.py:310] 2024-07-29 19:15:51,904 >> {'loss': 0.0023, 'learning_rate': 1.7255e-07, 'epoch': 4.40, 'throughput': 500.13}
+
+[INFO|callbacks.py:310] 2024-07-29 19:16:05,102 >> {'loss': 0.0002, 'learning_rate': 1.6882e-07, 'epoch': 4.41, 'throughput': 500.12}
+
+[INFO|callbacks.py:310] 2024-07-29 19:16:18,300 >> {'loss': 0.0017, 'learning_rate': 1.6513e-07, 'epoch': 4.42, 'throughput': 500.15}
+
+[INFO|callbacks.py:310] 2024-07-29 19:16:31,509 >> {'loss': 0.0002, 'learning_rate': 1.6148e-07, 'epoch': 4.42, 'throughput': 500.13}
+
+[INFO|callbacks.py:310] 2024-07-29 19:16:44,718 >> {'loss': 0.0073, 'learning_rate': 1.5787e-07, 'epoch': 4.43, 'throughput': 500.14}
+
+[INFO|callbacks.py:310] 2024-07-29 19:16:57,919 >> {'loss': 0.0013, 'learning_rate': 1.5430e-07, 'epoch': 4.43, 'throughput': 500.11}
+
+[INFO|callbacks.py:310] 2024-07-29 19:17:11,123 >> {'loss': 0.0003, 'learning_rate': 1.5077e-07, 'epoch': 4.44, 'throughput': 500.09}
+
+[INFO|callbacks.py:310] 2024-07-29 19:17:24,315 >> {'loss': 0.0149, 'learning_rate': 1.4728e-07, 'epoch': 4.45, 'throughput': 500.10}
+
+[INFO|callbacks.py:310] 2024-07-29 19:17:37,525 >> {'loss': 0.0026, 'learning_rate': 1.4382e-07, 'epoch': 4.45, 'throughput': 500.17}
+
+[INFO|callbacks.py:310] 2024-07-29 19:17:50,735 >> {'loss': 0.0001, 'learning_rate': 1.4041e-07, 'epoch': 4.46, 'throughput': 500.17}
+
+[INFO|callbacks.py:310] 2024-07-29 19:18:03,943 >> {'loss': 0.0002, 'learning_rate': 1.3704e-07, 'epoch': 4.47, 'throughput': 500.13}
+
+[INFO|callbacks.py:310] 2024-07-29 19:18:17,134 >> {'loss': 0.0060, 'learning_rate': 1.3371e-07, 'epoch': 4.47, 'throughput': 500.15}
+
+[INFO|callbacks.py:310] 2024-07-29 19:18:30,339 >> {'loss': 0.0004, 'learning_rate': 1.3042e-07, 'epoch': 4.48, 'throughput': 500.14}
+
+[INFO|callbacks.py:310] 2024-07-29 19:18:43,543 >> {'loss': 0.0004, 'learning_rate': 1.2716e-07, 'epoch': 4.49, 'throughput': 500.17}
+
+[INFO|callbacks.py:310] 2024-07-29 19:18:56,754 >> {'loss': 0.0007, 'learning_rate': 1.2395e-07, 'epoch': 4.49, 'throughput': 500.17}
+
+[INFO|callbacks.py:310] 2024-07-29 19:19:09,959 >> {'loss': 0.0002, 'learning_rate': 1.2078e-07, 'epoch': 4.50, 'throughput': 500.17}
+
+[INFO|callbacks.py:310] 2024-07-29 19:19:23,150 >> {'loss': 0.0001, 'learning_rate': 1.1764e-07, 'epoch': 4.51, 'throughput': 500.17}
+
+[INFO|callbacks.py:310] 2024-07-29 19:19:36,344 >> {'loss': 0.0033, 'learning_rate': 1.1455e-07, 'epoch': 4.51, 'throughput': 500.18}
+
+[INFO|callbacks.py:310] 2024-07-29 19:19:49,550 >> {'loss': 0.0009, 'learning_rate': 1.1150e-07, 'epoch': 4.52, 'throughput': 500.20}
+
+[INFO|callbacks.py:310] 2024-07-29 19:20:02,778 >> {'loss': 0.0216, 'learning_rate': 1.0849e-07, 'epoch': 4.52, 'throughput': 500.20}
+
+[INFO|callbacks.py:310] 2024-07-29 19:20:15,982 >> {'loss': 0.0035, 'learning_rate': 1.0552e-07, 'epoch': 4.53, 'throughput': 500.22}
+
+[INFO|callbacks.py:310] 2024-07-29 19:20:29,179 >> {'loss': 0.0073, 'learning_rate': 1.0259e-07, 'epoch': 4.54, 'throughput': 500.20}
+
+[INFO|callbacks.py:310] 2024-07-29 19:20:42,402 >> {'loss': 0.0003, 'learning_rate': 9.9696e-08, 'epoch': 4.54, 'throughput': 500.20}
+
+[INFO|callbacks.py:310] 2024-07-29 19:20:55,597 >> {'loss': 0.0007, 'learning_rate': 9.6846e-08, 'epoch': 4.55, 'throughput': 500.16}
+
+[INFO|callbacks.py:310] 2024-07-29 19:21:08,781 >> {'loss': 0.0003, 'learning_rate': 9.4036e-08, 'epoch': 4.56, 'throughput': 500.15}
+
+[INFO|callbacks.py:310] 2024-07-29 19:21:21,992 >> {'loss': 0.0168, 'learning_rate': 9.1267e-08, 'epoch': 4.56, 'throughput': 500.13}
+
+[INFO|callbacks.py:310] 2024-07-29 19:21:35,203 >> {'loss': 0.0015, 'learning_rate': 8.8539e-08, 'epoch': 4.57, 'throughput': 500.13}
+
+[INFO|callbacks.py:310] 2024-07-29 19:21:48,413 >> {'loss': 0.0001, 'learning_rate': 8.5851e-08, 'epoch': 4.58, 'throughput': 500.18}
+
+[INFO|callbacks.py:310] 2024-07-29 19:22:01,612 >> {'loss': 0.0036, 'learning_rate': 8.3204e-08, 'epoch': 4.58, 'throughput': 500.16}
+
+[INFO|callbacks.py:310] 2024-07-29 19:22:14,818 >> {'loss': 0.0022, 'learning_rate': 8.0598e-08, 'epoch': 4.59, 'throughput': 500.14}
+
+[INFO|callbacks.py:310] 2024-07-29 19:22:28,027 >> {'loss': 0.0085, 'learning_rate': 7.8032e-08, 'epoch': 4.60, 'throughput': 500.17}
+
+[INFO|callbacks.py:310] 2024-07-29 19:22:41,238 >> {'loss': 0.0001, 'learning_rate': 7.5508e-08, 'epoch': 4.60, 'throughput': 500.17}
+
+[INFO|callbacks.py:310] 2024-07-29 19:22:54,431 >> {'loss': 0.0001, 'learning_rate': 7.3024e-08, 'epoch': 4.61, 'throughput': 500.18}
+
+[INFO|callbacks.py:310] 2024-07-29 19:23:07,618 >> {'loss': 0.0070, 'learning_rate': 7.0581e-08, 'epoch': 4.61, 'throughput': 500.21}
+
+[INFO|callbacks.py:310] 2024-07-29 19:23:20,838 >> {'loss': 0.0002, 'learning_rate': 6.8179e-08, 'epoch': 4.62, 'throughput': 500.21}
+
+[INFO|callbacks.py:310] 2024-07-29 19:23:34,051 >> {'loss': 0.0002, 'learning_rate': 6.5819e-08, 'epoch': 4.63, 'throughput': 500.24}
+
+[INFO|callbacks.py:310] 2024-07-29 19:23:47,245 >> {'loss': 0.0008, 'learning_rate': 6.3499e-08, 'epoch': 4.63, 'throughput': 500.24}
+
+[INFO|callbacks.py:310] 2024-07-29 19:24:00,444 >> {'loss': 0.0043, 'learning_rate': 6.1220e-08, 'epoch': 4.64, 'throughput': 500.24}
+
+[INFO|callbacks.py:310] 2024-07-29 19:24:13,638 >> {'loss': 0.0004, 'learning_rate': 5.8983e-08, 'epoch': 4.65, 'throughput': 500.22}
+
+[INFO|callbacks.py:310] 2024-07-29 19:24:26,834 >> {'loss': 0.0024, 'learning_rate': 5.6786e-08, 'epoch': 4.65, 'throughput': 500.22}
+
+[INFO|callbacks.py:310] 2024-07-29 19:24:40,025 >> {'loss': 0.0024, 'learning_rate': 5.4631e-08, 'epoch': 4.66, 'throughput': 500.22}
+
+[INFO|callbacks.py:310] 2024-07-29 19:24:53,246 >> {'loss': 0.0001, 'learning_rate': 5.2517e-08, 'epoch': 4.67, 'throughput': 500.22}
+
+[INFO|callbacks.py:310] 2024-07-29 19:25:06,459 >> {'loss': 0.0001, 'learning_rate': 5.0444e-08, 'epoch': 4.67, 'throughput': 500.22}
+
+[INFO|callbacks.py:310] 2024-07-29 19:25:19,657 >> {'loss': 0.0002, 'learning_rate': 4.8413e-08, 'epoch': 4.68, 'throughput': 500.27}
+
+[INFO|callbacks.py:310] 2024-07-29 19:25:32,853 >> {'loss': 0.0000, 'learning_rate': 4.6423e-08, 'epoch': 4.69, 'throughput': 500.30}
+
+[INFO|callbacks.py:310] 2024-07-29 19:25:46,053 >> {'loss': 0.0003, 'learning_rate': 4.4474e-08, 'epoch': 4.69, 'throughput': 500.29}
+
+[INFO|callbacks.py:310] 2024-07-29 19:25:59,252 >> {'loss': 0.0004, 'learning_rate': 4.2567e-08, 'epoch': 4.70, 'throughput': 500.29}
+
+[INFO|callbacks.py:310] 2024-07-29 19:26:12,466 >> {'loss': 0.0010, 'learning_rate': 4.0701e-08, 'epoch': 4.70, 'throughput': 500.27}
+
+[INFO|callbacks.py:310] 2024-07-29 19:26:25,683 >> {'loss': 0.0001, 'learning_rate': 3.8877e-08, 'epoch': 4.71, 'throughput': 500.27}
+
+[INFO|callbacks.py:310] 2024-07-29 19:26:38,882 >> {'loss': 0.0031, 'learning_rate': 3.7094e-08, 'epoch': 4.72, 'throughput': 500.30}
+
+[INFO|callbacks.py:310] 2024-07-29 19:26:52,073 >> {'loss': 0.0006, 'learning_rate': 3.5353e-08, 'epoch': 4.72, 'throughput': 500.28}
+
+[INFO|callbacks.py:310] 2024-07-29 19:27:05,271 >> {'loss': 0.0001, 'learning_rate': 3.3653e-08, 'epoch': 4.73, 'throughput': 500.29}
+
+[INFO|callbacks.py:310] 2024-07-29 19:27:18,468 >> {'loss': 0.0060, 'learning_rate': 3.1995e-08, 'epoch': 4.74, 'throughput': 500.27}
+
+[INFO|callbacks.py:310] 2024-07-29 19:27:31,680 >> {'loss': 0.0001, 'learning_rate': 3.0379e-08, 'epoch': 4.74, 'throughput': 500.28}
+
+[INFO|callbacks.py:310] 2024-07-29 19:27:44,877 >> {'loss': 0.0010, 'learning_rate': 2.8804e-08, 'epoch': 4.75, 'throughput': 500.31}
+
+[INFO|callbacks.py:310] 2024-07-29 19:27:58,085 >> {'loss': 0.0019, 'learning_rate': 2.7271e-08, 'epoch': 4.76, 'throughput': 500.32}
+
+[INFO|callbacks.py:310] 2024-07-29 19:28:11,301 >> {'loss': 0.0047, 'learning_rate': 2.5780e-08, 'epoch': 4.76, 'throughput': 500.32}
+
+[INFO|callbacks.py:310] 2024-07-29 19:28:24,509 >> {'loss': 0.0002, 'learning_rate': 2.4330e-08, 'epoch': 4.77, 'throughput': 500.31}
+
+[INFO|callbacks.py:310] 2024-07-29 19:28:37,683 >> {'loss': 0.0002, 'learning_rate': 2.2922e-08, 'epoch': 4.78, 'throughput': 500.38}
+
+[INFO|callbacks.py:310] 2024-07-29 19:28:50,892 >> {'loss': 0.0006, 'learning_rate': 2.1556e-08, 'epoch': 4.78, 'throughput': 500.40}
+
+[INFO|callbacks.py:310] 2024-07-29 19:29:04,114 >> {'loss': 0.0001, 'learning_rate': 2.0231e-08, 'epoch': 4.79, 'throughput': 500.44}
+
+[INFO|callbacks.py:310] 2024-07-29 19:29:17,330 >> {'loss': 0.0038, 'learning_rate': 1.8949e-08, 'epoch': 4.79, 'throughput': 500.45}
+
+[INFO|callbacks.py:310] 2024-07-29 19:29:30,542 >> {'loss': 0.0088, 'learning_rate': 1.7708e-08, 'epoch': 4.80, 'throughput': 500.43}
+
+[INFO|callbacks.py:310] 2024-07-29 19:29:43,752 >> {'loss': 0.0002, 'learning_rate': 1.6509e-08, 'epoch': 4.81, 'throughput': 500.43}
+
+[INFO|callbacks.py:310] 2024-07-29 19:29:56,968 >> {'loss': 0.0020, 'learning_rate': 1.5352e-08, 'epoch': 4.81, 'throughput': 500.44}
+
+[INFO|callbacks.py:310] 2024-07-29 19:30:10,172 >> {'loss': 0.0012, 'learning_rate': 1.4237e-08, 'epoch': 4.82, 'throughput': 500.52}
+
+[INFO|callbacks.py:310] 2024-07-29 19:30:23,364 >> {'loss': 0.0038, 'learning_rate': 1.3164e-08, 'epoch': 4.83, 'throughput': 500.49}
+
+[INFO|callbacks.py:310] 2024-07-29 19:30:36,566 >> {'loss': 0.0003, 'learning_rate': 1.2133e-08, 'epoch': 4.83, 'throughput': 500.48}
+
+[INFO|callbacks.py:310] 2024-07-29 19:30:49,764 >> {'loss': 0.0023, 'learning_rate': 1.1143e-08, 'epoch': 4.84, 'throughput': 500.51}
+
+[INFO|callbacks.py:310] 2024-07-29 19:31:02,963 >> {'loss': 0.0001, 'learning_rate': 1.0196e-08, 'epoch': 4.85, 'throughput': 500.56}
+
+[INFO|callbacks.py:310] 2024-07-29 19:31:16,178 >> {'loss': 0.0044, 'learning_rate': 9.2909e-09, 'epoch': 4.85, 'throughput': 500.56}
+
+[INFO|callbacks.py:310] 2024-07-29 19:31:29,403 >> {'loss': 0.0040, 'learning_rate': 8.4276e-09, 'epoch': 4.86, 'throughput': 500.56}
+
+[INFO|callbacks.py:310] 2024-07-29 19:31:42,604 >> {'loss': 0.0042, 'learning_rate': 7.6063e-09, 'epoch': 4.87, 'throughput': 500.54}
+
+[INFO|callbacks.py:310] 2024-07-29 19:31:55,808 >> {'loss': 0.0004, 'learning_rate': 6.8271e-09, 'epoch': 4.87, 'throughput': 500.53}
+
+[INFO|callbacks.py:310] 2024-07-29 19:32:09,010 >> {'loss': 0.0005, 'learning_rate': 6.0899e-09, 'epoch': 4.88, 'throughput': 500.51}
+
+[INFO|callbacks.py:310] 2024-07-29 19:32:22,218 >> {'loss': 0.0000, 'learning_rate': 5.3947e-09, 'epoch': 4.88, 'throughput': 500.49}
+
+[INFO|callbacks.py:310] 2024-07-29 19:32:35,414 >> {'loss': 0.0001, 'learning_rate': 4.7417e-09, 'epoch': 4.89, 'throughput': 500.47}
+
+[INFO|callbacks.py:310] 2024-07-29 19:32:48,619 >> {'loss': 0.0089, 'learning_rate': 4.1307e-09, 'epoch': 4.90, 'throughput': 500.48}
+
+[INFO|callbacks.py:310] 2024-07-29 19:33:01,832 >> {'loss': 0.0118, 'learning_rate': 3.5618e-09, 'epoch': 4.90, 'throughput': 500.48}
+
+[INFO|callbacks.py:310] 2024-07-29 19:33:15,029 >> {'loss': 0.0003, 'learning_rate': 3.0350e-09, 'epoch': 4.91, 'throughput': 500.47}
+
+[INFO|callbacks.py:310] 2024-07-29 19:33:28,234 >> {'loss': 0.0013, 'learning_rate': 2.5503e-09, 'epoch': 4.92, 'throughput': 500.49}
+
+[INFO|callbacks.py:310] 2024-07-29 19:33:41,437 >> {'loss': 0.0058, 'learning_rate': 2.1078e-09, 'epoch': 4.92, 'throughput': 500.50}
+
+[INFO|callbacks.py:310] 2024-07-29 19:33:54,649 >> {'loss': 0.0002, 'learning_rate': 1.7073e-09, 'epoch': 4.93, 'throughput': 500.46}
+
+[INFO|callbacks.py:310] 2024-07-29 19:34:07,844 >> {'loss': 0.0059, 'learning_rate': 1.3490e-09, 'epoch': 4.94, 'throughput': 500.43}
+
+[INFO|callbacks.py:310] 2024-07-29 19:34:21,041 >> {'loss': 0.0068, 'learning_rate': 1.0329e-09, 'epoch': 4.94, 'throughput': 500.45}
+
+[INFO|callbacks.py:310] 2024-07-29 19:34:34,246 >> {'loss': 0.0003, 'learning_rate': 7.5887e-10, 'epoch': 4.95, 'throughput': 500.48}
+
+[INFO|callbacks.py:310] 2024-07-29 19:34:47,456 >> {'loss': 0.0251, 'learning_rate': 5.2700e-10, 'epoch': 4.96, 'throughput': 500.51}
+
+[INFO|callbacks.py:310] 2024-07-29 19:35:00,654 >> {'loss': 0.0054, 'learning_rate': 3.3729e-10, 'epoch': 4.96, 'throughput': 500.51}
+
+[INFO|callbacks.py:310] 2024-07-29 19:35:13,860 >> {'loss': 0.0007, 'learning_rate': 1.8972e-10, 'epoch': 4.97, 'throughput': 500.50}
+
+[INFO|callbacks.py:310] 2024-07-29 19:35:27,054 >> {'loss': 0.0002, 'learning_rate': 8.4323e-11, 'epoch': 4.98, 'throughput': 500.52}
+
+[INFO|callbacks.py:310] 2024-07-29 19:35:40,250 >> {'loss': 0.0032, 'learning_rate': 2.1081e-11, 'epoch': 4.98, 'throughput': 500.55}
+
+[INFO|callbacks.py:310] 2024-07-29 19:35:53,436 >> {'loss': 0.0107, 'learning_rate': 0.0000e+00, 'epoch': 4.99, 'throughput': 500.53}
+
+[INFO|trainer.py:3503] 2024-07-29 19:36:02,175 >> Saving model checkpoint to saves/LLaMA3.1-8B-Chat/full/train_2024-07-29-16-36-04_llama3.1_reeval/checkpoint-775
+
+[INFO|configuration_utils.py:472] 2024-07-29 19:36:02,178 >> Configuration saved in saves/LLaMA3.1-8B-Chat/full/train_2024-07-29-16-36-04_llama3.1_reeval/checkpoint-775/config.json
+
+[INFO|configuration_utils.py:807] 2024-07-29 19:36:02,178 >> Configuration saved in saves/LLaMA3.1-8B-Chat/full/train_2024-07-29-16-36-04_llama3.1_reeval/checkpoint-775/generation_config.json
+
+[INFO|modeling_utils.py:2763] 2024-07-29 19:36:19,377 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at saves/LLaMA3.1-8B-Chat/full/train_2024-07-29-16-36-04_llama3.1_reeval/checkpoint-775/model.safetensors.index.json.
+
+[INFO|tokenization_utils_base.py:2702] 2024-07-29 19:36:19,380 >> tokenizer config file saved in saves/LLaMA3.1-8B-Chat/full/train_2024-07-29-16-36-04_llama3.1_reeval/checkpoint-775/tokenizer_config.json
+
+[INFO|tokenization_utils_base.py:2711] 2024-07-29 19:36:19,381 >> Special tokens file saved in saves/LLaMA3.1-8B-Chat/full/train_2024-07-29-16-36-04_llama3.1_reeval/checkpoint-775/special_tokens_map.json
+
+[INFO|trainer.py:2394] 2024-07-29 19:36:57,790 >> 
+
+Training completed. Do not forget to share your model on huggingface.co/models =)
+
+
+
+[INFO|trainer.py:3503] 2024-07-29 19:37:06,485 >> Saving model checkpoint to saves/LLaMA3.1-8B-Chat/full/train_2024-07-29-16-36-04_llama3.1_reeval
+
+[INFO|configuration_utils.py:472] 2024-07-29 19:37:06,488 >> Configuration saved in saves/LLaMA3.1-8B-Chat/full/train_2024-07-29-16-36-04_llama3.1_reeval/config.json
+
+[INFO|configuration_utils.py:807] 2024-07-29 19:37:06,488 >> Configuration saved in saves/LLaMA3.1-8B-Chat/full/train_2024-07-29-16-36-04_llama3.1_reeval/generation_config.json
+
+[INFO|modeling_utils.py:2763] 2024-07-29 19:37:24,386 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at saves/LLaMA3.1-8B-Chat/full/train_2024-07-29-16-36-04_llama3.1_reeval/model.safetensors.index.json.
+
+[INFO|tokenization_utils_base.py:2702] 2024-07-29 19:37:24,389 >> tokenizer config file saved in saves/LLaMA3.1-8B-Chat/full/train_2024-07-29-16-36-04_llama3.1_reeval/tokenizer_config.json
+
+[INFO|tokenization_utils_base.py:2711] 2024-07-29 19:37:24,390 >> Special tokens file saved in saves/LLaMA3.1-8B-Chat/full/train_2024-07-29-16-36-04_llama3.1_reeval/special_tokens_map.json
+
+[WARNING|ploting.py:89] 2024-07-29 19:37:25,805 >> No metric eval_loss to plot.
+
+[WARNING|ploting.py:89] 2024-07-29 19:37:25,806 >> No metric eval_accuracy to plot.
+
+[INFO|modelcard.py:449] 2024-07-29 19:37:25,806 >> Dropping the following result as it does not have all the necessary fields:
+{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
+